In [1]:
import numpy as np
import pandas as pd

import random
import math
from scipy.stats import norm
import graphviz

from datetime import datetime
import pytz
import os

In [2]:
def write_to_csv(df, outname, outdir, index=None):
    '''
    inputs:
    output:
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, outname)    
    df.to_csv(fullname, index=index) 
    return

def prob_to_odds(p):
    '''
    input:
    output:
    '''
    p = max(p,0.000001)
    p = min(p,0.999999)
    if p <= 0.5:
        return round(100/(p/(1-p)))
    return round(-100*(p/(1-p)))

In [36]:
bracket_type = 'mens'
bi = bracket_type[0].upper()

In [37]:
pred = pd.read_csv('Output/{}/Regression/predictions_2023.csv'.format(bi+bracket_type[1:]))
college_names = pd.read_csv('Data/{}Teams.csv'.format(bi, bi))

pred = pred.merge(college_names, left_on='TeamID1', right_on='TeamID')
pred = pred.merge(college_names, left_on='TeamID2', right_on='TeamID')

pred

Unnamed: 0,Season,TeamID1,TeamID2,Pred,Win_Prob,PScore1,PScore2,StdScore1,StdScore2,TeamID_x,TeamName_x,FirstD1Season_x,LastD1Season_x,TeamID_y,TeamName_y,FirstD1Season_y,LastD1Season_y
0,2023,1101,1102,0.484804,0.551107,67.968220,68.40598,11.398313,11.581235,1101,Abilene Chr,2014,2023,1102,Air Force,1985,2023
1,2023,1101,1103,0.230862,0.279713,67.941350,76.76892,11.398313,12.620345,1101,Abilene Chr,2014,2023,1103,Akron,1985,2023
2,2023,1102,1103,0.239152,0.240300,61.537693,70.10966,11.581235,12.620345,1102,Air Force,1985,2023,1103,Akron,1985,2023
3,2023,1101,1104,0.042546,0.101431,67.784930,88.73936,11.398313,12.992937,1101,Abilene Chr,2014,2023,1104,Alabama,1985,2023
4,2023,1102,1104,0.052077,0.084203,61.424072,81.35815,11.581235,12.992937,1102,Air Force,1985,2023,1104,Alabama,1985,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65698,2023,1472,1477,0.690299,0.691325,70.280910,65.54291,9.319421,9.763690,1472,St Thomas MN,2022,2023,1477,TX A&M Commerce,2023,2023
65699,2023,1473,1477,0.325608,0.449096,64.861350,70.11245,13.818760,9.763690,1473,Lindenwood,2023,2023,1477,TX A&M Commerce,2023,2023
65700,2023,1474,1477,0.618967,0.605614,73.142660,69.95516,11.351692,9.763690,1474,Queens NC,2023,2023,1477,TX A&M Commerce,2023,2023
65701,2023,1475,1477,0.611416,0.568902,71.715065,68.95264,9.758033,9.763690,1475,Southern Indiana,2023,2023,1477,TX A&M Commerce,2023,2023


In [38]:
pred['Total'] = pred.apply(lambda row: round(row['PScore1'] + row['PScore2'], 1), axis=1)
pred['Spread'] = pred.apply(lambda row: round(row['PScore2'] - row['PScore1'], 1), axis=1)
pred['US_Odds'] = pred.apply(lambda row: prob_to_odds(row['Pred']), axis=1)
pred['Win'] = pred.apply(lambda row: 100*round(row['Pred'], 4), axis=1)
pred['Win_Elo'] = pred.apply(lambda row: 100*round(row['Win_Prob'], 4), axis=1)
pred['Score_x'] = pred.apply(lambda row: round(row['PScore1'], 2), axis=1)
pred['Score_y'] = pred.apply(lambda row: round(row['PScore2'], 2), axis=1)
pred['StdScore_x'] = pred.apply(lambda row: round(row['StdScore1'], 2), axis=1)
pred['StdScore_y'] = pred.apply(lambda row: round(row['StdScore2'], 2), axis=1)

cols = ['TeamID_x','TeamName_x','TeamID_y','TeamName_y','Total','Spread','US_Odds','Win','Win_Elo',\
        'Score_x','Score_y','StdScore_x','StdScore_y']
pred[cols]

Unnamed: 0,TeamID_x,TeamName_x,TeamID_y,TeamName_y,Total,Spread,US_Odds,Win,Win_Elo,Score_x,Score_y,StdScore_x,StdScore_y
0,1101,Abilene Chr,1102,Air Force,136.4,0.4,106,48.48,55.11,67.97,68.41,11.40,11.58
1,1101,Abilene Chr,1103,Akron,144.7,8.8,333,23.09,27.97,67.94,76.77,11.40,12.62
2,1102,Air Force,1103,Akron,131.6,8.6,318,23.92,24.03,61.54,70.11,11.58,12.62
3,1101,Abilene Chr,1104,Alabama,156.5,21.0,2250,4.25,10.14,67.78,88.74,11.40,12.99
4,1102,Air Force,1104,Alabama,142.8,19.9,1820,5.21,8.42,61.42,81.36,11.58,12.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65698,1472,St Thomas MN,1477,TX A&M Commerce,135.8,-4.7,-223,69.03,69.13,70.28,65.54,9.32,9.76
65699,1473,Lindenwood,1477,TX A&M Commerce,135.0,5.3,207,32.56,44.91,64.86,70.11,13.82,9.76
65700,1474,Queens NC,1477,TX A&M Commerce,143.1,-3.2,-162,61.90,60.56,73.14,69.96,11.35,9.76
65701,1475,Southern Indiana,1477,TX A&M Commerce,140.7,-2.8,-157,61.14,56.89,71.72,68.95,9.76,9.76


In [7]:
NROUNDS = 10000
playin_losers = [1411,1369,1280,1305]

teams = pd.read_csv('Data/{}NCAATourneySeeds.csv'.format(bi, bi))
teams = teams[teams['Season']==2023].reset_index(drop=True)

teams['SeedName'] = teams['Seed']
teams['Seed'] = teams.apply(lambda row: int(row['SeedName'][-2:]) if row['SeedName'][-1] not in 'ab' \
                            else int(row['SeedName'][-3:-1]), axis=1)
teams['Ord'] = teams.apply(lambda row: row['SeedName'][:1], axis=1)
teams['Playin'] = teams.apply(lambda row: True if row['SeedName'][-1] in 'ab' else False, axis=1)

teams['Slot1'] = teams.apply(lambda row: 8*(ord(row['Ord'])-ord('W')) + (16-row['Seed']) if row['Seed'] > 8 \
                             else 8*(ord(row['Ord'])-ord('W')) + (row['Seed']-1), axis=1)
teams['Slot2'] = teams.apply(lambda row: 4*(ord(row['Ord'])-ord('W')) + (7-row['Slot1']%8) if row['Slot1']%8 > 3 \
                                          else 4*(ord(row['Ord'])-ord('W')) + row['Slot1']%8, axis=1)
teams['Slot3'] = teams.apply(lambda row: 2*(ord(row['Ord'])-ord('W')) + (3-row['Slot2']%4) if row['Slot2']%4 > 1 \
                                          else 2*(ord(row['Ord'])-ord('W')) + row['Slot2']%4, axis=1)
teams['Slot4'] = teams.apply(lambda row: ord(row['Ord'])-ord('W'), axis=1)
teams['Slot5'] = teams.apply(lambda row: (ord(row['Ord'])-ord('W'))//2, axis=1)
teams['Slot6'] = 0

win_dict = dict(zip(teams['TeamID'].values, np.zeros(len(teams['TeamID'].values))))

for n in range(NROUNDS):
    if n % 1000 == 0:
        print('Completed {} simulations:\t{}'.format(n, datetime.now(pytz.timezone('US/Pacific'))))
    
    teams['Lost'] = False
    
    if len(playin_losers) != 4:
        for region in set(teams['Ord'].values):
            for s in range(1,17):
                tms = teams[(teams['Playin'])&(teams['Seed']==s)&(teams['Ord']==region)]['TeamID'].values
                df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
                if len(df) > 0:
                    if random.uniform(0,1) < df['Pred'].values[0]:
                        loser = df['TeamID2'].values[0]
                    else:
                        loser = df['TeamID1'].values[0]
                    teams.loc[teams['TeamID']==loser, ['Lost']] = True
    else:
        for loser in playin_losers:
            teams.loc[teams['TeamID']==loser, ['Lost']] = True

    for r in range(1,7):
        for i in range(2**(6-r)):
            tms = teams[(teams['Slot{}'.format(r)]==i)&(teams['Lost']==False)]['TeamID'].values
            df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
            if random.uniform(0,1) < df['Pred'].values[0]:
                loser = df['TeamID2'].values[0]
            else:
                loser = df['TeamID1'].values[0]
            teams.loc[teams['TeamID']==loser, ['Lost']] = True

        for t in teams[~teams['Lost']]['TeamID'].values:
            win_dict[t] += 1

win_dict = {k:v/NROUNDS for k,v in win_dict.items()}

print('Completed all simulations:\t{}'.format(datetime.now(pytz.timezone('US/Pacific'))))

Completed 0 simulations:	2023-03-17 14:51:24.549950-07:00
Completed 1000 simulations:	2023-03-17 14:52:59.922457-07:00
Completed 2000 simulations:	2023-03-17 14:54:35.812552-07:00
Completed 3000 simulations:	2023-03-17 14:56:11.885753-07:00
Completed 4000 simulations:	2023-03-17 14:57:47.328774-07:00
Completed 5000 simulations:	2023-03-17 14:59:22.008900-07:00
Completed 6000 simulations:	2023-03-17 15:00:56.730593-07:00
Completed 7000 simulations:	2023-03-17 15:02:32.778069-07:00
Completed 8000 simulations:	2023-03-17 15:04:07.337146-07:00
Completed 9000 simulations:	2023-03-17 15:06:05.177347-07:00
Completed all simulations:	2023-03-17 15:07:40.216891-07:00


In [8]:
tournament_teams = set(teams['TeamID'])
pred = pred[(pred['TeamID1'].isin(tournament_teams))&(pred['TeamID2'].isin(tournament_teams))].copy()

pred['Expected_Wins_x'] = pred.apply(lambda row: round(win_dict[row['TeamID1']],2), axis=1)
pred['Expected_Wins_y'] = pred.apply(lambda row: round(win_dict[row['TeamID2']],2), axis=1)

cols = ['TeamName_x','TeamName_y','Total','Spread','US_Odds','Win','Win_Elo',\
        'Score_x','Score_y','StdScore_x','StdScore_y','Expected_Wins_x','Expected_Wins_y']
pred[cols]

Unnamed: 0,TeamName_x,TeamName_y,Total,Spread,US_Odds,Win,Win_Elo,Score_x,Score_y,StdScore_x,StdScore_y,Expected_Wins_x,Expected_Wins_y
48,Alabama,Arizona,156.6,-2.2,-130,56.47,54.44,79.36,77.20,12.99,13.51,2.88,1.85
58,Alabama,Arizona St,145.9,-13.6,-796,88.84,79.56,79.78,66.17,12.99,9.61,2.88,0.42
65,Arizona,Arizona St,147.2,-8.8,-357,78.13,76.51,78.00,69.16,13.51,9.61,1.85,0.42
94,Alabama,Arkansas,149.2,-6.1,-226,69.28,65.98,77.65,71.51,12.99,11.41,2.88,0.96
101,Arizona,Arkansas,148.5,-3.0,-147,59.45,61.88,75.71,72.74,13.51,11.41,1.85,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60348,Utah St,Xavier,153.5,-3.2,-162,61.80,52.05,78.35,75.15,10.24,11.07,1.50,1.68
60351,VCU,Xavier,142.8,-1.8,-132,56.89,53.45,72.34,70.50,10.17,11.07,0.69,1.68
60354,Vermont,Xavier,146.0,2.6,145,40.89,38.56,71.68,74.27,11.42,11.07,0.65,1.68
60356,Virginia,Xavier,136.6,0.7,112,47.20,49.81,67.97,68.67,8.99,11.07,0.86,1.68


In [9]:
bracket = teams[['TeamID','Ord','Seed','Playin']].copy()
bracket['Expected_Wins'] = bracket.apply(lambda row: round(win_dict[row['TeamID']],2), axis=1)

cols = ['TeamName','Ord','Seed','Playin','Expected_Wins']
bracket = bracket.merge(college_names, on='TeamID')[cols]

write_to_csv(bracket, 'PredictedTournamentSuccess.csv', 'Output/{}/Summary/'.format(bi+bracket_type[1:]))

bracket.head()

Unnamed: 0,TeamName,Ord,Seed,Playin,Expected_Wins
0,Purdue,W,1,False,2.49
1,Marquette,W,2,False,1.38
2,Kansas St,W,3,False,0.83
3,Tennessee,W,4,False,1.44
4,Duke,W,5,False,1.2


In [16]:
cols = ['TeamName_x','TeamName_y','Total','Spread','US_Odds','Win','Win_Elo',\
        'Score_x','Score_y','StdScore_x','StdScore_y','Expected_Wins_x','Expected_Wins_y']
likeliest_bracket = pd.DataFrame(dict(zip(cols,[[]]*len(cols))))

teams['Lost'] = False

for region in set(teams['Ord'].values):
    for s in range(1,17):
        tms = teams[(teams['Playin'])&(teams['Seed']==s)&(teams['Ord']==region)]['TeamID'].values
        df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
        if len(df) > 0:
            if df['Pred'].values[0] > 0.5:
                loser = df['TeamID2'].values[0]
            else:
                loser = df['TeamID1'].values[0]
            teams.loc[teams['TeamID']==loser, ['Lost']] = True
        likeliest_bracket = pd.concat([likeliest_bracket, df[cols]], axis=0)

for r in range(1,7):
    for i in range(2**(6-r)):
        tms = teams[(teams['Slot{}'.format(r)]==i)&(teams['Lost']==False)]['TeamID'].values
        df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
        if df['Pred'].values[0] > 0.5:
            loser = df['TeamID2'].values[0]
        else:
            loser = df['TeamID1'].values[0]
        teams.loc[teams['TeamID']==loser, ['Lost']] = True
        likeliest_bracket = pd.concat([likeliest_bracket, df[cols]], axis=0)

write_to_csv(likeliest_bracket, 'LikeliestOutcomeBracket.csv', 'Output/{}/Summary/'.format(bi+bracket_type[1:]))
likeliest_bracket.reset_index(drop=True)

Unnamed: 0,TeamName_x,TeamName_y,Total,Spread,US_Odds,Win,Win_Elo,Score_x,Score_y,StdScore_x,StdScore_y,Expected_Wins_x,Expected_Wins_y
0,Mississippi St,Pittsburgh,131.1,-2.6,-159.0,61.34,57.26,66.84,64.22,8.74,9.46,0.00,0.60
1,F Dickinson,TX Southern,139.7,-6.0,-251.0,71.55,48.97,72.86,66.88,11.44,9.61,0.04,0.00
2,Arizona St,Nevada,133.4,1.6,130.0,43.45,50.11,65.88,67.49,9.61,9.95,0.42,0.00
3,SE Missouri St,TAM C. Christi,151.4,2.8,142.0,41.26,37.50,74.27,77.09,12.18,13.37,0.00,0.08
4,F Dickinson,Purdue,141.6,17.9,2484.0,3.87,8.58,61.86,79.79,11.44,9.01,0.04,2.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Houston,Texas,134.3,-5.2,-213.0,68.05,62.61,69.78,64.55,11.02,11.29,3.32,2.01
63,Connecticut,Gonzaga,146.9,3.5,153.0,39.56,43.08,71.72,75.18,11.82,14.45,1.57,2.52
64,Alabama,Purdue,146.5,-1.3,-121.0,54.72,55.35,73.88,72.59,12.99,9.01,2.88,2.49
65,Gonzaga,Houston,140.9,1.1,115.0,46.42,45.84,69.88,71.01,14.45,11.02,2.52,3.32


In [10]:
write_to_csv(pred[cols], 'AllPossibleMatchups.csv', 'Output/{}/Summary/'.format(bi+bracket_type[1:]))

In [11]:
NROUNDS = 5000
playin_losers = []

elo_dict = dict(zip(teams['TeamID'].values, np.zeros(len(teams['TeamID'].values))))

for n in range(NROUNDS):
    if n % 1000 == 0:
        print('Completed {} simulations:\t{}'.format(n, datetime.now(pytz.timezone('US/Pacific'))))
    
    teams['Lost'] = False
    
    if len(playin_losers) != 4:
        for region in set(teams['Ord'].values):
            for s in range(1,17):
                tms = teams[(teams['Playin'])&(teams['Seed']==s)&(teams['Ord']==region)]['TeamID'].values
                df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
                if len(df) > 0:
                    if random.uniform(0,1) < df['Win_Prob'].values[0]:
                        loser = df['TeamID2'].values[0]
                    else:
                        loser = df['TeamID1'].values[0]
                    teams.loc[teams['TeamID']==loser, ['Lost']] = True
    else:
        for loser in playin_losers:
            teams.loc[teams['TeamID']==loser, ['Lost']] = True

    for r in range(1,7):
        for i in range(2**(6-r)):
            tms = teams[(teams['Slot{}'.format(r)]==i)&(teams['Lost']==False)]['TeamID'].values
            df = pred[(pred['TeamID_x'].isin(tms))&(pred['TeamID_y'].isin(tms))]
            if random.uniform(0,1) < df['Win_Prob'].values[0]:
                loser = df['TeamID2'].values[0]
            else:
                loser = df['TeamID1'].values[0]
            teams.loc[teams['TeamID']==loser, ['Lost']] = True

        for t in teams[~teams['Lost']]['TeamID'].values:
            elo_dict[t] += 1

elo_dict = {k:v/NROUNDS for k,v in elo_dict.items()}

print('Completed all simulations:\t{}'.format(datetime.now(pytz.timezone('US/Pacific'))))

Completed 0 simulations:	2023-03-15 21:52:11.440624-07:00
Completed 1000 simulations:	2023-03-15 21:53:48.482709-07:00
Completed 2000 simulations:	2023-03-15 21:55:25.855296-07:00
Completed 3000 simulations:	2023-03-15 21:57:02.686540-07:00
Completed 4000 simulations:	2023-03-15 21:58:39.705567-07:00
Completed all simulations:	2023-03-15 22:00:16.919603-07:00


In [12]:
elo_bracket = teams[['TeamID','Ord','Seed','Playin']].copy()
elo_bracket['Expected_Wins'] = elo_bracket.apply(lambda row: round(elo_dict[row['TeamID']],2), axis=1)

cols = ['TeamName','Ord','Seed','Playin','Expected_Wins']
elo_bracket = elo_bracket.merge(college_names, on='TeamID')[cols]

write_to_csv(elo_bracket, 'PredictedEloTournamentSuccess.csv', 'Output/{}/Summary/'.format(bi+bracket_type[1:]))

bracket.head()

Unnamed: 0,TeamName,Ord,Seed,Playin,Expected_Wins
0,South Carolina,W,1,False,5.29
1,Maryland,W,2,False,1.96
2,Notre Dame,W,3,False,1.85
3,UCLA,W,4,False,1.31
4,Oklahoma,W,5,False,1.31


In [44]:
matchup = ['Texas','San Diego St']
cols = ['TeamName_x','TeamName_y','Total','Spread','US_Odds','Win','Win_Elo',\
        'Score_x','Score_y','StdScore_x','StdScore_y']
pred[(pred['TeamName_x'].isin(matchup))&(pred['TeamName_y'].isin(matchup))][cols]

Unnamed: 0,TeamName_x,TeamName_y,Total,Spread,US_Odds,Win,Win_Elo,Score_x,Score_y,StdScore_x,StdScore_y
41578,San Diego St,Texas,137.6,4.0,186,34.97,46.86,66.79,70.79,9.49,11.29


In [15]:
pred = pd.read_csv('Output/Mens/Summary/AllPossibleMatchups.csv')

matchup = ['Connecticut','Michigan']
pred[(pred['TeamName_x'].isin(matchup))&(pred['TeamName_y'].isin(matchup))]

Unnamed: 0,TeamName_x,TeamName_y,Total,Spread,US_Odds,Win,Win_Elo,Score_x,Score_y,StdScore_x,StdScore_y,Expected_Wins_x,Expected_Wins_y
