## Bracket Challenge Predictor

Reading data

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import re

from functools import reduce
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from xgboost import XGBClassifier

## Data Preprocessing

In [3]:
team_matchups = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Tournament Matchups.csv")

away_stats = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Away.csv")
home_stats = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Home.csv")
neutral_stats = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Neutral.csv")

conf_perf = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Conference Results.csv")

team_res = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Team Results.csv")
team_hist = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Resumes.csv")
team_v_ranked = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/TeamRankings.csv")
team_mis = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/KenPom Barttorvik.csv")

coach_perf = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Coach Results.csv")

seed_res = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Seed Results.csv")

In [4]:
team_ids = team_matchups[['YEAR','TEAM','TEAM NO']].drop_duplicates()

team_res.rename(columns={'TEAM ID':'TEAM NO'},inplace=True)

In [6]:
temp_away = away_stats.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]
temp_home = home_stats.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]
temp_neutral = neutral_stats.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]

#adds _A to all columns to represent Away
temp_away.columns = list(temp_away.columns[:2]) + [col + "_A" for col in temp_away.columns[2:]]

#_H for Home
temp_home.columns = list(temp_home.columns[:2]) + [col + "_H" for col in temp_home.columns[2:]]

#_N for Neutral
temp_neutral.columns = list(temp_neutral.columns[:2]) + [col + "_N" for col in temp_neutral.columns[2:]]


#combined stats as they are highly correlated away, home, and neutral
#function to create new combined dataframe easily
def var_mean(dfs, vars):
    combined_df = [dfs[0].loc[:,vars],dfs[1].loc[:,vars],dfs[2].loc[:,vars]]
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=['YEAR','TEAM NO']),combined_df)
    for var in vars[2:]:
        combined_df[var] = combined_df[[var+'_x',var+'_y',var]].mean(axis=1)
        combined_df.drop([var+'_x',var+'_y'],axis=1,inplace=True)
    return combined_df

vars = ['YEAR','TEAM NO','BADJ EM','FTR','FTRD',
        'BADJ T','TOV%','TOV%D','OREB%','OP OREB%','RAW T','WAB',
        '2PT%', '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%',
        'AST%', 'OP AST%', '2PTR', '3PTR', '2PTRD', '3PTRD','ELITE SOS']
dfs = [away_stats,home_stats,neutral_stats]

team_stats = var_mean(dfs,vars)

temp_res = team_res.loc[:,['TEAM','PAKE','PASE','R64','R32','S16','E8','F4',
                           'F2','CHAMP','TOP2']]

temp_his = team_hist.drop(['TEAM','SEED','ROUND','Q1 PLUS Q2 W','B POWER','BID TYPE'],axis=1)

temp_v_ranked = team_v_ranked.loc[:,['YEAR','TEAM NO','TR RATING','V 1-25 WINS','V 1-25 LOSS',
                                     'V 26-50 WINS','V 26-50 LOSS','LUCK RATING','CONSISTENCY TR RATING']]

temp_mis = team_mis.loc[:,['YEAR','CONF','CONF ID','TEAM NO','K OFF','K DEF','AVG HGT','EFF HGT','EXP','TALENT']]


Created a column on who won each matchup for every game in the dataset. Will now add columns of interest to this dataset and will try to do feature engineering as to model who'd win each matchup.

In [7]:
wins = []
for i in range(0,len(team_matchups)-1,2):
    if team_matchups.iloc[i,8] > team_matchups.iloc[i+1,8]:
        wins.append(1)
        wins.append(0)
    else:
        wins.append(0)
        wins.append(1)
    
comp_stats = team_matchups.iloc[:,[0,3,4,5,6,7,8]]
comp_stats['WIN'] = wins
        
comp_stats.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_stats['WIN'] = wins


Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,CURRENT ROUND,SCORE,WIN
0,2024,1067,Connecticut,1,1,64,91,1
1,2024,1026,Stetson,16,64,64,52,0
2,2024,1060,Florida Atlantic,8,64,64,65,0
3,2024,1036,Northwestern,9,32,64,77,1
4,2024,1029,San Diego St.,5,16,64,69,1


Creating matchup id which will be useful when performing train test split as we need the games split together

In [8]:
matchup_id = []

for i in range(len(comp_stats)//2):
    matchup_id.append(i)
    matchup_id.append(i)

comp_stats['MATCHUP ID'] = matchup_id
comp_stats = comp_stats.iloc[:,[0,1,2,8,4,5,6,7,3]]
comp_stats

Unnamed: 0,YEAR,TEAM NO,TEAM,MATCHUP ID,ROUND,CURRENT ROUND,SCORE,WIN,SEED
0,2024,1067,Connecticut,0,1,64,91,1,1
1,2024,1026,Stetson,0,64,64,52,0,16
2,2024,1060,Florida Atlantic,1,64,64,65,0,8
3,2024,1036,Northwestern,1,32,64,77,1,9
4,2024,1029,San Diego St.,2,16,64,69,1,5
...,...,...,...,...,...,...,...,...,...
2009,2008,43,Kansas,1004,1,4,84,1,1
2010,2008,37,Memphis,1005,2,4,78,1,1
2011,2008,13,UCLA,1005,4,4,63,0,1
2012,2008,43,Kansas,1006,1,2,75,1,1


In [9]:
comp_team_stats = comp_stats.merge(temp_mis,on=['YEAR','TEAM NO'],how='left')

#rearranging so CONF and CONF ID are near front
comp_team_stats = comp_team_stats.iloc[:,[0,1,2,9,10,3,4,5,6,7,8,11,12,13,14]]

dfs = [comp_team_stats.drop(['SCORE'],axis=1),temp_away,temp_home,temp_neutral,temp_v_ranked,temp_his,team_stats]

comp_team_stats = reduce(lambda left, right: pd.merge(left, right, on=['YEAR','TEAM NO']),dfs)

comp_team_stats = comp_team_stats.merge(temp_res,on='TEAM',how='left')

comp_team_stats.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,MATCHUP ID,ROUND,CURRENT ROUND,WIN,SEED,...,PAKE,PASE,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,2024,1067,Connecticut,BE,8,0,1,64,1,1,...,10.8,13.3,10.0,6.0,5.0,5.0,5.0,4.0,4.0,2.0
1,2024,1026,Stetson,ASun,5,0,64,64,0,16,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024,1060,Florida Atlantic,Amer,4,1,64,64,0,8,...,2.8,2.7,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,2024,1036,Northwestern,B10,6,1,32,64,1,9,...,0.9,0.8,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024,1029,San Diego St.,MWC,20,2,16,64,1,5,...,0.8,1.1,11.0,6.0,4.0,1.0,1.0,1.0,0.0,1.0


In [63]:
len(comp_team_stats),len(team_matchups)

(2014, 2014)

## Feature engineering

In [10]:
def differenced_matchup(df):
    matchup = []
    for i in range(0,len(df)-1,2):
        team_a = df.iloc[i,:]
        team_b = df.iloc[i+1,:]
    
        if team_a['WIN'] == 1:
            diff = (team_a.loc['WIN':]-team_b.loc['WIN':]).drop('WIN')
            row = list(team_a.loc[:'WIN']) + list(diff)
            matchup.append(row)
            
            row = list(team_b.loc[:'WIN']) + list(-diff)
            matchup.append(row)
        else:
            diff = (team_b.loc['WIN':]-team_a.loc['WIN':]).drop('WIN')
            row = list(team_b.loc[:'WIN']) + list(diff)
            matchup.append(row)
            
            row = list(team_a.loc[:'WIN']) + list(-diff)
            matchup.append(row)

    columnss = list(df.columns)
    return pd.DataFrame(matchup,columns=columnss) 

diff_team_stats = differenced_matchup(comp_team_stats)
diff_team_stats

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,MATCHUP ID,ROUND,CURRENT ROUND,WIN,SEED,...,PAKE,PASE,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,2024,1067,Connecticut,BE,8,0,1,64,1,-15,...,10.8,13.3,9.0,6.0,5.0,5.0,5.0,4.0,4.0,2.0
1,2024,1026,Stetson,ASun,5,0,64,64,0,15,...,-10.8,-13.3,-9.0,-6.0,-5.0,-5.0,-5.0,-4.0,-4.0,-2.0
2,2024,1036,Northwestern,B10,6,1,32,64,1,1,...,-1.9,-1.9,1.0,2.0,-1.0,-1.0,-1.0,0.0,0.0,0.0
3,2024,1060,Florida Atlantic,Amer,4,1,64,64,0,-1,...,1.9,1.9,-1.0,-2.0,1.0,1.0,1.0,-0.0,-0.0,-0.0
4,2024,1029,San Diego St.,MWC,20,2,16,64,1,-7,...,0.5,1.3,8.0,5.0,4.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009,2008,31,North Carolina,ACC,2,1004,4,4,0,0,...,7.8,11.0,-2.0,-3.0,1.0,-0.0,1.0,1.0,-0.0,-3.0
2010,2008,37,Memphis,CUSA,13,1005,2,4,1,0,...,-5.5,-5.0,-2.0,-4.0,-5.0,-1.0,-1.0,1.0,0.0,0.0
2011,2008,13,UCLA,P10,23,1005,4,4,0,0,...,5.5,5.0,2.0,4.0,5.0,1.0,1.0,-1.0,-0.0,-0.0
2012,2008,43,Kansas,B12,7,1006,1,2,1,0,...,4.9,-2.6,8.0,11.0,7.0,6.0,3.0,2.0,2.0,10.0


Add column to store what seeds they've beaten each round

## Model Testing

Choosing predictors and train test split

In [52]:
predictors = diff_team_stats.copy().loc[:, 'WIN':].drop('WIN', axis=1).columns
target = 'WIN'

#test on 2024 season using the raw data as we'll need to find differences dynamically
s24 = comp_team_stats.loc[(comp_team_stats['YEAR']==2024) & (comp_team_stats['CURRENT ROUND']==64),:]

#before 24 season
bf_24 = diff_team_stats.loc[diff_team_stats['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

Fitting on XGBoost Model as it usually tends to perform best on Tabular data but will try other models as well. Will also perform Cross Validation to find best hyper parameters.

Function to automately simulate tournament and get correct differenced feature dataset

In [11]:
#updates differenced data with matchup 
def create_matchup_features(team1, team2,cols):
    feature_diffs = team1.copy()
    indx = np.where(cols == "WIN")[0][0]
    for col in cols:
        if col not in cols[:indx+1]:
            feature_diffs[col] = team1[col] - team2[col]
    return feature_diffs

#simulates tournamnent
def sim_tourney(df, model):
    rounds = [64, 32, 16, 8, 4, 2]
    sim = {}

    features = df.loc[:, 'WIN':].drop('WIN', axis=1).columns

    for rd in rounds:
        
        cur_rd = df[df['CURRENT ROUND'] == rd].copy()
        next_rd_teams = []
        
        win = [0]*rd
        
        for i in range(0, len(cur_rd), 2):
            team1 = cur_rd.iloc[i]
            team2 = cur_rd.iloc[i + 1] if i + 1 < len(cur_rd) else None

            if team2 is None:
                next_rd_teams.append(team1)  # Odd number of teams, auto-advance
                cur_rd.loc[i, "WINNER"] = team1["TEAM"]
                continue

            matchup_features = create_matchup_features(team1, team2,df.columns)
            x_cur_rd = matchup_features[features]
            
            team1_win_prob = model.predict_proba([x_cur_rd])[0][1]
            team2_win_prob = 1 - team1_win_prob
               
            if  team1_win_prob > team2_win_prob:
                winner = team1
                win[i] = 1
            else:
                winner = team2
                win[i+1]=1
            winner['CURRENT ROUND'] = rd // 2
            
            next_rd_teams.append(winner)
            
            cur_rd['WINNER'] = win

        sim[f'{rd}'] = cur_rd[['TEAM','CONF','CURRENT ROUND','WINNER']].copy()
        df = pd.DataFrame(next_rd_teams)

    return sim

Trying a Random Grid Search to see if it has similar performance as previous search took 5 hrs to run.

In [56]:
parameter_grid = {
    'n_estimators':[200,500,1000],
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':np.arange(0.5, 1.1, .1)
}

boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

35 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [57]:
print(ran_rest.best_params_)

best_ran_xg = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',
                        colsample_bytree=0.6,min_child_weight=4,max_depth=5,random_state=42)
best_ran_xg.fit(x_train,y_train)
pred = best_ran_xg.predict(x_test)

confusion_matrix(y_test.values,pred,labels=[1,0])

{'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 5, 'colsample_bytree': 0.6}


array([[142,  47],
       [ 49, 140]])

In [58]:
(142+143)/(142+143+47+46)

0.753968253968254

In [59]:
sim = sim_tourney(s24,best_ran_xg)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [60]:
sim['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Connecticut,BE,2,1
42,Kentucky,SEC,2,0


In [62]:
imp = best_ran_xg.feature_importances_

sorted_idx = np.argsort(imp)

imp[sorted_idx],predictors[sorted_idx]

(array([0.00482178, 0.00535365, 0.00544822, 0.00563866, 0.00571649,
        0.00574301, 0.0057927 , 0.00612575, 0.00625169, 0.00625307,
        0.00640987, 0.00653989, 0.00655165, 0.00657166, 0.00663797,
        0.00668739, 0.00686324, 0.00686755, 0.00696196, 0.00708533,
        0.00710199, 0.00723315, 0.00730604, 0.00734729, 0.0074229 ,
        0.00744169, 0.00751685, 0.00755208, 0.007558  , 0.00755906,
        0.00756397, 0.00767391, 0.0077598 , 0.00784195, 0.00790525,
        0.00796676, 0.00799543, 0.00804781, 0.00808444, 0.00825292,
        0.00829465, 0.00836365, 0.00846878, 0.00850073, 0.00851561,
        0.00856056, 0.00869858, 0.00875136, 0.00880739, 0.00908039,
        0.00912259, 0.0092042 , 0.00920837, 0.00950912, 0.00970556,
        0.00982064, 0.01001146, 0.01024133, 0.01096581, 0.01113669,
        0.01137183, 0.01249821, 0.01328517, 0.01389585, 0.014504  ,
        0.01612008, 0.02158276, 0.02336351, 0.02743217, 0.02777286,
        0.02854432, 0.0304522 , 0.04112922, 0.08

## Optional: Trying adding supplemental features to model
Although from the results of the model it seems to do an incredible job at predicting the winner of each game let's see if we can slightly improve model performance by adding these supplemental data:
- Conference statistics
- Seed statistics
- Coach statistics 

## Conference

Starting with Conference:

In [12]:
temp = comp_team_stats.copy()
temp = temp.drop_duplicates(subset=['YEAR', 'TEAM NO'],keep='first')
temp = temp.drop(['TEAM NO','TEAM','CONF','MATCHUP ID','CURRENT ROUND'],axis=1)
grouped_conf = temp.groupby(['YEAR','CONF ID'])
grouped_conf = grouped_conf.mean()

In [13]:
conf_perf = conf_perf.copy().drop(['GAMES','W','L'],axis=1)
conf_perf.head()

Unnamed: 0,CONF ID,CONF,PAKE,PAKE RANK,PASE,PASE RANK,WIN%,R64,R32,S16,E8,F4,F2,CHAMP,TOP2,CHAMP%
0,1,A10,-1.5,24,-5.8,29,0.4,48,23,7,2,0,0,0,0,9.00%
1,2,ACC,17.5,1,12.4,1,0.636,95,65,43,25,12,7,5,27,95.30%
2,3,AE,-1.8,25,-0.9,24,0.059,16,1,0,0,0,0,0,0,0.00%
3,4,Amer,-5.3,30,-0.6,21,0.54,24,14,6,3,2,1,1,3,51.20%
4,5,ASun,0.2,13,1.2,11,0.2,16,3,1,0,0,0,0,0,0.50%


In [14]:
comp_conf = []

yrs = [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2021,2022,2023,2024]

for yr in yrs:
    conf_yr = grouped_conf.xs(yr, level='YEAR').drop(['WIN'],axis=1)
    conf_yr = conf_yr.merge(conf_perf.loc[:,['CONF','CONF ID','WIN%']], on='CONF ID', how='left')  
    conf_yr['YEAR'] = yr  
    comp_conf.append(conf_yr)
    
comp_conf = pd.concat(comp_conf).reset_index()
#reordering columns
comp_conf = comp_conf.loc[:,['YEAR','CONF','CONF ID','SEED','ROUND','K OFF','K DEF','AVG HGT','EFF HGT', 'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A',
       'PPPO_A', 'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H',
       'EFG%D_H', 'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N',
       'EFG%_N', 'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','WIN%', 'R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2']]

Will now merge these conference stats onto the team matchups dataset but will only look at these for the stats and then will similarly feature engineer by matchup with these conference data

In [15]:
temp = comp_team_stats.copy()
conf_matchup = temp.iloc[:,[0,1,2,3,4,5,7,8]]

#replace P10 to be P12 for consistency
conf_matchup.loc[conf_matchup['CONF']=='P10','CONF'] = 'P12'

conf_matchup = conf_matchup.merge(comp_conf.drop('CONF',axis=1),on=['YEAR','CONF ID'],how='left')
conf_matchup.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,MATCHUP ID,CURRENT ROUND,WIN,SEED,ROUND,...,PASE,WIN%,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,2024,1067,Connecticut,BE,8,0,64,1,2.0,11.0,...,3.966667,0.603,10.0,6.666667,4.0,2.333333,1.666667,1.333333,1.333333,1.333333
1,2024,1026,Stetson,ASun,5,0,64,0,16.0,64.0,...,0.0,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024,1060,Florida Atlantic,Amer,4,1,64,0,10.0,64.0,...,1.25,0.54,2.5,1.0,0.5,0.5,0.5,0.0,0.0,0.0
3,2024,1036,Northwestern,B10,6,1,64,1,5.833333,33.666667,...,1.016667,0.584,9.166667,7.0,4.0,1.666667,1.166667,0.5,0.0,1.666667
4,2024,1029,San Diego St.,MWC,20,2,64,1,8.8,48.0,...,-1.92,0.354,6.2,2.2,1.0,0.2,0.2,0.2,0.0,0.2


In [16]:
diff_conf_matchup = differenced_matchup(conf_matchup)
diff_conf_matchup.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,MATCHUP ID,CURRENT ROUND,WIN,SEED,ROUND,...,PASE,WIN%,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,2024,1067,Connecticut,BE,8,0,64,1,-14.0,-53.0,...,3.966667,0.403,9.0,6.666667,4.0,2.333333,1.666667,1.333333,1.333333,1.333333
1,2024,1026,Stetson,ASun,5,0,64,0,14.0,53.0,...,-3.966667,-0.403,-9.0,-6.666667,-4.0,-2.333333,-1.666667,-1.333333,-1.333333,-1.333333
2,2024,1036,Northwestern,B10,6,1,64,1,-4.166667,-30.333333,...,-0.233333,0.044,6.666667,6.0,3.5,1.166667,0.666667,0.5,0.0,1.666667
3,2024,1060,Florida Atlantic,Amer,4,1,64,0,4.166667,30.333333,...,0.233333,-0.044,-6.666667,-6.0,-3.5,-1.166667,-0.666667,-0.5,-0.0,-1.666667
4,2024,1029,San Diego St.,MWC,20,2,64,1,-1.2,-16.0,...,-3.17,-0.186,3.7,1.2,0.5,-0.3,-0.3,0.2,0.0,0.2


Now model hyperparameter finding and model fitting with this new data

In [231]:
predictors = diff_conf_matchup.copy().loc[:, 'WIN':].drop('WIN', axis=1).columns
target = 'WIN'

#test on 2024 season 
#will use raw data and then take differences as matchups update during simulation
s24 = conf_matchup.loc[(conf_matchup['YEAR']==2024)&(conf_matchup['CURRENT ROUND']==64),:]
s24 = s24.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF', 'CONF ID', 'MATCHUP ID', 'CURRENT ROUND', 'WIN', 'SEED',
       'ROUND','K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','WIN%','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2']]

#before 24 season
bf_24 = diff_conf_matchup.loc[diff_conf_matchup['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [None]:
parameter_grid = {
    'n_estimators':[200,500,1000],
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':np.arange(0.5, 1.1, .1)
}

boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

55 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [141]:
best_model_conf = ran_rest.best_estimator_
pred = best_model_conf.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[128,  61],
       [ 58, 131]])

See how model does with just conference data

In [142]:
sim = sim_tourney(s24,best_model_conf)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [148]:
sim['4']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Connecticut,BE,4,0
24,Clemson,ACC,4,1
41,North Carolina St.,ACC,4,1
58,Creighton,BE,4,0


Will create new var that use the prediction probability 

In [232]:
r64_data = diff_conf_matchup.loc[diff_conf_matchup['CURRENT ROUND'] == 64].copy()

r64_data['CONF PRED'] = best_model_conf.predict_proba(r64_data[predictors])[:, 1]  # Taking the win probability

conf_pred_df = r64_data[['YEAR', 'TEAM NO', 'CONF PRED']]

temp = comp_team_stats.copy()

temp = temp.merge(conf_pred_df, on=['YEAR', 'TEAM NO'], how='left')

#Fill missing values if any (for safety)
temp['CONF PRED'] = temp['CONF PRED'].fillna(0)

diff_team_n_conf = differenced_matchup(temp)

n_predictors = diff_team_n_conf.iloc[:,10:].columns

#before 24 season
bf_24 = temp.loc[temp['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][n_predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][n_predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [233]:
boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,reg_lambda=10,reg_alpha=5,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

35 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

Testing on 2024

In [234]:
best_mod = ran_rest.best_estimator_

s24 = temp.loc[(temp['YEAR']==2024)&(temp['CURRENT ROUND']==64),:]
s24 = s24.merge(conf_matchup.loc[conf_matchup['CURRENT ROUND']==64,['TEAM NO','WIN%']],on=['TEAM NO'],how='left')
s24 = s24.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF', 'CONF ID', 'MATCHUP ID', 'CURRENT ROUND', 'SEED',
       'ROUND','WIN', 'K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','CONF PRED']]


Reworking function for the new data

In [236]:

sim = sim_tourney(s24,best_mod)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [237]:
sim['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
30,Arizona,P12,2,0
62,Tennessee,SEC,2,1


Can see the historical conference data might be to heavyily weighted

## Seeds

Next we'll look at historical performance by different seeds

In [17]:
temp = comp_team_stats.copy()
temp = temp.drop_duplicates(subset=['YEAR', 'TEAM NO'],keep='first')
temp = temp.drop(['TEAM NO','TEAM','CONF','CONF ID','MATCHUP ID','CURRENT ROUND'],axis=1)
grouped_seed = temp.groupby(['YEAR','SEED'])
grouped_seed = grouped_seed.mean()

comp_sd = []

for yr in yrs:
    conf_yr = grouped_seed.xs(yr, level='YEAR').drop(['WIN'],axis=1)
    conf_yr = conf_yr.merge(seed_res.loc[:,['SEED','WIN%']], on='SEED', how='left')  
    conf_yr['YEAR'] = yr  
    comp_sd.append(conf_yr)
    
comp_sd = pd.concat(comp_sd).reset_index()
comp_sd = comp_sd.loc[:,['YEAR','SEED','ROUND', 'K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE', 'R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2', 'WIN%']]

temp = comp_team_stats.copy()
seed_matchup = temp.iloc[:,[0,1,2,3,5,7,8,9]]

seed_matchup = seed_matchup.merge(comp_sd,on=['YEAR','SEED'],how='left')
diff_seed_matchup = differenced_matchup(seed_matchup)
diff_seed_matchup.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,MATCHUP ID,CURRENT ROUND,WIN,SEED,ROUND,K OFF,...,PASE,R64,R32,S16,E8,F4,F2,CHAMP,TOP2,WIN%
0,2024,1067,Connecticut,BE,0,64,1,-15,-55.25,13.7365,...,5.025,10.0,8.75,6.75,4.0,3.0,2.25,1.5,4.25,0.768
1,2024,1026,Stetson,ASun,0,64,0,15,55.25,-13.7365,...,-5.025,-10.0,-8.75,-6.75,-4.0,-3.0,-2.25,-1.5,-4.25,-0.768
2,2024,1036,Northwestern,B10,1,64,1,1,-16.0,-1.623,...,3.375,3.75,5.25,2.5,1.0,0.75,0.25,0.0,1.0,-0.038
3,2024,1060,Florida Atlantic,Amer,1,64,0,-1,16.0,1.623,...,-3.375,-3.75,-5.25,-2.5,-1.0,-0.75,-0.25,-0.0,-1.0,0.038
4,2024,1029,San Diego St.,MWC,2,64,1,-7,-8.0,-0.042,...,1.4,10.25,8.25,5.5,2.0,1.25,1.0,0.0,2.25,0.175


In [15]:
predictors = diff_seed_matchup.copy().loc[:, 'WIN':].drop('WIN', axis=1).columns
target = 'WIN'

#before 24 season
bf_24 = diff_seed_matchup.loc[diff_seed_matchup['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [16]:
#test on 2024 season 
#will use raw data and then take differences as matchups update during simulation
s24 = seed_matchup.loc[(seed_matchup['YEAR']==2024)&(seed_matchup['CURRENT ROUND']==64),:]
s24 = s24.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF','CURRENT ROUND', 'WIN', 'SEED',
       'ROUND','K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','WIN%','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2']]

In [205]:
parameter_grid = {
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':np.arange(0.5, 1.1, .1)
}

boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

35 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [206]:
best_model_seed = ran_rest.best_estimator_
pred = best_model_seed.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[135,  54],
       [ 51, 138]])

In [207]:
sim = sim_tourney(s24,best_model_seed)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [208]:
sim['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Connecticut,BE,2,1
32,Houston,B12,2,0


Adding seed predicted probas as a feature before testing on 2024 season

In [212]:
r64_data = diff_seed_matchup.loc[diff_seed_matchup['CURRENT ROUND'] == 64].copy()

r64_data['SEED PRED'] = best_model_seed.predict_proba(r64_data[predictors])[:, 1]  # Taking the win probability

seed_pred_df = r64_data[['YEAR', 'TEAM NO', 'SEED PRED']]

temp = comp_team_stats.copy()

temp = temp.merge(seed_pred_df, on=['YEAR', 'TEAM NO'], how='left')

#Fill missing values if any (for safety)
temp['SEED PRED'] = temp['SEED PRED'].fillna(0)

diff_team_n_seed = differenced_matchup(temp)

n_predictors = diff_team_n_seed.iloc[:,10:].columns
target = 'WIN'

#before 24 season
bf_24 = temp.loc[temp['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][n_predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][n_predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [213]:
boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [214]:
best_model_seed = ran_rest.best_estimator_
pred = best_model_seed.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[128,  61],
       [ 78, 111]])

In [None]:
sim = sim_tourney(s24,best_model_seed)

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Connecticut,BE,4,1
16,North Carolina,ACC,4,0
32,Houston,B12,4,1
48,Purdue,B10,4,0


In [None]:
sim['2']

## Coaching

Next what came out to be much more labor intensive due to the nature of the data, Coaching:

Starting with cleaning up the provided coach stats dataset, either due to typos or column values not being integers

In [18]:
coach_perf = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Coach Results.csv")

In [19]:
coach_perf = coach_perf.drop(['F4%','CHAMP%'],axis=1)

steve_prohm = {
    "COACH ID": min(coach_perf.iloc[72,0],coach_perf.iloc[187,0],coach_perf.iloc[291,0]),  # Keep the lower ID
    "COACH": "Steve Prohm",  # Corrected name
    "PAKE": coach_perf.loc[[72, 187, 291], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[72, 187, 291], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[72, 187, 291], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[72, 187, 291], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[72, 187, 291], "GAMES"].sum(),
    "W": coach_perf.loc[[72, 187, 291], "W"].sum(),
    "L": coach_perf.loc[[72, 187, 291], "L"].sum(),
    "WIN%": coach_perf.loc[[72, 187, 291], "WIN%"].mean(),
    "R64": coach_perf.loc[[72, 187, 291], "R64"].sum(),
    "R32": coach_perf.loc[[72, 187, 291], "R32"].sum(),
    "S16": coach_perf.loc[[72, 187, 291], "S16"].sum(),
    "E8": coach_perf.loc[[72, 187, 291], "E8"].sum(),
    "F4": coach_perf.loc[[72, 187, 291], "F4"].sum(),
    "F2": coach_perf.loc[[72, 187, 291], "F2"].sum(),
    "CHAMP": coach_perf.loc[[72, 187, 291], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[72, 187, 291], "TOP2"].sum()
}

bob_marlin = {
    "COACH ID": min(coach_perf.iloc[164,0],coach_perf.iloc[215,0]),  # Keep the lower ID
    "COACH": "Bob Marlin",  # Corrected name
    "PAKE": coach_perf.loc[[164, 215], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[164, 215], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[164, 215], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[164, 215], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[164, 215], "GAMES"].sum(),
    "W": coach_perf.loc[[164, 215], "W"].sum(),
    "L": coach_perf.loc[[164, 215], "L"].sum(),
    "WIN%": coach_perf.loc[[164, 215], "WIN%"].mean(),
    "R64": coach_perf.loc[[164, 215], "R64"].sum(),
    "R32": coach_perf.loc[[164, 215], "R32"].sum(),
    "S16": coach_perf.loc[[164, 215], "S16"].sum(),
    "E8": coach_perf.loc[[164, 215], "E8"].sum(),
    "F4": coach_perf.loc[[164, 215], "F4"].sum(),
    "F2": coach_perf.loc[[164, 215], "F2"].sum(),
    "CHAMP": coach_perf.loc[[164, 215], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[164, 215], "TOP2"].sum()
}

kermit_davis = {
    "COACH ID": min(coach_perf.iloc[54,0],coach_perf.iloc[193,0]),  # Keep the lower ID
    "COACH": "Kermit Davis",  # Corrected name
    "PAKE": coach_perf.loc[[54, 193], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[54, 193], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[54, 193], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[54, 193], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[54, 193], "GAMES"].sum(),
    "W": coach_perf.loc[[54, 193], "W"].sum(),
    "L": coach_perf.loc[[54, 193], "L"].sum(),
    "WIN%": coach_perf.loc[[54, 193], "WIN%"].mean(),
    "R64": coach_perf.loc[[54, 193], "R64"].sum(),
    "R32": coach_perf.loc[[54, 193], "R32"].sum(),
    "S16": coach_perf.loc[[54, 193], "S16"].sum(),
    "E8": coach_perf.loc[[54, 193], "E8"].sum(),
    "F4": coach_perf.loc[[54, 193], "F4"].sum(),
    "F2": coach_perf.loc[[54, 193], "F2"].sum(),
    "CHAMP": coach_perf.loc[[54, 193], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[54, 193], "TOP2"].sum()
}

edward_joyner = {
    "COACH ID": min(coach_perf.iloc[101,0],coach_perf.iloc[148,0]),  # Keep the lower ID
    "COACH": "Edward Joyner",  # Corrected name
    "PAKE": coach_perf.loc[[101, 148], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[101, 148], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[101, 148], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[101, 148], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[101, 148], "GAMES"].sum(),
    "W": coach_perf.loc[[101, 148], "W"].sum(),
    "L": coach_perf.loc[[101, 148], "L"].sum(),
    "WIN%": coach_perf.loc[[101, 148], "WIN%"].mean(),
    "R64": coach_perf.loc[[101, 148], "R64"].sum(),
    "R32": coach_perf.loc[[101, 148], "R32"].sum(),
    "S16": coach_perf.loc[[101, 148], "S16"].sum(),
    "E8": coach_perf.loc[[101, 148], "E8"].sum(),
    "F4": coach_perf.loc[[101, 148], "F4"].sum(),
    "F2": coach_perf.loc[[101, 148], "F2"].sum(),
    "CHAMP": coach_perf.loc[[101, 148], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[101, 148], "TOP2"].sum()
}

russell_turner = {
    "COACH ID": min(coach_perf.iloc[58,0],coach_perf.iloc[181,0]),  # Keep the lower ID
    "COACH": "Russell Turner",  # Corrected name
    "PAKE": coach_perf.loc[[58, 181], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[58, 181], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[58, 181], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[58, 181], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[58, 181], "GAMES"].sum(),
    "W": coach_perf.loc[[58, 181], "W"].sum(),
    "L": coach_perf.loc[[58, 181], "L"].sum(),
    "WIN%": coach_perf.loc[[58, 181], "WIN%"].mean(),
    "R64": coach_perf.loc[[58, 181], "R64"].sum(),
    "R32": coach_perf.loc[[58, 181], "R32"].sum(),
    "S16": coach_perf.loc[[58, 181], "S16"].sum(),
    "E8": coach_perf.loc[[58, 181], "E8"].sum(),
    "F4": coach_perf.loc[[58, 181], "F4"].sum(),
    "F2": coach_perf.loc[[58, 181], "F2"].sum(),
    "CHAMP": coach_perf.loc[[58, 181], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[58, 181], "TOP2"].sum()
}

david_richman = {
    "COACH ID": min(coach_perf.iloc[119,0],coach_perf.iloc[142,0]),  # Keep the lower ID
    "COACH": "David Richman",  # Corrected name
    "PAKE": coach_perf.loc[[119, 142], "PAKE"].mean(),
    "PAKE RANK": coach_perf.loc[[119, 142], "PAKE RANK"].mean(),
    "PASE": coach_perf.loc[[119, 142], "PASE"].mean(),
    "PASE RANK": coach_perf.loc[[119, 142], "PASE RANK"].mean(),
    "GAMES": coach_perf.loc[[119, 142], "GAMES"].sum(),
    "W": coach_perf.loc[[119, 142], "W"].sum(),
    "L": coach_perf.loc[[119, 142], "L"].sum(),
    "WIN%": coach_perf.loc[[119, 142], "WIN%"].mean(),
    "R64": coach_perf.loc[[119, 142], "R64"].sum(),
    "R32": coach_perf.loc[[119, 142], "R32"].sum(),
    "S16": coach_perf.loc[[119, 142], "S16"].sum(),
    "E8": coach_perf.loc[[119, 142], "E8"].sum(),
    "F4": coach_perf.loc[[119, 142], "F4"].sum(),
    "F2": coach_perf.loc[[119, 142], "F2"].sum(),
    "CHAMP": coach_perf.loc[[119, 142], "CHAMP"].sum(),
    "TOP2": coach_perf.loc[[119, 142], "TOP2"].sum()
}

coach_perf.loc[coach_perf['COACH']=="Murray Bartow",'COACH'] = 'Murry Bartow'

coach_perf = coach_perf.drop([54, 58, 72, 101, 119, 142, 148, 164, 181, 187, 193, 215, 291])

# Reset index
coach_perf = coach_perf.reset_index(drop=True)

# Add the new combined row
coach_perf = pd.concat([coach_perf, pd.DataFrame([steve_prohm,kermit_davis,bob_marlin,edward_joyner,russell_turner,david_richman])], ignore_index=True)

coach_perf

Unnamed: 0,COACH ID,COACH,PAKE,PAKE RANK,PASE,PASE RANK,GAMES,W,L,WIN%,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,1,Roy Williams,8.20,1.0,5.50,6.0,44,34,10,0.7730,12,11,8,6,4,3,2,8
1,2,Tom Izzo,8.00,2.0,9.60,1.0,47,32,15,0.6810,15,13,9,5,4,1,0,4
2,3,Brad Stevens,6.50,3.0,7.60,3.0,17,12,5,0.7060,5,4,2,2,2,2,0,0
3,4,John Calipari,6.20,4.0,8.60,2.0,52,39,13,0.7500,14,12,10,8,5,3,1,8
4,5,Dana Altman,6.10,5.0,6.70,4.0,24,16,8,0.6670,8,8,5,2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,55,Kermit Davis,0.25,124.5,0.35,118.5,5,2,3,0.4165,3,2,0,0,0,0,0,0
308,165,Bob Marlin,-0.30,190.5,-0.30,194.0,3,0,3,0.0000,3,0,0,0,0,0,0,0
309,102,Edward Joyner,-0.05,125.5,0.00,120.5,3,0,3,0.0000,3,0,0,0,0,0,0,0
310,59,Russell Turner,0.25,120.5,0.25,130.5,3,1,2,0.2500,2,1,0,0,0,0,0,0


This data is missing what years and teams a coach was coaching in order to merge onto the team statisitics dataset. Below is the painstaking process of manually entering the correct information for this dataset:

In [20]:
coach_perf['TEAMS'] = [['North Carolina'],['Michigan St.'],['Butler'],['Memphis','Kentucky','Arkansas'],['Creighton','Oregon'],['Syracuse'],['Michigan'],
                      ['Villanova'],['Nevada','Arkansas','USC'],['Connecticut'],['Loyola Chicago','Oklahoma'],['Kansas St.','South Carolina','Massachusetts'],
                      ['Wagner','Rhode Island','Connecticut'],['Kansas'],['George Mason','Miami FL'],['North Carolina'],['Gonzaga'],['Little Rock','Texas Tech','Texas','Missisippi'],
                      ['Florida'],['Connecticut',"Saint Joseph's"],["Saint Peter's",'Seton Hall'],['Florida Atlantic','Michigan'],['Louisville','Iona',"St. John's"],['Duke'],
                      ['Xavier','Arizona','Xavier'],['Wisconsin'],['Ohio','Illinois','Akron'],['Dayton','Indiana','Rhode Island'],['San Diego St.'],
                      ['Oklahoma','Pittsburgh'],['Florida Gulf Coast','USC','SMU'],['Oral Roberts','Wichita St.'],['Michigan'],['Stanford','UCF'],['Murray St.','Texas A&M'],['La Salle'],
                      ['Kansas St.'],['UNC Wilmington','North Carolina St.'],['Western Kentucky','South Carolina','Northern Kentucky'],['Princeton'],['Florida St.'],['Grand Canyon'],['Richmond'],
                      ['Montana','Oregon St.'],['Florida','Georgia'],['Buffalo','Alabama'],['Xavier','Louisville','College of Charleston'],['Alabama','North Carolina St.','Cal St. Northridge'],
                      ['Cornell','Boston College','Penn'],['Fairleigh Dickinson','Iona'],['Wright St.','Clemson'],['Northwestern'],['Norfolk St.'],['Mercer'],['UAB','Stanford'],
                      ['Yale'],['Marshall'],['Morehead St.','Southern Miss','Tennessee'],['Western Kentucky'],['Abilene Christian','UTEP'],['Dayton','Georgia Tech','South Florida'],
                      ['San Diego'],['Furman'],['Duke'],['James Madison','Vanderbilt'],['Iowa St.','Creighton'],['Murray St.','LSU'],['Stony Brook','Ohio'],['Stanford','LSU','TCU'],
                      ['Northern Iowa'],['Lehigh'],['Alabama'],['Cleveland St.','Missouri'],['Washington'],['Arkansas'],
                      ['Hawaii'],['Cleveland St.'],['Georgia St.'],['Liberty','Liberty'],['Washington'],['Penn St.','Notre Dame'],['Oakland'],['Marquette','Virginia Tech','Texas A&M'],
                      ['Fresno St.','UTEP','Texas'],['Washington St.','Stanford'],['Saint Louis'],['North Dakota St.','Ohio'],['Georgia Tech','George Mason'],['Harvard'],['Baylor'],
                      ['Missouri','Arkansas',"St. John's"],['Stony Brook','Rutgers'],['Montana St.','Utah St.','Washington'],['South Florida','Eastern Michigan'],['Utah'],
                      ['Chattanooga','Central Arkansas'],['UTSA'],['UT Arlington','Troy'],['Mississippi St.','Western Kentucky'],['Arkansas Pine Bluff','Mississippi Valley St.'],
                      ['Lafayette'],['Southern Miss','Colorado St.'],['Mississippi Valley St.'],['Northwestern','Holy Cross'],['North Carolina A&T'],['Robert Morris'],['Radford'],
                      ['Grambling St.'],['Wagner'],['Marshall','UCF','Stetson'],['Fairleigh Dickinson'],['Bradley','UC Davis'],['Gardner Webb'],['Hartford','Manhattan'],
                      ['Radford','UNC Greensboro'],["Mount St. Mary's",'Siena','George Washington'],['Wagner',"Saint Peter's"],['UNC Asheville'],['Bradley'],
                      ['Norfolk St.'],['Georgia St.','SMU','Rice'],['Howard'],['Texas Southern'],['Army','Drexel'],['Stetson','Lipscomb','Belmont'],['Milwaukee','Butler'],
                      ['UMBC','Utah St.','VCU'],['North Dakota'],['Texas A&M','Maryland'],['Southern'],['Troy'],['Cal Poly'],['UNC Asheville','Middle Tennessee'],['Detroit'],
                      ['Cal St. Northridge'],['Binghamton','Morgan St.'],['James Madison'],['UMBC'],['Boston University','Penn St.','Florida Gulf Coast'],['Northwestern St.'],
                      ['Boise St.'],['Austin Peay'],['Loyola MD'],['Coastal Carolina'],["Mount St. Mary's",'Holy Cross'],['Northern Colorado'],['Milwaukee'],['Longwood'],
                      ['Delaware'],['Northern Kentucky','Cincinnati'],['Cal St. Fullerton'],['Stephen F. Austin'],['Notre Dame'],['Akron','Duquesne'],['Texas A&M Corpus Chris','Western Kentucky','Oklahoma St.'],
                      ['East Tennessee St.','Wake Forest'],['Maryland'],['Eastern Kentucky'],['East Tennessee St.'],['Cal St. Fullerton'],['Pacific'],['Princeton','Fairfield'],['Manhattan'],
                      ['Winthrop'],['Florida Southern','Green Bay'],['Eastern Washington'],['Georgia St.','Cal St. Bakersfield'],['Chattanooga','Massachusetts'],['UNC Asheville'],['Western Michigan'],
                      ['Indiana St.'],['Delaware','North Carolina A&T'],['Wyoming'],['Robert Morris','Rutgers'],['Stephen F. Austin'],['Morgan St.'],["Saint Peter's"],['American'],
                      ['New Mexico St.','New Mexico'],['College of Charleston','Boston College'],['Kennesaw St.','South Florida'],['Oklahoma St.'],['Montana'],['Morehead St.'],['Tennessee','Auburn'],['Northeastern'],
                      ['Georgia','Cleveland St.'],['North Carolina Central'],['Army','Saint Louis'],['Houston'],['Tulsa','Wake Forest'],['Kentucky','Texas Tech','Ranger College','Tarleton St.'],['Portland St.','Washington St.'],
                      ['Weber St.'],['UC Santa Barbara'],['California'],['LIU Brooklyn','Dusquesne','UMBC'],['Samford'],['Western Kentucky','Jacksonville St.'],['Eastern Washington','Portland'],['Bucknell','New Hampshire'],
                      ['Florida Gulf Coast','East Carolina'],['Midland College','Arkansas St.','North Texas','Texas Tech'],['UNC Greensboro','Cincinnati'],['American','Old Dominion'],
                      ['Boston College','Kennesaw St.'],['Georgetown'],['Oral Roberts'],['South Alabama'],['Massachusetts','LIU Brooklyn'],['Albany'],['UNLV','Oklahoma'],['Old Dominion'],['South Dakota St.'],['New Mexico','UCLA','Nevada'],
                      ['Mississippi','UAB'],['Indiana'],['Texas Tech'],['UC Santa Barbara'],['Kent St.'],['Kent St.','TCU','Ohio','Boston College'],['UTEP','Auburn','Central Michigan'],['Oregon','Washington St.'],
                      ['Wake Forest'],['Wisconsin'],['Minnesota','Long Beach St.','Eastern Washington'],['Loyola Chicago'],['Vermont','George Washington'],['Arizona','USC'],['Penn St.','Navy'],['Marquette','Indiana','Georgia'],
                      ['Iona'],['South Dakota St.','UNLV','Iowa St.'],['North Texas','LSU','Texas Southern'],['Wichita St.'],['Bowling Green','New Mexico St.','Mississippi St.'],['Indiana'],['San Diego St.'],['UCLA','Mississippi St.'],
                      ['Furman','Drake','Colorado St.'],['Davidson'],['USC','UTEP'],["Saint Joseph's"],['Bucknell','George Mason','Holy Cross'],['Nevada','Georgia','California'],['Utah'],['Memphis'],['Rice','VCU','Penn St.'],
                      ['Colgate'],['SMU'],['Colorado St.','Nebraska','San Jose St.'],['New Mexico'],['Northen Colorado','Colorado'],['VCU','Alabama','Dayton'],['Chattanooga','South Carolina'],['South Dakota St.','Wright St.'],
                      ['Winthrop','College of Charleston','Louisville'],['FIU','Minnesota','New Mexico'],['Boise St.'],['Arizona St.','Santa Clara'],['West Virginia'],['Memphis','Georgia Tech'],['Fairfield','Providence','Georgetown'],['Valparaiso','Vanderbilt','Grand Canyon'],
                      ['South Dakota','Utah St.','Utah'],['Indiana','Houston'],["St. John's",'San Diego'],['Drake','Providence','Central Michigan'],['Butler','Ohio St.','Depaul'],['Chattanooga','VCU','LSU','McNeese St.'],
                      ['Cincinnati','UCLA'],['Drake','West Virginia'],['Stephen F. Austin','Oklahoma St.','Illinois'],['New Mexico St.','UNLV'],['Marquette'],['Buffalo','Arizona St.'],['SMU'],['Iowa St.','Nebraska'],
                      ['St. Bonaventure'],['Vermont'],['Minnesota','Texas Tech','Memphis'],['UNLV'],['San Francisco','Florida'],['Tennessee','California','Missouri'],['Iona','Seton Hall','Maryland'],
                      ['Utah St.'],['Ohio St.','Butler'],['BYU','Kentucky'],['Illinois','Kansas St.'],['Belmont'],['Miami FL','Missouri','Tulsa'],['BYU'],['Purdue'],['Temple','La Salle'],
                      ['Siena','Iowa'],['Vanderbilt','Pittsburgh'],['Wofford','Virginia Tech'],['VCU','Texas','Marquette'],['Arizona'],['Oklahoma St.','Saint Louis'],['Clemson'],["Saint Mary's"],['Pittsburgh','TCU'],['Georgetown'],['Texas','Tennessee'],['Washington St.','Virginia']
                      ,['Murray St.','Iowa St.','Murray St.'],['Middle Tennessee','Mississippi'],['Sam Houston St.','Louisiana Lafayette'],['Hampton'],['UC Irvine'],['North Dakota St.']]
coach_perf['YRS'] = [['2003-2021'],['1995-2025'],['2007-2013'],['2000-2009','2009-2024','2024-2025'],['1994-2010','2010-2025'],['1976-2023'],['2007-2019'],
                     ['2001-2022'],['2015-2019','2019-2024','2024-2025'],['2012-2018'],['2011-2021','2021-2025'],['2007-2012','2012-2022','2022-2025'],
                     ['2010-2012','2012-2018','2018-2025'],['2003-2025'],['1997-2011','2011-2024'],['2021-2025'],['1999-2025'],['2015-2016','2016-2021','2021-2022','2023-2025'],
                     ['1996-2015'],['1986-2012','2018-2021'],['2018-2022','2022-2025'],['2018-2024','2024-2025'],['2001-2017','2020-2023','2024-2025'],['1980-2022'],
                     ['2004-2009','2009-2021','2022-2025'],['2001-2015'],['2008-2012','2012-2017','2017-2025'],['2011-2017','2017-2021','2022-2025'],['2017-2025'],
                     ['2006-2011','2018-2025'],['2011-2013','2013-2024','2024-2025'],['2017-2023','2023-2025'],['2019-2024'],['2008-2016','2016-2025'],['2006-2011','2011-2019'],['2004-2018'],
                     ['2022-2025'],['2014-2017','2017-2025'],['2003-2008','2008-2012','2019-2025'],['2011-2025'],['2002-2025'],['2009-2013'],['2005-2025'],
                     ['2006-2014','2014-2025'],['2015-2022','2022-2025'],['2015-2019','2019-2025'],['2009-2018','2018-2022','2024-2025'],['1998-2009','2011-2017','2018-2021'],
                     ['2000-2010','2010-2014','2015-2025'],['2022-2023','2023-2025'],['2006-2010','2010-2025'],['2013-2025'],['2007-2013'],['2008-2019'],['2012-2016','2016-2024'],
                     ['1999-2025'],['2014-2024'],['2006-2012','2012-2014','2014-2015'],['2008-2012'],['2011-2021','2021-2025'],['2003-2011','2011-2016','2017-2023'],
                     ['2007-2015'],['2017-2025'],['2022-2025'],['2020-2024','2024-2025'],['2006-2010','2010-2025'],['2015-2022','2022-2025'],['2016-2019','2019-2025'],['2004-2008','2008-2012','2012-2016'],
                     ['2006-2025'],['2007-2025'],['2015-2019'],['2019-2022','2022-2025'],['2017-2024'],['2007-2011'],
                     ['2015-2025'],['2006-2017'],['2011-2019'],['2007-2009','2015-2025'],['2002-2017'],['2021-2023','2023-2025'],['1984-2025'],['2008-2014','2014-2019','2019-2025'],
                     ['2011-2018','2018-2021','2022-2025'],['2019-2024','2024-2025'],['2007-2012'],['2007-2014','2014-2019'],['2000-2011','2011-2015'],['2007-2025'],['2003-2025'],
                     ['2006-2011','2011-2019','2019-2023'],['2005-2016','2016-2025'],['2019-2023','2023-2024','2024-2025'],['2007-2014','2021-2025'],['2011-2021'],
                     ['2004-2013','2024-2025'],['2006-2016'],['2006-2018','2019-2025'],['1998-2012','2016-2023'],['2008-2021','2022-2025'],
                     ['1995-2022'],['2004-2012','2012-2018'],['2005-2008'],['2000-2013','2015-2019'],['2012-2016'],['2010-2025'],['2007-2011'],
                     ['2017-2025'],['2022-2025'],['2007-2010','2010-2016','2019-2025'],['2013-2022'],['2002-2011','2011-2025'],['2013-2024'],['2010-2022','2023-2025'],
                     ['2011-2021','2021-2025'],['2012-2018','2018-2019','2019-2022'],['2012-2022','2022-2025'],['2018-2025'],['2015-2025'],
                     ['2014-2025'],['2019-2022','2022-2024','2024-2025'],['2019-2025'],['2012-2018'],['2009-2016','2016-2025'],['2011-2013','2013-2019','2019-2025'],['2016-2017','2017-2022'],
                     ['2016-2021','2021-2023','2023-2025'],['2006-2019'],['2007-2011','2011-2021'],['2011-2017'],['2013-2019'],['2009-2019'],['2013-2018','2018-2025'],['2008-2016'],
                     ['1996-2013'],['2007-2009','2019-2025'],['2008-2016'],['2004-2012'],['2009-2011','2011-2020','2022-2025'],['1999-2022'],
                     ['2002-2010'],['1990-2017'],['2004-2013'],['2007-2023'],['2003-2010','2010-2015'],['2010-2016'],['2005-2016'],['2018-2025'],
                     ['2016-2025'],['2015-2019','2019-2021'],['2013-2025'],['2016-2025'],['2000-2023'],['2004-2017','2017-2024'],['2021-2023','2023-2024','2024-2025'],
                     ['2015-2020','2020-2025'],['1989-2011'],['2005-2015'],['2003-2015'],['2003-2012'],['1988-2013'],['2007-2011','2011-2019'],['2011-2022'],
                     ['2007-2012'],['2006-2015','2015-2020'],['2011-2017'],['2007-2011','2011-2025'],['2015-2017','2017-2022'],['1996-2013'],['2003-2020'],
                     ['2010-2021'],['2006-2016','2023-2025'],['2011-2016'],['2007-2010','2010-2013'],['2000-2013'],['2006-2019'],['2006-2018'],['2013-2023'],
                     ['2016-2017','2017-2021'],['2014-2021','2021-2025'],['2019-2023','2023-2024'],['2017-2024'],['2014-2025'],['2016-2024'],['2005-2011','2014-2025'],['2006-2025'],
                     ['2003-2009','2017-2019'],['2009-2025'],['2002-2009','2012-2016'],['2004-2010'],['2012-2014','2014-2020'],['2007-2009','2011-2012','2015-2020','2020-2025'],['2005-2009','2009-2014'],
                     ['2006-2022'],['1998-2017'],['2008-2014'],['2002-2012','2012-2017','2021-2025'],['2020-2025'],['2012-2016','2016-2025'],['2017-2021','2021-2025'],['2015-2023','2023-2025'],
                     ['2013-2018','2018-2022'],['2004-2009','2016-2017','2017-2023','2023-2025'],['2011-2021','2021-2025'],['2000-2013','2013-2024'],
                     ['1997-2010','2015-2019'],['2017-2023'],['1999-2017'],['2007-2012'],['2008-2017','2017-2019'],['2002-2021'],['2004-2011','2011-2021'],['2001-2013'],['2019-2025'],['2007-2013','2013-2018','2019-2025'],
                     ['2006-2018','2020-2025'],['2021-2025'],['2021-2023'],['2017-2025'],['2011-2025'],['2002-2008','2008-2012','2012-2014','2014-2021'],['2006-2010','2010-2014','2021-2025'],['1997-2010','2014-2019'],
                     ['2007-2010'],['2015-2025'],['1999-2006','2007-2024','2024-2025'],['2021-2025'],['2005-2011','2011-2016'],['2007-2008','2009-2013'],['2003-2011','2011-2025'],['1999-2008','2008-2017','2018-2022'],
                     ['2010-2019'],['2016-2019','2019-2021','2021-2025'],['2001-2012','2012-2017','2018-2025'],['2007-2020'],['2014-2015','2017-2022','2022-2025'],['2007-2008'],['1999-2017'],['2003-2013','2015-2022'],
                     ['2013-2017','2017-2018','2018-2025'],['1989-2022'],['2005-2009','2010-2017'],['1995-2019'],['2008-2015','2015-2021','2023-2025'],['2004-2009','2009-2018','2019-2023'],['2007-2011'],['2018-2025'],['2014-2017','2017-2023','2023-2025'],
                     ['2011-2025'],['2012-2016'],['2007-2012','2012-2019','2021-2025'],['2013-2017'],['2006-2010','2010-2025'],['2006-2009','2009-2015','2017-2025'],['2017-2022','2022-2025'],['1995-2016','2016-2024'],
                     ['2012-2021','2021-2024','2024-2025'],['2012-2013','2013-2021','2021-2025'],['2010-2025'],['2006-2015','2016-2025'],['2007-2023'],['2009-2016','2016-2023'],['2006-2011','2011-2023','2023-2025'],['2011-2016','2016-2019','2020-2025'],
                     ['2014-2018','2018-2021','2021-2025'],['2006-2007','2014-2025'],['2010-2015','2022-2025'],['2007-2008','2008-2011','2012-2021'],['2014-2017','2017-2024','2024-2025'],['2013-2015','2015-2017','2017-2022','2023-2025'],
                     ['2006-2019','2019-2025'],['2018-2024','2024-2025'],['2013-2016','2016-2017','2017-2025'],['2007-2016','2016-2019'],['2014-2021'],['2013-2015','2015-2025'],['2016-2022'],['2010-2015','2019-2025'],
                     ['2007-2025'],['2011-2025'],['2007-2013','2013-2016','2016-2018'],['2011-2016'],['2019-2022','2022-2025'],['2011-2014','2014-2017','2017-2022'],['2007-2010','2010-2022','2022-2025'],
                     ['1998-2015'],['2004-2017','2022-2025'],['2019-2024','2024-2025'],['2003-2012','2012-2022'],['1986-2019'],['2004-2011','2011-2014','2014-2022'],['2005-2019'],['2005-2025'],['2006-2019','2022-2025'],
                     ['2005-2010','2010-2025'],['1999-2016','2016-2018'],['2002-2019','2019-2025'],['2009-2015','2015-2021','2021-2025'],['2021-2025'],['2008-2016','2016-2024'],['2003-2010'],['2001-2025'],['2003-2016','2016-2025'],['2004-2017'],['1998-2015','2015-2025'],['2006-2009','2009-2024']
                     ,['2011-2015','2015-2021','2022-2025'],['2002-2018','2018-2023'],['1998-2010','2010-2024'],['2009-2024'],['2010-2025'],['2014-2025']]


Need to create a function that'll pull the correct TEAM NO depending on the newly provided columns

In [21]:
def team_no_finder(df, ids):
    # Convert lists in TEAMS and YRS into separate rows
    expanded_df = df.explode(['TEAMS', 'YRS'])

    # Extract start and end years
    expanded_df[['start_yr', 'end_yr']] = expanded_df['YRS'].str.extract(r"(\d{4})-(\d{4})")
    expanded_df['start_yr'] = expanded_df['start_yr'].fillna(expanded_df['YRS']).astype(int)
    expanded_df['end_yr'] = expanded_df['end_yr'].fillna(expanded_df['YRS']).astype(int)

    # Merge with team_ids using TEAM and YEAR conditions
    merged_df = expanded_df.merge(ids, left_on='TEAMS', right_on='TEAM')

    # Filter rows where season falls within the coaching period
    merged_df = merged_df[(merged_df['YEAR'] > merged_df['start_yr']) & (merged_df['YEAR'] <= merged_df['end_yr'])]
    
    
    #creating new var that'll store how long they've been at current team
    merged_df = merged_df.reset_index(drop=True)
    merged_df['TENURE'] = merged_df['YEAR'] - merged_df['start_yr']
    
    final_df = merged_df[['YEAR','COACH ID', 'COACH', 'TEAM NO', 'TEAM', 'TENURE','WIN%']]
    
    return final_df

#Getting rid of white space for Texas at 2023
team_ids.loc[team_ids['TEAM NO']==957,'TEAM'] = 'Texas'

coach_by_teamno = team_no_finder(coach_perf, team_ids)

coach_by_teamno

Unnamed: 0,YEAR,COACH ID,COACH,TEAM NO,TEAM,TENURE,WIN%
0,2021,1,Roy Williams,838,North Carolina,18,0.773
1,2019,1,Roy Williams,770,North Carolina,16,0.773
2,2018,1,Roy Williams,703,North Carolina,15,0.773
3,2017,1,Roy Williams,638,North Carolina,14,0.773
4,2016,1,Roy Williams,570,North Carolina,13,0.773
...,...,...,...,...,...,...,...
1017,2011,102,Edward Joyner,245,Hampton,2,0.000
1018,2019,59,Russell Turner,751,UC Irvine,9,0.250
1019,2015,59,Russell Turner,480,UC Irvine,5,0.250
1020,2019,120,David Richman,768,North Dakota St.,5,0.000


In [242]:
len(np.unique(team_ids['TEAM NO'])),len(np.unique(team_no_finder(coach_perf, team_ids)['TEAM NO']))

(1023, 1022)

Now can merge on these coach stats onto team matchup to then create the feature matrix we want

In [22]:
temp = comp_team_stats.copy()
temp = temp.merge(coach_by_teamno.loc[:,['COACH ID','COACH','WIN%','TENURE','TEAM NO']],on='TEAM NO',how='left')
comp_coach_stats = temp.loc[:,['YEAR','TEAM NO','TEAM','CONF','CONF ID','COACH','COACH ID','MATCHUP ID','SEED',
       'ROUND', 'CURRENT ROUND', 'WIN', 'K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE', 'R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','WIN%','TENURE']]

comp_coach_stats

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,COACH,COACH ID,MATCHUP ID,SEED,ROUND,...,R64,R32,S16,E8,F4,F2,CHAMP,TOP2,WIN%,TENURE
0,2024,1067,Connecticut,BE,8,Dan Hurley,13.0,0,1,1,...,10.0,6.0,5.0,5.0,5.0,4.0,4.0,2.0,0.778,6.0
1,2024,1026,Stetson,ASun,5,Donnie Jones,114.0,0,16,64,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,5.0
2,2024,1060,Florida Atlantic,Amer,4,Dusty May,22.0,1,8,64,...,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.667,6.0
3,2024,1036,Northwestern,B10,6,Chris Collins,52.0,1,9,32,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500,11.0
4,2024,1029,San Diego St.,MWC,20,Brian Dutcher,29.0,2,5,16,...,11.0,6.0,4.0,1.0,1.0,1.0,0.0,1.0,0.583,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009,2008,43,Kansas,B12,7,Bill Self,14.0,1004,1,1,...,16.0,16.0,9.0,7.0,4.0,3.0,2.0,12.0,0.745,5.0
2010,2008,37,Memphis,CUSA,13,John Calipari,4.0,1005,1,2,...,8.0,5.0,2.0,1.0,1.0,1.0,0.0,2.0,0.750,8.0
2011,2008,13,UCLA,P10,23,Ben Howland,251.0,1005,1,4,...,10.0,9.0,7.0,2.0,2.0,0.0,0.0,2.0,0.545,5.0
2012,2008,43,Kansas,B12,7,Bill Self,14.0,1006,1,1,...,16.0,16.0,9.0,7.0,4.0,3.0,2.0,12.0,0.745,5.0


In [244]:
len(team_matchups), len(comp_coach_stats)

(2014, 2014)

In [23]:
diff_comp_coach = differenced_matchup(comp_coach_stats)
diff_comp_coach.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,COACH,COACH ID,MATCHUP ID,SEED,ROUND,...,R64,R32,S16,E8,F4,F2,CHAMP,TOP2,WIN%,TENURE
0,2024,1067,Connecticut,BE,8,Dan Hurley,13.0,0,1,1,...,9.0,6.0,5.0,5.0,5.0,4.0,4.0,2.0,0.778,1.0
1,2024,1026,Stetson,ASun,5,Donnie Jones,114.0,0,16,64,...,-9.0,-6.0,-5.0,-5.0,-5.0,-4.0,-4.0,-2.0,-0.778,-1.0
2,2024,1036,Northwestern,B10,6,Chris Collins,52.0,1,9,32,...,1.0,2.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,-0.167,5.0
3,2024,1060,Florida Atlantic,Amer,4,Dusty May,22.0,1,8,64,...,-1.0,-2.0,1.0,1.0,1.0,-0.0,-0.0,-0.0,0.167,-5.0
4,2024,1029,San Diego St.,MWC,20,Brian Dutcher,29.0,2,5,16,...,8.0,5.0,4.0,1.0,1.0,1.0,0.0,1.0,0.383,3.0


In [246]:
predictors = diff_comp_coach.copy().loc[:, 'WIN':].drop('WIN', axis=1).columns
target = 'WIN'

#before 24 season
bf_24 = diff_comp_coach.loc[diff_comp_coach['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [247]:
#test on 2024 season 
#will use raw data and then take differences as matchups update during simulation
s24 = comp_coach_stats.loc[(comp_coach_stats['YEAR']==2024)&(comp_coach_stats['CURRENT ROUND']==64),:]
s24 = s24.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF','CURRENT ROUND', 'WIN', 'SEED',
       'ROUND','K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','WIN%','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2']]

In [248]:
parameter_grid = {
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':np.arange(0.5, 1.1, .1)
}

boost = XGBClassifier(n_estimators=200,learning_rate=.01,booster='gbtree',early_stopping_rounds=20,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [None]:
best_model_coach = ran_rest.best_estimator_
pred = best_model_coach.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[150,  39],
       [ 41, 148]])

In [252]:
(150+148)/(39+41+150+148)

0.7883597883597884

In [None]:
sim = sim_tourney(s24,best_model_coach)

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
30,Arizona,P12,2,0
62,Tennessee,SEC,2,1


In [None]:
sim['2']

## Combining
Trying just adding the features of interest onto the comp_team stats data frame and testing

In [24]:
temp_conf = conf_matchup[['PAKE','PASE','R64','R32', 'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','WIN%']].copy()
temp_conf.columns = [col + "_CONF" for col in temp_conf.columns]

temp_seed = seed_matchup[['R64', 'R32', 'S16', 'E8', 'F4','F2', 'CHAMP', 'TOP2', 'WIN%']].copy()
temp_seed.columns = [col + "_SEED" for col in temp_seed.columns]

temp_coach = comp_coach_stats[['WIN%','TENURE']]
temp_coach.columns = [col + "_COACH" for col in temp_coach.columns]

In [25]:
temp = comp_team_stats.copy()

combined_comp_stats = pd.concat([temp,temp_conf,temp_seed,temp_coach],axis=1)

diff_combined_comp_stats = differenced_matchup(combined_comp_stats)
diff_combined_comp_stats.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF,CONF ID,MATCHUP ID,ROUND,CURRENT ROUND,WIN,SEED,...,R32_SEED,S16_SEED,E8_SEED,F4_SEED,F2_SEED,CHAMP_SEED,TOP2_SEED,WIN%_SEED,WIN%_COACH,TENURE_COACH
0,2024,1067,Connecticut,BE,8,0,1,64,1,-15,...,8.75,6.75,4.0,3.0,2.25,1.5,4.25,0.768,0.778,1.0
1,2024,1026,Stetson,ASun,5,0,64,64,0,15,...,-8.75,-6.75,-4.0,-3.0,-2.25,-1.5,-4.25,-0.768,-0.778,-1.0
2,2024,1036,Northwestern,B10,6,1,32,64,1,1,...,5.25,2.5,1.0,0.75,0.25,0.0,1.0,-0.038,-0.167,5.0
3,2024,1060,Florida Atlantic,Amer,4,1,64,64,0,-1,...,-5.25,-2.5,-1.0,-0.75,-0.25,-0.0,-1.0,0.038,0.167,-5.0
4,2024,1029,San Diego St.,MWC,20,2,16,64,1,-7,...,8.25,5.5,2.0,1.25,1.0,0.0,2.25,0.175,0.383,3.0


In [26]:
predictors = diff_combined_comp_stats.copy().loc[:, 'WIN':].drop('WIN', axis=1).columns
target = 'WIN'

#before 24 season
bf_24 = diff_combined_comp_stats.loc[diff_combined_comp_stats['YEAR']<2024]

#splitting on MATCHUP ID in order to ensure that the same matchups are split together
train_groups, test_groups = train_test_split(bf_24['MATCHUP ID'].unique(),test_size = .2, random_state=42)

x_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][predictors]
y_train = bf_24[bf_24['MATCHUP ID'].isin(train_groups)][target]

x_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][predictors]
y_test = bf_24[bf_24['MATCHUP ID'].isin(test_groups)][target]

In [27]:
#test on 2024 season 
#will use raw data and then take differences as matchups update during simulation
s24 = combined_comp_stats.loc[(combined_comp_stats['YEAR']==2024)&(combined_comp_stats['CURRENT ROUND']==64),:]
s24 = s24.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF','CURRENT ROUND', 'WIN', 'SEED',
       'K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','PAKE_CONF', 'PASE_CONF', 'R64_CONF', 'R32_CONF', 'S16_CONF', 'E8_CONF',
       'F4_CONF', 'F2_CONF', 'CHAMP_CONF', 'TOP2_CONF', 'WIN%_CONF',
       'R64_SEED', 'R32_SEED', 'S16_SEED', 'E8_SEED', 'F4_SEED', 'F2_SEED',
       'CHAMP_SEED', 'TOP2_SEED', 'WIN%_SEED', 'WIN%_COACH', 'TENURE_COACH']]

In [28]:
parameter_grid = {
    'n_estimators':[200,500,1000],
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':np.arange(0.5, 1.1, .1)
}

boost = XGBClassifier(learning_rate=.01,booster='gbtree',early_stopping_rounds=20,reg_lambda=10,reg_alpha=5,random_state=42)

ran_grid_search = RandomizedSearchCV(boost,param_distributions=parameter_grid,n_iter=50,cv=5,scoring='roc_auc')

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],verbose=False)

50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-

In [29]:
best_model = ran_rest.best_estimator_
pred = best_model.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[149,  40],
       [ 40, 149]])

In [30]:
sim = sim_tourney(s24,best_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [31]:
sim

{'64':                 TEAM  CONF  CURRENT ROUND  WINNER
 0        Connecticut    BE             64       1
 1            Stetson  ASun             64       0
 2   Florida Atlantic  Amer             64       1
 3       Northwestern   B10             64       0
 4      San Diego St.   MWC             64       1
 ..               ...   ...            ...     ...
 59             Akron   MAC             64       0
 60             Texas   B12             64       1
 61      Colorado St.   MWC             64       0
 62         Tennessee   SEC             64       1
 63     Saint Peter's  MAAC             64       0
 
 [64 rows x 4 columns],
 '32':                   TEAM  CONF  CURRENT ROUND  WINNER
 0          Connecticut    BE             32       1
 2     Florida Atlantic  Amer             32       0
 4        San Diego St.   MWC             32       1
 6               Auburn   SEC             32       0
 9             Duquesne   A10             32       0
 10            Illinois   B10   

In [32]:

imp = best_model.feature_importances_

sorted_idx = np.argsort(imp)

imp[sorted_idx],predictors[sorted_idx]

(array([0.        , 0.        , 0.00145517, 0.00204012, 0.0020889 ,
        0.00224303, 0.00336995, 0.00339516, 0.00362273, 0.00367641,
        0.00407784, 0.00451911, 0.00454202, 0.00461141, 0.00477028,
        0.00489669, 0.00530797, 0.00535085, 0.00545023, 0.00555679,
        0.00559248, 0.00573085, 0.00579662, 0.00579952, 0.0058469 ,
        0.00586377, 0.00605864, 0.00611378, 0.00618782, 0.00619411,
        0.00645136, 0.00647138, 0.00650463, 0.00654505, 0.00664227,
        0.0066917 , 0.00670877, 0.00707034, 0.00719288, 0.00736511,
        0.00751194, 0.00751308, 0.00756593, 0.00757725, 0.00758034,
        0.00764177, 0.00768927, 0.00772274, 0.00774371, 0.00785862,
        0.00787093, 0.00791877, 0.00802364, 0.00803962, 0.00823016,
        0.00840937, 0.00841214, 0.00846132, 0.00854216, 0.00883409,
        0.00894665, 0.00900551, 0.00903098, 0.00915415, 0.00919916,
        0.00922535, 0.00940796, 0.00950405, 0.0095089 , 0.00965735,
        0.00985239, 0.01000211, 0.01014088, 0.01

In [33]:
imp[sorted_idx],predictors[sorted_idx]

(array([0.        , 0.        , 0.00145517, 0.00204012, 0.0020889 ,
        0.00224303, 0.00336995, 0.00339516, 0.00362273, 0.00367641,
        0.00407784, 0.00451911, 0.00454202, 0.00461141, 0.00477028,
        0.00489669, 0.00530797, 0.00535085, 0.00545023, 0.00555679,
        0.00559248, 0.00573085, 0.00579662, 0.00579952, 0.0058469 ,
        0.00586377, 0.00605864, 0.00611378, 0.00618782, 0.00619411,
        0.00645136, 0.00647138, 0.00650463, 0.00654505, 0.00664227,
        0.0066917 , 0.00670877, 0.00707034, 0.00719288, 0.00736511,
        0.00751194, 0.00751308, 0.00756593, 0.00757725, 0.00758034,
        0.00764177, 0.00768927, 0.00772274, 0.00774371, 0.00785862,
        0.00787093, 0.00791877, 0.00802364, 0.00803962, 0.00823016,
        0.00840937, 0.00841214, 0.00846132, 0.00854216, 0.00883409,
        0.00894665, 0.00900551, 0.00903098, 0.00915415, 0.00919916,
        0.00922535, 0.00940796, 0.00950405, 0.0095089 , 0.00965735,
        0.00985239, 0.01000211, 0.01014088, 0.01

Since it seems the model is not optimized on getting the champion correctly we can priotize this by creating a custom loss function

In [370]:
best_model

In [34]:
round_weights = {
    64: 1.0,  # Round of 64 (Least Weight)
    32: 1.5,  # Round of 32
    16: 2.0,  # Sweet 16
    8: 3.0,   # Elite 8
    4: 5.0,   # Final Four
    2: 7.5,   # Championship Game
    1: 10.0   # Winner (Most Weight)
}

# Assign weights based on the round of the matchup
comp_team_stats["ROUND WEIGHT"] = comp_team_stats["ROUND"].map(round_weights)

# Train XGBoost with weighted importance for different rounds
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_lambda=10,  # L2 Regularization
    reg_alpha=5,  # L1 Regularization
    random_state=42
)

# Fit with sample_weight
xgb_model.fit(
    x_train, y_train, 
    sample_weight=comp_team_stats.loc[x_train.index, "ROUND WEIGHT"]
)



In [35]:
def evaluate_bracket(predictions, actual_results):
    """
    Score model predictions based on actual tournament results, penalizing early mispredictions more.
    """
    scoring_weights = np.array([round_weights.get(r, 1.0) for r in actual_results['ROUND']])
    bracket_loss = np.sum(scoring_weights * (predictions != actual_results['WIN']))
    
    return bracket_loss


In [44]:
parameter_grid = {
    'n_estimators':[200,500,1000],
    'max_depth':np.arange(5, 11, 1),
    'min_child_weight':np.arange(0, 7, 1),
    'colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1]
}

ran_grid_search = RandomizedSearchCV(xgb_model,param_distributions=parameter_grid,scoring=evaluate_bracket,cv=5)

ran_rest = ran_grid_search.fit(x_train,y_train,eval_set=[(x_test, y_test)],
    sample_weight=comp_team_stats.loc[x_train.index, "ROUND WEIGHT"],verbose=False)

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
TypeError: evaluate_bracket() takes 2 positional arguments but 3 were given

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
TypeError: evaluate_bracket() takes 2 positional arguments but 3 were given

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
TypeError: evaluate_bracket() takes 2 positional arguments but 3 were given

Traceback (most recent call last):

In [45]:
best_model_custloss = ran_rest.best_estimator_
pred = best_model_custloss.predict(x_test)
confusion_matrix(y_test.values,pred,labels=[1,0])

array([[147,  42],
       [ 42, 147]])

In [46]:
sim = sim_tourney(s24,best_model_custloss)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [47]:
sim

{'64':                 TEAM  CONF  CURRENT ROUND  WINNER
 0        Connecticut    BE             64       1
 1            Stetson  ASun             64       0
 2   Florida Atlantic  Amer             64       1
 3       Northwestern   B10             64       0
 4      San Diego St.   MWC             64       1
 ..               ...   ...            ...     ...
 59             Akron   MAC             64       0
 60             Texas   B12             64       1
 61      Colorado St.   MWC             64       0
 62         Tennessee   SEC             64       1
 63     Saint Peter's  MAAC             64       0
 
 [64 rows x 4 columns],
 '32':                   TEAM  CONF  CURRENT ROUND  WINNER
 0          Connecticut    BE             32       1
 2     Florida Atlantic  Amer             32       0
 4        San Diego St.   MWC             32       0
 6               Auburn   SEC             32       1
 9             Duquesne   A10             32       0
 10            Illinois   B10   

## Trying on 2025

In [48]:
matchup25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Tournament Matchups25.csv")
away_stats25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Away25.csv")
home_stats25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Home25.csv")
neutral_stats25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Barttorvik Neutral25.csv")
team_hist25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/Resumes25.csv")
team_v_ranked25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/TeamRankings25.csv")
team_mis25 = pd.read_csv("/Users/nicholaskim/Documents/Repositories/bracket challenge/bracketology/data/KenPom Barttorvik25.csv")

team_ids25 = matchup25[['YEAR','TEAM','TEAM NO']].drop_duplicates()

matchup25 = matchup25.loc[(matchup25['YEAR']==2025)&(matchup25['CURRENT ROUND']==64),].drop(['BY YEAR NO','BY ROUND NO','SCORE'],axis=1)
away_stats25 = away_stats25.loc[away_stats25['YEAR']==2025,]
home_stats25 = home_stats25.loc[home_stats25['YEAR']==2025,]
neutral_stats25 = neutral_stats25.loc[neutral_stats25['YEAR']==2025,]
team_hist25 = team_hist25.loc[team_hist25['YEAR']==2025,]
team_v_ranked25 = team_v_ranked25.loc[team_v_ranked25['YEAR']==2025,]
team_miss25 = team_mis25.loc[team_mis25['YEAR']==2025,]

In [49]:
temp_away = away_stats25.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]
temp_home = home_stats25.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]
temp_neutral = neutral_stats25.loc[:,['YEAR','TEAM NO','BADJ O','BADJ D','WIN%','EFG%','EFG%D','PPPO','PPPD']]

#adds _A to all columns to represent Away
temp_away.columns = list(temp_away.columns[:2]) + [col + "_A" for col in temp_away.columns[2:]]

#_H for Home
temp_home.columns = list(temp_home.columns[:2]) + [col + "_H" for col in temp_home.columns[2:]]

#_N for Neutral
temp_neutral.columns = list(temp_neutral.columns[:2]) + [col + "_N" for col in temp_neutral.columns[2:]]


vars = ['YEAR','TEAM NO','BADJ EM','FTR','FTRD',
        'BADJ T','TOV%','TOV%D','OREB%','OP OREB%','RAW T','WAB',
        '2PT%', '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%',
        'AST%', 'OP AST%', '2PTR', '3PTR', '2PTRD', '3PTRD','ELITE SOS']
dfs = [away_stats25,home_stats25,neutral_stats25]

team_stats = var_mean(dfs,vars)

temp_his = team_hist25.drop(['TEAM','SEED','ROUND','Q1 PLUS Q2 W','B POWER','BID TYPE'],axis=1)

temp_v_ranked = team_v_ranked25.loc[:,['YEAR','TEAM NO','TR RATING','V 1-25 WINS','V 1-25 LOSS',
                                     'V 26-50 WINS','V 26-50 LOSS','LUCK RATING','CONSISTENCY TR RATING']]

temp_mis = team_mis25.loc[:,['YEAR','CONF','CONF ID','TEAM NO','K OFF','K DEF','AVG HGT','EFF HGT','EXP','TALENT']]

These two teams not in the team_res dataset

In [50]:
alabama_st = {
    'TEAM':'Alabama St.',
    'PAKE': -0,
    'PASE': -0,
    'R64': 2,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

saint_francis = {
    'TEAM':'Saint Francis',
    'PAKE': 0,
    'PASE': 0,
    'R64': 1,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

uc_san_deigo = {
    'TEAM':'UC San Diego',
    'PAKE': 0,
    'PASE': 0,
    'R64': 0,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

bryant = {
    'TEAM':'Bryant',
    'PAKE': 0,
    'PASE': 0,
    'R64': 0,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

nebraska_omaha = {
    'TEAM':'Nebraska Omaha',
    'PAKE': 0,
    'PASE': 0,
    'R64': 0,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

siu_edwardsville = {
    'TEAM':'SIU Edwardsville',
    'PAKE': 0,
    'PASE': 0,
    'R64': 0,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

high_point = {
    'TEAM':'High Point',
    'PAKE': 0,
    'PASE': 0,
    'R64': 0,
    'R32': 0,
    'S16': 0,
    'E8': 0,
    'F4': 0,
    'F2': 0,
    'CHAMP': 0,
    'TOP2': 0
}

temp_res = team_res.loc[:,['TEAM','PAKE','PASE','R64','R32','S16','E8','F4',
                           'F2','CHAMP','TOP2']]

temp_res = pd.concat([temp_res, pd.DataFrame([alabama_st,saint_francis,uc_san_deigo,bryant,nebraska_omaha,siu_edwardsville,high_point])], ignore_index=True)
temp_res

Unnamed: 0,TEAM,PAKE,PASE,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,Abilene Christian,0.7,0.7,2,1,0,0,0,0,0,0
1,Akron,-1.0,-1.3,5,0,0,0,0,0,0,0
2,Alabama,0.2,-0.5,6,4,3,1,1,0,0,2
3,Albany,-0.4,-0.3,3,0,0,0,0,0,0,0
4,American,-0.5,-0.4,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
244,UC San Diego,0.0,0.0,0,0,0,0,0,0,0,0
245,Bryant,0.0,0.0,0,0,0,0,0,0,0,0
246,Nebraska Omaha,0.0,0.0,0,0,0,0,0,0,0,0
247,SIU Edwardsville,0.0,0.0,0,0,0,0,0,0,0,0


In [51]:
matchup25['WIN'] = [0]*len(matchup25)

matchup25.merge(temp_mis,on=['YEAR','TEAM NO'],how='left')

Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,CURRENT ROUND,WIN,CONF,CONF ID,K OFF,K DEF,AVG HGT,EFF HGT,EXP,TALENT
0,2025,1141,Auburn,1,1,64,0,SEC,28,121.014,100.7990,78.431,81.057,2.573,34.433
1,2025,1145,Alabama St.,16,64,64,0,SWAC,31,103.998,104.2010,76.500,79.666,2.346,4.049
2,2025,1141,Auburn,1,1,64,0,SEC,28,121.014,100.7990,78.431,81.057,2.573,34.433
3,2025,1099,Saint Francis,16,64,64,0,NEC,21,100.249,107.9270,76.305,78.644,1.173,0.200
4,2025,1119,Louisville,8,1,64,0,ACC,2,113.399,99.6724,78.008,81.168,2.555,20.225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,2025,1090,Troy,14,64,64,0,SB,26,108.553,98.9402,77.800,80.146,1.661,0.200
68,2025,1088,UCLA,7,1,64,0,B10,6,112.395,98.9106,78.101,81.377,1.849,70.640
69,2025,1086,Utah St.,10,64,64,0,MWC,20,118.700,104.9960,77.290,80.475,2.077,17.856
70,2025,1094,Tennessee,2,1,64,0,SEC,28,114.091,95.4042,77.149,81.352,2.650,51.743


In [52]:
comp_team_stats = matchup25.merge(temp_mis,on=['YEAR','TEAM NO'],how='left')

#rearranging so CONF and CONF ID are near front
comp_team_stats = comp_team_stats.iloc[:,[0,1,2,8,7,4,5,6,3,9,10,11,12,13]]

dfs = [comp_team_stats,temp_away,temp_home,temp_neutral,temp_v_ranked,temp_his,team_stats]

comp_team_stats = reduce(lambda left, right: pd.merge(left, right, on=['YEAR','TEAM NO']),dfs)

comp_team_stats = comp_team_stats.merge(temp_res,on='TEAM',how='left')

comp_team_stats

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF ID,CONF,ROUND,CURRENT ROUND,WIN,SEED,K OFF,...,PAKE,PASE,R64,R32,S16,E8,F4,F2,CHAMP,TOP2
0,2025,1141,Auburn,28,SEC,1,64,0,1,121.014,...,-1.5,-0.2,5,4,1,1,1,0,0,1
1,2025,1145,Alabama St.,31,SWAC,64,64,0,16,103.998,...,0.0,0.0,2,0,0,0,0,0,0,0
2,2025,1141,Auburn,28,SEC,1,64,0,1,121.014,...,-1.5,-0.2,5,4,1,1,1,0,0,1
3,2025,1099,Saint Francis,21,NEC,64,64,0,16,100.249,...,0.0,0.0,1,0,0,0,0,0,0,0
4,2025,1119,Louisville,2,ACC,1,64,0,8,113.399,...,1.8,3.5,10,7,6,5,2,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,2025,1090,Troy,26,SB,64,64,0,14,108.553,...,-0.1,-0.1,1,0,0,0,0,0,0,0
68,2025,1088,UCLA,6,B10,1,64,0,7,112.395,...,4.5,5.1,10,9,7,2,2,0,0,2
69,2025,1086,Utah St.,20,MWC,64,64,0,10,118.700,...,-3.9,-3.4,7,1,0,0,0,0,0,0
70,2025,1094,Tennessee,28,SEC,1,64,0,2,114.091,...,-2.0,-0.3,11,8,6,2,0,0,0,3


In [53]:
temp_conf = conf_perf[['CONF ID','PAKE','PASE','R64','R32', 'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','WIN%']].copy()
temp_conf.columns = ['CONF ID'] + [col + "_CONF" for col in temp_conf.columns[1:]]

temp_seed = seed_res[['SEED','R64', 'R32', 'S16', 'E8', 'F4','F2', 'CHAMP', 'TOP2', 'WIN%']].copy()
temp_seed.columns = ['SEED'] + [col + "_SEED" for col in temp_seed.columns[1:]]


coach_by_teamno25 = team_no_finder(coach_perf, team_ids25)
temp_coach = coach_by_teamno25[['TEAM NO','WIN%','TENURE']]
temp_coach.columns = ['TEAM NO'] + [col + "_COACH" for col in temp_coach.columns[1:]]

In [54]:
comp_team_stats = comp_team_stats.merge(temp_conf,on='CONF ID',how='left')
comp_team_stats = comp_team_stats.merge(temp_seed,on='SEED',how='left')
comp_team_stats = comp_team_stats.merge(temp_coach,on='TEAM NO',how='left')

Filling in Coach stats manually where it's nan

In [55]:
updates = {
    1145: [0, 3],
    1099: [0,0],
    1089: [0, 12],  
    1112: [.688,2],
    1120: [0,6],
    1139: [0,2],
    1133: [0,1],
    1087: [0,5],
    1107: [0,3],
    1144: [0,2],
    1108: [0,1],
    1138: [0,1],
    1096: [0,6],
    1127: [0,2],
    1086: [0,1],
    1082:[0,3]
}

for teamno, values in updates.items():
    comp_team_stats.loc[comp_team_stats['TEAM NO'] == teamno, ['WIN%_COACH', 'TENURE_COACH']] = values
    
comp_team_stats.loc[comp_team_stats['TENURE_COACH'].isna()]

Unnamed: 0,YEAR,TEAM NO,TEAM,CONF ID,CONF,ROUND,CURRENT ROUND,WIN,SEED,K OFF,...,R32_SEED,S16_SEED,E8_SEED,F4_SEED,F2_SEED,CHAMP_SEED,TOP2_SEED,WIN%_SEED,WIN%_COACH,TENURE_COACH


In [56]:
comp_team_stats = comp_team_stats.loc[:,['YEAR', 'TEAM NO', 'TEAM', 'CONF','CURRENT ROUND', 'WIN', 'SEED',
       'K OFF', 'K DEF', 'AVG HGT', 'EFF HGT',
       'BADJ O_A', 'BADJ D_A', 'WIN%_A', 'EFG%_A', 'EFG%D_A', 'PPPO_A',
       'PPPD_A', 'BADJ O_H', 'BADJ D_H', 'WIN%_H', 'EFG%_H', 'EFG%D_H',
       'PPPO_H', 'PPPD_H', 'BADJ O_N', 'BADJ D_N', 'WIN%_N', 'EFG%_N',
       'EFG%D_N', 'PPPO_N', 'PPPD_N', 'TR RATING', 'V 1-25 WINS',
       'V 1-25 LOSS', 'V 26-50 WINS', 'V 26-50 LOSS', 'LUCK RATING',
       'CONSISTENCY TR RATING', 'NET RPI', 'RESUME', 'WAB RANK', 'ELO', 'Q1 W',
       'Q2 W', 'Q3 Q4 L', 'PLUS 500', 'R SCORE', 'BADJ EM', 'FTR', 'FTRD',
       'BADJ T', 'TOV%', 'TOV%D', 'OREB%', 'OP OREB%', 'RAW T', 'WAB', '2PT%',
       '2PT%D', '3PT%', '3PT%D', 'BLK%', 'BLKED%', 'AST%', 'OP AST%', '2PTR',
       '3PTR', '2PTRD', '3PTRD', 'ELITE SOS', 'PAKE', 'PASE','R64', 'R32',
       'S16', 'E8', 'F4', 'F2', 'CHAMP', 'TOP2','PAKE_CONF', 'PASE_CONF', 'R64_CONF', 'R32_CONF', 'S16_CONF', 'E8_CONF',
       'F4_CONF', 'F2_CONF', 'CHAMP_CONF', 'TOP2_CONF', 'WIN%_CONF',
       'R64_SEED', 'R32_SEED', 'S16_SEED', 'E8_SEED', 'F4_SEED', 'F2_SEED',
       'CHAMP_SEED', 'TOP2_SEED', 'WIN%_SEED', 'WIN%_COACH', 'TENURE_COACH']]

Since data was pulled before all the First Four games are finalized there are more rows in the matchups to account for all possible senarios. We'll then create different verisions of matchups to cover all basis. At time two of these games are finialized (Alabama St. and UNC). Will create different versions of matchup:
 - v1 where American and Texas get in
 - v2 American and Xavier
 - v3 Mount St. Mary's and Texas
 - v4 Mount St. Mary's and Xavier

In [57]:
matchups25_v1 = comp_team_stats.drop([2,3,12,13,38,39,64,65])
matchups25_v2 = comp_team_stats.drop([2,3,12,13,38,39,62,63])
matchups25_v3 = comp_team_stats.drop([2,3,12,13,36,37,64,65])
matchups25_v4 = comp_team_stats.drop([2,3,12,13,36,37,62,63])

## Version 1 Sim

In [378]:
sim_comp_regloss = sim_tourney(matchups25_v1,best_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [385]:
sim_comp_regloss['4']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,4,1
22,Connecticut,BE,4,0
36,Duke,ACC,4,0
54,Houston,B12,4,1


In [380]:
sim_comp_custloss = sim_tourney(matchups25_v1,best_model_custloss)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [384]:
sim_comp_custloss['4']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,4,1
22,Connecticut,BE,4,0
36,Duke,ACC,4,1
54,Houston,B12,4,0


## Version 2

In [386]:
sim_comp_reglossv2 = sim_tourney(matchups25_v2,best_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [389]:
sim_comp_reglossv2['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,2,0
54,Houston,B12,2,1


In [390]:
sim_comp_custlossv2 = sim_tourney(matchups25_v1,best_model_custloss)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [392]:
sim_comp_custlossv2['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,2,0
36,Duke,ACC,2,1


## Version 3

In [393]:
sim_comp_reglossv3 = sim_tourney(matchups25_v3,best_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [395]:
sim_comp_reglossv3['2']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,2,0
54,Houston,B12,2,1


In [396]:
sim_comp_custlossv3 = sim_tourney(matchups25_v3,best_model_custloss)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [408]:
sim_comp_custlossv3['16']

Unnamed: 0,TEAM,CONF,CURRENT ROUND,WINNER
0,Auburn,SEC,16,1
6,Michigan,B10,16,0
10,Mississippi,SEC,16,1
18,Michigan St.,B10,16,0
22,Connecticut,BE,16,1
26,Maryland,B10,16,0
30,Texas Tech,B12,16,0
33,Arkansas,SEC,16,1
38,Duke,ACC,16,1
42,Oregon,B10,16,0


## Version 4

In [58]:
sim_comp_reglossv4 = sim_tourney(matchups25_v4,best_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [59]:
sim_comp_reglossv4

{'64':            TEAM  CONF  CURRENT ROUND  WINNER
 0        Auburn   SEC             64       1
 1   Alabama St.  SWAC             64       0
 4    Louisville   ACC             64       0
 5     Creighton    BE             64       1
 6      Michigan   B10             64       1
 ..          ...   ...            ...     ...
 67         Troy    SB             64       0
 68         UCLA   B10             64       1
 69     Utah St.   MWC             64       0
 70    Tennessee   SEC             64       1
 71      Wofford    SC             64       0
 
 [64 rows x 4 columns],
 '32':             TEAM  CONF  CURRENT ROUND  WINNER
 0         Auburn   SEC             32       1
 5      Creighton    BE             32       0
 6       Michigan   B10             32       1
 8      Texas A&M   SEC             32       0
 10   Mississippi   SEC             32       1
 14      Iowa St.   B12             32       0
 16     Marquette    BE             32       0
 18  Michigan St.   B10           

In [61]:
sim_comp_custlossv4 = sim_tourney(matchups25_v4,best_model_custloss)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winner['CURRENT ROUND'] = rd // 2
A value is trying to be set on a

In [62]:
sim_comp_custlossv4

{'64':            TEAM  CONF  CURRENT ROUND  WINNER
 0        Auburn   SEC             64       1
 1   Alabama St.  SWAC             64       0
 4    Louisville   ACC             64       1
 5     Creighton    BE             64       0
 6      Michigan   B10             64       1
 ..          ...   ...            ...     ...
 67         Troy    SB             64       0
 68         UCLA   B10             64       1
 69     Utah St.   MWC             64       0
 70    Tennessee   SEC             64       1
 71      Wofford    SC             64       0
 
 [64 rows x 4 columns],
 '32':             TEAM  CONF  CURRENT ROUND  WINNER
 0         Auburn   SEC             32       1
 4     Louisville   ACC             32       0
 6       Michigan   B10             32       0
 8      Texas A&M   SEC             32       1
 10   Mississippi   SEC             32       1
 14      Iowa St.   B12             32       0
 16     Marquette    BE             32       0
 18  Michigan St.   B10           