In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read model features data
modeling_features = pd.read_csv('../../data/model_features.csv')
all_features = pd.read_csv('../../data/transformed_data_men_21_to_24.csv')

modeling_features.head()

Unnamed: 0,won,underdog,underdog_opp,fan_favorite,fan_favorite_opp,de,oe,te,pace,physicality_score,...,luck_opp,choke_rate,choke_rate_opp,upset_rate,upset_rate_opp,3mean_plus_minus,3mean_plus_minus_opp,win_streak,win_streak_opp,matchup_win_rate
0,1,0,0,1,0,83.955939,105.172414,1.25271,72.0,9.405172,...,0.0,0.0,,0.0,,14.333333,-3.333333,0,0,
1,1,0,0,0,1,83.955939,105.172414,1.25271,72.0,9.405172,...,0.571429,0.0,,0.0,,14.333333,-0.333333,1,2,
2,1,0,0,1,0,83.955939,105.172414,1.25271,72.0,9.405172,...,0.444444,0.0,,0.0,0.0,14.333333,-5.666667,2,1,
3,1,0,0,1,0,83.955939,105.172414,1.25271,72.0,9.405172,...,0.0,0.0,,0.0,,14.333333,-10.333333,3,0,0.5
4,0,1,0,0,1,83.955939,105.172414,1.25271,72.0,9.405172,...,0.333333,0.0,0.3,0.0,0.444444,13.666667,21.333333,4,2,


In [3]:
all_features.head()

Unnamed: 0,team,opponent,won,team_score,opponent_score,team_rank,opponent_rank,game_round,season_type,date,...,luck_opp,choke_rate,choke_rate_opp,upset_rate,upset_rate_opp,3mean_plus_minus,3mean_plus_minus_opp,win_streak,win_streak_opp,matchup_win_rate
0,Abilene Christian,East Tennessee State,1,70.0,47.0,,,,regular_season,2020/11/25,...,0.0,0.0,,0.0,,14.333333,-3.333333,0,0,
1,Abilene Christian,Austin Peay,1,80.0,72.0,,,,regular_season,2020/11/27,...,0.571429,0.0,,0.0,,14.333333,-0.333333,1,2,
2,Abilene Christian,Nebraska Omaha,1,70.0,58.0,,,,regular_season,2020/11/28,...,0.444444,0.0,,0.0,0.0,14.333333,-5.666667,2,1,
3,Abilene Christian,Tarleton State,1,69.0,48.0,,,,regular_season,2020/12/05,...,0.0,0.0,,0.0,,14.333333,-10.333333,3,0,
4,Abilene Christian,Texas Tech,0,44.0,51.0,,17.0,,regular_season,2020/12/09,...,0.333333,0.0,0.3,0.0,0.444444,13.666667,21.333333,4,2,


#### Remove all qualitative fields besides team names, team ranks, and season_year

In [4]:
exception_fields = ['team','seed','team_rank','opponent','seed_opp','opponent_rank','team_score','opponent_score','plus_minus',
                    'season_year','season_type','game_round']
qualitative_fields = [col for col in all_features.columns if (col not in modeling_features.columns) & (col not in exception_fields)]
qualitative_fields

['date',
 'home_game',
 'g',
 'w',
 'l',
 'g_opp',
 'w_opp',
 'l_opp',
 'conf',
 'conf_opp']

In [10]:
# Set baseline f1-score

# df = all_features.loc[all_features.season_type == 'ncaa_tournament'].drop(qualitative_fields,axis=1)
df = all_features.drop(qualitative_fields,axis=1)

# Split training and test data
X = df[df.columns[~df.columns.isin(['won'])]]
y = df['won']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.drop(exception_fields, axis=1, inplace=True)
# X_train_raw = X_train.copy() # To be concatenated later
# df_pred = X_test[exception_fields]
df_pred = X_test.copy()
X_test.drop(exception_fields, axis=1, inplace=True)

for scaler in [StandardScaler(), MinMaxScaler()]:
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

model = HistGradientBoostingClassifier(
    learning_rate = 0.05,
    min_samples_leaf = 25,
    random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

0.7920176234287934


In [11]:
pd.options.display.max_columns = 100

In [12]:
df_pred['won'] = y_test
df_pred['prediction'] = y_pred
df_pred['correct_prediction'] = 1
df_pred.loc[df_pred.won != df_pred.prediction, 'correct_prediction'] = 0

pred = df_pred.reset_index(drop=True)#.join(X_train_raw.reset_index(drop=True))
pred['matchup_win_rate_opp'] = 1- pred.matchup_win_rate
failures = pred.loc[pred.correct_prediction == 0]
failures.loc[failures.season_type == 'ncaa_tournament'].sort_values('plus_minus')

Unnamed: 0,team,opponent,team_score,opponent_score,team_rank,opponent_rank,game_round,season_type,season_year,underdog,underdog_opp,fan_favorite,fan_favorite_opp,de,oe,te,pace,physicality_score,sos,srs,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,pf,tov,tov%,poss,ast_per_poss,ast_per_fg,tov_per_poss,ast_to_tov,poss_per_game,game_win_rate,gpt_sent_score_avg,de_opp,oe_opp,te_opp,pace_opp,physicality_score_opp,sos_opp,srs_opp,fg_opp,fga_opp,fg%_opp,3p_opp,3pa_opp,3p%_opp,ft_opp,fta_opp,ft%_opp,orb_opp,drb_opp,trb_opp,ast_opp,stl_opp,blk_opp,pf_opp,tov_opp,tov%_opp,poss_opp,ast_per_poss_opp,ast_per_fg_opp,tov_per_poss_opp,ast_to_tov_opp,poss_per_game_opp,game_win_rate_opp,gpt_sent_score_avg_opp,seed,seed_opp,conf_rank,conf_rank_opp,plus_minus,luck,luck_opp,choke_rate,choke_rate_opp,upset_rate,upset_rate_opp,3mean_plus_minus,3mean_plus_minus_opp,win_streak,win_streak_opp,matchup_win_rate,won,prediction,correct_prediction,matchup_win_rate_opp
1788,San Diego State,Connecticut,59.0,76.0,5.0,4.0,CHAMPIONSHIP,ncaa_tournament,2023,1,0,1,0,95.541401,107.122177,1.121212,66.0,8.275641,8.92,15.81,25.282051,57.717949,0.438,6.948718,19.948718,0.348,13.641026,18.897436,0.722,10.923077,25.384615,36.307692,12.923077,6.717949,3.871795,16.25641,11.589744,14.8,34.925,0.370024,0.511156,0.331847,1.115044,0.895513,0.820513,89.0,94.306184,115.535445,1.22511,68.0,9.262821,8.51,22.95,27.74359,59.820513,0.464,9.051282,24.948718,0.363,14.025641,18.435897,0.761,13.0,26.333333,39.333333,17.538462,6.282051,4.871795,17.74359,12.897436,15.8,36.398077,0.481851,0.632163,0.354344,1.359841,0.933284,0.794872,88.333333,5.0,4.0,9.0,2.0,-17.0,0.875,0.0,0.15,0.230769,0.625,0.8,3.0,21.333333,16,5,,0,1,0,
349,Saint Mary's (CA),UCLA,56.0,72.0,5.0,4.0,SECOND ROUND,ncaa_tournament,2022,1,0,1,0,93.257919,107.239819,1.149927,65.0,7.323529,6.58,14.82,26.117647,56.323529,0.464,7.235294,20.470588,0.353,10.235294,13.5,0.758,8.617647,25.088235,33.705882,13.470588,6.911765,2.529412,15.558824,11.235294,15.2,35.147794,0.383256,0.515766,0.319659,1.198953,1.033759,0.764706,82.0,95.948963,112.238171,1.16977,66.0,7.621429,8.36,19.3,27.628571,61.371429,0.45,6.771429,19.171429,0.353,13.371429,18.057143,0.741,11.257143,25.342857,36.6,13.885714,6.971429,3.314286,15.828571,8.942857,11.3,33.891429,0.409712,0.502585,0.263868,1.552716,0.968327,0.771429,80.0,5.0,4.0,7.0,6.0,-16.0,1.0,0.25,0.25,0.192308,0.5,0.666667,6.333333,2.0,1,1,,0,1,0,
6772,Saint Bonaventure,Louisiana State,61.0,76.0,9.0,8.0,FIRST ROUND,ncaa_tournament,2021,1,0,1,0,93.992674,107.765568,1.146532,65.0,7.964286,4.49,13.44,25.809524,58.285714,0.443,6.380952,18.571429,0.344,12.047619,16.333333,0.738,11.333333,25.571429,36.904762,14.571429,5.857143,3.952381,15.333333,10.714286,14.0,32.94881,0.442244,0.564576,0.32518,1.36,1.568991,0.761905,72.0,104.005722,113.066285,1.087116,72.0,8.353448,9.14,15.7,28.827586,62.862069,0.459,8.068966,23.241379,0.347,16.034483,21.344828,0.751,11.37931,27.0,38.37931,12.103448,7.758621,2.931034,17.413793,11.344828,13.5,38.931897,0.310888,0.419856,0.291402,1.066869,1.342479,0.655172,74.333333,9.0,8.0,15.0,5.0,-15.0,0.6,0.4,,,0.0,0.25,14.333333,2.666667,3,0,,0,1,0,
4629,Houston,Miami (FL),75.0,89.0,1.0,5.0,SWEET 16,ncaa_tournament,2023,0,1,0,1,89.780405,116.976351,1.302916,64.0,8.912162,4.79,22.2,27.297297,59.540541,0.458,7.675676,22.567568,0.34,12.594595,17.189189,0.733,12.810811,26.135135,38.945946,14.702703,7.918919,4.918919,16.702703,10.0,12.9,32.651351,0.450294,0.538614,0.306266,1.47027,0.882469,0.891892,85.333333,105.407011,115.864528,1.099211,68.0,7.972973,6.84,13.98,28.594595,59.594595,0.48,7.513514,20.405405,0.368,14.351351,18.297297,0.784,10.378378,24.351351,34.72973,14.459459,7.297297,3.108108,14.243243,11.108108,14.0,38.015541,0.380357,0.505671,0.292199,1.301703,1.027447,0.783784,82.0,1.0,5.0,8.0,2.0,-14.0,0.5,0.636364,0.090909,0.315789,0.75,0.857143,6.0,5.333333,2,2,,0,1,0,
7309,Miami (FL),Connecticut,59.0,72.0,5.0,4.0,FINAL FOUR,ncaa_tournament,2023,1,0,1,0,105.407011,115.864528,1.099211,68.0,7.972973,6.84,13.98,28.594595,59.594595,0.48,7.513514,20.405405,0.368,14.351351,18.297297,0.784,10.378378,24.351351,34.72973,14.459459,7.297297,3.108108,14.243243,11.108108,14.0,38.015541,0.380357,0.505671,0.292199,1.301703,1.027447,0.783784,82.0,94.306184,115.535445,1.22511,68.0,9.262821,8.51,22.95,27.74359,59.820513,0.464,9.051282,24.948718,0.363,14.025641,18.435897,0.761,13.0,26.333333,39.333333,17.538462,6.282051,4.871795,17.74359,12.897436,15.8,36.398077,0.481851,0.632163,0.354344,1.359841,0.933284,0.794872,88.333333,5.0,4.0,2.0,2.0,-13.0,0.636364,0.0,0.315789,0.230769,0.857143,0.8,12.333333,22.0,4,4,,0,1,0,
2193,Duke,Tennessee,52.0,65.0,5.0,4.0,SECOND ROUND,ncaa_tournament,2023,1,0,1,0,97.524621,110.385946,1.131878,65.0,8.548611,7.44,15.83,26.0,57.555556,0.452,6.805556,20.305556,0.335,13.194444,17.222222,0.766,12.0,26.666667,38.666667,14.611111,5.666667,4.527778,15.25,12.0,15.4,34.180556,0.427469,0.561966,0.351077,1.217593,0.94946,0.75,85.666667,88.751664,108.469524,1.222169,65.0,9.180556,7.98,20.84,25.222222,58.25,0.433,7.666667,23.333333,0.329,12.638889,17.861111,0.708,12.888889,25.5,38.388889,16.694444,8.25,3.611111,16.555556,11.972222,15.2,32.789583,0.509139,0.661894,0.365123,1.394432,0.910822,0.694444,77.333333,5.0,4.0,2.0,5.0,-13.0,0.833333,0.666667,0.25,0.333333,0.5,0.8,13.333333,3.333333,9,1,,0,1,0,
5542,Georgia Tech,Loyola Chicago,60.0,71.0,9.0,8.0,FIRST ROUND,ncaa_tournament,2021,1,0,1,0,102.081144,109.200476,1.069742,67.0,7.894231,7.91,12.79,27.769231,58.230769,0.477,7.384615,21.192308,0.348,12.0,16.538462,0.726,7.884615,22.384615,30.269231,15.846154,9.076923,3.461538,15.769231,11.153846,14.4,38.894231,0.407417,0.570637,0.286774,1.42069,1.495932,0.653846,67.0,88.279365,111.847619,1.266974,63.0,7.129032,1.37,15.12,25.935484,52.064516,0.498,7.129032,19.548387,0.365,12.032258,16.741935,0.719,7.419355,25.096774,32.516129,15.806452,6.935484,2.419355,14.322581,11.741935,16.4,38.210484,0.413668,0.609453,0.307296,1.346154,1.232596,0.83871,76.666667,9.0,8.0,2.0,12.0,-11.0,0.5,0.333333,,0.125,0.5,0.5,6.666667,16.666667,7,6,,0,1,0,
5300,Utah State,Missouri,65.0,76.0,10.0,7.0,FIRST ROUND,ncaa_tournament,2023,1,0,1,0,100.923204,112.888751,1.118561,69.0,7.271429,6.81,14.23,27.057143,56.428571,0.479,9.285714,24.0,0.387,14.771429,19.371429,0.763,8.571429,26.942857,35.514286,16.714286,5.0,3.342857,16.828571,12.171429,15.6,39.858571,0.41934,0.61774,0.305365,1.373239,1.138816,0.742857,84.333333,105.531915,111.935157,1.060676,70.0,8.228571,6.89,11.41,28.228571,59.771429,0.472,9.285714,25.828571,0.36,13.171429,17.371429,0.758,8.857143,21.371429,30.228571,15.942857,10.2,2.714286,18.028571,11.142857,14.1,38.765714,0.411262,0.564777,0.287441,1.430769,1.107592,0.714286,76.0,10.0,7.0,9.0,5.0,-11.0,0.5,1.0,,0.666667,0.0,0.583333,6.666667,0.666667,1,0,,0,1,0,
2750,Houston,Villanova,44.0,50.0,5.0,2.0,ELITE 8,ncaa_tournament,2022,1,0,1,0,89.521423,114.016537,1.273623,65.0,9.407895,6.47,22.55,27.736842,59.657895,0.465,7.842105,23.184211,0.338,11.526316,17.289474,0.667,13.315789,25.684211,39.0,16.236842,8.026316,5.157895,17.157895,11.131579,14.1,33.765132,0.480876,0.585389,0.329677,1.458629,0.888556,0.842105,85.333333,99.214156,113.494666,1.143936,63.0,7.105263,10.29,19.31,24.263158,56.131579,0.432,9.342105,26.0,0.359,13.868421,16.710526,0.83,10.315789,24.473684,34.789474,11.868421,6.026316,2.184211,14.894737,9.894737,13.4,31.779605,0.37346,0.489154,0.311355,1.199468,0.836305,0.789474,71.333333,5.0,2.0,8.0,2.0,-6.0,0.4,0.666667,0.125,0.173913,0.6,0.727273,13.666667,12.666667,6,8,,0,1,0,
3610,Missouri,Oklahoma,68.0,72.0,9.0,8.0,FIRST ROUND,ncaa_tournament,2021,1,0,1,0,101.768707,103.891156,1.020856,70.0,8.326923,10.87,12.37,25.692308,57.576923,0.446,7.076923,21.807692,0.325,14.961538,21.346154,0.701,9.807692,25.884615,35.692308,13.269231,6.653846,3.538462,19.692308,13.307692,16.4,39.331731,0.337367,0.516467,0.338345,0.99711,1.512759,0.615385,76.0,99.847793,106.51843,1.066808,69.0,7.833333,8.86,13.52,26.740741,60.37037,0.443,7.851852,23.037037,0.341,13.185185,17.703704,0.745,9.592593,26.148148,35.740741,13.296296,7.148148,3.62963,14.740741,10.962963,13.7,36.52037,0.364079,0.49723,0.300188,1.212838,1.352606,0.592593,73.0,9.0,8.0,5.0,1.0,-4.0,0.857143,0.5,0.461538,0.5,0.571429,0.454545,-3.0,-1.666667,0,0,,0,1,0,


In [13]:
failures_to_examine = [349, 4200, 1788]
df_f = failures.loc[failures_to_examine].set_index(['team','opponent','seed','seed_opp','season_year','game_round','prediction','won','plus_minus'])

In [24]:
fields_to_compare = ['tov_per_poss','game_win_rate','fan_favorite','ast_per_poss','sos','te','matchup_win_rate']
fields_to_compare_complete = sorted(fields_to_compare+[col + '_opp' for col in fields_to_compare])

In [25]:
all_features.loc[(all_features.team == 'Houston') & (all_features.opponent == 'Baylor')]

Unnamed: 0,team,opponent,won,team_score,opponent_score,team_rank,opponent_rank,game_round,season_type,date,season_year,home_game,underdog,underdog_opp,fan_favorite,fan_favorite_opp,g,w,l,de,oe,te,pace,physicality_score,sos,srs,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,drb,trb,ast,stl,blk,pf,tov,tov%,poss,ast_per_poss,ast_per_fg,tov_per_poss,ast_to_tov,poss_per_game,...,oe_opp,te_opp,pace_opp,physicality_score_opp,sos_opp,srs_opp,fg_opp,fga_opp,fg%_opp,3p_opp,3pa_opp,3p%_opp,ft_opp,fta_opp,ft%_opp,orb_opp,drb_opp,trb_opp,ast_opp,stl_opp,blk_opp,pf_opp,tov_opp,tov%_opp,poss_opp,ast_per_poss_opp,ast_per_fg_opp,tov_per_poss_opp,ast_to_tov_opp,poss_per_game_opp,game_win_rate_opp,gpt_sent_score_avg_opp,seed,conf,seed_opp,conf_opp,conf_rank,conf_rank_opp,plus_minus,luck,luck_opp,choke_rate,choke_rate_opp,upset_rate,upset_rate_opp,3mean_plus_minus,3mean_plus_minus_opp,win_streak,win_streak_opp,matchup_win_rate
12450,Houston,Baylor,0,59.0,78.0,2.0,1.0,FINAL FOUR,ncaa_tournament,2021/04/03,2021,0,1,0,1,0,32.0,28.0,4.0,88.210227,115.246212,1.306495,66.0,9.398438,5.37,21.66,26.6875,61.28125,0.435,9.09375,25.71875,0.354,13.59375,18.59375,0.731,14.3125,26.1875,40.5,13.9375,7.96875,4.65625,17.96875,10.65625,13.2,31.863281,0.437416,0.522248,0.334437,1.307918,0.995728,...,119.646401,1.266293,69.0,9.141667,7.4,24.83,30.566667,62.866667,0.486,9.966667,24.133333,0.413,11.8,16.666667,0.708,12.466667,23.5,35.966667,16.9,8.9,3.7,17.1,11.5,14.0,37.516667,0.450466,0.55289,0.30653,1.469565,1.250556,0.933333,82.0,2.0,Amer,1.0,B12,8.0,1.0,-19.0,0.666667,,0.125,0.083333,0.833333,1.0,8.333333,11.0,10,4,
12552,Houston,Baylor,1,82.0,76.0,2.0,11.0,,regular_season,2024/02/24,2024,0,0,1,0,1,28.0,25.0,3.0,87.057522,114.49115,1.315121,64.0,9.5625,8.99,26.71,26.571429,61.25,0.434,7.464286,21.821429,0.342,13.321429,19.357143,0.688,14.285714,24.5,38.785714,13.071429,10.428571,4.821429,17.964286,8.714286,11.0,30.194643,0.432906,0.491935,0.288604,1.5,1.07838,...,120.421607,1.153458,66.0,8.455357,10.31,20.6,28.107143,57.714286,0.487,8.857143,22.321429,0.397,16.535714,22.571429,0.733,11.535714,23.678571,35.214286,15.321429,6.892857,3.321429,16.785714,12.071429,15.0,39.364286,0.389222,0.545108,0.306659,1.269231,1.405867,0.714286,82.0,,B12,,B12,1.0,1.0,6.0,0.5,0.428571,0.111111,0.238095,,0.25,11.333333,7.666667,4,0,0.0


In [26]:
# df_f[fields_to_compare_complete].to_csv('../../data/failure_analysis.csv')
df_f[fields_to_compare_complete].T

team,Saint Mary's (CA),Arkansas,San Diego State
opponent,UCLA,Kansas,Connecticut
seed,5.0,8.0,5.0
seed_opp,4.0,1.0,4.0
season_year,2022,2023,2023
game_round,SECOND ROUND,SECOND ROUND,CHAMPIONSHIP
prediction,1,0,1
won,0,1,0
plus_minus,-16.0,1.0,-17.0
ast_per_poss,0.383256,0.322908,0.370024
ast_per_poss_opp,0.409712,0.422477,0.481851
fan_favorite,1.0,1.0,1.0
fan_favorite_opp,0.0,0.0,0.0
game_win_rate,0.764706,0.611111,0.820513
game_win_rate_opp,0.771429,0.777778,0.794872
matchup_win_rate,,,
matchup_win_rate_opp,,,
sos,6.58,9.87,8.92
sos_opp,8.36,11.84,8.51


1. From a seeding perspective, the only team that should possibly have a fan_favorite underdog score would be Arkansas vs Kansas. The simple seeding/ranking calculation could be superficial as San Diego State deserved an underdog rating vs UConn by every means.
2. Tournament data vs Regular Season data
3. Matchup Winrate's calculation may be superficial - win recency and/or frequency should be factored into the score somehow

In [22]:
pred[['season_type','correct_prediction']].groupby(['season_type']).sum() / pred[['season_type','correct_prediction']].groupby(['season_type']).count()

Unnamed: 0_level_0,correct_prediction
season_type,Unnamed: 1_level_1
ncaa_tournament,0.630435
regular_season,0.798899


In [29]:
all_features[['season_type','won']].groupby(['season_type']).count()

Unnamed: 0_level_0,won
season_type,Unnamed: 1_level_1
ncaa_tournament,400
regular_season,39118
