In [1]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

pd.set_option('display.max_columns', None)

In [2]:
DATA_PATH = 'march-machine-learning-mania-2023/'

#for filename in sorted(os.listdir(DATA_PATH)):
    #print(filename)

# Data Preparation

## Seeds

In [3]:
df_seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv"),
], ignore_index=True)
    
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


## Season results

In [4]:
df_season_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv"),
], ignore_index=True)
df_season_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

In [5]:
df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']

In [6]:
df_season_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,ScoreGap
0,2003,10,1104,68,1328,62,27,58,3,14,11,18,14,24,13,23,7,1,22,22,53,2,10,16,22,10,22,8,18,9,2,20,6
1,2003,10,1272,70,1393,63,26,62,8,20,10,19,15,28,16,13,4,4,18,24,67,6,24,9,20,20,25,7,12,8,6,16,7
2,2003,11,1266,73,1437,61,24,58,8,18,17,29,17,26,15,10,5,2,25,22,73,3,26,14,23,31,22,9,12,2,5,23,12
3,2003,11,1296,56,1457,50,18,38,3,9,17,31,6,19,11,12,14,2,18,18,49,6,22,8,15,17,20,9,19,4,3,23,6
4,2003,11,1400,77,1208,71,30,61,6,14,11,13,17,22,12,14,4,4,20,24,62,6,16,17,27,21,15,12,10,7,1,14,6


### In Game Stats

In [7]:
#Calculate IN Game Stats Here
df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']
#Possessions  (FGA – OR) + TO + (Y * FTA)
df_season_results['WPOS'] = (df_season_results['WFGA'] - df_season_results['WOR']) + df_season_results['WTO'] + (.44 * df_season_results['WFTA'])
df_season_results['LPOS'] = (df_season_results['LFGA'] - df_season_results['LOR']) + df_season_results['LTO'] + (.44 * df_season_results['LFTA'])
#Points Per Possession Game['WPPP'] = Team['WScore'] / Team['WNumPos']
df_season_results['WPPP'] = df_season_results['WScore'] / df_season_results['WPOS']
df_season_results['LPPP'] = df_season_results['LScore'] / df_season_results['LPOS']
#Field Goal Percentage ((Team['WScore'] - Team['WFTM']) / 2) / Team['WFGA']
df_season_results['WFGP'] = ((df_season_results['WScore'] - df_season_results['WFTM']) / 2) / df_season_results['WFGA']
df_season_results['LFGP'] = ((df_season_results['LScore'] - df_season_results['LFTM']) / 2) / df_season_results['LFGA']
#Turn Over Rate Game['WTORate'] = Team['WTO'] / Team['WNumPos']
df_season_results['WTORate'] = df_season_results['WTO'] / df_season_results['WPOS']
df_season_results['LTORate'] = df_season_results['LTO'] / df_season_results['LPOS']
#Offensive Field Goal Efficency Game['WOff_eFG'] = (Team['WFGM'] + 0.5 * Team['WFGM3']) / Team['WFGA']
df_season_results['WOff_eFG'] = (df_season_results['WFGM'] + 0.5 * df_season_results['WFGM3']) / df_season_results['WFGA']
df_season_results['LOff_eFG'] = (df_season_results['LFGM'] + 0.5 * df_season_results['LFGM3']) / df_season_results['LFGA']
#Defensive Field Goal Efficency Game['WDef_eFG'] = (Team['LFGM'] + 0.5 * Team['LFGM3']) / Team['LFGA']
df_season_results['WDef_eFG'] = (df_season_results['LFGM'] + 0.5 * df_season_results['LFGM3']) / df_season_results['LFGA']
df_season_results['LDef_eFG'] = (df_season_results['WFGM'] + 0.5 * df_season_results['WFGM3']) / df_season_results['WFGA']
#Offensive Rebound Percentage Game['WORB'] = Team['WOR'] / (Team['WOR'] + Team['LDR'])
df_season_results['WORP'] = df_season_results['WOR'] / (df_season_results['WOR'] + df_season_results['LDR'])
df_season_results['LORP'] = df_season_results['LOR'] / (df_season_results['LOR'] + df_season_results['WDR'])
#Defensive Rebound Percentage Game['WDRB'] = Team['WDR'] / (Team['LOR'] + Team['WDR'])
df_season_results['WDRP'] = df_season_results['WDR'] / (df_season_results['LOR'] + df_season_results['WDR'])
df_season_results['LDRP'] = df_season_results['LDR'] / (df_season_results['WOR'] + df_season_results['LDR'])
#Percentage of Field Goals Assisted Game['WFGAst'] = Team['WAst'] / Team['WFGM']
df_season_results['WFGAst'] = df_season_results['WAst'] / df_season_results['WFGM']
df_season_results['LFGAst'] = df_season_results['LAst'] / df_season_results['LFGM']
#Assists to Turnover Ratio Game['WAstTO'] = Team['WAst'] / Team['WTO']
df_season_results['WAstTO'] = df_season_results['WAst'] / df_season_results['WTO']
df_season_results['LAstTO'] = df_season_results['LAst'] / df_season_results['LTO']
#Free Throw Percentage Game['WFTP'] = Team['WFTM'] / Team['WFTA']
df_season_results['WFTP'] = df_season_results['WFTM'] / df_season_results['WFTA']
df_season_results['LFTP'] = df_season_results['LFTM'] / df_season_results['LFTA']
#Free Throw Rate (Free Throws per FGAs) Game['WFT_Rate'] = Team['WFTA'] / Team['WFGA']
df_season_results['WFT_Rate'] = df_season_results['WFTA'] / df_season_results['WFGA']
df_season_results['LFT_Rate'] = df_season_results['LFTA'] / df_season_results['LFGA']
#True Shooting Attempts Game['WTSA'] = Team['WFGA'] + 0.4 * Team['WFTA']
df_season_results['WTSA'] = df_season_results['WFGA'] + 0.44 * df_season_results['WFTA']
df_season_results['LTSA'] = df_season_results['LFGA'] + 0.44 * df_season_results['LFTA']
#True Shooting Percentage Game['WTru'] = Team['WScore'] / (2*Team['WTSA'])
df_season_results['WTru'] = df_season_results['WScore'] / (2 * df_season_results['WTSA'])
df_season_results['LTru'] = df_season_results['LScore'] / (2 * df_season_results['LTSA'])

### Features

In [8]:
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "WTeamID": "TeamID"})

In [9]:
num_win.head() #num losses should have the same thing so no need to run it

Unnamed: 0,Season,TeamID,NumWins
0,2003,1102,12
1,2003,1103,13
2,2003,1104,17
3,2003,1105,7
4,2003,1106,13


In [10]:
df_season_results.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WFGM',
       'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'ScoreGap', 'WPOS',
       'LPOS', 'WPPP', 'LPPP', 'WFGP', 'LFGP', 'WTORate', 'LTORate',
       'WOff_eFG', 'LOff_eFG', 'WDef_eFG', 'LDef_eFG', 'WORP', 'LORP', 'WDRP',
       'LDRP', 'WFGAst', 'LFGAst', 'WAstTO', 'LAstTO', 'WFTP', 'LFTP',
       'WFT_Rate', 'LFT_Rate', 'WTSA', 'LTSA', 'WTru', 'LTru'],
      dtype='object')

In [11]:
num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})

In [12]:
gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()

gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap', 'WPOS','WPPP', 'WFGP', 'WTORate','WOff_eFG',
                   'WDef_eFG','WORP','WDRP','WFGAst','WAstTO','WFTP','WFT_Rate','WTru']].rename(
    columns={
    "ScoreGap": "GapWins", "WTeamID": "TeamID", "WPOS": "POS", "WPPP":"PPP", "WFGP":"FGP", "WTORate":"TORate",
    "WOff_eFG":"Off_eFG", "WDef_eFG":"Def_eFG", "WORP":"ORP", "WDRP":"DRP", "WFGAst":"FGAst", "WAstTO":"AstTO",
        "WFTP":"FTP", "WFT_Rate":"FT_Rate","WTru":"Tru"})

In [13]:
gap_win.head()

Unnamed: 0,Season,TeamID,GapWins,POS,PPP,FGP,TORate,Off_eFG,Def_eFG,ORP,DRP,FGAst,AstTO,FTP,FT_Rate,Tru
0,2003,1102,15.583333,56.38,1.224741,0.692162,0.194352,0.692162,0.454576,0.172188,0.636526,0.746082,1.87577,0.631688,0.541714,0.702724
1,2003,1103,9.384615,72.144615,1.218303,0.592604,0.175252,0.592604,0.510207,0.31851,0.620109,0.593458,1.534233,0.722603,0.554488,0.638244
2,2003,1104,13.176471,67.795294,1.103197,0.506056,0.191923,0.506056,0.438473,0.367049,0.703299,0.551302,1.177279,0.709384,0.392488,0.549145
3,2003,1105,13.0,76.6,1.036343,0.4884,0.235009,0.4884,0.434355,0.383144,0.667596,0.618892,0.942261,0.743782,0.425332,0.541645
4,2003,1106,10.384615,67.907692,1.005881,0.517668,0.261162,0.517668,0.385205,0.378952,0.714061,0.530133,0.756597,0.591138,0.395797,0.544102


In [14]:
gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap', 'LPOS', 'LPPP', 'LFGP', 'LTORate','LOff_eFG','LDef_eFG',
                     'LORP','LDRP','LFGAst','LAstTO','LFTP','LFT_Rate','LTru']].rename(
    columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID", "LPOS":"POS","LPPP":"PPP","LFGP":"FGP","LTORate":"TORate","LOff_eFG":"Off_eFG",
            "LDef_eFG":"Def_eFG","LORP":"ORP","LDRP":"DRP","LFGAst":"FGAst","LAstTO":"AstTO","LFTP":"FTP","LFT_Rate":"FT_Rate","LTru":"Tru"})

In [15]:
gap_loss.head()

Unnamed: 0,Season,TeamID,GapLosses,POS,PPP,FGP,TORate,Off_eFG,Def_eFG,ORP,DRP,FGAst,AstTO,FTP,FT_Rate,Tru
0,2003,1102,11.25,53.2,0.91724,0.503591,0.216997,0.503591,0.558816,0.165271,0.625778,0.604671,0.999534,0.650437,0.375427,0.533891
1,2003,1103,7.5,68.16,1.02843,0.484527,0.187378,0.484527,0.585211,0.294003,0.633394,0.529464,1.129718,0.747034,0.382163,0.537125
2,2003,1104,9.454545,63.483636,0.955762,0.429002,0.215706,0.429002,0.54443,0.377756,0.66155,0.431948,0.706652,0.698651,0.341227,0.479359
3,2003,1105,11.473684,76.301053,0.903714,0.446777,0.246354,0.446777,0.536922,0.317489,0.631796,0.583636,0.832372,0.697004,0.335248,0.490594
4,2003,1106,9.266667,66.736,0.891659,0.450522,0.246039,0.450522,0.511333,0.323938,0.649253,0.467619,0.694235,0.650908,0.231093,0.479614


Merge

In [16]:
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

In [17]:
df_features_season_w.head()

Unnamed: 0,Season,TeamID
0,2003,1102
1,2003,1103
2,2003,1104
3,2003,1105
4,2003,1106


In [18]:
df_features_season = pd.concat([df_features_season_w, df_features_season_l], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [19]:
df_gap_loss_subset = gap_loss[['Season', 'TeamID', 'GapLosses']] 
df_gap_win_subset = gap_win[['Season', 'TeamID', 'GapWins']]

In [20]:
df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(df_gap_win_subset, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(df_gap_loss_subset, on=['Season', 'TeamID'], how='left')

In [21]:
df_features_season.head()

Unnamed: 0,Season,TeamID,NumWins,NumLosses,GapWins,GapLosses
0,2003,1102,12.0,16.0,15.583333,11.25
1,2003,1103,13.0,14.0,9.384615,7.5
2,2003,1104,17.0,11.0,13.176471,9.454545
3,2003,1105,7.0,19.0,13.0,11.473684
4,2003,1106,13.0,15.0,10.384615,9.266667


In [22]:
win_features = gap_loss.drop('GapLosses', axis=1) #dropping the gap loss
loss_features = gap_win.drop('GapWins', axis=1) #dropping the gap loss

In [23]:
avg_loss_features = loss_features.set_index(['Season', 'TeamID'])
avg_win_features = win_features.set_index(['Season', 'TeamID'])

features = (avg_loss_features + avg_win_features) / 2

features.reset_index(inplace=True)
features.head()

Unnamed: 0,Season,TeamID,POS,PPP,FGP,TORate,Off_eFG,Def_eFG,ORP,DRP,FGAst,AstTO,FTP,FT_Rate,Tru
0,2003,1102,54.79,1.070991,0.597877,0.205675,0.597877,0.506696,0.168729,0.631152,0.675376,1.437652,0.641063,0.45857,0.618308
1,2003,1103,70.152308,1.123367,0.538566,0.181315,0.538566,0.547709,0.306256,0.626752,0.561461,1.331976,0.734818,0.468326,0.587684
2,2003,1104,65.639465,1.02948,0.467529,0.203814,0.467529,0.491451,0.372403,0.682424,0.491625,0.941966,0.704018,0.366857,0.514252
3,2003,1105,76.450526,0.970029,0.467589,0.240682,0.467589,0.485638,0.350317,0.649696,0.601264,0.887316,0.720393,0.38029,0.51612
4,2003,1106,67.321846,0.94877,0.484095,0.253601,0.484095,0.448269,0.351445,0.681657,0.498876,0.725416,0.621023,0.313445,0.511858


In [24]:
df_features_season = df_features_season.merge(features, on=['Season', 'TeamID'], how='left')

In [25]:
df_features_season.fillna(0, inplace=True)  

In [26]:
df_features_season['Num_Games'] = df_features_season['NumWins'] + df_features_season['NumLosses']

In [27]:
df_features_season.head()

Unnamed: 0,Season,TeamID,NumWins,NumLosses,GapWins,GapLosses,POS,PPP,FGP,TORate,Off_eFG,Def_eFG,ORP,DRP,FGAst,AstTO,FTP,FT_Rate,Tru,Num_Games
0,2003,1102,12.0,16.0,15.583333,11.25,54.79,1.070991,0.597877,0.205675,0.597877,0.506696,0.168729,0.631152,0.675376,1.437652,0.641063,0.45857,0.618308,28.0
1,2003,1103,13.0,14.0,9.384615,7.5,70.152308,1.123367,0.538566,0.181315,0.538566,0.547709,0.306256,0.626752,0.561461,1.331976,0.734818,0.468326,0.587684,27.0
2,2003,1104,17.0,11.0,13.176471,9.454545,65.639465,1.02948,0.467529,0.203814,0.467529,0.491451,0.372403,0.682424,0.491625,0.941966,0.704018,0.366857,0.514252,28.0
3,2003,1105,7.0,19.0,13.0,11.473684,76.450526,0.970029,0.467589,0.240682,0.467589,0.485638,0.350317,0.649696,0.601264,0.887316,0.720393,0.38029,0.51612,26.0
4,2003,1106,13.0,15.0,10.384615,9.266667,67.321846,0.94877,0.484095,0.253601,0.484095,0.448269,0.351445,0.681657,0.498876,0.725416,0.621023,0.313445,0.511858,28.0


Compute Features

In [28]:
df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = (
    (df_features_season['NumWins'] * df_features_season['GapWins'] - 
    df_features_season['NumLosses'] * df_features_season['GapLosses'])
    / (df_features_season['NumWins'] + df_features_season['NumLosses'])
)

In [29]:
df_features_season.drop(['NumWins', 'NumLosses', 'GapWins', 'GapLosses', 'Num_Games'], axis=1, inplace=True)

At this point here we have all the features of the team's averages when they win / lose

In [30]:
df_features_season.head()

Unnamed: 0,Season,TeamID,POS,PPP,FGP,TORate,Off_eFG,Def_eFG,ORP,DRP,FGAst,AstTO,FTP,FT_Rate,Tru,WinRatio,GapAvg
0,2003,1102,54.79,1.070991,0.597877,0.205675,0.597877,0.506696,0.168729,0.631152,0.675376,1.437652,0.641063,0.45857,0.618308,0.428571,0.25
1,2003,1103,70.152308,1.123367,0.538566,0.181315,0.538566,0.547709,0.306256,0.626752,0.561461,1.331976,0.734818,0.468326,0.587684,0.481481,0.62963
2,2003,1104,65.639465,1.02948,0.467529,0.203814,0.467529,0.491451,0.372403,0.682424,0.491625,0.941966,0.704018,0.366857,0.514252,0.607143,4.285714
3,2003,1105,76.450526,0.970029,0.467589,0.240682,0.467589,0.485638,0.350317,0.649696,0.601264,0.887316,0.720393,0.38029,0.51612,0.269231,-4.884615
4,2003,1106,67.321846,0.94877,0.484095,0.253601,0.484095,0.448269,0.351445,0.681657,0.498876,0.725416,0.621023,0.313445,0.511858,0.464286,-0.142857


In [31]:
df_features_season.shape

(12135, 17)

## Tourney Results

In [32]:
df_tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv"),
    pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv"),
], ignore_index=True)
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

In [33]:
df_tourney_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,1998,137,3104,94,3422,46
1,1998,137,3112,75,3365,63
2,1998,137,3163,93,3193,52
3,1998,137,3198,59,3266,45
4,1998,137,3203,74,3208,72


## Feature Engineering

### Train Data

In [34]:
df = df_tourney_results.copy()
df = df[df['Season'] >= 2010].reset_index(drop=True)

df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,2010,138,3124,69,3201,55
1,2010,138,3173,67,3395,66
2,2010,138,3181,72,3214,37
3,2010,138,3199,75,3256,61
4,2010,138,3207,62,3265,42


In [35]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})

In [36]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW
0,2010,138,3124,69,3201,55,X04
1,2010,138,3173,67,3395,66,X08
2,2010,138,3181,72,3214,37,X02
3,2010,138,3199,75,3256,61,W03
4,2010,138,3207,62,3265,42,X05


In [37]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})

In [38]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2010,138,3124,69,3201,55,X04,X13
1,2010,138,3173,67,3395,66,X08,X09
2,2010,138,3181,72,3214,37,X02,X15
3,2010,138,3199,75,3256,61,W03,W14
4,2010,138,3207,62,3265,42,X05,X12


In [39]:
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

In [40]:
df['SeedW'] = df['SeedW'].apply(treat_seed)
df['SeedL'] = df['SeedL'].apply(treat_seed)

In [41]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2010,138,3124,69,3201,55,4,13
1,2010,138,3173,67,3395,66,8,9
2,2010,138,3181,72,3214,37,2,15
3,2010,138,3199,75,3256,61,3,14
4,2010,138,3207,62,3265,42,5,12


### Season Stats

In [42]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsW',
    'NumLosses': 'NumLossesW',
    'GapWins': 'GapWinsW',
    'GapLosses': 'GapLossesW',
    'WinRatio': 'WinRatioW',
    'GapAvg': 'GapAvgW',
    #NEW FEATURES
    'POS': 'POSW',
    'PPP':'PPPW',
    'FGP':'FGPW',
    'TORate':'TORateW',
    'Off_eFG': 'Off_eFGW',
    'Def_eFG':'Def_eFGW',
    'ORP':'ORPW',
    'DRP':'DRPW',
    'FGAst':'FGAstW',
    'AstTO':'AstTOW',
    'FTP': 'FTPW',
    'FT_Rate': 'FT_RateW',
    'Tru':'TruW'
}).drop(columns='TeamID', axis=1)

In [43]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsL',
    'NumLosses': 'NumLossesL',
    'GapWins': 'GapWinsL',
    'GapLosses': 'GapLossesL',
    'WinRatio': 'WinRatioL',
    'GapAvg': 'GapAvgL',
    #NEW FEATURES
    'POS': 'POSL',
    'PPP':'PPPL',
    'FGP':'FGPL',
    'TORate':'TORateL',
    'Off_eFG': 'Off_eFGL',
    'Def_eFG':'Def_eFGL',
    'ORP':'ORPL',
    'DRP':'DRPL',
    'FGAst':'FGAstL',
    'AstTO':'AstTOL',
    'FTP': 'FTPL',
    'FT_Rate': 'FT_RateL',
    'Tru':'TruL'
}).drop(columns='TeamID', axis=1)

In [44]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,POSW,PPPW,FGPW,TORateW,Off_eFGW,Def_eFGW,ORPW,DRPW,FGAstW,AstTOW,FTPW,FT_RateW,TruW,WinRatioW,GapAvgW,POSL,PPPL,FGPL,TORateL,Off_eFGL,Def_eFGL,ORPL,DRPL,FGAstL,AstTOL,FTPL,FT_RateL,TruL,WinRatioL,GapAvgL
0,2010,138,3124,69,3201,55,4,13,71.389662,0.943135,0.450636,0.232878,0.450636,0.397767,0.339747,0.703923,0.545561,0.88734,0.733631,0.407474,0.505605,0.71875,15.25,70.888148,0.952554,0.452832,0.204399,0.452832,0.455427,0.32394,0.650504,0.523829,0.945212,0.695118,0.266067,0.48797,0.818182,12.878788
1,2010,138,3173,67,3395,66,8,9,72.061524,0.932544,0.444534,0.237103,0.444534,0.441518,0.367651,0.686852,0.493228,0.769111,0.701292,0.29968,0.486072,0.807692,11.269231,73.059545,0.941538,0.449193,0.229476,0.449193,0.435968,0.330679,0.636856,0.609427,0.981146,0.765098,0.296938,0.495891,0.733333,12.0
2,2010,138,3181,72,3214,37,2,15,72.881926,0.904706,0.414786,0.254444,0.414786,0.450766,0.440257,0.650455,0.489958,0.713278,0.665684,0.291757,0.452396,0.84375,16.53125,69.580478,0.890866,0.404425,0.233744,0.404425,0.421934,0.351524,0.646252,0.502414,0.798722,0.713828,0.317405,0.454595,0.633333,7.7
3,2010,138,3199,75,3256,61,3,14,73.3864,0.930726,0.447253,0.26029,0.447253,0.438582,0.391366,0.664062,0.585201,0.777135,0.706554,0.328566,0.489607,0.833333,14.366667,75.561087,0.952841,0.447306,0.227836,0.447306,0.433594,0.360982,0.684207,0.511793,0.833962,0.688641,0.374008,0.493961,0.741935,9.935484
4,2010,138,3207,62,3265,42,5,12,69.559167,0.921404,0.413695,0.23575,0.413695,0.458839,0.411665,0.59345,0.652369,0.982511,0.736504,0.303365,0.4591,0.8,9.666667,66.119341,0.978926,0.466927,0.21119,0.466927,0.428923,0.297306,0.643749,0.650662,1.088279,0.821386,0.309054,0.521809,0.787879,10.272727


In [45]:
df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'SeedW',
       'SeedL', 'POSW', 'PPPW', 'FGPW', 'TORateW', 'Off_eFGW', 'Def_eFGW',
       'ORPW', 'DRPW', 'FGAstW', 'AstTOW', 'FTPW', 'FT_RateW', 'TruW',
       'WinRatioW', 'GapAvgW', 'POSL', 'PPPL', 'FGPL', 'TORateL', 'Off_eFGL',
       'Def_eFGL', 'ORPL', 'DRPL', 'FGAstL', 'AstTOL', 'FTPL', 'FT_RateL',
       'TruL', 'WinRatioL', 'GapAvgL'],
      dtype='object')

In [46]:
min_day_num = df['DayNum'].min()
print(min_day_num)
# df.shape

134


### Add Symetrical 
- Right now our data only consists of won matches
- We duplicate our data, get rid of the winner loser

In [47]:
def add_loosing_matches(df):
    win_rename = {
        "WTeamID": "TeamIdA", 
        "WScore" : "ScoreA", 
        "LTeamID" : "TeamIdB",
        "LScore": "ScoreB",
     }
    win_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('W')})
    win_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('L')})
    
    lose_rename = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
    }
    lose_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('W')})
    lose_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('L')})
    
    win_df = df.copy()
    lose_df = df.copy()
    
    win_df = win_df.rename(columns=win_rename)
    lose_df = lose_df.rename(columns=lose_rename)
    
    return pd.concat([win_df, lose_df], axis=0, sort=False)

In [48]:
df = add_loosing_matches(df)

### Differences
- We compute the difference between the team for each feature.
- This helps further assessing how better (or worse) team A is from team B

In [49]:
cols_to_diff = [ #15.283636
    'Seed', 'WinRatio', 'GapAvg', 'POS', 'PPP', 'FGP', 'TORate', 'Off_eFG', 'Def_eFG',
    'ORP', 'DRP', 'FGAst', 'AstTO', 'FTP', 'FT_Rate', 'Tru'
]

for col in cols_to_diff:
    df[col + 'Diff'] = df[col + 'A'] - df[col + 'B']

In [50]:
# min_day_num = df['DayNum'].min()
# print(min_day_num)
df.head()

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,POSA,PPPA,FGPA,TORateA,Off_eFGA,Def_eFGA,ORPA,DRPA,FGAstA,AstTOA,FTPA,FT_RateA,TruA,WinRatioA,GapAvgA,POSB,PPPB,FGPB,TORateB,Off_eFGB,Def_eFGB,ORPB,DRPB,FGAstB,AstTOB,FTPB,FT_RateB,TruB,WinRatioB,GapAvgB,SeedDiff,WinRatioDiff,GapAvgDiff,POSDiff,PPPDiff,FGPDiff,TORateDiff,Off_eFGDiff,Def_eFGDiff,ORPDiff,DRPDiff,FGAstDiff,AstTODiff,FTPDiff,FT_RateDiff,TruDiff
0,2010,138,3124,69,3201,55,4,13,71.389662,0.943135,0.450636,0.232878,0.450636,0.397767,0.339747,0.703923,0.545561,0.88734,0.733631,0.407474,0.505605,0.71875,15.25,70.888148,0.952554,0.452832,0.204399,0.452832,0.455427,0.32394,0.650504,0.523829,0.945212,0.695118,0.266067,0.48797,0.818182,12.878788,-9,-0.099432,2.371212,0.501514,-0.009419,-0.002197,0.028479,-0.002197,-0.05766,0.015807,0.053418,0.021732,-0.057872,0.038513,0.141407,0.017635
1,2010,138,3173,67,3395,66,8,9,72.061524,0.932544,0.444534,0.237103,0.444534,0.441518,0.367651,0.686852,0.493228,0.769111,0.701292,0.29968,0.486072,0.807692,11.269231,73.059545,0.941538,0.449193,0.229476,0.449193,0.435968,0.330679,0.636856,0.609427,0.981146,0.765098,0.296938,0.495891,0.733333,12.0,-1,0.074359,-0.730769,-0.998022,-0.008994,-0.004659,0.007627,-0.004659,0.00555,0.036972,0.049996,-0.116199,-0.212034,-0.063806,0.002742,-0.009819
2,2010,138,3181,72,3214,37,2,15,72.881926,0.904706,0.414786,0.254444,0.414786,0.450766,0.440257,0.650455,0.489958,0.713278,0.665684,0.291757,0.452396,0.84375,16.53125,69.580478,0.890866,0.404425,0.233744,0.404425,0.421934,0.351524,0.646252,0.502414,0.798722,0.713828,0.317405,0.454595,0.633333,7.7,-13,0.210417,8.83125,3.301447,0.01384,0.010361,0.0207,0.010361,0.028832,0.088733,0.004204,-0.012456,-0.085443,-0.048144,-0.025648,-0.002199
3,2010,138,3199,75,3256,61,3,14,73.3864,0.930726,0.447253,0.26029,0.447253,0.438582,0.391366,0.664062,0.585201,0.777135,0.706554,0.328566,0.489607,0.833333,14.366667,75.561087,0.952841,0.447306,0.227836,0.447306,0.433594,0.360982,0.684207,0.511793,0.833962,0.688641,0.374008,0.493961,0.741935,9.935484,-11,0.091398,4.431183,-2.174687,-0.022115,-5.2e-05,0.032454,-5.2e-05,0.004987,0.030384,-0.020145,0.073408,-0.056827,0.017914,-0.045442,-0.004354
4,2010,138,3207,62,3265,42,5,12,69.559167,0.921404,0.413695,0.23575,0.413695,0.458839,0.411665,0.59345,0.652369,0.982511,0.736504,0.303365,0.4591,0.8,9.666667,66.119341,0.978926,0.466927,0.21119,0.466927,0.428923,0.297306,0.643749,0.650662,1.088279,0.821386,0.309054,0.521809,0.787879,10.272727,-7,0.012121,-0.606061,3.439826,-0.057522,-0.053233,0.024559,-0.053233,0.029917,0.114359,-0.050299,0.001707,-0.105768,-0.084882,-0.005689,-0.062709


## Test Data

### Preparing

In [51]:
df_test = pd.read_csv(DATA_PATH + "SampleSubmission2023.csv")

In [52]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [53]:
df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2023_1101_1102,0.5,2023,1101,1102
1,2023_1101_1103,0.5,2023,1101,1103
2,2023_1101_1104,0.5,2023,1101,1104
3,2023_1101_1105,0.5,2023,1101,1105
4,2023_1101_1106,0.5,2023,1101,1106


### Seeds
- Seeds are not released yet, so i filled missing values with "WO1"

In [54]:
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'}).fillna('W01')

In [55]:
df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'}).fillna('W01')

In [56]:
df_test['SeedA'] = df_test['SeedA'].apply(treat_seed)
df_test['SeedB'] = df_test['SeedB'].apply(treat_seed)

### Season Stats

In [57]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsA',
    'NumLosses': 'NumLossesA',
    'GapWins': 'GapWinsA',
    'GapLosses': 'GapLossesA',
    'WinRatio': 'WinRatioA',
    'GapAvg': 'GapAvgA',
    
    'POS': 'POSA',
    'PPP':'PPPA',
    'FGP':'FGPA',
    'TORate':'TORateA',
    'Off_eFG':'Off_eFGA',
    'Def_eFG':'Def_eFGA',
    'ORP':'ORPA',
    'DRP':'DRPA',
    'FGAst':'FGAstA',
    'AstTO':'AstTOA',
    'FTP':'FTPA',
    'FT_Rate':'FT_RateA',
    'Tru':'TruA'
}).drop(columns='TeamID', axis=1)

In [58]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsB',
    'NumLosses': 'NumLossesB',
    'GapWins': 'GapWinsB',
    'GapLosses': 'GapLossesB',
    'WinRatio': 'WinRatioB',
    'GapAvg': 'GapAvgB', 

    'POS': 'POSB',
    'PPP':'PPPB',
    'FGP':'FGPB',
    'TORate':'TORateB',
    'Off_eFG':'Off_eFGB',
    'Def_eFG':'Def_eFGB',
    'ORP':'ORPB',
    'DRP':'DRPB',
    'FGAst':'FGAstB',
    'AstTO':'AstTOB',
    'FTP':'FTPB',
    'FT_Rate':'FT_RateB',
    'Tru':'TruB'
}).drop(columns='TeamID', axis=1)

In [59]:
df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,POSA,PPPA,FGPA,TORateA,Off_eFGA,Def_eFGA,ORPA,DRPA,FGAstA,AstTOA,FTPA,FT_RateA,TruA,WinRatioA,GapAvgA,POSB,PPPB,FGPB,TORateB,Off_eFGB,Def_eFGB,ORPB,DRPB,FGAstB,AstTOB,FTPB,FT_RateB,TruB,WinRatioB,GapAvgB
0,2023_1101_1102,0.5,2023,1101,1102,1,1,70.369402,1.044996,0.519349,0.167225,0.519349,0.560199,0.233766,0.716143,0.568745,1.450701,0.706358,0.343506,0.558316,0.409091,-2.863636,63.036381,1.074998,0.554375,0.168254,0.554375,0.493387,0.205383,0.730455,0.651418,1.634207,0.7119,0.327102,0.585144,0.482759,0.241379
1,2023_1101_1103,0.5,2023,1101,1103,1,1,70.369402,1.044996,0.519349,0.167225,0.519349,0.560199,0.233766,0.716143,0.568745,1.450701,0.706358,0.343506,0.558316,0.409091,-2.863636,67.611895,1.03274,0.491541,0.164004,0.491541,0.504538,0.29258,0.755935,0.489782,1.20035,0.715106,0.334507,0.534447,0.653846,5.653846
2,2023_1101_1104,0.5,2023,1101,1104,1,1,70.369402,1.044996,0.519349,0.167225,0.519349,0.560199,0.233766,0.716143,0.568745,1.450701,0.706358,0.343506,0.558316,0.409091,-2.863636,74.93087,1.046355,0.516288,0.206208,0.516288,0.469309,0.291406,0.77655,0.49549,0.969561,0.734499,0.401293,0.564103,0.851852,15.111111
3,2023_1101_1105,0.5,2023,1101,1105,1,1,70.369402,1.044996,0.519349,0.167225,0.519349,0.560199,0.233766,0.716143,0.568745,1.450701,0.706358,0.343506,0.558316,0.409091,-2.863636,71.213194,0.984732,0.499684,0.188523,0.499684,0.490304,0.275131,0.689697,0.495421,1.017891,0.64749,0.358612,0.530904,0.36,-3.56
4,2023_1101_1106,0.5,2023,1101,1106,1,1,70.369402,1.044996,0.519349,0.167225,0.519349,0.560199,0.233766,0.716143,0.568745,1.450701,0.706358,0.343506,0.558316,0.409091,-2.863636,68.759143,0.959792,0.442721,0.181671,0.442721,0.467496,0.309192,0.659406,0.466995,0.905496,0.73119,0.349493,0.494248,0.259259,-9.703704


### Differences

In [60]:
for col in cols_to_diff:
    df_test[col + 'Diff'] = df_test[col + 'A'] - df_test[col + 'B']

## Target

In [61]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

# Modeling 

In [62]:
features = [
       'SeedA', 'SeedB', 'POSA',
       'PPPA', 'FGPA', 'TORateA', 'Off_eFGA', 'Def_eFGA', 'ORPA', 'DRPA',
       'FGAstA', 'AstTOA', 'FTPA', 'FT_RateA', 'TruA',
       'WinRatioA', 'GapAvgA', 'POSB', 'PPPB', 'FGPB', 'TORateB', 'Off_eFGB',
       'Def_eFGB', 'ORPB', 'DRPB', 'FGAstB', 'AstTOB', 'FTPB', 'FT_RateB',
       'TruB', 'WinRatioB', 'GapAvgB', 'SeedDiff',
       'WinRatioDiff', 'GapAvgDiff', 'POSDiff', 'PPPDiff', 'FGPDiff',
       'TORateDiff', 'Off_eFGDiff', 'Def_eFGDiff', 'ORPDiff', 'DRPDiff',
       'FGAstDiff', 'AstTODiff', 'FTPDiff', 'FT_RateDiff', 'TruDiff'
]

In [63]:
df.columns

Index(['Season', 'DayNum', 'TeamIdA', 'ScoreA', 'TeamIdB', 'ScoreB', 'SeedA',
       'SeedB', 'POSA', 'PPPA', 'FGPA', 'TORateA', 'Off_eFGA', 'Def_eFGA',
       'ORPA', 'DRPA', 'FGAstA', 'AstTOA', 'FTPA', 'FT_RateA', 'TruA',
       'WinRatioA', 'GapAvgA', 'POSB', 'PPPB', 'FGPB', 'TORateB', 'Off_eFGB',
       'Def_eFGB', 'ORPB', 'DRPB', 'FGAstB', 'AstTOB', 'FTPB', 'FT_RateB',
       'TruB', 'WinRatioB', 'GapAvgB', 'SeedDiff', 'WinRatioDiff',
       'GapAvgDiff', 'POSDiff', 'PPPDiff', 'FGPDiff', 'TORateDiff',
       'Off_eFGDiff', 'Def_eFGDiff', 'ORPDiff', 'DRPDiff', 'FGAstDiff',
       'AstTODiff', 'FTPDiff', 'FT_RateDiff', 'TruDiff', 'ScoreDiff', 'WinA'],
      dtype='object')

In [64]:
df_test.columns

Index(['ID', 'Pred', 'Season', 'TeamIdA', 'TeamIdB', 'SeedA', 'SeedB', 'POSA',
       'PPPA', 'FGPA', 'TORateA', 'Off_eFGA', 'Def_eFGA', 'ORPA', 'DRPA',
       'FGAstA', 'AstTOA', 'FTPA', 'FT_RateA', 'TruA', 'WinRatioA', 'GapAvgA',
       'POSB', 'PPPB', 'FGPB', 'TORateB', 'Off_eFGB', 'Def_eFGB', 'ORPB',
       'DRPB', 'FGAstB', 'AstTOB', 'FTPB', 'FT_RateB', 'TruB', 'WinRatioB',
       'GapAvgB', 'SeedDiff', 'WinRatioDiff', 'GapAvgDiff', 'POSDiff',
       'PPPDiff', 'FGPDiff', 'TORateDiff', 'Off_eFGDiff', 'Def_eFGDiff',
       'ORPDiff', 'DRPDiff', 'FGAstDiff', 'AstTODiff', 'FTPDiff',
       'FT_RateDiff', 'TruDiff'],
      dtype='object')

In [65]:
print(features)

['SeedA', 'SeedB', 'POSA', 'PPPA', 'FGPA', 'TORateA', 'Off_eFGA', 'Def_eFGA', 'ORPA', 'DRPA', 'FGAstA', 'AstTOA', 'FTPA', 'FT_RateA', 'TruA', 'WinRatioA', 'GapAvgA', 'POSB', 'PPPB', 'FGPB', 'TORateB', 'Off_eFGB', 'Def_eFGB', 'ORPB', 'DRPB', 'FGAstB', 'AstTOB', 'FTPB', 'FT_RateB', 'TruB', 'WinRatioB', 'GapAvgB', 'SeedDiff', 'WinRatioDiff', 'GapAvgDiff', 'POSDiff', 'PPPDiff', 'FGPDiff', 'TORateDiff', 'Off_eFGDiff', 'Def_eFGDiff', 'ORPDiff', 'DRPDiff', 'FGAstDiff', 'AstTODiff', 'FTPDiff', 'FT_RateDiff', 'TruDiff']


In [66]:
X = df[features]
Y = df['WinA']

In [67]:
def rescale(features, df_train, df_val, df_test=None):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_)
    df_val[features] = (df_val[features] - min_) / (max_ - min_)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_) / (max_ - min_)
        
    return df_train, df_val, df_test

### Cross Validation 
-

#### Below is a version of my try at kflold with the model

In [72]:
# WITH PRED_TESTS 
def my_kfold2(df, features, df_test_=None, mode="", n_splits=5, start_season_index=7):
    cvs = []
    pred_tests = []
    scaler = StandardScaler() 
    kf = KFold(n_splits=n_splits)

    seasons = df['Season'].unique()
    
    for train_index, test_index in kf.split(seasons[start_season_index:]):
        train_seasons = seasons[start_season_index:][train_index]
        test_seasons = seasons[start_season_index:][test_index]
        
        print(f'\nTraining on seasons {train_seasons}, Validating on seasons {test_seasons}')
        X_train = df[df['Season'].isin(train_seasons)].reset_index(drop=True).copy()
        X_val = df[df['Season'].isin(test_seasons)].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        
        # Convert DataFrames to NumPy arrays
        x_train, y_train = X_train[features].values, X_train["WinA"].values
        x_val, y_val = X_val[features].values, X_val["WinA"].values

        x_train = scaler.fit_transform(x_train)
        x_val = scaler.transform(x_val)
        
        if mode == "xgb":
            model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
            model.fit(x_train, y_train)
        elif mode == "random_forest":
            model = RandomForestClassifier(random_state=42, n_jobs=-1)
            model.fit(x_train, y_train)
        elif mode == "reg":
            model = LogisticRegression(random_state=42) 
            model.fit(x_train, y_train)
        
        pred = model.predict_proba(x_val)[:, 1]
        pred = np.clip(pred, 0, 1)
        
        if df_test is not None:
            df_test = df_test[features].values
            if mode == "reg":
                pred_test = model.predict(df_test)
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            else:
                pred_test = model.predict_proba(df_test)[:, 1]
            pred_tests.append(pred_test)
            
        score = ((y_val - pred) ** 2).mean()
        accuracy =  (y_val == (pred > 0.5)).mean()
        precision = precision_score(y_val, (pred > 0.5))
        recall = recall_score(y_val, (pred > 0.5))
        f1 = f1_score(y_val, (pred > 0.5))
        auc_roc = roc_auc_score(y_val, pred)
        print(f'\t -> Score: {score:.3f}, Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, AUC-ROC: {auc_roc:.3f}')

        cvs.append(score)
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    print(f'\t -> Average Accuracy: {np.mean(cvs):.3f}')

    return pred_tests

In [69]:
result = my_kfold2(df,features, df_test, mode="random_forest")


Training on seasons [2018 2019 2021 2022], Validating on seasons [2017]
	 -> Score: 0.181, Accuracy: 0.715, Precision: 0.733, Recall: 0.677, F1: 0.704, AUC-ROC: 0.803

Training on seasons [2017 2019 2021 2022], Validating on seasons [2018]
	 -> Score: 0.193, Accuracy: 0.681, Precision: 0.679, Recall: 0.685, F1: 0.682, AUC-ROC: 0.775

Training on seasons [2017 2018 2021 2022], Validating on seasons [2019]
	 -> Score: 0.166, Accuracy: 0.731, Precision: 0.727, Recall: 0.738, F1: 0.733, AUC-ROC: 0.843

Training on seasons [2017 2018 2019 2022], Validating on seasons [2021]
	 -> Score: 0.186, Accuracy: 0.717, Precision: 0.719, Recall: 0.713, F1: 0.716, AUC-ROC: 0.791

Training on seasons [2017 2018 2019 2021], Validating on seasons [2022]
	 -> Score: 0.190, Accuracy: 0.716, Precision: 0.713, Recall: 0.724, F1: 0.719, AUC-ROC: 0.783

 Local CV is 0.183
	 -> Average Accuracy: 0.183


In [70]:
result_2 =  my_kfold2(df,features, df_test, mode="xgb")


Training on seasons [2018 2019 2021 2022], Validating on seasons [2017]
	 -> Score: 0.242, Accuracy: 0.677, Precision: 0.680, Recall: 0.669, F1: 0.674, AUC-ROC: 0.787

Training on seasons [2017 2019 2021 2022], Validating on seasons [2018]
	 -> Score: 0.270, Accuracy: 0.669, Precision: 0.672, Recall: 0.662, F1: 0.667, AUC-ROC: 0.750

Training on seasons [2017 2018 2021 2022], Validating on seasons [2019]
	 -> Score: 0.191, Accuracy: 0.731, Precision: 0.738, Recall: 0.715, F1: 0.727, AUC-ROC: 0.837

Training on seasons [2017 2018 2019 2022], Validating on seasons [2021]
	 -> Score: 0.218, Accuracy: 0.705, Precision: 0.712, Recall: 0.690, F1: 0.701, AUC-ROC: 0.789

Training on seasons [2017 2018 2019 2021], Validating on seasons [2022]
	 -> Score: 0.246, Accuracy: 0.675, Precision: 0.679, Recall: 0.664, F1: 0.672, AUC-ROC: 0.748

 Local CV is 0.233
	 -> Average Accuracy: 0.233


In [73]:
result_3 =  my_kfold2(df,features, df_test, mode="reg")


Training on seasons [2018 2019 2021 2022], Validating on seasons [2017]
	 -> Score: 0.170, Accuracy: 0.754, Precision: 0.754, Recall: 0.754, F1: 0.754, AUC-ROC: 0.828

Training on seasons [2017 2019 2021 2022], Validating on seasons [2018]
	 -> Score: 0.199, Accuracy: 0.685, Precision: 0.685, Recall: 0.685, F1: 0.685, AUC-ROC: 0.773

Training on seasons [2017 2018 2021 2022], Validating on seasons [2019]
	 -> Score: 0.153, Accuracy: 0.762, Precision: 0.762, Recall: 0.762, F1: 0.762, AUC-ROC: 0.866

Training on seasons [2017 2018 2019 2022], Validating on seasons [2021]
	 -> Score: 0.187, Accuracy: 0.752, Precision: 0.752, Recall: 0.752, F1: 0.752, AUC-ROC: 0.792

Training on seasons [2017 2018 2019 2021], Validating on seasons [2022]
	 -> Score: 0.196, Accuracy: 0.724, Precision: 0.724, Recall: 0.724, F1: 0.724, AUC-ROC: 0.771

 Local CV is 0.181
	 -> Average Accuracy: 0.181


### Submission 

In [77]:
pred_test = np.mean(result, 0)
df_test['pred'] = pred_test

# _ = sns.displot(pred_test)

In [78]:
final_sub = df_test[['ID', 'pred']].copy()
final_sub.to_csv('submission.csv', index=False)

In [79]:
final_sub.head()

Unnamed: 0,ID,pred
0,2023_1101_1102,0.528
1,2023_1101_1103,0.512
2,2023_1101_1104,0.536
3,2023_1101_1105,0.53
4,2023_1101_1106,0.512
