In [2]:
import numpy as np
import sklearn
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import isotonic
import itertools
import pickle

In [3]:
seeds = pd.read_csv(r'../input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv')
tourney_results = pd.read_csv(r'../input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
regular_results = pd.read_csv(r'../input/mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonCompactResults.csv')
regular_results_detailed = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv')
tourney_results_detailed = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv')

H1_team_quality = pd.read_csv(r'../input/team-quality-rankings/H1_team_quality.csv')
H2_team_quality = pd.read_csv(r'../input/team-quality-rankings/H2_team_quality.csv')
Full_team_quality = pd.read_csv(r'../input/team-quality-rankings/Full_team_quality.csv')


In [4]:
H1_team_quality['TeamID'] = H1_team_quality['TeamID'].astype(str)
H2_team_quality['TeamID'] = H2_team_quality['TeamID'].astype(str)
Full_team_quality['TeamID'] = Full_team_quality['TeamID'].astype(str)

H1_team_quality['Season'] = H1_team_quality['Season'].astype(str)
H2_team_quality['Season'] = H2_team_quality['Season'].astype(str)
Full_team_quality['Season'] = Full_team_quality['Season'].astype(str)

In [5]:
H1_team_quality['quality'] = H1_team_quality.groupby('Season').rank()['quality']
H2_team_quality['quality'] = H2_team_quality.groupby('Season').rank()['quality']
Full_team_quality['quality'] = Full_team_quality.groupby('Season').rank()['quality']

# Generating Matchups for Stage 1

In [6]:
def matchupGenerator(season):
    matchups = list(itertools.combinations(np.array(seeds[seeds.Season == season].TeamID), 2))
    matchups = pd.DataFrame(matchups)
    matchups['Season'] = season
    matchups.columns = ['T1_TeamID', 'T2_TeamID', 'Season']
    
    matchups['IDLow'] = np.where(matchups['T1_TeamID'] < matchups['T2_TeamID'], matchups['T1_TeamID'], matchups['T2_TeamID'])
    matchups['IDHigh'] = np.where(matchups['T1_TeamID'] > matchups['T2_TeamID'], matchups['T1_TeamID'], matchups['T2_TeamID'])

    matchups['ID'] = matchups["Season"].astype(str) + '_' + matchups["IDLow"].astype(str) + '_' +matchups["IDHigh"].astype(str)
    matchups.drop(['IDLow', 'IDHigh'], axis = 1, inplace = True)
    matchups['T1_TeamID'] = matchups['T1_TeamID'].astype(str)
    matchups['T2_TeamID'] = matchups['T2_TeamID'].astype(str)
    matchups['Season'] = matchups['Season'].astype(str)

    return matchups

In [7]:
matchups2015 = matchupGenerator(2003)
matchups2015 = matchupGenerator(2004)
matchups2015 = matchupGenerator(2005)
matchups2015 = matchupGenerator(2006)
matchups2015 = matchupGenerator(2007)
matchups2015 = matchupGenerator(2008)
matchups2015 = matchupGenerator(2009)
matchups2015 = matchupGenerator(2010)
matchups2015 = matchupGenerator(2011)
matchups2015 = matchupGenerator(2012)
matchups2015 = matchupGenerator(2013)
matchups2015 = matchupGenerator(2014)
matchups2015 = matchupGenerator(2015)
matchups2016 = matchupGenerator(2016)
matchups2017 = matchupGenerator(2017)
matchups2018 = matchupGenerator(2018)
matchups2019 = matchupGenerator(2019)
matchups2019 = matchupGenerator(2021)

In [8]:
matchups2015.head()

Unnamed: 0,T1_TeamID,T2_TeamID,Season,ID
0,1437,1438,2015,2015_1437_1438
1,1437,1328,2015,2015_1328_1437
2,1437,1257,2015,2015_1257_1437
3,1437,1320,2015,2015_1320_1437
4,1437,1344,2015,2015_1344_1437


# Massey Ordinal Rankings

In [9]:
def masseyRankings(season):
    rankings = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage1/MMasseyOrdinals.csv')
    rankings = rankings.loc[rankings.Season == season]
    minRankDay = min(rankings.RankingDayNum)
    maxRankDay = max(rankings.RankingDayNum)
    medianRankDay = np.median(rankings.RankingDayNum)
    
    startRank = rankings.loc[rankings['RankingDayNum'] == minRankDay].groupby(['TeamID']).mean().reset_index()[['Season', 'TeamID', 'OrdinalRank']]
    startRank.rename(columns={'OrdinalRank': 'StartRank'}, inplace=True)

    endRank = rankings.loc[rankings['RankingDayNum'] == maxRankDay].groupby(['TeamID']).mean().reset_index()[['Season', 'TeamID', 'OrdinalRank']]
    endRank.rename(columns={'OrdinalRank': 'EndRank'}, inplace=True)

    midRank = rankings.loc[rankings['RankingDayNum'] == medianRankDay].groupby(['TeamID']).mean().reset_index()[['Season', 'TeamID', 'OrdinalRank']]
    midRank.rename(columns={'OrdinalRank': 'MidRank'}, inplace=True)
    
    bestRank = rankings.groupby(['TeamID']).min().reset_index()[['Season', 'TeamID', 'OrdinalRank']]
    bestRank.rename(columns={'OrdinalRank': 'BestRank'}, inplace=True)

    
    seasonRankings = startRank
    seasonRankings['MidRank'] = midRank['MidRank']
    seasonRankings['EndRank'] = endRank['EndRank']
    seasonRankings['BestRank'] = bestRank['BestRank']
    seasonRankings['RankDif'] = seasonRankings['BestRank'] - seasonRankings['EndRank']
    seasonRankings['Season'] = seasonRankings['Season'].astype(int)
    seasonRankings['TeamID'] = seasonRankings['TeamID'].astype(int)
    
    seasonRankings['Season'] = seasonRankings['Season'].astype(str)
    seasonRankings['TeamID'] = seasonRankings['TeamID'].astype(str)

    
    return seasonRankings


In [10]:
MasseyRankings = pd.concat([masseyRankings(2003),
                          masseyRankings(2004),
                          masseyRankings(2005),
                          masseyRankings(2006),
                          masseyRankings(2007),
                          masseyRankings(2008),
                          masseyRankings(2009),
                          masseyRankings(2010),
                          masseyRankings(2011),
                          masseyRankings(2012),
                          masseyRankings(2013),
                          masseyRankings(2014),
                          masseyRankings(2015),
                          masseyRankings(2016),
                          masseyRankings(2017),
                          masseyRankings(2018),
                          masseyRankings(2019), 
                           masseyRankings(2020),
                           masseyRankings(2021)]).reset_index(drop=True)


In [11]:
MasseyRankings['TeamID'] = MasseyRankings['TeamID'].astype(str)
MasseyRankings['Season'] = MasseyRankings['Season'].astype(str)

# Creating Modeling Data

## Regular Season Data

In [12]:
seeds['Seed'] = seeds['Seed'].replace('A','')
seeds['Seed'] = seeds['Seed'].replace('B','')
seeds['Seed'] = seeds['Seed'].str[1:3]
seeds['Seed'] = seeds['Seed'].astype(int)
seeds['TeamID'] = seeds['TeamID'].astype(str)
seeds['Season'] = seeds['Season'].astype(str)
seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [13]:
def prepare_data_modified(df):
    
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]


    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'         
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]
    
    output = pd.concat([df, dfswap]).sort_index().reset_index(drop=True)

    output['T1_Possessions'] = output['T1_FGA'] - output['T1_OR'] + output['T1_TO'] + (0.475 * output['T1_FTA'])
    output['T2_Possessions'] = output['T2_FGA'] - output['T2_OR'] + output['T2_TO'] + (0.475 * output['T2_FTA'])

    output['T1_OER'] = output['T1_Score'] / ( output['T1_FGA'] +  ((output['T1_FTA'] * 0.9)/2 + output['T1_TO']))
    output['T1_DER'] = output['T2_Score'] / ( output['T2_FGA'] +  ((output['T2_FTA'] * 0.9)/2 + output['T2_TO']))

    output['T1_PAPP'] = output['T1_Score']/output['T1_Possessions']
    output['T2_PAPP'] = output['T2_Score']/output['T2_Possessions']

        
    return output

In [14]:
regular_results_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [15]:
regular_season = prepare_data_modified(regular_results_detailed)

In [16]:
# convert to str, so the model would treat TeamID them as factors
regular_season['T1_TeamID'] = regular_season['T1_TeamID'].astype(str)
regular_season['T2_TeamID'] = regular_season['T2_TeamID'].astype(str)
regular_season['Season'] = regular_season['Season'].astype(str)

# make it a binary task
regular_season['win'] = np.where(regular_season['T1_Score']>regular_season['T2_Score'], 1, 0)

In [17]:
regular_season['T1_winMargin'] = regular_season['T1_Score'] - regular_season['T2_Score']
regular_season['T2_winMargin'] = regular_season['T2_Score'] - regular_season['T1_Score']

#### Regular Season Summary Statistics

In [18]:
regular_season["T1_FGPercent"] = regular_season["T1_FGM"] /  regular_season["T1_FGA"]
regular_season["T1_FG3Percent"] = regular_season["T1_FGM3"] /  regular_season["T1_FGA3"]

regular_season["T2_FGPercent"] = regular_season["T2_FGM"] /  regular_season["T2_FGA"]
regular_season["T2_FG3Percent"] = regular_season["T2_FGM3"] /  regular_season["T2_FGA3"]

In [19]:
OffensiveStatsT1 = regular_season.groupby(['Season', 'T1_TeamID']).agg({"T1_Score": ["median", "min", "max"],
                                                                    "T1_FGPercent": ["median"],
                                                                    "T1_FG3Percent": ["median"],
                                                                   "T1_FGA3": ["median"], 
                                                                   "T1_FTA": ["median"], 
                                                                   "T1_OR": ["median"], 
                                                                   "T1_DR": ["median"], 
                                                                   "T1_Ast": ["median"], 
                                                                   "T1_TO": ["median", "max"], 
                                                                   "T1_Stl": ["median", "max"], 
                                                                   "T1_PF": ["median"],
                                                                   "T1_Possessions": ["median"],
                                                                   "T1_winMargin": ["median", "min", "max"]
                                                                  })
OffensiveStatsT1.columns = ['_'.join(col).strip() for col in OffensiveStatsT1.columns.values]
OffensiveStatsT1.reset_index(inplace = True)
OffensiveStatsT1.head()

Unnamed: 0,Season,T1_TeamID,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,T1_FTA_median,T1_OR_median,...,T1_Ast_median,T1_TO_median,T1_TO_max,T1_Stl_median,T1_Stl_max,T1_PF_median,T1_Possessions_median,T1_winMargin_median,T1_winMargin_min,T1_winMargin_max
0,2003,1102,56.5,33,85,0.488636,0.333333,20.0,17.0,4.5,...,12.0,11.0,20,5.5,13,19.0,55.3125,-3.0,-32,38
1,2003,1103,79.0,52,105,0.492308,0.333333,14.0,25.0,10.0,...,16.0,13.0,18,8.0,19,19.0,68.075,-2.0,-16,33
2,2003,1104,69.0,46,89,0.412404,0.318841,20.0,20.0,13.0,...,13.0,13.0,23,6.0,12,18.0,66.45,6.0,-19,28
3,2003,1105,71.5,40,97,0.409463,0.367003,21.0,21.5,14.0,...,15.0,18.0,28,9.0,18,20.0,76.8625,-3.5,-42,34
4,2003,1106,60.5,43,85,0.414971,0.356471,18.0,16.5,11.5,...,11.0,16.5,24,8.5,13,18.0,67.7375,-1.0,-26,26


In [20]:
DefensiveStatsT1 = regular_season.groupby(['Season', 'T1_TeamID']).agg({"T2_Score": ["median", "min", "max"],
                                                                    "T2_FGPercent": ["median"],
                                                                    "T2_FG3Percent": ["median"],
                                                                   "T2_FGA3": ["median"], 
                                                                   "T2_FTA": ["median"], 
                                                                   "T2_OR": ["median"], 
                                                                   "T2_DR": ["median"], 
                                                                   "T2_Ast": ["median"], 
                                                                   "T2_TO": ["median", "max"], 
                                                                   "T2_Stl": ["median", "max"], 
                                                                   "T2_PF": ["median"],
                                                                   "T2_Possessions": ["median"],
                                                                  })
DefensiveStatsT1.columns = ['_'.join(col).replace("T2_", "T1_Opponent_") for col in DefensiveStatsT1.columns.values]
DefensiveStatsT1.reset_index(inplace = True)
DefensiveStatsT1.head()

Unnamed: 0,Season,T1_TeamID,T1_Opponent_Score_median,T1_Opponent_Score_min,T1_Opponent_Score_max,T1_Opponent_FGPercent_median,T1_Opponent_FG3Percent_median,T1_Opponent_FGA3_median,T1_Opponent_FTA_median,T1_Opponent_OR_median,T1_Opponent_DR_median,T1_Opponent_Ast_median,T1_Opponent_TO_median,T1_Opponent_TO_max,T1_Opponent_Stl_median,T1_Opponent_Stl_max,T1_Opponent_PF_median,T1_Opponent_Possessions_median
0,2003,1102,58.5,33,76,0.463333,0.380952,11.0,20.0,9.0,19.0,8.0,13.5,20,5.0,12,18.0,54.8625
1,2003,1103,77.0,55,112,0.5,0.380952,18.0,20.0,12.0,20.0,16.0,15.0,25,6.0,11,21.0,68.55
2,2003,1104,63.5,48,82,0.415881,0.324561,19.5,16.5,10.0,22.5,11.0,14.0,21,6.0,10,19.0,65.725
3,2003,1105,78.5,53,103,0.430857,0.368421,18.0,27.0,13.0,26.0,15.0,19.0,29,9.5,17,19.0,76.1875
4,2003,1106,63.5,33,82,0.424777,0.313725,13.0,21.5,12.0,21.5,11.5,15.0,22,8.0,19,16.0,66.7875


In [21]:
OffensiveStatsT2 = regular_season.groupby(['Season', 'T2_TeamID']).agg({"T2_Score": ["median", "min", "max"],
                                                                    "T2_FGPercent": ["median"],
                                                                    "T2_FG3Percent": ["median"],
                                                                   "T2_FGA3": ["median"], 
                                                                   "T2_FTA": ["median"], 
                                                                   "T2_OR": ["median"], 
                                                                   "T2_DR": ["median"], 
                                                                   "T2_Ast": ["median"], 
                                                                   "T2_TO": ["median", "max"], 
                                                                   "T2_Stl": ["median", "max"], 
                                                                   "T2_PF": ["median"],
                                                                   "T2_Possessions": ["median"],
                                                                   "T2_winMargin": ["median", "min", "max"]
                                                                  })
OffensiveStatsT2.columns = ['_'.join(col).strip() for col in OffensiveStatsT2.columns.values]
OffensiveStatsT2.reset_index(inplace = True)
OffensiveStatsT2.head()

Unnamed: 0,Season,T2_TeamID,T2_Score_median,T2_Score_min,T2_Score_max,T2_FGPercent_median,T2_FG3Percent_median,T2_FGA3_median,T2_FTA_median,T2_OR_median,...,T2_Ast_median,T2_TO_median,T2_TO_max,T2_Stl_median,T2_Stl_max,T2_PF_median,T2_Possessions_median,T2_winMargin_median,T2_winMargin_min,T2_winMargin_max
0,2003,1102,56.5,33,85,0.488636,0.333333,20.0,17.0,4.5,...,12.0,11.0,20,5.5,13,19.0,55.3125,-3.0,-32,38
1,2003,1103,79.0,52,105,0.492308,0.333333,14.0,25.0,10.0,...,16.0,13.0,18,8.0,19,19.0,68.075,-2.0,-16,33
2,2003,1104,69.0,46,89,0.412404,0.318841,20.0,20.0,13.0,...,13.0,13.0,23,6.0,12,18.0,66.45,6.0,-19,28
3,2003,1105,71.5,40,97,0.409463,0.367003,21.0,21.5,14.0,...,15.0,18.0,28,9.0,18,20.0,76.8625,-3.5,-42,34
4,2003,1106,60.5,43,85,0.414971,0.356471,18.0,16.5,11.5,...,11.0,16.5,24,8.5,13,18.0,67.7375,-1.0,-26,26


In [22]:
DefensiveStatsT2 = regular_season.groupby(['Season', 'T2_TeamID']).agg({"T1_Score": ["median", "min", "max"],
                                                                    "T1_FGPercent": ["median"],
                                                                    "T1_FG3Percent": ["median"],
                                                                   "T1_FGA3": ["median"], 
                                                                   "T1_FTA": ["median"], 
                                                                   "T1_OR": ["median"], 
                                                                   "T1_DR": ["median"], 
                                                                   "T1_Ast": ["median"], 
                                                                   "T1_TO": ["median", "max"], 
                                                                   "T1_Stl": ["median", "max"], 
                                                                   "T1_PF": ["median"],
                                                                   "T1_Possessions": ["median"],
                                                                  })
DefensiveStatsT2.columns = ['_'.join(col).replace("T1_", "T2_Opponent_") for col in DefensiveStatsT2.columns.values]
DefensiveStatsT2.reset_index(inplace = True)
DefensiveStatsT2.head()

Unnamed: 0,Season,T2_TeamID,T2_Opponent_Score_median,T2_Opponent_Score_min,T2_Opponent_Score_max,T2_Opponent_FGPercent_median,T2_Opponent_FG3Percent_median,T2_Opponent_FGA3_median,T2_Opponent_FTA_median,T2_Opponent_OR_median,T2_Opponent_DR_median,T2_Opponent_Ast_median,T2_Opponent_TO_median,T2_Opponent_TO_max,T2_Opponent_Stl_median,T2_Opponent_Stl_max,T2_Opponent_PF_median,T2_Opponent_Possessions_median
0,2003,1102,58.5,33,76,0.463333,0.380952,11.0,20.0,9.0,19.0,8.0,13.5,20,5.0,12,18.0,54.8625
1,2003,1103,77.0,55,112,0.5,0.380952,18.0,20.0,12.0,20.0,16.0,15.0,25,6.0,11,21.0,68.55
2,2003,1104,63.5,48,82,0.415881,0.324561,19.5,16.5,10.0,22.5,11.0,14.0,21,6.0,10,19.0,65.725
3,2003,1105,78.5,53,103,0.430857,0.368421,18.0,27.0,13.0,26.0,15.0,19.0,29,9.5,17,19.0,76.1875
4,2003,1106,63.5,33,82,0.424777,0.313725,13.0,21.5,12.0,21.5,11.5,15.0,22,8.0,19,16.0,66.7875


In [23]:
OffensiveStatsT1['T1_TeamID'] = OffensiveStatsT1['T1_TeamID'].astype(str)
OffensiveStatsT2['T2_TeamID'] = OffensiveStatsT2['T2_TeamID'].astype(str)
DefensiveStatsT1['T1_TeamID'] = DefensiveStatsT1['T1_TeamID'].astype(str)
DefensiveStatsT2['T2_TeamID'] = DefensiveStatsT2['T2_TeamID'].astype(str)

OffensiveStatsT1['Season'] = OffensiveStatsT1['Season'].astype(str)
OffensiveStatsT2['Season'] = OffensiveStatsT2['Season'].astype(str)
DefensiveStatsT1['Season'] = DefensiveStatsT1['Season'].astype(str)
DefensiveStatsT2['Season'] = DefensiveStatsT2['Season'].astype(str)

## Tournament Data

In [24]:
tournament_results = prepare_data_modified(tourney_results_detailed)

In [25]:
tournament_results['win'] = np.where(tournament_results['T1_Score']>tournament_results['T2_Score'], 1, 0)

In [26]:
tournament_results = tournament_results[['Season', 'T1_TeamID', 'T2_TeamID', 'win']]

In [27]:
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win
0,2003,1421,1411,1
1,2003,1411,1421,0
2,2003,1436,1112,0
3,2003,1112,1436,1
4,2003,1113,1272,1


In [28]:
tournament_results['T1_TeamID'] = tournament_results['T1_TeamID'].astype(str)
tournament_results['T2_TeamID'] = tournament_results['T2_TeamID'].astype(str)
tournament_results['Season'] = tournament_results['Season'].astype(str)

In [29]:
tournament_results = tournament_results.merge(OffensiveStatsT1, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','T1_TeamID'],
                               sort=False)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T1_Ast_median,T1_TO_median,T1_TO_max,T1_Stl_median,T1_Stl_max,T1_PF_median,T1_Possessions_median,T1_winMargin_median,T1_winMargin_min,T1_winMargin_max
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,13.0,16.0,26,7.0,12,19.0,71.5,-3.0,-52,14
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,14.0,15.0,27,6.0,13,19.0,71.0,2.5,-23,43
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,13.0,14.0,23,6.0,17,16.0,65.725,6.0,-26,26
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,18.5,14.5,22,8.0,17,17.5,76.3,13.0,-7,39
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,16.0,14.0,20,4.0,11,19.0,69.925,5.0,-23,41


In [30]:
tournament_results = tournament_results.merge(DefensiveStatsT1, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','T1_TeamID'],
                               sort=False)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T1_Opponent_FTA_median,T1_Opponent_OR_median,T1_Opponent_DR_median,T1_Opponent_Ast_median,T1_Opponent_TO_median,T1_Opponent_TO_max,T1_Opponent_Stl_median,T1_Opponent_Stl_max,T1_Opponent_PF_median,T1_Opponent_Possessions_median
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,22.0,13.0,22.0,14.0,13.0,20,8.0,19,19.0,72.0
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,19.0,11.5,23.5,13.5,14.5,22,7.5,18,21.5,70.925
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,14.0,8.0,22.0,13.0,13.0,23,7.0,14,17.0,65.65
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,19.0,11.5,23.5,16.0,16.5,28,5.0,13,22.5,76.4375
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,20.0,12.0,20.0,14.0,15.0,21,6.0,12,22.0,70.2


In [31]:
tournament_results = tournament_results.merge(OffensiveStatsT2, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','T2_TeamID'],
                               sort=False)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T2_Ast_median,T2_TO_median,T2_TO_max,T2_Stl_median,T2_Stl_max,T2_PF_median,T2_Possessions_median,T2_winMargin_median,T2_winMargin_min,T2_winMargin_max
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,14.0,15.0,27,6.0,13,19.0,71.0,2.5,-23,43
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,13.0,16.0,26,7.0,12,19.0,71.5,-3.0,-52,14
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,18.5,14.5,22,8.0,17,17.5,76.3,13.0,-7,39
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,13.0,14.0,23,6.0,17,16.0,65.725,6.0,-26,26
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,17.0,13.0,22,7.0,14,18.0,70.7,7.0,-17,24


In [32]:
tournament_results = tournament_results.merge(DefensiveStatsT2, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','T2_TeamID'],
                               sort=False)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T2_Opponent_FTA_median,T2_Opponent_OR_median,T2_Opponent_DR_median,T2_Opponent_Ast_median,T2_Opponent_TO_median,T2_Opponent_TO_max,T2_Opponent_Stl_median,T2_Opponent_Stl_max,T2_Opponent_PF_median,T2_Opponent_Possessions_median
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,19.0,11.5,23.5,13.5,14.5,22,7.5,18,21.5,70.925
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,22.0,13.0,22.0,14.0,13.0,20,8.0,19,19.0,72.0
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,19.0,11.5,23.5,16.0,16.5,28,5.0,13,22.5,76.4375
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,14.0,8.0,22.0,13.0,13.0,23,7.0,14,17.0,65.65
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,20.0,13.0,24.0,14.0,14.0,28,7.0,13,20.0,69.975


In [33]:
tournament_results = tournament_results.merge(seeds, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results = tournament_results.merge(seeds, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'Seed_x': 'T1_Seed', 
                         'Seed_y': 'T2_Seed'}, inplace=True)
tournament_results.drop(['TeamID_x', 'TeamID_y'], axis = 1, inplace = True)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T2_Opponent_DR_median,T2_Opponent_Ast_median,T2_Opponent_TO_median,T2_Opponent_TO_max,T2_Opponent_Stl_median,T2_Opponent_Stl_max,T2_Opponent_PF_median,T2_Opponent_Possessions_median,T1_Seed,T2_Seed
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,23.5,13.5,14.5,22,7.5,18,21.5,70.925,16,16
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,22.0,14.0,13.0,20,8.0,19,19.0,72.0,16,16
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,23.5,16.0,16.5,28,5.0,13,22.5,76.4375,16,1
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,22.0,13.0,13.0,23,7.0,14,17.0,65.65,1,16
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,24.0,14.0,14.0,28,7.0,13,20.0,69.975,10,7


In [34]:
tournament_results['seed_dif'] = tournament_results['T1_Seed'] - tournament_results['T2_Seed']

In [35]:
tournament_results = tournament_results.merge(H1_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T1_H1_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)

tournament_results = tournament_results.merge(H1_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T2_H1_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)



tournament_results = tournament_results.merge(H2_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T1_H2_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)

tournament_results = tournament_results.merge(H2_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T2_H2_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)

tournament_results = tournament_results.merge(Full_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T1_Full_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)

tournament_results = tournament_results.merge(Full_team_quality[['TeamID', 'Season', 'quality']], 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'quality': 'T2_Full_Quality'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)


tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T2_Opponent_Possessions_median,T1_Seed,T2_Seed,seed_dif,T1_H1_Quality,T2_H1_Quality,T1_H2_Quality,T2_H2_Quality,T1_Full_Quality,T2_Full_Quality
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,70.925,16,16,0,94.0,68.0,72.0,144.0,104.0,58.0
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,72.0,16,16,0,68.0,94.0,144.0,72.0,58.0,104.0
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,76.4375,16,1,15,158.0,323.0,156.0,324.0,148.0,326.0
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,65.65,1,16,-15,323.0,158.0,324.0,156.0,326.0,148.0
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,69.975,10,7,3,299.0,282.0,320.0,272.0,290.0,307.0


In [36]:
tournament_results.columns

Index(['Season', 'T1_TeamID', 'T2_TeamID', 'win', 'T1_Score_median',
       'T1_Score_min', 'T1_Score_max', 'T1_FGPercent_median',
       'T1_FG3Percent_median', 'T1_FGA3_median', 'T1_FTA_median',
       'T1_OR_median', 'T1_DR_median', 'T1_Ast_median', 'T1_TO_median',
       'T1_TO_max', 'T1_Stl_median', 'T1_Stl_max', 'T1_PF_median',
       'T1_Possessions_median', 'T1_winMargin_median', 'T1_winMargin_min',
       'T1_winMargin_max', 'T1_Opponent_Score_median', 'T1_Opponent_Score_min',
       'T1_Opponent_Score_max', 'T1_Opponent_FGPercent_median',
       'T1_Opponent_FG3Percent_median', 'T1_Opponent_FGA3_median',
       'T1_Opponent_FTA_median', 'T1_Opponent_OR_median',
       'T1_Opponent_DR_median', 'T1_Opponent_Ast_median',
       'T1_Opponent_TO_median', 'T1_Opponent_TO_max', 'T1_Opponent_Stl_median',
       'T1_Opponent_Stl_max', 'T1_Opponent_PF_median',
       'T1_Opponent_Possessions_median', 'T2_Score_median', 'T2_Score_min',
       'T2_Score_max', 'T2_FGPercent_median', 'T2_F

In [37]:
tournament_results = tournament_results.merge(MasseyRankings, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'StartRank': 'T1_StartRank', 
                                  'MidRank': 'T1_MidRank', 
                                  'EndRank': 'T1_EndRank', 
                                  'BestRank': 'T1_BestRank', 
                                  'RankDif': 'T1_RankDif'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)

tournament_results = tournament_results.merge(MasseyRankings, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
tournament_results.rename(columns={'StartRank': 'T2_StartRank', 
                                  'MidRank': 'T2_MidRank', 
                                  'EndRank': 'T2_EndRank', 
                                  'BestRank': 'T2_BestRank', 
                                  'RankDif': 'T2_RankDif'}, inplace=True)
tournament_results.drop(['TeamID'], axis = 1, inplace = True)
tournament_results.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T1_StartRank,T1_MidRank,T1_EndRank,T1_BestRank,T1_RankDif,T2_StartRank,T2_MidRank,T2_EndRank,T2_BestRank,T2_RankDif
0,2003,1421,1411,1,71.0,50,99,0.433333,0.363636,17.0,...,218.0,251.76,240.34375,128,-112.34375,231.0,207.72,239.28125,114,-125.28125
1,2003,1411,1421,0,72.5,54,104,0.433699,0.333333,19.0,...,231.0,207.72,239.28125,114,-125.28125,218.0,251.76,240.34375,128,-112.34375
2,2003,1436,1112,0,65.0,50,101,0.446429,0.352941,15.0,...,192.0,176.68,153.125,101,-52.125,26.0,3.074074,2.676471,1,-1.676471
3,2003,1112,1436,1,86.5,65,107,0.458447,0.355731,19.5,...,26.0,3.074074,2.676471,1,-1.676471,192.0,176.68,153.125,101,-52.125
4,2003,1113,1272,1,75.0,55,108,0.47541,0.3125,12.0,...,89.0,34.92,36.0,10,-26.0,67.0,59.4,21.705882,7,-14.705882


In [38]:
def featureDifferences(data):
    data['Score_Dif'] = data['T1_Score_median'] - data['T2_Score_median']
    data['FGPerc_Dif'] = data['T1_FGPercent_median'] - data['T2_FGPercent_median']
    data['FG3Perc_Dif'] = data['T1_FG3Percent_median'] - data['T2_FG3Percent_median']
    data['FGA3_Dif'] = data['T1_FGA3_median'] - data['T2_FGA3_median']
    data['FTA_Dif'] = data['T1_FTA_median'] - data['T2_FTA_median']
    data['OR_Dif'] = data['T1_OR_median'] - data['T2_OR_median']
    data['DF_Dif'] = data['T1_DR_median'] - data['T2_DR_median']
    data['Ast_Dif'] = data['T1_Ast_median'] - data['T2_Ast_median']
    data['TO_Dif'] = data['T1_TO_median'] - data['T2_TO_median']
    data['Stl_Dif'] = data['T1_Stl_median'] - data['T2_Stl_median']
    data['PF_Dif'] = data['T1_PF_median'] - data['T2_PF_median']
    data['Possessions_Dif'] = data['T1_Possessions_median'] - data['T2_Possessions_median']
    data['WinMargin_Dif'] = data['T1_winMargin_median'] - data['T2_winMargin_median']
    
    data['T1_Score_OppDif'] = data['T1_Score_median'] - data['T2_Opponent_Score_median']
    data['T1_FGPercent_OppDif'] = data['T1_FGPercent_median'] - data['T2_Opponent_FGPercent_median']
    data['T1_FG3Percent_OppDif'] = data['T1_FG3Percent_median'] - data['T2_Opponent_FG3Percent_median']
    data['T1_FGA3_OppDif'] = data['T1_FGA3_median'] - data['T2_Opponent_FGA3_median']
    data['T1_FTA_OppDif'] = data['T1_FTA_median'] - data['T2_Opponent_FTA_median']
    data['T1_OR_OppDif'] = data['T1_OR_median'] - data['T2_Opponent_OR_median']
    data['T1_DR_OppDif'] = data['T1_DR_median'] - data['T2_Opponent_DR_median']
    data['T1_Ast_OppDif'] = data['T1_Ast_median'] - data['T2_Opponent_Ast_median']
    data['T1_TO_OppDif'] = data['T1_TO_median'] - data['T2_Opponent_TO_median']
    data['T1_Stl_OppDif'] = data['T1_Stl_median'] - data['T2_Opponent_Stl_median']
    data['T1_PF_OppDif'] = data['T1_PF_median'] - data['T2_Opponent_PF_median']
    data['T1_Possessions_OppDif'] = data['T1_Possessions_median'] - data['T2_Opponent_Possessions_median']
    
    data['T2_Score_OppDif'] = data['T2_Score_median'] - data['T1_Opponent_Score_median']
    data['T2_FGPercent_OppDif'] = data['T2_FGPercent_median'] - data['T1_Opponent_FGPercent_median']
    data['T2_FG3Percent_OppDif'] = data['T2_FG3Percent_median'] - data['T1_Opponent_FG3Percent_median']
    data['T2_FGA3_OppDif'] = data['T2_FGA3_median'] - data['T1_Opponent_FGA3_median']
    data['T2_FTA_OppDif'] = data['T2_FTA_median'] - data['T1_Opponent_FTA_median']
    data['T2_OR_OppDif'] = data['T2_OR_median'] - data['T1_Opponent_OR_median']
    data['T2_DR_OppDif'] = data['T2_DR_median'] - data['T1_Opponent_DR_median']
    data['T2_Ast_OppDif'] = data['T2_Ast_median'] - data['T1_Opponent_Ast_median']
    data['T2_TO_OppDif'] = data['T2_TO_median'] - data['T1_Opponent_TO_median']
    data['T2_Stl_OppDif'] = data['T2_Stl_median'] - data['T1_Opponent_Stl_median']
    data['T2_PF_OppDif'] = data['T2_PF_median'] - data['T1_Opponent_PF_median']
    data['T2_Possessions_OppDif'] = data['T2_Possessions_median'] - data['T1_Opponent_Possessions_median']

    return data

In [39]:
tournament_results = featureDifferences(tournament_results)

In [40]:
tournament_results.tail()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,win,T1_Score_median,T1_Score_min,T1_Score_max,T1_FGPercent_median,T1_FG3Percent_median,T1_FGA3_median,...,T2_FG3Percent_OppDif,T2_FGA3_OppDif,T2_FTA_OppDif,T2_OR_OppDif,T2_DR_OppDif,T2_Ast_OppDif,T2_TO_OppDif,T2_Stl_OppDif,T2_PF_OppDif,T2_Possessions_OppDif
2357,2021,1124,1222,1,82.5,58,112,0.503731,0.418011,24.0,...,-0.027348,6.0,5.0,3.5,5.0,1.0,-6.0,2.0,0.0,-3.125
2358,2021,1417,1211,0,73.5,48,107,0.461538,0.379808,18.5,...,-0.016238,-2.5,6.5,2.0,8.0,8.0,1.5,3.0,0.0,9.5875
2359,2021,1211,1417,1,90.0,73,116,0.53536,0.34058,19.5,...,0.063574,-1.5,3.0,2.0,1.5,1.0,-4.5,-1.0,0.0,-8.775
2360,2021,1124,1211,1,82.5,58,112,0.503731,0.418011,24.0,...,-0.020101,0.5,6.5,-1.0,7.0,6.5,-3.5,2.0,0.0,6.375
2361,2021,1211,1124,0,90.0,73,116,0.53536,0.34058,19.5,...,0.101777,4.0,1.0,4.0,-0.5,5.0,-3.0,3.0,0.0,-3.3625


### Training and Validation Data for Modeling

In [41]:
tournament_results.Season = tournament_results.Season.astype(int)

In [42]:
trainData = tournament_results[tournament_results.Season.isin([2003, 2004, 2005, 2006, 
                                             2007, 2008, 2009, 2010, 
                                             2011, 2012, 2013, 2014, 2015, 2016])]

validData = tournament_results[tournament_results.Season.isin([2017, 2018, 2019, 2021])]

In [43]:
trainData['T1_TeamID'] = trainData['T1_TeamID'].astype(str)
trainData['T2_TeamID'] = trainData['T2_TeamID'].astype(str)
validData['T1_TeamID'] = validData['T1_TeamID'].astype(str)
validData['T2_TeamID'] = validData['T2_TeamID'].astype(str)
seeds['TeamID'] = seeds['TeamID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [44]:
X_train, y_train = trainData.drop(['Season', 'T1_TeamID', 'T2_TeamID', 'win'], axis = 1), trainData['win']
X_test, y_test = validData.drop(['Season', 'T1_TeamID', 'T2_TeamID', 'win'], axis = 1), validData['win']

In [45]:
scaler = preprocessing.StandardScaler().fit(X_train)
#X_train_scaled = pd.DataFrame(scaler.transform(X_train))
#X_train_scaled.columns = X_train.columns

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#X_test_scaled = pd.DataFrame(scaler.transform(X_test))
#X_test.columns = X_test.columns

# Model Building

### Logistic Regressions

In [45]:
logreg = sklearn.linear_model.LogisticRegression(max_iter = 5000, C = 1)
logreg.fit(X_train_scaled, y_train)
print(sklearn.metrics.log_loss(y_train,logreg.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,logreg.predict_proba(X_test_scaled)))

0.5118611374365756
0.5975409632925489


### Decision Tree

In [46]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

dtClassifier = DecisionTreeClassifier(random_state=2020, 
                                      min_samples_leaf = .1)
dtClassifier.fit(X_train_scaled, y_train)

print(sklearn.metrics.log_loss(y_train,dtClassifier.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,dtClassifier.predict_proba(X_test_scaled)))

0.5509386185809488
0.5830472003591733


### SVM 

In [47]:
from sklearn import svm
svmClassifier = svm.SVC(probability = True, C = .05, kernel = "sigmoid")
svmClassifier.fit(X_train_scaled, y_train)

print(sklearn.metrics.log_loss(y_train,svmClassifier.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,svmClassifier.predict_proba(X_test_scaled)))

0.5433851893566118
0.5665061597153543


### Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rfClassifier = RandomForestClassifier(
                                      n_estimators = 2000, 
                                      random_state=2020, 
                                     )
rfClassifier.fit(X_train_scaled, y_train)

print(sklearn.metrics.log_loss(y_train,rfClassifier.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,rfClassifier.predict_proba(X_test_scaled)))

0.16255063432291328
0.5551287227150196


### XGBoost

In [49]:
import pandas as pd
import numpy as no
import xgboost as xgb
from sklearn.metrics import log_loss
from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV

In [50]:
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dvalid = xgb.DMatrix(X_test_scaled, label=y_test)

#### Hyperopt

In [48]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, anneal

In [52]:
xgbSpace={'max_depth': hp.quniform("max_depth", 2, 50, 1),
          'learning_rate' : hp.uniform('learning_rate', 0.0001, 0.5),
        'gamma': hp.quniform ('gamma', 0,20, 1),
        'reg_alpha' : hp.quniform('reg_alpha', 0,10, 0.5),
        'reg_lambda' : hp.quniform('reg_lambda', 0,10, 0.5),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 50, 1),
        'n_estimators': 5000
    }

In [53]:
# Classifier:
def xgb_hyperparameter_tuning(space):
    model = xgb.XGBClassifier(n_estimators = space['n_estimators'],
                              learning_rate = space['learning_rate'],
                              max_depth = int(space['max_depth']), 
                              gamma = space['gamma'],
                              reg_alpha = space['reg_alpha'],
                              reg_lambda = space['reg_lambda'],
                              min_child_weight=space['min_child_weight'],
                              colsample_bytree=space['colsample_bytree'], 
                             use_label_encoder=False)
    
    evaluation = [(X_train_scaled, y_train), ( X_test_scaled, y_test)]
    
    model.fit(X_train_scaled, y_train,
              eval_set=evaluation, 
              eval_metric="logloss",
              early_stopping_rounds=50,
              verbose=False)

    logLossTrain = sklearn.metrics.log_loss(y_train,model.predict_proba(X_train_scaled))
    logLossTest = sklearn.metrics.log_loss(y_test,model.predict_proba(X_test_scaled))

    #print ("SCORE:", logLossTrain)
    #print ("SCORE:", logLossTest)

    #change the metric if you like
    return {'loss': logLossTest, 'status': STATUS_OK, 'model': model}

In [54]:
trials = Trials()
bestXGBClassifier = fmin(fn=xgb_hyperparameter_tuning,
            space=xgbSpace,
            algo=anneal.suggest,
            max_evals=2500,
            trials=trials)

print(bestXGBClassifier)

100%|██████████| 2500/2500 [1:06:31<00:00,  1.60s/trial, best loss: 0.5379506228317259]
{'colsample_bytree': 0.6335811872761462, 'gamma': 2.0, 'learning_rate': 0.10336793213647967, 'max_depth': 22.0, 'min_child_weight': 26.0, 'reg_alpha': 3.0, 'reg_lambda': 4.5}


In [55]:
xgbClassifier = xgb.XGBClassifier(n_estimators = 5000, 
                                  max_depth = int(bestXGBClassifier['max_depth']), 
                                  learning_rate = bestXGBClassifier['learning_rate'],
                                  gamma = bestXGBClassifier['gamma'],
                                  reg_alpha = bestXGBClassifier['reg_alpha'],
                                  reg_lambda = bestXGBClassifier['reg_lambda'],
                                  min_child_weight = bestXGBClassifier['min_child_weight'],
                                  colsample_bytree = bestXGBClassifier['colsample_bytree'], 
                                  use_label_encoder=False)

evaluation = [(X_train_scaled, y_train), ( X_test_scaled, y_test)]

xgbClassifier.fit(X_train_scaled, y_train,
              eval_set=evaluation, 
              eval_metric="logloss",
              early_stopping_rounds=50,
              verbose=False)

print(sklearn.metrics.log_loss(y_train,xgbClassifier.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,xgbClassifier.predict_proba(X_test_scaled)))

0.4261382816673238
0.5379506228317259


In [56]:
pickle.dump(xgbClassifier, open("xgbClassifier.pickle.dat", "wb"))
#xgbClassifier = pickle.load(open("xgbClassifier.pickle.dat", "rb"))

### LightGBM

In [46]:
import lightgbm

In [49]:
lightGBMSpace = {'num_leaves': hp.quniform("num_leaves", 4, 400, 1),
                 'max_depth': hp.quniform("max_depth", 2, 30, 1),
        'learning_rate': hp.uniform ('learning_rate', 0.00001,0.5),
        'reg_alpha' : hp.quniform('reg_alpha', 0,20, 1),
        'reg_lambda' : hp.quniform('reg_lambda', 0,20, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.25,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 50, 1),
        'n_estimators': 5000
    }

In [56]:
# Classifier:
def lightGBM_hyperparameter_tuning(space):
    
    model = lightgbm.LGBMClassifier(max_depth = int(space['max_depth']), 
                                    num_leaves = int(space['num_leaves']),
                                    learning_rate = space['learning_rate'], 
                                    reg_alpha = space['reg_alpha'],
                                    reg_lambda = space['reg_lambda'],
                                    min_child_weight=space['min_child_weight'],
                                    colsample_bytree=space['colsample_bytree'])
    
    evaluation = [(X_train_scaled, y_train), ( X_test_scaled, y_test)]
    
    model.fit(X_train_scaled, y_train,
              eval_set=evaluation, 
              eval_metric="logloss",
              #early_stopping_rounds=50,
              #verbose=False, 
             callbacks=[lightgbm.early_stopping(stopping_rounds= 50)])

    logLossTrain = sklearn.metrics.log_loss(y_train,model.predict_proba(X_train_scaled))
    logLossTest = sklearn.metrics.log_loss(y_test,model.predict_proba(X_test_scaled))

    #print ("SCORE:", logLossTrain)
    #print ("SCORE:", logLossTest)

    #change the metric if you like
    return {'loss': logLossTest, 'status': STATUS_OK}

In [57]:
trials = Trials()
bestLightGBM = fmin(fn=lightGBM_hyperparameter_tuning,
                    space=lightGBMSpace,
                    algo=anneal.suggest,
                    max_evals=2500,
                    trials=trials)

print(bestLightGBM)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:         
[63]	training's binary_logloss: 0.494863	valid_1's binary_logloss: 0.548831
Training until validation scores don't improve for 50 rounds                      
Early stopping, best iteration is:                                                
[25]	training's binary_logloss: 0.378019	valid_1's binary_logloss: 0.553883
Training until validation scores don't improve for 50 rounds                      
Early stopping, best iteration is:                                                
[22]	training's binary_logloss: 0.455067	valid_1's binary_logloss: 0.557376
Training until validation scores don't improve for 50 rounds                      
Early stopping, best iteration is:                                                
[14]	training's binary_logloss: 0.485405	valid_1's binary_logloss: 0.551501
Training until validation scores don't improve for 50 rounds                    

In [58]:
lightGBMClassifier = lightgbm.LGBMClassifier(n_estimators = 5000, 
                                  num_leaves = int(bestLightGBM['num_leaves']), 
                                  learning_rate = bestLightGBM['learning_rate'], 
                                    reg_alpha = bestLightGBM['reg_alpha'],
                                    reg_lambda = bestLightGBM['reg_lambda'],
                                    min_child_weight = bestLightGBM['min_child_weight'],
                                    colsample_bytree = bestLightGBM['colsample_bytree'])

evaluation = [(X_train_scaled, y_train), ( X_test_scaled, y_test)]
lightGBMClassifier.fit(X_train_scaled, y_train,
                  eval_set=evaluation, 
                  eval_metric="logloss",
                  #arly_stopping_rounds=50,verbose=False, 
                      callbacks=[lightgbm.early_stopping(stopping_rounds= 50)])

print(sklearn.metrics.log_loss(y_train,lightGBMClassifier.predict_proba(X_train_scaled)))
print(sklearn.metrics.log_loss(y_test,lightGBMClassifier.predict_proba(X_test_scaled)))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[78]	training's binary_logloss: 0.459489	valid_1's binary_logloss: 0.542049
0.4594893174780938
0.5420493513288968


In [59]:
pickle.dump(lightGBMClassifier, open("lightGBMClassifier.pickle.dat", "wb"))
#lightGBMClassifier = pickle.load(open("lightGBMClassifier.pickle.dat", "rb"))

# CMDStanPy Model

In [None]:
!pip install --upgrade cmdstanpy

In [None]:
import cmdstanpy
from cmdstanpy import CmdStanModel
cmdstanpy.install_cmdstan()

In [None]:
def cmdStanData(dataSet, season):
    
    dataSet = dataSet[dataSet.Season == season]
    
    team_key = pd.read_csv('/Users/mitchfairweather/Desktop/NCAA Data/mens-march-mania-2022/MDataFiles_Stage1/MTeams.csv')[["TeamID", "TeamName"]]
    team_key['TeamID'] = team_key['TeamID'].astype(int)
    team_key = team_key[team_key['TeamID'].isin(np.unique(dataSet.T1_TeamID))]
    team_key["id"] = range(1, len(team_key.index) + 1)
    
    dataSet['homei'] = np.where(dataSet.location == "H", 1, 0)
    dataSet['homej'] = np.where(dataSet.location == "A", 1, 0)
    dataSet['margin'] = dataSet.T1_Score - dataSet.T2_Score
    dataSet = dataSet[["Season", "DayNum", "T1_TeamID", "T1_Score", "T2_TeamID", "T2_Score", "margin", "homei", "homej"]]
    dataSet = dataSet.rename(columns={#'T1_TeamID' : 'teami',
                          'T1_Score'  : 'scorei', 
                          #'T2_TeamID' : 'teamj', 
                          'T2_Score'  : 'scorej',
                          'DayNum'  : 'daynum',
                          'Season'  : 'season'})
    
    dataSet['T1_TeamID'] = dataSet['T1_TeamID'].astype(str)
    dataSet['T2_TeamID'] = dataSet['T2_TeamID'].astype(str)
    team_key['TeamID'] = team_key['TeamID'].astype(str)

    
    dataSet = dataSet.merge(team_key, left_on="T1_TeamID" , right_on="TeamID")
    dataSet = dataSet.rename(index = str, columns = {"id" : "teami"})
    dataSet.drop(['TeamID', 'TeamName'], axis = 1, inplace = True)
    dataSet = dataSet.merge(team_key, left_on="T2_TeamID" , right_on="TeamID")
    dataSet = dataSet.rename(index = str, columns = {"id" : "teamj"})
    dataSet.drop(['TeamID', 'TeamName'], axis = 1, inplace = True)
    
    names = ["N", "y", "h_i", "h_j", "team_i", "team_j", "N_g"]
    values = [len(dataSet.index), list(dataSet.margin), list(dataSet.homei), list(dataSet.homej), list(dataSet.teami), list(dataSet.teamj), dataSet.teamj.max()]
    train = dict(zip(names, values))
    
    return train, team_key

In [None]:
def cmdStanModel(dataSet, season):
    train, team_key = pyStanData(dataSet, season) 

    cmdModel = CmdStanModel(stan_file = "../input/stan-model-code/cmdstan.stan")

    fit = model.sample(data=cmdData, 
                        chains = 4, 
                       seed = 2020, 
                       iter_warmup = 500, 
                       iter_sampling = 2000,
                       adapt_delta = .9)
    return fit, cmdModel, team_key

In [49]:
cmdStan2003Fit,cmdStan2003Model, team_key2003 = pyStanModel(regular_results_detailed, 2003)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
In file included from /opt/conda/lib/python3.7/site-packages/numpy/core/inc

KeyboardInterrupt: 

In [None]:
cmdStan2003Fit,cmdStan2003Model, team_key2003 = cmdStanModel(regular_results_detailed, 2003)
cmdStan2004Fit,cmdStan2004Model, team_key2004 = cmdStanModel(regular_results_detailed, 2004)
cmdStan2005Fit,cmdStan2005Model, team_key2005 = cmdStanModel(regular_results_detailed, 2005)
cmdStan2006Fit,cmdStan2006Model, team_key2006 = cmdStanModel(regular_results_detailed, 2006)
cmdStan2007Fit,cmdStan2007Model, team_key2007 = cmdStanModel(regular_results_detailed, 2007)

cmdStan2008Fit,cmdStan2008Model, team_key2008 = cmdStanModel(regular_results_detailed, 2008)
cmdStan2009Fit,cmdStan2009Model, team_key2009 = cmdStanModel(regular_results_detailed, 2009)
cmdStan2010Fit,cmdStan2010Model, team_key2010 = cmdStanModel(regular_results_detailed, 2010)
cmdStan2011Fit,cmdStan2011Model, team_key2011 = cmdStanModel(regular_results_detailed, 2011)

cmdStan2012Fit,cmdStan2012Model, team_key2012 = cmdStanModel(regular_results_detailed, 2012)
cmdStan2013Fit,cmdStan2013Model, team_key2013 = cmdStanModel(regular_results_detailed, 2013)
cmdStan2014Fit,cmdStan2014Model, team_key2014 = cmdStanModel(regular_results_detailed, 2014)

cmdStan2015Fit,cmdStan2015Model, team_key2015 = cmdStanModel(regular_results_detailed, 2015)
cmdStan2016Fit,cmdStan2016Model, team_key2016 = cmdStanModel(regular_results_detailed, 2016)
cmdStan2017Fit,cmdStan2017Model, team_key2017 = cmdStanModel(regular_results_detailed, 2017)
cmdStan2018Fit,cmdStan2018Model, team_key2018 = cmdStanModel(regular_results_detailed, 2018)
cmdStan2019Fit,cmdStan2019Model, team_key2019 = cmdStanModel(regular_results_detailed, 2019)
cmdStan2021Fit,cmdStan2021Model, team_key2021 = cmdStanModel(regular_results_detailed, 2021)

In [None]:
def cmdStanPreds(fitObject, team_keys, tourneyData, season):
    tourney_matchups = tourneyData[tourneyData.Season== season][['Season', 'T1_TeamID', 'T2_TeamID', 'win']] 
    
    tourney_matchups = tourney_matchups.merge(team_keys, 
                       how = 'left', 
                       left_on = ['T1_TeamID'], 
                      right_on = ['TeamID'])
    tourney_matchups.rename(columns={'TeamName': 'T1_Name', 'id': 'T1_ID'}, inplace=True)
    tourney_matchups.drop('TeamID', axis = 1, inplace = True)

    tourney_matchups = tourney_matchups.merge(team_keys, 
                       how = 'left', 
                       left_on = ['T2_TeamID'], 
                      right_on = ['TeamID'])
    tourney_matchups.rename(columns={'TeamName': 'T2_Name', 'id': 'T2_ID'}, inplace=True)
    tourney_matchups.drop('TeamID', axis = 1, inplace = True)
    
    win_prob = []
    
    th = pd.DataFrame(fitObject.stan_variable("theta"))
    a = pd.DataFrame(fitObject.stan_variable("alpha"))
    sig = fitObject.stan_variable("sigma")
    
    def compare2(i, j, fitObject, homei = 0, homej = 0, reps = 750):
        win_prob = []

        th = pd.DataFrame(fitObject.stan_variable("theta"))
        a = pd.DataFrame(fitObject.stan_variable("alpha"))
        sig = fitObject.stan_variable("sigma")

        # Simulating specified number of games
        for r in range(1, reps):
            win_prob.append(
                np.mean(

                    # Ability difference
                    th[i] - th[j] +

                    # Adjusting for home court
                    a[i]*homei - a[j]*homej +

                    # Team performance variance
                    np.random.normal(0, sig[random.randrange(len(sig))],len(th.index)) > 0
                )
            )

        # Averaging game results
        win_prob = np.mean(win_prob)

        return(win_prob)
    
    preds = []

    for g in range(len(tourney_matchups.index)):
        preds.append(compare2(tourney_matchups["T1_ID"][g] - 1, 
                              tourney_matchups["T2_ID"][g] - 1,
                              fitObject = fitObject, 
                              homei = 0, 
                              homej = 0))
    tourney_matchups['pred'] = preds
    tourney_matchups = tourney_matchups[['Season', 'T1_TeamID', 'T2_TeamID', 'win', 'pred']]
    return tourney_matchups

In [None]:
cmdStanPreds2003 = cmdStanPredictions(fitObject = cmdStan2003Fit, 
                              team_keys = team_key2003, 
                              tourneyData = tournament_results, 
                              season = 2003)

cmdStanPreds2004 = cmdStanPredictions(fitObject = cmdStan2004Fit, 
                              team_keys = team_key2004, 
                              tourneyData = tournament_results, 
                              season = 2004)

cmdStanPreds2005 = cmdStanPredictions(fitObject = cmdStan2005Fit, 
                              team_keys = team_key2005, 
                              tourneyData = tournament_results, 
                              season = 2005)

cmdStanPreds2006 = cmdStanPredictions(fitObject = cmdStan2006Fit, 
                              team_keys = team_key2006, 
                              tourneyData = tournament_results, 
                              season = 2006)

cmdStanPreds2007 = cmdStanPredictions(fitObject = cmdStan2007Fit, 
                              team_keys = team_key2007, 
                              tourneyData = tournament_results, 
                              season = 2007)

cmdStanPreds2008 = cmdStanPredictions(fitObject = cmdStan2008Fit, 
                              team_keys = team_key2008, 
                              tourneyData = tournament_results, 
                              season = 2008)

cmdStanPreds2009 = cmdStanPredictions(fitObject = cmdStan2009Fit, 
                              team_keys = team_key2009, 
                              tourneyData = tournament_results, 
                              season = 2009)

cmdStanPreds2010 = cmdStanPredictions(fitObject = cmdStan2010Fit, 
                              team_keys = team_key2010, 
                              tourneyData = tournament_results, 
                              season = 2010)

cmdStanPreds2011 = cmdStanPredictions(fitObject = cmdStan2011Fit, 
                              team_keys = team_key2011, 
                              tourneyData = tournament_results, 
                              season = 2011)

cmdStanPreds2012 = cmdStanPredictions(fitObject = cmdStan2012Fit, 
                              team_keys = team_key2012, 
                              tourneyData = tournament_results, 
                              season = 2012)

cmdStanPreds2013 = cmdStanPredictions(fitObject = cmdStan2013Fit, 
                              team_keys = team_key2013, 
                              tourneyData = tournament_results, 
                              season = 2013)
cmdStanPreds2014 = cmdStanPredictions(fitObject = cmdStan2014Fit, 
                              team_keys = team_key2014, 
                              tourneyData = tournament_results, 
                              season = 2014)
cmdStanPreds2015 = cmdStanPredictions(fitObject = cmdStan2015Fit, 
                              team_keys = team_key2015, 
                              tourneyData = tournament_results, 
                              season = 2015)
cmdStanPreds2016 = cmdStanPredictions(fitObject = cmdStan2016Fit, 
                              team_keys = team_key2016, 
                              tourneyData = tournament_results, 
                              season = 2016)
cmdStanPreds2017 = cmdStanPredictions(fitObject = cmdStan2017Fit, 
                              team_keys = team_key2017, 
                              tourneyData = tournament_results, 
                              season = 2017)
cmdStanPreds2018 = cmdStanPredictions(fitObject = cmdStan2018Fit, 
                              team_keys = team_key2018, 
                              tourneyData = tournament_results, 
                              season = 2018)
cmdStanPreds2019 = cmdStanPredictions(fitObject = cmdStan2019Fit, 
                              team_keys = team_key2019, 
                              tourneyData = tournament_results, 
                              season = 2019)
cmdStanPreds2021 = cmdStanPredictions(fitObject = cmdStan2021Fit, 
                              team_keys = team_key2021, 
                              tourneyData = tournament_results, 
                              season = 2021)

In [None]:
#2003
with open("cmdStan2003Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2003Model, file)
with open("cmdStan2003Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2003Fit, file)
with open("team_key2003.pkl"  , 'wb') as file:  
    pickle.dump(team_key2003, file)
with open("cmdStanPreds2003.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2003, file)
    
#2004
with open("cmdStan2004Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2004Model, file)
with open("cmdStan2004Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2004Fit, file)
with open("team_key2004.pkl"  , 'wb') as file:  
    pickle.dump(team_key2004, file)
with open("cmdStanPreds2004.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2004, file)
    
#2005
with open("cmdStan2005Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2005Model, file)
with open("cmdStan2005Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2005Fit, file)
with open("team_key2005.pkl"  , 'wb') as file:  
    pickle.dump(team_key2005, file)
with open("cmdStanPreds2005.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2005, file)
    
#2006
with open("cmdStan2006Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2006Model, file)
with open("cmdStan2006Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2006Fit, file)
with open("team_key2006.pkl"  , 'wb') as file:  
    pickle.dump(team_key2006, file)
with open("cmdStanPreds2006.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2006, file)
    
#2007
with open("cmdStan2007Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2007Model, file)
with open("cmdStan2007Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2007Fit, file)
with open("team_key2007.pkl"  , 'wb') as file:  
    pickle.dump(team_key2007, file)
with open("cmdStanPreds2007.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2007, file)
    
#2008
with open("cmdStan2008Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2008Model, file)
with open("cmdStan2008Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2008Fit, file)
with open("team_key2008.pkl"  , 'wb') as file:  
    pickle.dump(team_key2008, file)
with open("cmdStanPreds2008.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2008, file)
    
#2009
with open("cmdStan2009Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2009Model, file)
with open("cmdStan2009Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2009Fit, file)
with open("team_key2009.pkl"  , 'wb') as file:  
    pickle.dump(team_key2009, file)
with open("cmdStanPreds2009.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2009, file)
    
#2010
with open("cmdStan2010Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2010Model, file)
with open("cmdStan2010Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2010Fit, file)
with open("team_key2010.pkl"  , 'wb') as file:  
    pickle.dump(team_key2010, file)
with open("cmdStanPreds2010.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2010, file)
    
#2011
with open("cmdStan2011Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2011Model, file)
with open("cmdStan2011Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2011Fit, file)
with open("team_key2011.pkl"  , 'wb') as file:  
    pickle.dump(team_key2011, file)
with open("cmdStanPreds2011.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2011, file)
    
#2012
with open("cmdStan2012Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2012Model, file)
with open("cmdStan2012Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2012Fit, file)
with open("team_key2012.pkl"  , 'wb') as file:  
    pickle.dump(team_key2012, file)
with open("cmdStanPreds2012.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2012, file)
    
#2013
with open("cmdStan2013Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2013Model, file)
with open("cmdStan2013Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2013Fit, file)
with open("team_key2013.pkl"  , 'wb') as file:  
    pickle.dump(team_key2013, file)
with open("cmdStanPreds2013.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2013, file)
    
#2014
with open("cmdStan2014Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2014Model, file)
with open("cmdStan2014Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2014Fit, file)
with open("team_key2014.pkl"  , 'wb') as file:  
    pickle.dump(team_key2014, file)
with open("cmdStanPreds2014.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2014, file)

#2015
with open("cmdStan2015Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2015Model, file)
with open("cmdStan2015Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2015Fit, file)
with open("team_key2015.pkl"  , 'wb') as file:  
    pickle.dump(team_key2015, file)
with open("cmdStanPreds2015.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2015, file)
    
#2016
with open("cmdStan2016Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2016Model, file)
with open("cmdStan2016Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2016Fit, file)
with open("team_key2016.pkl"  , 'wb') as file:  
    pickle.dump(team_key2016, file)
with open("cmdStanPreds2016.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2016, file)

#2017
with open("cmdStan2017Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2017Model, file)
with open("cmdStan2017Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2017Fit, file)
with open("team_key2017.pkl"  , 'wb') as file:  
    pickle.dump(team_key2017, file)
with open("cmdStanPreds2017.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2017, file)
    
#2018
with open("cmdStan2018Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2018Model, file)
with open("cmdStan2018Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2018Fit, file)
with open("team_key2018.pkl"  , 'wb') as file:  
    pickle.dump(team_key2018, file)
with open("cmdStanPreds2018.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2018, file)
    
#2019
with open("cmdStan2019Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2019Model, file)
with open("cmdStan2019Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2019Fit, file)
with open("team_key2019.pkl"  , 'wb') as file:  
    pickle.dump(team_key2019, file)
with open("cmdStanPreds2019.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2019, file)
#2021
with open("cmdStan2021Model.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2021Model, file)
with open("cmdStan2021Fit.pkl"  , 'wb') as file:  
    pickle.dump(cmdStan2021Fit, file)
with open("team_key2021.pkl"  , 'wb') as file:  
    pickle.dump(team_key2021, file)
with open("cmdStanPreds2021.pkl"  , 'wb') as file:  
    pickle.dump(cmdStanPreds2021, file)

### Reading in pystan models

#2003
with open('../input/stage-1-predictions/pyStan2003Model.pkl', 'rb') as file:  
    pystan2003Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2003Fit.pkl', 'rb') as file:  
    pystan2003Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2003.pkl', 'rb') as file:  
    pystanPreds2003 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2003.pkl', 'rb') as file:  
    team_key2003 = pickle.load(file)
    
#2004
with open('../input/stage-1-predictions/pyStan2004Model.pkl', 'rb') as file:  
    pystan2004Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2004Fit.pkl', 'rb') as file:  
    pystan2004Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2004.pkl', 'rb') as file:  
    pystanPreds2004 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2004.pkl', 'rb') as file:  
    team_key2004 = pickle.load(file)
    
#2005
with open('../input/stage-1-predictions/pyStan2005Model.pkl', 'rb') as file:  
    pystan2005Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2005Fit.pkl', 'rb') as file:  
    pystan2005Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2005.pkl', 'rb') as file:  
    pystanPreds2005 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2005.pkl', 'rb') as file:  
    team_key2005 = pickle.load(file)
    
#2006
with open('../input/stage-1-predictions/pyStan2006Model.pkl', 'rb') as file:  
    pystan2006Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2006Fit.pkl', 'rb') as file:  
    pystan2006Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2006.pkl', 'rb') as file:  
    pystanPreds2006 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2006.pkl', 'rb') as file:  
    team_key2006 = pickle.load(file)
    
#2007
with open('../input/stage-1-predictions/pyStan2007Model.pkl', 'rb') as file:  
    pystan2007Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2007Fit.pkl', 'rb') as file:  
    pystan2007Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2007.pkl', 'rb') as file:  
    pystanPreds2007 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2007.pkl', 'rb') as file:  
    team_key2007 = pickle.load(file)
    
#2008
with open('../input/stage-1-predictions/pyStan2008Model.pkl', 'rb') as file:  
    pystan2008Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2008Fit.pkl', 'rb') as file:  
    pystan2008Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2008.pkl', 'rb') as file:  
    pystanPreds2008 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2008.pkl', 'rb') as file:  
    team_key2008 = pickle.load(file)
    
#2009
with open('../input/stage-1-predictions/pyStan2009Model.pkl', 'rb') as file:  
    pystan2009Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2009Fit.pkl', 'rb') as file:  
    pystan2009Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2009.pkl', 'rb') as file:  
    pystanPreds2009 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2009.pkl', 'rb') as file:  
    team_key2009 = pickle.load(file)
    
#2010
with open('../input/stage-1-predictions/pyStan2010Model.pkl', 'rb') as file:  
    pystan2010Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2010Fit.pkl', 'rb') as file:  
    pystan2010Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2010.pkl', 'rb') as file:  
    pystanPreds2010 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2010.pkl', 'rb') as file:  
    team_key2010= pickle.load(file)
    
#2011
with open('../input/stage-1-predictions/pyStan2011Model.pkl', 'rb') as file:  
    pystan2011Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2011Fit.pkl', 'rb') as file:  
    pystan2011Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2011.pkl', 'rb') as file:  
    pystanPreds2011 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2011.pkl', 'rb') as file:  
    team_key2011= pickle.load(file)
    
#2012
with open('../input/stage-1-predictions/pyStan2012Model.pkl', 'rb') as file:  
    pystan2012Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2012Fit.pkl', 'rb') as file:  
    pystan2012Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2012.pkl', 'rb') as file:  
    pystanPreds2012 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2012.pkl', 'rb') as file:  
    team_key2012= pickle.load(file)
    
#2013
with open('../input/stage-1-predictions/pyStan2013Model.pkl', 'rb') as file:  
    pystan2013Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2013Fit.pkl', 'rb') as file:  
    pystan2013Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2013.pkl', 'rb') as file:  
    pystanPreds2013 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2013.pkl', 'rb') as file:  
    team_key2010= pickle.load(file)

        
#2014
with open('../input/stage-1-predictions/pyStan2014Model.pkl', 'rb') as file:  
    pystan2014Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2014Fit.pkl', 'rb') as file:  
    pystan2014Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2014.pkl', 'rb') as file:  
    pystanPreds2014 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2014.pkl', 'rb') as file:  
    team_key2014= pickle.load(file)

#2015
with open('../input/stage-1-predictions/pyStan2015Model.pkl', 'rb') as file:  
    pystan2015Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2015Fit.pkl', 'rb') as file:  
    pystan2015Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2015.pkl', 'rb') as file:  
    pystanPreds2015 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2015.pkl', 'rb') as file:  
    team_key2015 = pickle.load(file)
    
#2016
with open('../input/stage-1-predictions/pyStan2016Model.pkl', 'rb') as file:  
    pystan2016Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2016Fit.pkl', 'rb') as file:  
    pystan2016Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2016.pkl', 'rb') as file:  
    pystanPreds2016 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2016.pkl', 'rb') as file:  
    team_key2016 = pickle.load(file)

#2017
with open('../input/stage-1-predictions/pyStan2017Model.pkl', 'rb') as file:  
    pystan2017Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2017Fit.pkl', 'rb') as file:  
    pystan2017Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2017.pkl', 'rb') as file:  
    pystanPreds2017 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2017.pkl', 'rb') as file:  
    team_key2017 = pickle.load(file)
    
#2018
with open('../input/stage-1-predictions/pyStan2018Model.pkl', 'rb') as file:  
    pystan2018Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2018Fit.pkl', 'rb') as file:  
    pystan2018Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2018.pkl', 'rb') as file:  
    pystanPreds2018 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2018.pkl', 'rb') as file:  
    team_key2018 = pickle.load(file)
    
#2019
with open('../input/stage-1-predictions/pyStan2019Model.pkl', 'rb') as file:  
    pystan2019Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2019Fit.pkl', 'rb') as file:  
    pystan2019Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2019.pkl', 'rb') as file:  
    pystanPreds2019 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2019.pkl', 'rb') as file:  
    team_key2019 = pickle.load(file)
    
 #2021
with open('../input/stage-1-predictions/pyStan2021Model.pkl', 'rb') as file:  
    pystan2021Model = pickle.load(file)
with open('../input/stage-1-predictions/pyStan2021Fit.pkl', 'rb') as file:  
    pystan2021Fit = pickle.load(file)
with open('../input/stage-1-predictions/preds2021.pkl', 'rb') as file:  
    pystanPreds2021 = pickle.load(file)
with open('../input/stage-1-predictions/team_key2021.pkl', 'rb') as file:  
    team_key2021 = pickle.load(file)



### Average of Predicted Probabilities

In [None]:
trainPreds['AveragePred'] = (trainPreds.pystanPreds + trainPreds.xgbPreds + trainPreds.lightGBMPreds) / 3
validPreds['AveragePred'] = (validPreds.pystanPreds + validPreds.xgbPreds + validPreds.lightGBMPreds) / 3

trainPreds['AveragePredClip'] = np.clip(trainPreds['AveragePred'], 0.05, 0.95)
validPreds['AveragePredClip'] = np.clip(validPreds['AveragePred'], 0.05, 0.95)

In [None]:
trainPreds.head()

In [None]:
validPreds.head()

In [None]:
print('No manipulation train: ', sklearn.metrics.log_loss(trainPreds['win'], trainPreds['AveragePred']))
print('No manipulation valid: ', sklearn.metrics.log_loss(validPreds['win'], validPreds['AveragePred']))

print('Clip train: ', sklearn.metrics.log_loss(trainPreds['win'], trainPreds['AveragePredClip']))
print('Clip valid: ', sklearn.metrics.log_loss(validPreds['win'], validPreds['AveragePredClip']))

### Conclusion: 

The clear and obvious winner here is actually a simple arithmetic average of the predicted probabilities and then clipped at values of (0.05, 0.95). For making the submission file, the steps will be to: 

1. Predictions on pystan (will take quite some time)
2. Predictions on XGBoost
3. Predictions on lightGBM
4. Average predictions from each model
5. Clip predicted probabilites using np.clip(prob, 0.05, 0.95)

# Predictions by Season

In [None]:
def matchupData(matchupData):
    
    matchupData = matchupData.merge(OffensiveStatsT1, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','T1_TeamID'],
                               sort=False)
    matchupData = matchupData.merge(DefensiveStatsT1, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','T1_TeamID'],
                               sort=False)
    matchupData = matchupData.merge(OffensiveStatsT2, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','T2_TeamID'],
                               sort=False)
    matchupData = matchupData.merge(DefensiveStatsT2, 
                               how='left', 
                               left_on = ['Season','T2_TeamID'], 
                               right_on =['Season','T2_TeamID'],
                               sort=False)
    
    matchupData = matchupData.merge(seeds, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
    matchupData = matchupData.merge(seeds, 
                                   how='left', 
                                   left_on = ['Season','T2_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'Seed_x': 'T1_Seed', 
                             'Seed_y': 'T2_Seed'}, inplace=True)
    matchupData.drop(['TeamID_x', 'TeamID_y'], axis = 1, inplace = True)
    matchupData['seed_dif'] = matchupData['T1_Seed'] - matchupData['T2_Seed']

    matchupData = matchupData.merge(H1_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T1_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T1_H1_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

    matchupData = matchupData.merge(H1_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T2_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T2_H1_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)



    matchupData = matchupData.merge(H2_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T1_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T1_H2_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

    matchupData = matchupData.merge(H2_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T2_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T2_H2_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

    matchupData = matchupData.merge(Full_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T1_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T1_Full_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

    matchupData = matchupData.merge(Full_team_quality[['TeamID', 'Season', 'quality']], 
                                   how='left', 
                                   left_on = ['Season','T2_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'quality': 'T2_Full_Quality'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)
    
    matchupData = featureDifferences(matchupData)
    
    matchupData = matchupData.merge(MasseyRankings, 
                               how='left', 
                               left_on = ['Season','T1_TeamID'], 
                               right_on =['Season','TeamID'],
                               sort=False)
    matchupData.rename(columns={'StartRank': 'T1_StartRank', 
                                      'MidRank': 'T1_MidRank', 
                                      'EndRank': 'T1_EndRank', 
                                      'BestRank': 'T1_BestRank', 
                                      'RankDif': 'T1_RankDif'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

    matchupData = matchupData.merge(MasseyRankings, 
                                   how='left', 
                                   left_on = ['Season','T2_TeamID'], 
                                   right_on =['Season','TeamID'],
                                   sort=False)
    matchupData.rename(columns={'StartRank': 'T2_StartRank', 
                                      'MidRank': 'T2_MidRank', 
                                      'EndRank': 'T2_EndRank', 
                                      'BestRank': 'T2_BestRank', 
                                      'RankDif': 'T2_RankDif'}, inplace=True)
    matchupData.drop(['TeamID'], axis = 1, inplace = True)

        
    return matchupData


In [None]:
def pyStanStage1Predictions(fitObject, team_keys, matchupData):
    matchupData = matchupData[['Season', 'T1_TeamID', 'T2_TeamID']] 
    
    matchupData = matchupData.merge(team_keys, 
                       how = 'left', 
                       left_on = ['T1_TeamID'], 
                      right_on = ['TeamID'])
    matchupData.rename(columns={'TeamName': 'T1_Name', 'id': 'T1_ID'}, inplace=True)
    matchupData.drop('TeamID', axis = 1, inplace = True)

    matchupData = matchupData.merge(team_keys, 
                       how = 'left', 
                       left_on = ['T2_TeamID'], 
                      right_on = ['TeamID'])
    matchupData.rename(columns={'TeamName': 'T2_Name', 'id': 'T2_ID'}, inplace=True)
    matchupData.drop('TeamID', axis = 1, inplace = True)
    
    win_prob = []
    
    th = pd.DataFrame(fitObject.extract()["theta"])
    a = pd.DataFrame(fitObject.extract()["alpha"])
    sig = fitObject.extract()["sigma"]
    
    def compare2(i, j, fitObject, homei = 0, homej = 0, reps = 1000):
        win_prob = []

        th = pd.DataFrame(fitObject.extract()["theta"])
        a = pd.DataFrame(fitObject.extract()["alpha"])
        sig = fitObject.extract()["sigma"]

        # Simulating specified number of games
        for r in range(1, reps):
            win_prob.append(
                np.mean(

                    # Ability difference
                    th[i] - th[j] +

                    # Adjusting for home court
                    a[i]*homei - a[j]*homej +

                    # Team performance variance
                    np.random.normal(0, 
                                     sig[random.randrange(len(sig))], 
                                     len(th.index)
                    ) > 0
                )
            )

        # Averaging game results
        win_prob = np.mean(win_prob)

        return(win_prob)
    
    preds = []

    for g in range(len(matchupData.index)):
        preds.append(compare2(matchupData["T1_ID"][g] - 1, 
                              matchupData["T2_ID"][g] - 1,
                              fitObject = fitObject, 
                              homei = 0, 
                              homej = 0))
    return preds

In [None]:
def matchupDataPredictions(fitObject, team_keys, matchupData):
    matchupData_scaled = scaler.transform(matchupData.drop(['T1_TeamID', 'T2_TeamID', 'Season', 'ID'], axis = 1))
    xgbPreds = xgbClassifier.predict_proba(matchupData_scaled)[:,1]
    lightGBMPreds = lightGBMClassifier.predict_proba(matchupData_scaled)[:,1]
    pyStanPreds = pyStanStage1Predictions(fitObject, team_keys, matchupData)
    
    predDF = pd.concat([pd.DataFrame(xgbPreds), pd.DataFrame(lightGBMPreds),pd.DataFrame(pyStanPreds)], axis = 1)
    predDF.columns = ['xgbPreds', 'lightGBMPreds', 'pyStanPreds']
    predDF = pd.concat([matchupData[['T1_TeamID', 'T2_TeamID', 'Season', 'ID']],predDF], axis = 1)
    
    predDF = predDF.merge(team_keys[['TeamID', 'TeamName']], 
                 how = 'left',
                 left_on = 'T1_TeamID', 
                 right_on = 'TeamID')
    predDF.rename(columns={'TeamName': 'T1_Name'}, inplace=True)
    predDF.drop('TeamID', axis = 1, inplace = True)
    
    predDF = predDF.merge(team_key2015[['TeamID', 'TeamName']], 
                          how = 'left',
                          left_on = 'T2_TeamID', 
                          right_on = 'TeamID')
    predDF.rename(columns={'TeamName': 'T2_Name'}, inplace=True)
    predDF.drop('TeamID', axis = 1, inplace = True)  
    predDF = predDF[['T1_Name', 'T2_Name', 'T1_TeamID', 'T2_TeamID', 'Season', 'ID', 'xgbPreds', 'lightGBMPreds', 'pyStanPreds']]

    predDF['AveragePreds'] = (predDF['xgbPreds'] + predDF['lightGBMPreds'] + predDF['pyStanPreds'])/3    
    
    return predDF

    

In [None]:
submissionPreds2015 = matchupDataPredictions(fitObject = pyStan2015Fit, team_keys = team_key2015, matchupData = matchupData(matchups2015))
submissionPreds2016 = matchupDataPredictions(fitObject = pyStan2016Fit, team_keys = team_key2016, matchupData = matchupData(matchups2016))
submissionPreds2017 = matchupDataPredictions(fitObject = pyStan2017Fit, team_keys = team_key2017, matchupData = matchupData(matchups2017))
submissionPreds2018 = matchupDataPredictions(fitObject = pyStan2018Fit, team_keys = team_key2018, matchupData = matchupData(matchups2018))
submissionPreds2019 = matchupDataPredictions(fitObject = pyStan2019Fit, team_keys = team_key2019, matchupData = matchupData(matchups2019))

In [None]:
submissionPreds = pd.concat([submissionPreds2015, submissionPreds2016, submissionPreds2017, 
                            submissionPreds2018, submissionPreds2019], axis = 0)

In [None]:
submissionPreds2015.to_csv('submissionPreds2015.csv',index=False)
submissionPreds2016.to_csv('submissionPreds2016.csv',index=False)
submissionPreds2017.to_csv('submissionPreds2017.csv',index=False)
submissionPreds2018.to_csv('submissionPreds2018.csv',index=False)
submissionPreds2019.to_csv('submissionPreds2019.csv',index=False)
submissionPreds.to_csv('submissionPreds.csv',index=False)

In [None]:
finalSubmissionPreds = submissionPreds[["ID", "AveragePreds"]]
finalSubmissionPreds.columns = ['ID', 'Pred']
finalSubmissionPreds.to_csv('finalSubmissionPreds.csv',index=False)
