# 02 Create Predictions GBM
Create predictions for each potential game

In [1]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import pickle

## Read in data

In [2]:
comb_results = pd.read_csv('../data/comb_results.csv')
comb_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207
1,1985,R1W2,W02,W15,1,1210,Georgia Tech,1273,Mercer,65,0,58,0,1210
2,1985,R1W3,W03,W14,1,1228,Illinois,1318,Northeastern,76,0,57,0,1228
3,1985,R1W4,W04,W13,1,1260,Loyola-Chicago,1233,Iona,59,0,58,0,1260
4,1985,R1W5,W05,W12,1,1374,SMU,1330,Old Dominion,85,0,68,0,1374


In [3]:
seeds = pd.read_csv('../data/Stage2DataFiles//NCAATourneySeeds.csv')
seeds['Seed_v2'] = [int(x[1:3]) for x in seeds['Seed']]
seeds.drop('Seed', axis=1, inplace=True)
seeds.head()

Unnamed: 0,Season,TeamID,Seed_v2
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [4]:
ordinals = pd.read_csv('../data/MasseyOrdinals_thru_2019_day_128/MasseyOrdinals_thru_2019_day_128.csv')
rpi = ordinals.loc[ordinals['SystemName']=='RPI']
rpi = rpi.drop_duplicates(subset=['Season', 'SystemName', 'TeamID'], keep='last')
rpi.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
127586,2003,133,RPI,1102,158
127587,2003,133,RPI,1103,182
127588,2003,133,RPI,1104,38
127589,2003,133,RPI,1105,313
127590,2003,133,RPI,1106,248


## Create features

In [5]:
# join to get original seed for strong and weak teams
comb_results_seeds = pd.merge(comb_results, seeds, 
                              left_on=['Season', 'TeamID_Strong'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Strong'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds = pd.merge(comb_results_seeds, seeds, 
                              left_on=['Season', 'TeamID_Weak'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Weak'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1


In [6]:
# add win indicator for strong team
comb_results_seeds_v2 = comb_results_seeds.copy()
comb_results_seeds_v2['Strong_Win'] = [1] * (comb_results_seeds_v2['TeamID_Winner']
                                             ==comb_results_seeds_v2['TeamID_Strong'])
comb_results_seeds_v2.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak,Strong_Win
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16,1
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8,1
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4,1
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2,1
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1,1


In [7]:
# add ordinals
comb_results_seeds_v3 = pd.merge(comb_results_seeds_v2, rpi[['Season', 'TeamID', 'OrdinalRank']], 
                           left_on=['Season', 'TeamID_Strong'], 
                           right_on=['Season', 'TeamID'],
                           how='left', validate='m:1')
comb_results_seeds_v3.rename(columns={'OrdinalRank':'RPI_Strong'}, inplace=True)
comb_results_seeds_v3.drop(['TeamID'], axis=1, inplace=True)

comb_results_seeds_v3 = pd.merge(comb_results_seeds_v3, rpi[['Season', 'TeamID', 'OrdinalRank']], 
                           left_on=['Season', 'TeamID_Weak'], 
                           right_on=['Season', 'TeamID'], 
                           how='left', validate='m:1')
comb_results_seeds_v3.rename(columns={'OrdinalRank':'RPI_Weak'}, inplace=True)
comb_results_seeds_v3.drop(['TeamID'], axis=1, inplace=True)

In [8]:
# create dummy variable
comb_results_seeds_v3['RPI_Strong_Dummy'] = [0 if pd.isnull(x) else 1 for x in comb_results_seeds_v3['RPI_Strong']]
comb_results_seeds_v3['RPI_Weak_Dummy'] = [0 if pd.isnull(x) else 1 for x in comb_results_seeds_v3['RPI_Weak']]

comb_results_seeds_v3['RPI_Weak'] = comb_results_seeds_v3['RPI_Weak'].fillna(0)
comb_results_seeds_v3['RPI_Strong'] = comb_results_seeds_v3['RPI_Strong'].fillna(0)

## Build Model

In [9]:
y = comb_results_seeds_v3[['Strong_Win']]
X = comb_results_seeds_v3[['Orig_Seed_Strong', 'Orig_Seed_Weak', 'RPI_Strong', 'RPI_Weak', 
                           'RPI_Strong_Dummy', 'RPI_Weak_Dummy']]

In [10]:
Xt = pd.DataFrame()
Xt['Orig_Seed_Weak'] = X['Orig_Seed_Strong']
Xt['Orig_Seed_Strong'] = X['Orig_Seed_Weak']
Xt['RPI_Strong'] = X['RPI_Weak']
Xt['RPI_Strong_Dummy'] = X['RPI_Weak_Dummy']
Xt['RPI_Weak'] = X['RPI_Strong']
Xt['RPI_Weak_Dummy'] = X['RPI_Strong_Dummy']

Xt.head()

Unnamed: 0,Orig_Seed_Weak,Orig_Seed_Strong,RPI_Strong,RPI_Strong_Dummy,RPI_Weak,RPI_Weak_Dummy
0,1,16,0.0,0,0.0,0
1,1,8,0.0,0,0.0,0
2,1,4,0.0,0,0.0,0
3,1,2,0.0,0,0.0,0
4,1,1,0.0,0,0.0,0


In [11]:
yt = [1-y]

In [12]:
X_all = X.append(Xt).reset_index(drop=True)
y_all = y.append(yt).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [13]:
reg = XGBClassifier().fit(X_all, y_all)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
pred = X.copy()[['Orig_Seed_Strong', 'Orig_Seed_Weak', 'RPI_Strong', 'RPI_Strong_Dummy', 'RPI_Weak', 'RPI_Weak_Dummy']]
pred['pred'] = [x[1] for x in reg.predict_proba(pred)]
pred.head()

Unnamed: 0,Orig_Seed_Strong,Orig_Seed_Weak,RPI_Strong,RPI_Strong_Dummy,RPI_Weak,RPI_Weak_Dummy,pred
0,1,16,0.0,0,0.0,0,0.98744
1,1,8,0.0,0,0.0,0,0.81111
2,1,4,0.0,0,0.0,0,0.72545
3,1,2,0.0,0,0.0,0,0.5423
4,1,1,0.0,0,0.0,0,0.508319


### Save model to file

In [15]:
with open('../data/xgb.p', 'wb') as handle:
    pickle.dump(reg, handle)