# 02 Create Predictions GLM
Create predictions for each potential game

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

## Read in data

In [8]:
comb_results = pd.read_csv('../data/comb_results.csv')
comb_results.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207
1,1985,R1W2,W02,W15,1,1210,Georgia Tech,1273,Mercer,65,0,58,0,1210
2,1985,R1W3,W03,W14,1,1228,Illinois,1318,Northeastern,76,0,57,0,1228
3,1985,R1W4,W04,W13,1,1260,Loyola-Chicago,1233,Iona,59,0,58,0,1260
4,1985,R1W5,W05,W12,1,1374,SMU,1330,Old Dominion,85,0,68,0,1374


In [9]:
seeds = pd.read_csv('../data/Stage2DataFiles//NCAATourneySeeds.csv')
seeds['Seed_v2'] = [int(x[1:3]) for x in seeds['Seed']]
seeds.drop('Seed', axis=1, inplace=True)
seeds.head()

Unnamed: 0,Season,TeamID,Seed_v2
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


## Create features

In [10]:
# join to get original seed for strong and weak teams
comb_results_seeds = pd.merge(comb_results, seeds, 
                              left_on=['Season', 'TeamID_Strong'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Strong'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds = pd.merge(comb_results_seeds, seeds, 
                              left_on=['Season', 'TeamID_Weak'], 
                              right_on=['Season', 'TeamID'], 
                              validate='m:1')
comb_results_seeds.rename(columns={'Seed_v2':'Orig_Seed_Weak'}, inplace=True)
comb_results_seeds.drop('TeamID', axis=1, inplace=True)

comb_results_seeds.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1


In [11]:
# add win indicator for strong team
comb_results_seeds_v2 = comb_results_seeds.copy()
comb_results_seeds_v2['Strong_Win'] = [1] * (comb_results_seeds_v2['TeamID_Winner']
                                             ==comb_results_seeds_v2['TeamID_Strong'])
comb_results_seeds_v2.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,Round,TeamID_Strong,TeamName_Strong,TeamID_Weak,TeamName_Weak,Score_Strong,NumOT_Strong,Score_Weak,NumOT_Weak,TeamID_Winner,Orig_Seed_Strong,Orig_Seed_Weak,Strong_Win
0,1985,R1W1,W01,W16,1,1207,Georgetown,1250,Lehigh,68,0,43,0,1207,1,16,1
1,1985,R2W1,R1W1,R1W8,2,1207,Georgetown,1396,Temple,63,0,46,0,1207,1,8,1
2,1985,R3W1,R2W1,R2W4,3,1207,Georgetown,1260,Loyola-Chicago,65,0,53,0,1207,1,4,1
3,1985,R4W1,R3W1,R3W2,4,1207,Georgetown,1210,Georgia Tech,60,0,54,0,1207,1,2,1
4,1985,R5WX,R4W1,R4X1,5,1207,Georgetown,1385,St John's,77,0,59,0,1207,1,1,1


## Build Model

In [45]:
y = comb_results_seeds_v2[['Strong_Win']]
X = comb_results_seeds_v2[['Orig_Seed_Strong', 'Orig_Seed_Weak']]

In [46]:
reg = LogisticRegression().fit(X, y)

  y = column_or_1d(y, warn=True)


In [47]:
reg.intercept_

array([0.36192009])

In [48]:
reg.coef_

array([[-0.22232054,  0.14819187]])

In [49]:
X.head()

Unnamed: 0,Orig_Seed_Strong,Orig_Seed_Weak
0,1,16
1,1,8
2,1,4
3,1,2
4,1,1


In [50]:
@np.vectorize
def apply_glm(oss, osw):
    b0 = 0.36192009
    b1 = -0.22232054
    b2 = 0.14819187
    return (1/(1+np.e**(-1*(b0+b1*oss+b2*osw))))

In [51]:
pred = X.copy()
pred['pred'] = apply_glm(pred['Orig_Seed_Strong'], pred['Orig_Seed_Weak'])
pred.head()

Unnamed: 0,Orig_Seed_Strong,Orig_Seed_Weak,pred
0,1,16,0.924886
1,1,8,0.790035
2,1,4,0.675324
3,1,2,0.607302
4,1,1,0.571455
5,1,8,0.790035
6,2,8,0.750787
7,1,8,0.790035
8,2,15,0.894745
9,2,7,0.722044


In [52]:
Xt = pd.DataFrame()
Xt['Orig_Seed_Weak'] = X['Orig_Seed_Strong']
Xt['Orig_Seed_Strong'] = X['Orig_Seed_Weak']
Xt.head()

Unnamed: 0,Orig_Seed_Weak,Orig_Seed_Strong
0,1,16
1,1,8
2,1,4
3,1,2
4,1,1


In [53]:
yt = [1-y]

In [54]:
X_all = X.append(Xt).reset_index(drop=True)
y_all = y.append(yt).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [55]:
reg = LogisticRegression().fit(X_all, y_all)

  y = column_or_1d(y, warn=True)


In [57]:
reg.intercept_

array([-7.73792442e-15])

In [58]:
reg.coef_

array([[-0.16661885,  0.16661885]])

In [59]:
@np.vectorize
def apply_glm(oss, osw):
    b0 = -7.73792442e-15
    b1 = -0.16661885
    b2 = 0.16661885
    return (1/(1+np.e**(-1*(b0+b1*oss+b2*osw))))

In [60]:
pred = X.copy()
pred['pred'] = apply_glm(pred['Orig_Seed_Strong'], pred['Orig_Seed_Weak'])
pred.head()

Unnamed: 0,Orig_Seed_Strong,Orig_Seed_Weak,pred
0,1,16,0.924092
1,1,8,0.762481
2,1,4,0.622426
3,1,2,0.541559
4,1,1,0.5
