In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
import xgboost as xgb

In [57]:
%store -r awayMatchup
%store -r homeMatchup
%store -r gameTable

In [58]:
cols = list(awayMatchup.columns)
cols
cols.pop(21)
cols.append('pf')

In [59]:
awayMatchup = awayMatchup[cols]
homeMatchup = homeMatchup[cols]

# Only use weeks > 3 since dvoa is inaccurate before week 3, typically
awayMatchup = awayMatchup[awayMatchup['week'] >= 3]
homeMatchup = homeMatchup[homeMatchup['week'] >= 3]

In [401]:
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(awayMatchup.iloc[:,:-1],awayMatchup.iloc[:,-1:], test_size=0.2,random_state=166)

In [402]:
# Now we want to predict points for (pf) for the homeMatchup/awayMatchup tables
# Setup table in way to do so:
ID_col = ['gameId']
target_col = ['pf']
cat_cols = ['roof','surface']
num_cols= ['ptsMatchup', 'offMatchup', 'ovrMatchup', 'passMatchup', 'pblkMatchup',
           'runMatchup', 'totalDvoaMatchup', 'offDvoaMatchup','avgRecv','overUnder']
other_col= ['date','season','week','stadium','team_x','team_y','homeAway','avgPf',
            'avgOvr_x', 'avgOff', 'avgPass', 'avgPblk','avgRun', 'avgRblk',
            'totalDvoa_x', 'offDvoa','pa', 'totalScore','avgPa', 'avgOvr_y',
            'avgDef', 'avgRdef', 'avgPrsh', 'avgCov', 'totalDvoa_y', 'defDvoa']

In [403]:
# Create label encoders for categorical variables and split the data set
# to train & test, further split the train data set to Train and Validate

#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    X_train_away[var] = number.fit_transform(X_train_away[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_away[var] = number.fit_transform(X_test_away[var].astype('str'))
    
# Pass the imputed and dummy (missing values flags) variables into the
# modeling process. Use random forest to predict the class
features=list(set(list(X_train_away.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_away = X_train_away[list(features)]
X_test_away= X_test_away[list(features)]


In [404]:
X_train_away.head()

Unnamed: 0,ptsMatchup,stDvoa,pblkMatchup,passMatchup,roof,surface,overUnder,offMatchup,totalDvoaMatchup,ovrMatchup,runMatchup,avgRecv,offDvoaMatchup
2325,44.571429,0.05,12.157143,6.971429,1,0,37.0,2.571429,0.023,1.542857,-15.014286,64.557143,0.031
1626,51.428571,0.012,12.571429,-0.328571,1,0,46.0,13.371429,0.229,3.528571,7.457143,63.542857,0.253
916,33.428571,0.001,12.9,-3.414286,1,0,37.5,-2.685714,-0.439,-5.614286,-11.442857,62.785714,-0.218
1074,42.714286,0.076,11.457143,-13.728571,0,0,48.0,-4.257143,-0.418,-8.557143,-1.814286,64.028571,-0.261
1751,50.857143,0.004,15.885714,0.114286,1,0,46.0,10.657143,0.007,-1.471429,3.928571,64.9,-0.044


In [405]:
model = xgb.XGBRegressor()
model.fit(X_train_away, y_train_away)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [406]:
y_pred_away = model.predict(X_test_away)
print(mean_squared_error(y_test_away, y_pred_away))

97.1417619293


In [407]:
# Now repeat the process for homeMatchup table:
X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(homeMatchup.iloc[:,:-1],homeMatchup.iloc[:,-1:], test_size=0.2,random_state=166)

for var in cat_cols:
    number = LabelEncoder()
    X_train_home[var] = number.fit_transform(X_train_home[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_home[var] = number.fit_transform(X_test_home[var].astype('str'))
    
features=list(set(list(X_train_home.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_home = X_train_home[list(features)]
X_test_home= X_test_home[list(features)]

In [408]:
X_train_home.head()

Unnamed: 0,ptsMatchup,stDvoa,pblkMatchup,passMatchup,roof,surface,overUnder,offMatchup,totalDvoaMatchup,ovrMatchup,runMatchup,avgRecv,offDvoaMatchup
2325,36.857143,0.044,6.328571,4.228571,1,0,37.0,0.114286,-0.023,-1.542857,-19.442857,66.928571,0.014
1626,43.0,-0.001,2.757143,2.785714,1,0,46.0,8.128571,-0.229,-3.528571,6.114286,60.114286,0.037
916,46.0,-0.026,13.771429,5.757143,1,0,37.5,5.314286,0.439,5.614286,-6.385714,64.971429,0.248
1074,63.285714,-0.096,3.228571,7.171429,0,0,48.0,9.528571,0.418,8.557143,5.028571,72.357143,0.329
1751,53.428571,-0.03,16.585714,6.971429,1,0,46.0,13.842857,-0.007,1.471429,1.014286,71.914286,-0.018


In [409]:
model = xgb.XGBRegressor()
model.fit(X_train_home, y_train_home)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [410]:
y_pred_home = model.predict(X_test_home)
print(mean_squared_error(y_test_home, y_pred_home))

94.7145631656


In [411]:
testDf = pd.DataFrame(X_test_away)
testDf['awayScore'] = y_test_away
testDf['awayPrediction'] = y_pred_away
testDf2 = pd.DataFrame(X_test_home)
testDf2['homeScore'] = y_test_home
testDf2['homePrediction'] = y_pred_home

In [412]:
colsAway = list(testDf.columns)
colsAway.pop(4)
colsAway.pop(4)
colsAway.pop(-1)
colsAway.pop(-1)
#colsAway
colsHome = list(testDf2.columns)
colsHome.pop(4)
colsHome.pop(4)
colsHome.pop(-1)
colsHome.pop(-1)
#colsHome

'homeScore'

In [413]:
awayPredDf = awayMatchup.merge(testDf, on=colsAway)
homePredDf = homeMatchup.merge(testDf2, on=colsHome)

awayHomeMerge = awayPredDf.merge(homePredDf, on=['gameId','date','season','week'])

awayHomeMerge = awayHomeMerge[['gameId', 'date', 'season', 'week','awayPrediction', 'homePrediction']]
awayHomeMerge.tail()

Unnamed: 0,gameId,date,season,week,awayPrediction,homePrediction
543,201309290rai,2013-09-29,2013,4,20.82975,22.904663
544,200912130rai,2009-12-13,2009,14,27.342749,18.43745
545,200811230sea,2008-11-23,2008,12,24.881224,13.832779
546,201310270den,2013-10-27,2013,8,24.137657,35.120605
547,201111130mia,2011-11-13,2011,10,20.546524,20.716974


In [414]:
finalDf = gameTable.merge(awayHomeMerge, on=['gameId', 'date'])
finalDf['totalPrediction'] = finalDf['awayPrediction'] + finalDf['homePrediction']


finalDf = finalDf[['gameId', 'date', 'season', 'week', 'awayTeam', 'awayPrediction', 'awayScore',
                   'homeTeam','homePrediction', 'homeScore', 'overUnder', 'result', 'totalPrediction', 'totalScore']]

In [415]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 5]
undersDf['result'].value_counts()

under    9
over     7
Name: result, dtype: int64

In [416]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 3]
undersDf['result'].value_counts()

under    27
over     25
Name: result, dtype: int64

In [417]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 1]
undersDf['result'].value_counts()

under    90
over     76
push      2
Name: result, dtype: int64

In [418]:
undersDf = finalDf[(finalDf['totalPrediction'] <= finalDf['overUnder'] - 3) & (finalDf['totalPrediction'] >= finalDf['overUnder'] - 5)]
undersDf['result'].value_counts()

over     18
under    18
Name: result, dtype: int64

In [419]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 5]
oversDf['result'].value_counts()

over     18
under    17
push      1
Name: result, dtype: int64

In [420]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 3]
oversDf['result'].value_counts()

over     53
under    46
push      1
Name: result, dtype: int64

In [421]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 1]
oversDf['result'].value_counts()

over     117
under    107
push       4
Name: result, dtype: int64