In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm
from sklearn.model_selection import KFold
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
import xgboost as xgb

In [56]:
%store -r awayMatchup
%store -r homeMatchup
%store -r gameTable

In [57]:
cols = list(awayMatchup.columns)
cols
cols.pop(21)
cols.append('pf')

In [58]:
awayMatchup = awayMatchup[cols]
homeMatchup = homeMatchup[cols]

# Only use weeks > 3 since dvoa is inaccurate before week 3, typically
awayMatchup = awayMatchup[awayMatchup['week'] >= 3]
homeMatchup = homeMatchup[homeMatchup['week'] >= 3]

In [819]:
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(awayMatchup.iloc[:,:-1],awayMatchup.iloc[:,-1:], test_size=0.2,random_state=2790)

In [820]:
# Now we want to predict points for (pf) for the homeMatchup/awayMatchup tables
# Setup table in way to do so:
ID_col = ['gameId']
target_col = ['pf']
cat_cols = ['roof','surface']
num_cols= ['ptsMatchup', 'offMatchup', 'ovrMatchup', 'passMatchup', 'pblkMatchup',
           'runMatchup', 'totalDvoaMatchup', 'offDvoaMatchup','avgRecv','overUnder']
other_col= ['date','season','week','stadium','team_x','team_y','homeAway','avgPf',
            'avgOvr_x', 'avgOff', 'avgPass', 'avgPblk','avgRun', 'avgRblk',
            'totalDvoa_x', 'offDvoa','pa', 'totalScore','avgPa', 'avgOvr_y',
            'avgDef', 'avgRdef', 'avgPrsh', 'avgCov', 'totalDvoa_y', 'defDvoa']

In [821]:
# Create label encoders for categorical variables and split the data set
# to train & test, further split the train data set to Train and Validate

#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    X_train_away[var] = number.fit_transform(X_train_away[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_away[var] = number.fit_transform(X_test_away[var].astype('str'))
    
# Pass the imputed and dummy (missing values flags) variables into the
# modeling process. Use random forest to predict the class
features=list(set(list(X_train_away.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_away = X_train_away[list(features)]
X_test_away= X_test_away[list(features)]


In [822]:
X_train_away.head()

Unnamed: 0,offMatchup,overUnder,runMatchup,pblkMatchup,offDvoaMatchup,roof,avgRecv,surface,passMatchup,ptsMatchup,ovrMatchup,totalDvoaMatchup,stDvoa
897,3.085714,42.0,-7.828571,5.271429,-0.207,0,64.171429,1,9.142857,42.142857,3.328571,0.095,-0.039
2340,8.014286,45.0,2.071429,16.442857,-0.056,1,62.314286,1,-10.842857,38.428571,-2.6,-0.385,0.012
2921,7.557143,42.5,5.385714,5.028571,-0.221,1,66.314286,0,3.785714,45.0,0.442857,-0.218,-0.005
2217,11.057143,52.0,-5.414286,15.285714,0.002,0,64.371429,1,14.014286,49.857143,1.771429,-0.096,-0.036
1731,2.6,45.5,-25.385714,6.728571,-0.055,1,72.085714,0,14.628571,44.571429,-3.442857,-0.214,0.071


In [823]:
model = xgb.XGBRegressor()
model.fit(X_train_away, y_train_away)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [824]:
y_pred_away = model.predict(X_test_away)
print(mean_squared_error(y_test_away, y_pred_away))

100.376642892


In [825]:
# Now repeat the process for homeMatchup table:
X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(homeMatchup.iloc[:,:-1],homeMatchup.iloc[:,-1:], test_size=0.2,random_state=2790)

for var in cat_cols:
    number = LabelEncoder()
    X_train_home[var] = number.fit_transform(X_train_home[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_home[var] = number.fit_transform(X_test_home[var].astype('str'))
    
features=list(set(list(X_train_home.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_home = X_train_home[list(features)]
X_test_home= X_test_home[list(features)]

In [826]:
X_train_home.head()

Unnamed: 0,offMatchup,overUnder,runMatchup,pblkMatchup,offDvoaMatchup,roof,avgRecv,surface,passMatchup,ptsMatchup,ovrMatchup,totalDvoaMatchup,stDvoa
897,-1.242857,42.0,-11.757143,4.614286,-0.216,0,60.8,1,2.885714,39.285714,-3.328571,-0.095,-0.065
2340,10.114286,45.0,-10.928571,11.914286,0.414,1,70.414286,1,6.957143,52.428571,2.6,0.385,0.04
2921,8.442857,42.5,4.514286,3.514286,-0.055,1,65.9,0,3.514286,46.428571,-0.442857,0.218,0.007
2217,9.657143,52.0,-2.6,11.585714,-0.205,0,67.185714,1,3.242857,45.285714,-1.771429,0.096,0.013
1731,9.271429,45.5,4.142857,15.528571,-0.018,1,72.171429,0,3.542857,47.428571,3.442857,0.214,0.065


In [827]:
model = xgb.XGBRegressor()
model.fit(X_train_home, y_train_home)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [828]:
y_pred_home = model.predict(X_test_home)
print(mean_squared_error(y_test_home, y_pred_home))

100.275381529


In [829]:
testDf = pd.DataFrame(X_test_away)
testDf['awayScore'] = y_test_away
testDf['awayPrediction'] = y_pred_away
testDf2 = pd.DataFrame(X_test_home)
testDf2['homeScore'] = y_test_home
testDf2['homePrediction'] = y_pred_home

In [830]:
colsAway = list(testDf.columns)
colsAway.pop(7)
colsAway.pop(5)
colsAway.pop(-1)
colsAway.pop(-1)
#colsAway
colsHome = list(testDf2.columns)
colsHome.pop(7)
colsHome.pop(5)
colsHome.pop(-1)
colsHome.pop(-1)
#colsHome

'homeScore'

In [831]:
awayPredDf = awayMatchup.merge(testDf, on=colsAway)
homePredDf = homeMatchup.merge(testDf2, on=colsHome)

awayHomeMerge = awayPredDf.merge(homePredDf, on=['gameId','date','season','week'])

awayHomeMerge = awayHomeMerge[['gameId', 'date', 'season', 'week','awayPrediction', 'homePrediction']]
awayHomeMerge.tail()

Unnamed: 0,gameId,date,season,week,awayPrediction,homePrediction
543,201112180nyg,2011-12-18,2011,15,21.424566,26.270615
544,201509240nyg,2015-09-24,2015,3,21.880686,24.056004
545,200801050sea,2008-01-05,2007,18,17.138498,22.728909
546,201512130chi,2015-12-13,2015,14,20.217297,19.908104
547,200711180dal,2007-11-18,2007,11,19.530952,29.287565


In [832]:
finalDf = gameTable.merge(awayHomeMerge, on=['gameId', 'date'])
finalDf['totalPrediction'] = finalDf['awayPrediction'] + finalDf['homePrediction']


finalDf = finalDf[['gameId', 'date', 'season', 'week', 'awayTeam', 'awayPrediction', 'awayScore',
                   'homeTeam','homePrediction', 'homeScore', 'overUnder', 'result', 'totalPrediction', 'totalScore']]

In [833]:
undersDf = finalDf[(finalDf['totalPrediction'] <= finalDf['overUnder'] - 3) & (finalDf['totalPrediction'] >= finalDf['overUnder'] - 5)]
undersDf['result'].value_counts()

over     32
under    30
push      2
Name: result, dtype: int64

In [834]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 5]
undersDf['result'].value_counts()

over     14
under     3
Name: result, dtype: int64

In [786]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 3]
undersDf['result'].value_counts()

under    33
over     25
Name: result, dtype: int64

In [558]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 1]
undersDf['result'].value_counts()

under    94
over     82
push      1
Name: result, dtype: int64

In [559]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 5]
oversDf['result'].value_counts()

over     35
under    27
push      2
Name: result, dtype: int64

In [560]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 3]
oversDf['result'].value_counts()

over     58
under    50
push      3
Name: result, dtype: int64

In [561]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 1]
oversDf['result'].value_counts()

under    121
over     117
push       4
Name: result, dtype: int64