In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
import datetime
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics



In [6]:
%store -r awayMatchup
%store -r homeMatchup
%store -r gameTable

In [61]:
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(awayMatchup.iloc[:,:-1],awayMatchup.iloc[:,-1:], test_size=0.2,random_state=1020)

In [62]:
X_train_away.shape

(2494, 40)

In [63]:
# Now we want to predict points for (pf) for the homeMatchup/awayMatchup tables
# Setup table in way to do so:
ID_col = ['gameId']
target_col = ['pf']
cat_cols = ['roof','surface']
num_cols= ['ptsMatchup', 'offMatchup', 'ovrMatchup', 'passMatchup', 'pblkMatchup',
           'runMatchup', 'totalDvoaMatchup', 'offDvoaMatchup', 'avgRecv']
other_col= ['date','season','week','stadium','team_x','team_y','homeAway','avgPf',
            'avgOvr_x', 'avgOff', 'avgPass', 'avgPblk','avgRun', 'avgRblk',
            'totalDvoa_x', 'offDvoa','pa', 'totalScore','avgPa', 'avgOvr_y',
            'avgDef', 'avgRdef', 'avgPrsh', 'avgCov', 'totalDvoa_y', 'defDvoa','overUnder']

In [64]:
# Create label encoders for categorical variables and split the data set
# to train & test, further split the train data set to Train and Validate

#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    X_train_away[var] = number.fit_transform(X_train_away[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_away[var] = number.fit_transform(X_test_away[var].astype('str'))

In [65]:
# Pass the imputed and dummy (missing values flags) variables into the
# modeling process. Use random forest to predict the class
features=list(set(list(X_train_away.columns))-set(ID_col)-set(target_col)-set(other_col))

In [66]:
X_train_away = X_train_away[list(features)]
X_test_away= X_test_away[list(features)]

In [67]:
X_train_away.head()

Unnamed: 0,ovrMatchup,pblkMatchup,roof,totalDvoaMatchup,passMatchup,avgRecv,surface,offDvoaMatchup,ptsMatchup,stDvoa,runMatchup,offMatchup
2587,10.228571,20.5,1,0.312,20.571429,71.071429,1,0.175,59.857143,-0.003,-17.057143,8.614286
2807,2.985714,-3.114286,1,0.349,12.942857,72.342857,1,0.121,49.714286,0.035,-0.228571,8.357143
345,-3.342857,13.285714,0,-0.024,-5.571429,64.714286,1,-0.348,41.714286,-0.024,-2.014286,3.428571
1082,-4.3,-0.942857,1,-0.092,-3.471429,63.5,0,0.015,37.285714,0.062,-13.685714,-7.728571
317,-3.485714,0.557143,1,-0.411,5.314286,66.185714,1,-0.454,44.428571,-0.02,-11.714286,1.471429


In [68]:
rf = RandomForestRegressor(n_estimators=600, max_depth=6, min_impurity_decrease=0.6)
rf.fit(X_train_away, y_train_away)
y_pred_away = rf.predict(X_test_away)
print(mean_squared_error(y_test_away, y_pred_away))

  


93.1285356564


In [69]:
# Now repeat the process for homeMatchup table:
X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(homeMatchup.iloc[:,:-1],homeMatchup.iloc[:,-1:], test_size=0.2,random_state=1020)

for var in cat_cols:
    number = LabelEncoder()
    X_train_home[var] = number.fit_transform(X_train_home[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_home[var] = number.fit_transform(X_test_home[var].astype('str'))
    
features=list(set(list(X_train_home.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_home = X_train_home[list(features)]
X_test_home= X_test_home[list(features)]

In [70]:
X_train_home.head()

Unnamed: 0,ovrMatchup,pblkMatchup,roof,totalDvoaMatchup,passMatchup,avgRecv,surface,offDvoaMatchup,ptsMatchup,stDvoa,runMatchup,offMatchup
2587,-10.228571,10.871429,1,-0.312,-9.685714,57.642857,1,-0.241,39.571429,0.027,-7.5,-4.585714
2807,-2.985714,7.485714,1,-0.349,-7.328571,73.114286,1,0.051,40.0,0.064,-0.342857,2.685714
345,3.342857,9.4,0,0.024,4.9,67.671429,1,0.011,46.428571,0.05,-6.757143,3.828571
1082,4.3,0.428571,1,0.092,0.9,69.814286,0,0.187,48.0,-0.046,-17.771429,-3.742857
317,3.485714,-11.9,1,0.411,11.085714,68.714286,1,0.014,45.857143,-0.017,1.3,5.014286


In [71]:
rf = RandomForestRegressor(n_estimators=600, max_depth=6, min_impurity_decrease=0.6)
rf.fit(X_train_home, y_train_home)
y_pred_home = rf.predict(X_test_home)
print(mean_squared_error(y_test_home, y_pred_home))

  


104.999120724


In [72]:
testDf = pd.DataFrame(X_test_away)
testDf['awayScore'] = y_test_away
testDf['awayPrediction'] = y_pred_away
testDf2 = pd.DataFrame(X_test_home)
testDf2['homeScore'] = y_test_home
testDf2['homePrediction'] = y_pred_home

In [73]:
colsAway = list(testDf.columns)
colsAway.pop(-8)
colsAway.pop(2)
colsAway.pop(-1)
colsAway.pop(-1)
# colsAway
colsHome = list(testDf2.columns)
colsHome.pop(-8)
colsHome.pop(2)
colsHome.pop(-1)
colsHome.pop(-1)
# colsHome

'homeScore'

In [74]:
awayPredDf = awayMatchup.merge(testDf, on=colsAway)
homePredDf = homeMatchup.merge(testDf2, on=colsHome)

In [75]:
awayHomeMerge = awayPredDf.merge(homePredDf, on=['gameId','date','season','week'])
#awayHomeMerge.columns

In [76]:
awayHomeMerge = awayHomeMerge[['gameId', 'date', 'season', 'week','awayPrediction', 'homePrediction']]
awayHomeMerge.head()

Unnamed: 0,gameId,date,season,week,awayPrediction,homePrediction
0,201709100det,2017-09-10,2017,1,21.951066,24.391565
1,201709170clt,2017-09-17,2017,2,24.200141,24.746882
2,201812160atl,2018-12-16,2018,15,16.696198,27.29746
3,201801060ram,2018-01-06,2017,18,21.205552,23.925811
4,201609180rai,2016-09-18,2016,2,18.195464,26.582297


In [77]:
finalDf = gameTable.merge(awayHomeMerge, on=['gameId', 'date'])
finalDf['totalPrediction'] = finalDf['awayPrediction'] + finalDf['homePrediction']
#finalDf.columns

In [78]:
finalDf = finalDf[['gameId', 'date', 'season', 'week', 'awayTeam', 'awayPrediction', 'awayScore',
                   'homeTeam','homePrediction', 'homeScore', 'overUnder', 'result', 'totalPrediction', 'totalScore']]

In [79]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 5]
undersDf['result'].value_counts()

under    20
over     16
push      1
Name: result, dtype: int64

In [80]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 3]
undersDf['result'].value_counts()

under    51
over     39
push      1
Name: result, dtype: int64

In [81]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 1]
undersDf['result'].value_counts()

under    113
over      99
push       2
Name: result, dtype: int64

In [82]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 5]
oversDf['result'].value_counts()

over     25
under    18
push      2
Name: result, dtype: int64

In [83]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 3]
oversDf['result'].value_counts()

over     68
under    60
push      2
Name: result, dtype: int64

In [84]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 1]
oversDf['result'].value_counts()

over     144
under    124
push       4
Name: result, dtype: int64