In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
import datetime
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics



In [2]:
%store -r awayMatchup
%store -r homeMatchup
%store -r gameTable

In [110]:
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(awayMatchup.iloc[:,:-1],awayMatchup.iloc[:,-1:], test_size=0.2,random_state=2107)

In [111]:
X_train_away.shape

(2494, 40)

In [112]:
# Now we want to predict points for (pf) for the homeMatchup/awayMatchup tables
# Setup table in way to do so:
ID_col = ['gameId']
target_col = ['pf']
cat_cols = ['roof','surface']
num_cols= ['ptsMatchup', 'offMatchup', 'ovrMatchup', 'passMatchup', 'pblkMatchup',
           'runMatchup', 'totalDvoaMatchup', 'offDvoaMatchup', 'avgRecv']
other_col= ['date','season','week','stadium','team_x','team_y','homeAway','avgPf',
            'avgOvr_x', 'avgOff', 'avgPass', 'avgPblk','avgRun', 'avgRblk',
            'totalDvoa_x', 'offDvoa','pa', 'totalScore','avgPa', 'avgOvr_y',
            'avgDef', 'avgRdef', 'avgPrsh', 'avgCov', 'totalDvoa_y', 'defDvoa','overUnder']

In [113]:
# Create label encoders for categorical variables and split the data set
# to train & test, further split the train data set to Train and Validate

#create label encoders for categorical features
for var in cat_cols:
    number = LabelEncoder()
    X_train_away[var] = number.fit_transform(X_train_away[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_away[var] = number.fit_transform(X_test_away[var].astype('str'))

In [114]:
# Pass the imputed and dummy (missing values flags) variables into the
# modeling process. Use random forest to predict the class
features=list(set(list(X_train_away.columns))-set(ID_col)-set(target_col)-set(other_col))

In [115]:
X_train_away = X_train_away[list(features)]
X_test_away= X_test_away[list(features)]

In [116]:
X_train_away.head()

Unnamed: 0,avgRecv,totalDvoaMatchup,offMatchup,roof,ovrMatchup,offDvoaMatchup,passMatchup,surface,ptsMatchup,stDvoa,pblkMatchup,runMatchup
1788,71.714286,0.631,12.457143,1,10.657143,0.397,16.8,0,49.428571,-0.062,7.742857,-2.957143
1490,60.814286,-0.223,0.028571,1,-3.357143,-0.038,-2.042857,0,44.428571,0.007,1.785714,-8.4
2366,64.7,0.329,1.328571,0,-2.485714,0.032,-4.8,1,46.571429,-0.034,9.2,-4.957143
2205,72.328571,0.374,16.542857,0,10.271429,0.35,21.142857,1,62.285714,-0.016,17.885714,-2.771429
2521,65.214286,0.172,2.328571,0,0.771429,-0.062,-1.828571,0,51.857143,0.101,7.257143,-9.371429


In [117]:
rf = RandomForestRegressor(n_estimators=600, max_depth=6, min_impurity_decrease=0.6)
rf.fit(X_train_away, y_train_away)
y_pred_away = rf.predict(X_test_away)
print(mean_squared_error(y_test_away, y_pred_away))

  


87.7576821829


In [118]:
# Now repeat the process for homeMatchup table:
X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(homeMatchup.iloc[:,:-1],homeMatchup.iloc[:,-1:], test_size=0.2,random_state=2107)

for var in cat_cols:
    number = LabelEncoder()
    X_train_home[var] = number.fit_transform(X_train_home[var].astype('str'))

for var in cat_cols:
    number = LabelEncoder()
    X_test_home[var] = number.fit_transform(X_test_home[var].astype('str'))
    
features=list(set(list(X_train_home.columns))-set(ID_col)-set(target_col)-set(other_col))

X_train_home = X_train_home[list(features)]
X_test_home= X_test_home[list(features)]

In [119]:
X_train_home.head()

Unnamed: 0,avgRecv,totalDvoaMatchup,offMatchup,roof,ovrMatchup,offDvoaMatchup,passMatchup,surface,ptsMatchup,stDvoa,pblkMatchup,runMatchup
1788,64.4,-0.631,-5.157143,1,-10.657143,-0.261,-13.471429,0,33.857143,-0.036,8.185714,-2.671429
1490,64.714286,0.223,4.228571,1,3.357143,0.179,3.271429,0,51.428571,0.012,6.357143,-5.871429
2366,64.885714,-0.329,4.914286,0,2.485714,-0.283,10.471429,1,44.857143,-0.048,9.342857,-8.057143
2205,66.242857,-0.374,3.371429,0,-10.271429,-0.027,-0.271429,1,41.142857,-0.012,12.128571,-3.028571
2521,66.528571,-0.172,3.328571,0,-0.771429,-0.085,7.085714,0,46.285714,-0.048,0.642857,-10.942857


In [120]:
rf = RandomForestRegressor(n_estimators=600, max_depth=6, min_impurity_decrease=0.6)
rf.fit(X_train_home, y_train_home)
y_pred_home = rf.predict(X_test_home)
print(mean_squared_error(y_test_home, y_pred_home))

  


99.184282222


In [121]:
testDf = pd.DataFrame(X_test_away)
testDf['awayScore'] = y_test_away
testDf['awayPrediction'] = y_pred_away
testDf2 = pd.DataFrame(X_test_home)
testDf2['homeScore'] = y_test_home
testDf2['homePrediction'] = y_pred_home

In [122]:
colsAway = list(testDf.columns)
colsAway.pop(-7)
colsAway.pop(3)
colsAway.pop(-1)
colsAway.pop(-1)
#colsAway
colsHome = list(testDf2.columns)
colsHome.pop(-7)
colsHome.pop(3)
colsHome.pop(-1)
colsHome.pop(-1)
# colsHome

'homeScore'

In [123]:
awayPredDf = awayMatchup.merge(testDf, on=colsAway)
homePredDf = homeMatchup.merge(testDf2, on=colsHome)

In [124]:
awayHomeMerge = awayPredDf.merge(homePredDf, on=['gameId','date','season','week'])
#awayHomeMerge.columns

In [125]:
awayHomeMerge = awayHomeMerge[['gameId', 'date', 'season', 'week','awayPrediction', 'homePrediction']]
awayHomeMerge.head()

Unnamed: 0,gameId,date,season,week,awayPrediction,homePrediction
0,201812300sea,2018-12-30,2018,17,15.800897,28.438475
1,201712170was,2017-12-17,2017,15,19.396151,21.874671
2,201709100det,2017-09-10,2017,1,21.380432,24.275178
3,201710080phi,2017-10-08,2017,5,19.855067,27.508813
4,201701010ram,2017-01-01,2016,17,21.580924,19.524006


In [126]:
finalDf = gameTable.merge(awayHomeMerge, on=['gameId', 'date'])
finalDf['totalPrediction'] = finalDf['awayPrediction'] + finalDf['homePrediction']
#finalDf.columns

In [127]:
finalDf = finalDf[['gameId', 'date', 'season', 'week', 'awayTeam', 'awayPrediction', 'awayScore',
                   'homeTeam','homePrediction', 'homeScore', 'overUnder', 'result', 'totalPrediction', 'totalScore']]

In [128]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 5]
undersDf['result'].value_counts()

under    13
over     10
push      1
Name: result, dtype: int64

In [129]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 3]
undersDf['result'].value_counts()

under    43
over     37
push      1
Name: result, dtype: int64

In [130]:
undersDf = finalDf[finalDf['totalPrediction'] <= finalDf['overUnder'] - 1]
undersDf['result'].value_counts()

over     100
under     97
push       2
Name: result, dtype: int64

In [131]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 5]
oversDf['result'].value_counts()

under    26
over     24
push      1
Name: result, dtype: int64

In [132]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 3]
oversDf['result'].value_counts()

under    63
over     62
push      5
Name: result, dtype: int64

In [133]:
oversDf = finalDf[finalDf['totalPrediction'] >= finalDf['overUnder'] + 1]
oversDf['result'].value_counts()

over     133
under    128
push       9
Name: result, dtype: int64