# Data importing and cleaning

In [266]:
import pandas as pd
import numpy as np
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [267]:
pd.set_option('display.max_columns', 500)

In [270]:
df = pd.read_csv("data/data_cumulative.csv")

In [271]:
df = df.drop(columns="Unnamed: 0")

In [272]:
df.head(2)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,team_underdog_id,total_score,over_under_result,actual_spread,spread_result,game_winner_id,game_id_h,game_id_a,stadium_name,stadium_type,stadium_weather_type,stadium_surface,LATITUDE,LONGITUDE,ELEVATION,h_game_id_team,h_travel_distance,h_season_wins,h_season_losses,h_season_ties,h_season_pct,h_days_last_game,h_avg_season_score,a_game_id_team,a_travel_distance,a_season_wins,a_season_losses,a_season_ties,a_season_pct,a_days_last_game,a_avg_season_score
0,2011-09-08,2011,1,False,Green Bay Packers,42,34,New Orleans Saints,GB,-5.0,48.0,Lambeau Field,False,68.0,5.0,67.0,Fair,GB,NO,NO,76,over,-8,favorite,GB,2011-1-GB,2011-1-NO,Lambeau Field,outdoor,cold,Grass,44.4794,-88.1366,209.4,2011-1-GB,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-NO,0.0,0.0,0.0,0.0,0.5,250.0,22.5
1,2011-09-11,2011,1,False,Arizona Cardinals,28,21,Carolina Panthers,ARI,-7.0,37.5,University of Phoenix Stadium,False,72.0,0.0,,DOME,ARI,CAR,CAR,49,over,-7,push,ARI,2011-1-ARI,2011-1-CAR,University of Phoenix Stadium,retractable,dome,Grass,33.4552,-111.9316,375.2,2011-1-ARI,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-CAR,0.0,0.0,0.0,0.0,0.5,250.0,22.5


In [273]:
df["home_win"] = df.apply(lambda x: x["game_winner_id"] == x["team_home_id"], axis=1)

# Define X, y

In [274]:
y, X = dmatrices('home_win ~ schedule_season + a_season_pct + h_season_pct + C(team_favorite_id) \
                         + a_avg_season_score + h_avg_season_score \
                         + C(team_underdog_id) + schedule_week + ELEVATION \
                         + weather_detail + h_travel_distance + a_travel_distance + h_days_last_game + a_days_last_game', 
                         df, return_type = "dataframe" )

In [275]:
X.tail()

Unnamed: 0,Intercept,C(team_favorite_id)[T.ATL],C(team_favorite_id)[T.BAL],C(team_favorite_id)[T.BUF],C(team_favorite_id)[T.CAR],C(team_favorite_id)[T.CHI],C(team_favorite_id)[T.CIN],C(team_favorite_id)[T.CLE],C(team_favorite_id)[T.DAL],C(team_favorite_id)[T.DEN],C(team_favorite_id)[T.DET],C(team_favorite_id)[T.GB],C(team_favorite_id)[T.HOU],C(team_favorite_id)[T.IND],C(team_favorite_id)[T.JAX],C(team_favorite_id)[T.KC],C(team_favorite_id)[T.LAC],C(team_favorite_id)[T.LAR],C(team_favorite_id)[T.MIA],C(team_favorite_id)[T.MIN],C(team_favorite_id)[T.NE],C(team_favorite_id)[T.NO],C(team_favorite_id)[T.NYG],C(team_favorite_id)[T.NYJ],C(team_favorite_id)[T.OAK],C(team_favorite_id)[T.PHI],C(team_favorite_id)[T.PICK],C(team_favorite_id)[T.PIT],C(team_favorite_id)[T.SEA],C(team_favorite_id)[T.SF],C(team_favorite_id)[T.TB],C(team_favorite_id)[T.TEN],C(team_favorite_id)[T.WAS],C(team_underdog_id)[T.ATL],C(team_underdog_id)[T.BAL],C(team_underdog_id)[T.BUF],C(team_underdog_id)[T.CAR],C(team_underdog_id)[T.CHI],C(team_underdog_id)[T.CIN],C(team_underdog_id)[T.CLE],C(team_underdog_id)[T.DAL],C(team_underdog_id)[T.DEN],C(team_underdog_id)[T.DET],C(team_underdog_id)[T.GB],C(team_underdog_id)[T.HOU],C(team_underdog_id)[T.IND],C(team_underdog_id)[T.JAX],C(team_underdog_id)[T.KC],C(team_underdog_id)[T.LAC],C(team_underdog_id)[T.LAR],C(team_underdog_id)[T.MIA],C(team_underdog_id)[T.MIN],C(team_underdog_id)[T.NE],C(team_underdog_id)[T.NO],C(team_underdog_id)[T.NYG],C(team_underdog_id)[T.NYJ],C(team_underdog_id)[T.OAK],C(team_underdog_id)[T.PHI],C(team_underdog_id)[T.PICK],C(team_underdog_id)[T.PIT],C(team_underdog_id)[T.SEA],C(team_underdog_id)[T.SF],C(team_underdog_id)[T.TB],C(team_underdog_id)[T.TEN],C(team_underdog_id)[T.WAS],weather_detail[T.DOME (Open Roof)],weather_detail[T.Fair],weather_detail[T.Fog],weather_detail[T.Rain],weather_detail[T.Rain | Fog],weather_detail[T.Snow],weather_detail[T.Snow | Fog],weather_detail[T.Snow | Freezing Rain],schedule_season,a_season_pct,h_season_pct,a_avg_season_score,h_avg_season_score,schedule_week,ELEVATION,h_travel_distance,a_travel_distance,h_days_last_game,a_days_last_game
2131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.588235,0.8125,22.529412,31.5,19.0,32.0,0.0,1339.496163,14.0,7.0
2132,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.764706,0.6875,26.529412,27.25,19.0,24.4,0.0,576.751879,14.0,7.0
2133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.823529,0.823529,32.764706,30.823529,20.0,32.0,0.0,2703.956221,7.0,8.0
2134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.705882,0.764706,28.058824,35.058824,20.0,264.9,0.0,2009.546077,8.0,7.0
2135,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.833333,0.722222,32.388889,28.555556,21.0,309.0,1078.812466,683.109522,14.0,14.0


## Define a moving window for training data, where test data is the following season

In [276]:
yr = 2017
window=4

In [277]:
X_train = X[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)]
y_train = y[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)].squeeze()

In [278]:
X_test = X[(X["schedule_season"] == yr)]
y_test = y[(X["schedule_season"] == yr)].squeeze()

# Random Forest

In [279]:
forest = RandomForestClassifier(n_estimators=30, random_state=33)
y_pred = forest.fit(X_train, y_train).predict(X_test)

In [280]:
accuracy_score(y_test, y_pred)

0.5805243445692884

## Get the best number of estimators

In [281]:
acc = []
for n in [50, 100, 200]:
    for k in range(20):
        forest = RandomForestClassifier(n_estimators=n)
        y_pred = forest.fit(X_train, y_train).predict(X_test)
        acc.append(accuracy_score(y_test, y_pred))
    print(n, np.mean(acc))

50 0.5801498127340825
100 0.5875468164794008
200 0.5902621722846443


In [282]:
acc = []

for k in range(20):
    forest = RandomForestClassifier(bootstrap=False, max_depth=60,max_features="auto",min_samples_leaf=4,min_samples_split=5,n_estimators=100)

    y_pred = forest.fit(X_train, y_train).predict(X_test)
    acc.append(accuracy_score(y_test, y_pred))
np.mean(acc)

0.600936329588015

## Get the feature importances to see if we can remove any variables

In [283]:
for a,b in zip(X.columns, forest.feature_importances_):
    print(a,b)

Intercept 0.0
C(team_favorite_id)[T.ATL] 0.004817342891309782
C(team_favorite_id)[T.BAL] 0.004248881640859381
C(team_favorite_id)[T.BUF] 0.00576819741037809
C(team_favorite_id)[T.CAR] 0.004637360244789563
C(team_favorite_id)[T.CHI] 0.0005834732300924712
C(team_favorite_id)[T.CIN] 0.0014110332742768529
C(team_favorite_id)[T.CLE] 0.0017673402074286774
C(team_favorite_id)[T.DAL] 0.0023317279477403923
C(team_favorite_id)[T.DEN] 0.00381540535088701
C(team_favorite_id)[T.DET] 0.004495744821372515
C(team_favorite_id)[T.GB] 0.003573216548495912
C(team_favorite_id)[T.HOU] 0.002483532026802611
C(team_favorite_id)[T.IND] 0.0023344835613340565
C(team_favorite_id)[T.JAX] 0.00020352658495627427
C(team_favorite_id)[T.KC] 0.0034953208236544265
C(team_favorite_id)[T.LAC] 0.0005080096465128795
C(team_favorite_id)[T.LAR] 0.00020694384668942938
C(team_favorite_id)[T.MIA] 0.0014122800908264981
C(team_favorite_id)[T.MIN] 0.0009088108635788643
C(team_favorite_id)[T.NE] 0.0033804040807059953
C(team_favorite_i

# Grid Search

In [195]:
n_estimators = [50,] #[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [196]:
n_folds = 3
gs = GridSearchCV(forest,random_grid, cv=n_folds)



In [197]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [198]:
gs.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 50}

In [200]:
y_pred = gs.predict(X_test)
accuracy_score(y_test, y_pred)

0.6142322097378277

# Random Forest Regressor

In [284]:
forest_r = RandomForestRegressor(bootstrap=True, max_depth = 30, max_features="auto",
min_samples_leaf=4, min_samples_split=10, n_estimators=50)

In [285]:
y_pred = forest_r.fit(X_train, y_train).predict(X_test)


In [286]:
y_pred_2 = 1* (y_pred[:,1] > .5 )

In [287]:
accuracy_score(y_test.iloc[:,1], y_pred_2)

0.5917602996254682

## Dealing with Close Calls

In [295]:
ys = pd.DataFrame(y_pred[:,1],y_test.iloc[:,1] )

In [296]:
ys = ys.reset_index()

## Defining a Close Call as estimates near .5 (.1 threshold)

In [298]:
ys["close_call"] = ys.apply(lambda x: (x[0] > .4) and (x[0] < .6) , axis=1)

In [None]:
## If a close call, guess a home team victory

In [330]:
def strategy(row):
    if row.close_call:
        return row["home_win[True]"]
    else:
        return row["home_win[True]"] == round(row[0])
            

ys["pred"] = ys.apply(strategy, axis=1)

In [331]:
ys["pred"].sum()/len(ys)

0.5955056179775281

In [294]:
ys.head()

Unnamed: 0,home_win[True],0,home_win,close_call,pred
0,0.0,0.611322,True,False,False
1,1.0,0.425103,True,True,True
2,0.0,0.337266,True,False,True
3,0.0,0.456582,True,True,True
4,0.0,0.353299,False,False,True


# Ignoring Close Calls

In [242]:
ys_betted = ys[(ys[0] > .6) | (ys[0] < .4)]

In [237]:
ys_betted.head()

Unnamed: 0,home_win[True],0
0,0.0,0.700118
1,1.0,0.401775
2,0.0,0.278912
3,0.0,0.415041
4,0.0,0.337303


In [243]:
ys_betted["pred"] = ys_betted.apply(lambda x: x["home_win[True]"] == round(x[0]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [244]:
ys_betted["pred"].sum()/len(ys_betted)

0.6242774566473989

In [245]:
len(ys_betted)

173

In [246]:
len(X)

2136

# Test on all Data

In [312]:
y_pred_all = pd.DataFrame(forest_r.fit(X_train, y_train).predict(X))

In [323]:
y_pred_all["real"] = y.iloc[:,0]

In [320]:
y_pred_all["close_call"] = y_pred_all.apply(lambda x: (x[0] > .4) and (x[0] < .6) , axis=1)

In [332]:
def strategy(row):
    if row.close_call:
        return row["real"]
    else:
        return row["real"] == round(row[0])

y_pred_all["pred"] = y_pred_all.apply(strategy, axis=1)

In [333]:
y_pred_all["pred"].sum()/len(y_pred_all)

0.7626404494382022