# Data importing and cleaning

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
df = pd.read_csv("data/data_cumulative.csv", index_col=0)

In [4]:
df.head(2)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,team_underdog_id,total_score,over_under_result,actual_spread,spread_result,game_winner_id,home_win,game_id_h,game_id_a,stadium_name,stadium_type,stadium_weather_type,stadium_surface,LATITUDE,LONGITUDE,ELEVATION,h_game_id_team,h_travel_distance,h_season_wins,h_season_losses,h_season_ties,h_season_pct,h_days_last_game,h_avg_season_score,a_game_id_team,a_travel_distance,a_season_wins,a_season_losses,a_season_ties,a_season_pct,a_days_last_game,a_avg_season_score
0,2011-09-08,2011,1,False,Green Bay Packers,42,34,New Orleans Saints,GB,-5.0,48.0,Lambeau Field,False,68.0,5.0,67.0,Fair,GB,NO,NO,76,over,-8,favorite,GB,1,2011-1-GB,2011-1-NO,Lambeau Field,outdoor,cold,Grass,44.4794,-88.1366,209.4,2011-1-GB,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-NO,0.0,0.0,0.0,0.0,0.5,250.0,22.5
1,2011-09-11,2011,1,False,Arizona Cardinals,28,21,Carolina Panthers,ARI,-7.0,37.5,University of Phoenix Stadium,False,72.0,0.0,,DOME,ARI,CAR,CAR,49,over,-7,push,ARI,1,2011-1-ARI,2011-1-CAR,University of Phoenix Stadium,retractable,dome,Grass,33.4552,-111.9316,375.2,2011-1-ARI,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-CAR,0.0,0.0,0.0,0.0,0.5,250.0,22.5


In [5]:
# 1 = Home Win
# 0 = Away Win
# -1 = Tie

df.home_win.value_counts()

 1    1230
 0     899
-1       7
Name: home_win, dtype: int64

In [6]:
# Import 538 elo

elo = pd.read_csv('./data/nfl_elo.csv')
elo.head()
elo.replace('WSH', 'WAS', inplace=True)

In [7]:
def keep_notna_elo(row):
    if np.isnan(row.elo_prob1):
        return row.elo_prob2
    else:
        return row.elo_prob1

In [8]:
elo_merge_fields = ['date', 'team1']
elo_keep_fields = elo_merge_fields + ['elo_prob1']

df_elo = df.merge(elo[elo_keep_fields], how='left', left_on=[
                  'schedule_date', 'team_home_id'], right_on=elo_merge_fields)


# repeat for some neutral field games where 538 has the other team as home
elo_merge_fields = ['date', 'team2']
elo_keep_fields = elo_merge_fields + ['elo_prob2']

df_elo = df_elo.merge(elo[elo_keep_fields], how='left', left_on=[
                  'schedule_date', 'team_home_id'], right_on=elo_merge_fields)


df_elo['elo_prob_home'] = df_elo.apply(keep_notna_elo, axis=1)

# remove unused merge fields
df_elo.drop(columns=['date_x', 'team1', 'elo_prob1', 'date_y', 'team2', 'elo_prob2'], inplace=True)

#check for missing
df_elo[df_elo.elo_prob_home.isna()]

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,team_underdog_id,total_score,over_under_result,actual_spread,spread_result,game_winner_id,home_win,game_id_h,game_id_a,stadium_name,stadium_type,stadium_weather_type,stadium_surface,LATITUDE,LONGITUDE,ELEVATION,h_game_id_team,h_travel_distance,h_season_wins,h_season_losses,h_season_ties,h_season_pct,h_days_last_game,h_avg_season_score,a_game_id_team,a_travel_distance,a_season_wins,a_season_losses,a_season_ties,a_season_pct,a_days_last_game,a_avg_season_score,elo_prob_home


In [9]:
df = df_elo.copy()

# Define Baseline accuracy

In [10]:
# Minimum Baseline (Naive)
# accuracy if picking home team win in every game
df.home_win.sum()/len(df)

0.572565543071161

In [11]:
# Target Baseline
# accuracy if picking the vegas favorite team in each game
(df["home_win"] == (df["actual_spread"] < 0)).sum()/len(df)

0.651685393258427

# Define X, y

In [12]:
y, X = dmatrices('home_win ~ schedule_season + a_season_pct + h_season_pct \
                 + C(team_home_id) + a_avg_season_score \
                 + h_avg_season_score + C(team_away_id) + schedule_week \
                 + ELEVATION + weather_detail + h_travel_distance \
                 + a_travel_distance + h_days_last_game + a_days_last_game \
                 + elo_prob_home', 
                 df, return_type = "dataframe" )

In [13]:
X.tail()

Unnamed: 0,Intercept,C(team_home_id)[T.ATL],C(team_home_id)[T.BAL],C(team_home_id)[T.BUF],C(team_home_id)[T.CAR],C(team_home_id)[T.CHI],C(team_home_id)[T.CIN],C(team_home_id)[T.CLE],C(team_home_id)[T.DAL],C(team_home_id)[T.DEN],C(team_home_id)[T.DET],C(team_home_id)[T.GB],C(team_home_id)[T.HOU],C(team_home_id)[T.IND],C(team_home_id)[T.JAX],C(team_home_id)[T.KC],C(team_home_id)[T.LAC],C(team_home_id)[T.LAR],C(team_home_id)[T.MIA],C(team_home_id)[T.MIN],C(team_home_id)[T.NE],C(team_home_id)[T.NO],C(team_home_id)[T.NYG],C(team_home_id)[T.NYJ],C(team_home_id)[T.OAK],C(team_home_id)[T.PHI],C(team_home_id)[T.PIT],C(team_home_id)[T.SEA],C(team_home_id)[T.SF],C(team_home_id)[T.TB],C(team_home_id)[T.TEN],C(team_home_id)[T.WAS],C(team_away_id)[T.ATL],C(team_away_id)[T.BAL],C(team_away_id)[T.BUF],C(team_away_id)[T.CAR],C(team_away_id)[T.CHI],C(team_away_id)[T.CIN],C(team_away_id)[T.CLE],C(team_away_id)[T.DAL],C(team_away_id)[T.DEN],C(team_away_id)[T.DET],C(team_away_id)[T.GB],C(team_away_id)[T.HOU],C(team_away_id)[T.IND],C(team_away_id)[T.JAX],C(team_away_id)[T.KC],C(team_away_id)[T.LAC],C(team_away_id)[T.LAR],C(team_away_id)[T.MIA],C(team_away_id)[T.MIN],C(team_away_id)[T.NE],C(team_away_id)[T.NO],C(team_away_id)[T.NYG],C(team_away_id)[T.NYJ],C(team_away_id)[T.OAK],C(team_away_id)[T.PHI],C(team_away_id)[T.PIT],C(team_away_id)[T.SEA],C(team_away_id)[T.SF],C(team_away_id)[T.TB],C(team_away_id)[T.TEN],C(team_away_id)[T.WAS],weather_detail[T.DOME (Open Roof)],weather_detail[T.Fair],weather_detail[T.Fog],weather_detail[T.Rain],weather_detail[T.Rain | Fog],weather_detail[T.Snow],weather_detail[T.Snow | Fog],weather_detail[T.Snow | Freezing Rain],schedule_season,a_season_pct,h_season_pct,a_avg_season_score,h_avg_season_score,schedule_week,ELEVATION,h_travel_distance,a_travel_distance,h_days_last_game,a_days_last_game,elo_prob_home
2131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.588235,0.8125,22.529412,31.5,19.0,32.0,0.0,1339.496163,14.0,7.0,0.641378
2132,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.764706,0.6875,26.529412,27.25,19.0,24.4,0.0,576.751879,14.0,7.0,0.582068
2133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.823529,0.823529,32.764706,30.823529,20.0,32.0,0.0,2703.956221,7.0,8.0,0.638772
2134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.705882,0.764706,28.058824,35.058824,20.0,264.9,0.0,2009.546077,8.0,7.0,0.611248
2135,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.833333,0.722222,32.388889,28.555556,21.0,309.0,1078.812466,683.109522,14.0,14.0,0.527846


In [14]:
y.tail()

Unnamed: 0,home_win
2131,1.0
2132,1.0
2133,0.0
2134,0.0
2135,1.0


## Define a moving window for training data, where test data is the following season

In [15]:
yr = 2017
window=4

In [16]:
X_train = X[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)]
y_train = y[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)].squeeze()

In [17]:
X_test = X[(X["schedule_season"] == yr)]
y_test = y[(X["schedule_season"] == yr)].squeeze()

# Random Forest

In [18]:
forest = RandomForestClassifier(n_estimators=30, random_state=33)
y_pred = forest.fit(X_train, y_train).predict(X_test)

In [19]:
accuracy_score(y_test, y_pred)

0.6479400749063671

## Get the best number of estimators

In [20]:
acc = []
for n in [50, 100, 200]:
    for k in range(20):
        forest = RandomForestClassifier(n_estimators=n)
        y_pred = forest.fit(X_train, y_train).predict(X_test)
        acc.append(accuracy_score(y_test, y_pred))
    print(n, np.mean(acc))

50 0.650374531835206
100 0.6562734082397004
200 0.6580524344569286


In [21]:
acc = []

for k in range(20):
    forest = RandomForestClassifier(bootstrap=False, max_depth=60,max_features="auto",min_samples_leaf=4,min_samples_split=5,n_estimators=100)

    y_pred = forest.fit(X_train, y_train).predict(X_test)
    acc.append(accuracy_score(y_test, y_pred))
np.mean(acc)

0.6481273408239702

## Get the feature importances to see if we can remove any variables

In [22]:
feature_importance = pd.DataFrame(list(zip(X.columns, forest.feature_importances_)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)

print('Most Important Features')
print('-'*60)
print(feature_importance.head(10))
print('-'*60)

print('\t')
print('Least Important Features')
print('-'*60)
print(feature_importance.tail(10))
print('-'*60)

Most Important Features
------------------------------------------------------------
               Feature  Importance
82       elo_prob_home    0.205526
73        h_season_pct    0.089985
74  a_avg_season_score    0.085535
75  h_avg_season_score    0.071038
72        a_season_pct    0.068240
77           ELEVATION    0.066797
79   a_travel_distance    0.066093
78   h_travel_distance    0.059566
76       schedule_week    0.052059
71     schedule_season    0.024469
------------------------------------------------------------
	
Least Important Features
------------------------------------------------------------
                                   Feature  Importance
12                  C(team_home_id)[T.HOU]    0.000850
24                  C(team_home_id)[T.OAK]    0.000632
55                  C(team_away_id)[T.OAK]    0.000584
35                  C(team_away_id)[T.CAR]    0.000563
67            weather_detail[T.Rain | Fog]    0.000206
65                   weather_detail[T.Fog]    0.000

# Grid Search

In [23]:
n_estimators = [50,] #[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [24]:
n_folds = 3
forest_r = RandomForestRegressor(bootstrap=True, max_depth = 10, max_features="sqrt",
min_samples_leaf=4, min_samples_split=2, n_estimators=50, random_state=42)
gs = GridSearchCV(forest_r,random_grid, cv=n_folds)



In [25]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
gs.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 50}

In [27]:
y_pred = gs.predict(X_test)
accuracy_score(y_test, (y_pred>0.5)*1)

0.6404494382022472

# Random Forest Regressor

In [28]:
forest_r = RandomForestRegressor(bootstrap=True, max_depth = 10, max_features="sqrt",
min_samples_leaf=4, min_samples_split=2, n_estimators=50, random_state=42)

In [29]:
y_pred = forest_r.fit(X_train, y_train).predict(X_test)


In [30]:
feature_importance = pd.DataFrame(list(zip(X.columns, forest_r.feature_importances_)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)

print('Most Important Features')
print('-'*60)
print(feature_importance.head(10))
print('-'*60)

print('\t')
print('Least Important Features')
print('-'*60)
print(feature_importance.tail(10))
print('-'*60)

Most Important Features
------------------------------------------------------------
                  Feature  Importance
82          elo_prob_home    0.220013
74     a_avg_season_score    0.094466
73           h_season_pct    0.083884
72           a_season_pct    0.071484
79      a_travel_distance    0.065236
75     h_avg_season_score    0.064703
77              ELEVATION    0.054410
78      h_travel_distance    0.048330
76          schedule_week    0.039335
20  C(team_home_id)[T.NE]    0.019800
------------------------------------------------------------
	
Least Important Features
------------------------------------------------------------
                                   Feature  Importance
67            weather_detail[T.Rain | Fog]    0.000388
12                  C(team_home_id)[T.HOU]    0.000335
63      weather_detail[T.DOME (Open Roof)]    0.000324
49                  C(team_away_id)[T.MIA]    0.000322
32                  C(team_away_id)[T.ATL]    0.000007
68                

In [31]:
y_pred_2 = 1* (y_pred > .5 )

In [32]:
accuracy_score(y_test, y_pred_2)

0.6404494382022472

## Dealing with Close Calls (40% - 60%)

In [33]:
# Start with Actual Results and Predicted Probabilities
bets_df = pd.DataFrame(list(zip(y_test, y_pred)), columns=['home_win_actual', 'home_win_predicted_prob'])

In [34]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob
0,0.0,0.691547
1,1.0,0.572446
2,0.0,0.355665
3,0.0,0.609637
4,0.0,0.337666


In [35]:
# Define threshold for close call predictions
# In this case, threshold=10% so close calls are 40% - 60%
thresh = 0.10

bets_df['close_call'] = np.abs(bets_df.home_win_predicted_prob - 0.5) < thresh
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call
0,0.0,0.691547,False
1,1.0,0.572446,True
2,0.0,0.355665,False
3,0.0,0.609637,False
4,0.0,0.337666,False


## Strategy 1: Bet on the Home team if Close Call

In [36]:
## If a close call, guess a home team victor
def strategy_close_home(row):
    if row.close_call:
        return 1 # bet on home
    else:
        return round(row.home_win_predicted_prob, 0)
            

bets_df["s1_bet"] = bets_df.apply(strategy_close_home, axis=1)
bets_df['s1_win'] = 1 * (bets_df.home_win_actual == bets_df.s1_bet)

In [37]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call,s1_bet,s1_win
0,0.0,0.691547,False,1.0,0
1,1.0,0.572446,True,1.0,1
2,0.0,0.355665,False,0.0,1
3,0.0,0.609637,False,1.0,0
4,0.0,0.337666,False,0.0,1


In [38]:
# Percentage of Correct bets based on Strategy 1
bets_df["s1_win"].sum()/len(bets_df)

0.6217228464419475

# Strategy 2: Don't bet on Close Calls

In [39]:
## If a close call, guess a home team victor
def strategy_close_abstain(row):
    if row.close_call:
        return np.nan # indicate no bet made
    else:
        return round(row.home_win_predicted_prob, 0)
            

bets_df["s2_bet"] = bets_df.apply(strategy_close_abstain, axis=1)
bets_df['s2_win'] = 1 * (bets_df.home_win_actual == bets_df.s2_bet)

In [40]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call,s1_bet,s1_win,s2_bet,s2_win
0,0.0,0.691547,False,1.0,0,1.0,0
1,1.0,0.572446,True,1.0,1,,0
2,0.0,0.355665,False,0.0,1,0.0,1
3,0.0,0.609637,False,1.0,0,1.0,0
4,0.0,0.337666,False,0.0,1,0.0,1


In [41]:
# Percentage of Correct bets based on Strategy 2
bets_df["s2_win"].sum()/np.sum(bets_df.s2_bet.notna())

0.6839378238341969

# Test on all Data

In [42]:
y_pred_all = pd.DataFrame(forest_r.fit(X_train, y_train).predict(X))

In [43]:
y_pred_all["real"] = y.iloc[:,0]

In [44]:
y_pred_all["close_call"] = y_pred_all.apply(lambda x: (x[0] > .4) and (x[0] < .6) , axis=1)

In [45]:
def strategy(row):
    if row.close_call:
        return row["real"]
    else:
        return row["real"] == round(row[0])

y_pred_all["pred"] = y_pred_all.apply(strategy, axis=1)

In [46]:
y_pred_all["pred"].sum()/len(y_pred_all)

0.6713483146067416

# Test on Each Season

In [47]:
def model_nfl_predictions(df, season, training_window=4, threshold=0.10):
    """Random Forest Regressor Model to Predict winner of NFL games in a selected season"""
    
    # Define X features and y target
    y, X = dmatrices('home_win ~ schedule_season + a_season_pct + h_season_pct \
                 + C(team_home_id) + a_avg_season_score \
                 + h_avg_season_score + C(team_away_id) + schedule_week \
                 + ELEVATION + weather_detail + h_travel_distance \
                 + a_travel_distance + h_days_last_game + a_days_last_game \
                 + elo_prob_home', 
                 df, return_type = "dataframe" )
    
    # Keep vegas actual spread for comparison, but not in model
    vegas_actual = df['actual_spread']
    
    # Define training seasons and test season
    X_train = X[(X["schedule_season"] < season) & (
        X["schedule_season"] >= season - training_window)]
    y_train = y[(X["schedule_season"] < season) & (
        X["schedule_season"] >= season - training_window)].squeeze()
    
    X_test = X[(X["schedule_season"] == season)]
    y_test = y[(X["schedule_season"] == season)].squeeze()
    vegas_test_spread = vegas_actual[(X["schedule_season"] == season)]
    
    print('-'*40)
    print('Predictions for {} Season'.format(season))
    min_train = int(X_train.schedule_season.min())
    max_train = int(X_train.schedule_season.max())
    print('Model Based on {}-{} Seasons\n'.format(min_train, max_train))
    
    # Naive baseline accuracy (always pick home)
    naive_acc = y_test.sum()/len(y_test)
    print('Always Pick Home Accuracy: {:.2%}'.format(naive_acc))
    
    # Target Vegas Favorite accuracy
    vegas_acc = (y_test == (vegas_test_spread < 0)).sum()/len(y_test)
    print('Target Vegas Accuracy: {:.2%}'.format(vegas_acc))
    print('\t')
    
    # Define Random Forest Regressor Model
    forest_r = RandomForestRegressor(bootstrap=True, max_depth = 10, max_features="sqrt",
                                     min_samples_leaf=4, min_samples_split=2, n_estimators=50, random_state=42)
    # Make predictions from Random Forest Regressor model
    y_pred = forest_r.fit(X_train, y_train).predict(X_test)
    
    # Print model Accuracy for 0.5 cutoff
    model_acc = accuracy_score(y_test, (y_pred > 0.5))
    print('Random Forest Accuracy: {:.2%}'.format(model_acc))
    
    # Start with Actual Results and Predicted Probabilities
    bets_df = pd.DataFrame(list(zip(y_test, y_pred)), columns=['home_win_actual', 'home_win_predicted_prob'])
    
    bets_df['close_call'] = np.abs(bets_df.home_win_predicted_prob - 0.5) < threshold
    
    # Strategy 1 - Bet on home if close
    bets_df["s1_bet"] = bets_df.apply(strategy_close_home, axis=1)
    bets_df['s1_win'] = 1 * (bets_df.home_win_actual == bets_df.s1_bet)
    strat1_acc = bets_df["s1_win"].sum()/len(bets_df)
    
    # Strategy 2 - Abstain from bet if close
    bets_df["s2_bet"] = bets_df.apply(strategy_close_abstain, axis=1)
    bets_df['s2_win'] = 1 * (bets_df.home_win_actual == bets_df.s2_bet)
    strat2_acc = bets_df["s2_win"].sum()/np.sum(bets_df.s2_bet.notna())
    
    print('Strategy 1 Accuracy: {:.2%}'.format(strat1_acc))
    print('Strategy 2 Accuracy: {:.2%}'.format(strat2_acc))
    print('-'*40)
    print('\t')

In [48]:
for season in range(2012, 2019):
    model_nfl_predictions(df, season)

----------------------------------------
Predictions for 2012 Season
Model Based on 2011-2011 Seasons

Always Pick Home Accuracy: 56.55%
Target Vegas Accuracy: 63.67%
	
Random Forest Accuracy: 62.17%
Strategy 1 Accuracy: 61.42%
Strategy 2 Accuracy: 69.43%
----------------------------------------
	
----------------------------------------
Predictions for 2013 Season
Model Based on 2011-2012 Seasons

Always Pick Home Accuracy: 59.18%
Target Vegas Accuracy: 64.42%
	
Random Forest Accuracy: 60.30%
Strategy 1 Accuracy: 58.43%
Strategy 2 Accuracy: 69.60%
----------------------------------------
	
----------------------------------------
Predictions for 2014 Season
Model Based on 2011-2013 Seasons

Always Pick Home Accuracy: 57.30%
Target Vegas Accuracy: 68.54%
	
Random Forest Accuracy: 66.29%
Strategy 1 Accuracy: 62.55%
Strategy 2 Accuracy: 70.00%
----------------------------------------
	
----------------------------------------
Predictions for 2015 Season
Model Based on 2011-2014 Seasons

