# Data importing and cleaning

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
df = pd.read_csv("data/data_cumulative.csv", index_col=0)

In [4]:
df.head(2)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,team_underdog_id,total_score,over_under_result,actual_spread,spread_result,game_winner_id,home_win,game_id_h,game_id_a,stadium_name,stadium_type,stadium_weather_type,stadium_surface,LATITUDE,LONGITUDE,ELEVATION,h_game_id_team,h_travel_distance,h_season_wins,h_season_losses,h_season_ties,h_season_pct,h_days_last_game,h_avg_season_score,a_game_id_team,a_travel_distance,a_season_wins,a_season_losses,a_season_ties,a_season_pct,a_days_last_game,a_avg_season_score
0,2011-09-08,2011,1,False,Green Bay Packers,42,34,New Orleans Saints,GB,-5.0,48.0,Lambeau Field,False,68.0,5.0,67.0,Fair,GB,NO,NO,76,over,-8,favorite,GB,1,2011-1-GB,2011-1-NO,Lambeau Field,outdoor,cold,Grass,44.4794,-88.1366,209.4,2011-1-GB,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-NO,0.0,0.0,0.0,0.0,0.5,250.0,22.5
1,2011-09-11,2011,1,False,Arizona Cardinals,28,21,Carolina Panthers,ARI,-7.0,37.5,University of Phoenix Stadium,False,72.0,0.0,,DOME,ARI,CAR,CAR,49,over,-7,push,ARI,1,2011-1-ARI,2011-1-CAR,University of Phoenix Stadium,retractable,dome,Grass,33.4552,-111.9316,375.2,2011-1-ARI,0.0,0.0,0.0,0.0,0.5,250.0,22.5,2011-1-CAR,0.0,0.0,0.0,0.0,0.5,250.0,22.5


In [5]:
# 1 = Home Win
# 0 = Away Win
# -1 = Tie

df.home_win.value_counts()

 1    1230
 0     899
-1       7
Name: home_win, dtype: int64

# Define Baseline accuracy

In [6]:
# Minimum Baseline (Naive)
# accuracy if picking home team win in every game
df.home_win.sum()/len(df)

0.572565543071161

In [7]:
# Target Baseline
# accuracy if picking the vegas favorite team in each game
(df["home_win"] == (df["actual_spread"] < 0)).sum()/len(df)

0.651685393258427

# Define X, y

In [8]:
y, X = dmatrices('home_win ~ schedule_season + a_season_pct + h_season_pct \
                 + C(team_favorite_id) + a_avg_season_score \
                 + h_avg_season_score + C(team_underdog_id) + schedule_week \
                 + ELEVATION + weather_detail + h_travel_distance \
                 + a_travel_distance + h_days_last_game + a_days_last_game', 
                 df, return_type = "dataframe" )

In [9]:
X.tail()

Unnamed: 0,Intercept,C(team_favorite_id)[T.ATL],C(team_favorite_id)[T.BAL],C(team_favorite_id)[T.BUF],C(team_favorite_id)[T.CAR],C(team_favorite_id)[T.CHI],C(team_favorite_id)[T.CIN],C(team_favorite_id)[T.CLE],C(team_favorite_id)[T.DAL],C(team_favorite_id)[T.DEN],C(team_favorite_id)[T.DET],C(team_favorite_id)[T.GB],C(team_favorite_id)[T.HOU],C(team_favorite_id)[T.IND],C(team_favorite_id)[T.JAX],C(team_favorite_id)[T.KC],C(team_favorite_id)[T.LAC],C(team_favorite_id)[T.LAR],C(team_favorite_id)[T.MIA],C(team_favorite_id)[T.MIN],C(team_favorite_id)[T.NE],C(team_favorite_id)[T.NO],C(team_favorite_id)[T.NYG],C(team_favorite_id)[T.NYJ],C(team_favorite_id)[T.OAK],C(team_favorite_id)[T.PHI],C(team_favorite_id)[T.PICK],C(team_favorite_id)[T.PIT],C(team_favorite_id)[T.SEA],C(team_favorite_id)[T.SF],C(team_favorite_id)[T.TB],C(team_favorite_id)[T.TEN],C(team_favorite_id)[T.WAS],C(team_underdog_id)[T.ATL],C(team_underdog_id)[T.BAL],C(team_underdog_id)[T.BUF],C(team_underdog_id)[T.CAR],C(team_underdog_id)[T.CHI],C(team_underdog_id)[T.CIN],C(team_underdog_id)[T.CLE],C(team_underdog_id)[T.DAL],C(team_underdog_id)[T.DEN],C(team_underdog_id)[T.DET],C(team_underdog_id)[T.GB],C(team_underdog_id)[T.HOU],C(team_underdog_id)[T.IND],C(team_underdog_id)[T.JAX],C(team_underdog_id)[T.KC],C(team_underdog_id)[T.LAC],C(team_underdog_id)[T.LAR],C(team_underdog_id)[T.MIA],C(team_underdog_id)[T.MIN],C(team_underdog_id)[T.NE],C(team_underdog_id)[T.NO],C(team_underdog_id)[T.NYG],C(team_underdog_id)[T.NYJ],C(team_underdog_id)[T.OAK],C(team_underdog_id)[T.PHI],C(team_underdog_id)[T.PICK],C(team_underdog_id)[T.PIT],C(team_underdog_id)[T.SEA],C(team_underdog_id)[T.SF],C(team_underdog_id)[T.TB],C(team_underdog_id)[T.TEN],C(team_underdog_id)[T.WAS],weather_detail[T.DOME (Open Roof)],weather_detail[T.Fair],weather_detail[T.Fog],weather_detail[T.Rain],weather_detail[T.Rain | Fog],weather_detail[T.Snow],weather_detail[T.Snow | Fog],weather_detail[T.Snow | Freezing Rain],schedule_season,a_season_pct,h_season_pct,a_avg_season_score,h_avg_season_score,schedule_week,ELEVATION,h_travel_distance,a_travel_distance,h_days_last_game,a_days_last_game
2131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.588235,0.8125,22.529412,31.5,19.0,32.0,0.0,1339.496163,14.0,7.0
2132,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.764706,0.6875,26.529412,27.25,19.0,24.4,0.0,576.751879,14.0,7.0
2133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.823529,0.823529,32.764706,30.823529,20.0,32.0,0.0,2703.956221,7.0,8.0
2134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.705882,0.764706,28.058824,35.058824,20.0,264.9,0.0,2009.546077,8.0,7.0
2135,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,0.833333,0.722222,32.388889,28.555556,21.0,309.0,1078.812466,683.109522,14.0,14.0


In [10]:
y.tail()

Unnamed: 0,home_win
2131,1.0
2132,1.0
2133,0.0
2134,0.0
2135,1.0


## Define a moving window for training data, where test data is the following season

In [11]:
yr = 2017
window=4

In [12]:
X_train = X[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)]
y_train = y[(X["schedule_season"] < yr) & (X["schedule_season"] >= yr - window)].squeeze()

In [13]:
X_test = X[(X["schedule_season"] == yr)]
y_test = y[(X["schedule_season"] == yr)].squeeze()

# Random Forest

In [14]:
forest = RandomForestClassifier(n_estimators=30, random_state=33)
y_pred = forest.fit(X_train, y_train).predict(X_test)

In [15]:
accuracy_score(y_test, y_pred)

0.5655430711610487

## Get the best number of estimators

In [16]:
acc = []
for n in [50, 100, 200]:
    for k in range(20):
        forest = RandomForestClassifier(n_estimators=n)
        y_pred = forest.fit(X_train, y_train).predict(X_test)
        acc.append(accuracy_score(y_test, y_pred))
    print(n, np.mean(acc))

50 0.6020599250936329
100 0.6019662921348314
200 0.6012484394506868


In [17]:
acc = []

for k in range(20):
    forest = RandomForestClassifier(bootstrap=False, max_depth=60,max_features="auto",min_samples_leaf=4,min_samples_split=5,n_estimators=100)

    y_pred = forest.fit(X_train, y_train).predict(X_test)
    acc.append(accuracy_score(y_test, y_pred))
np.mean(acc)

0.60561797752809

## Get the feature importances to see if we can remove any variables

In [18]:
feature_importance = pd.DataFrame(list(zip(X.columns, forest.feature_importances_)), columns=['Feature', 'Importance']).sort_values('Importance', ascending=False)

print('Most Important Features')
print('-'*60)
print(feature_importance.head(10))
print('-'*60)

print('\t')
print('Least Important Features')
print('-'*60)
print(feature_importance.tail(10))
print('-'*60)

Most Important Features
------------------------------------------------------------
               Feature  Importance
75        h_season_pct    0.127510
76  a_avg_season_score    0.113455
77  h_avg_season_score    0.102139
79           ELEVATION    0.097157
74        a_season_pct    0.089864
81   a_travel_distance    0.078234
80   h_travel_distance    0.071100
78       schedule_week    0.070087
73     schedule_season    0.028829
82    h_days_last_game    0.026690
------------------------------------------------------------
	
Least Important Features
------------------------------------------------------------
                                   Feature  Importance
58             C(team_underdog_id)[T.PICK]    0.000339
70                  weather_detail[T.Snow]    0.000213
60              C(team_underdog_id)[T.SEA]    0.000174
47               C(team_underdog_id)[T.KC]    0.000148
41              C(team_underdog_id)[T.DEN]    0.000130
14              C(team_favorite_id)[T.JAX]    0.000

# Grid Search

In [19]:
n_estimators = [50,] #[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [20]:
n_folds = 3
gs = GridSearchCV(forest,random_grid, cv=n_folds)



In [21]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=60, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
gs.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 50}

In [23]:
y_pred = gs.predict(X_test)
accuracy_score(y_test, y_pred)

0.6067415730337079

# Random Forest Regressor

In [24]:
forest_r = RandomForestRegressor(bootstrap=True, max_depth = 30, max_features="auto",
min_samples_leaf=4, min_samples_split=10, n_estimators=50, random_state=42)

In [25]:
y_pred = forest_r.fit(X_train, y_train).predict(X_test)


In [26]:
y_pred_2 = 1* (y_pred > .5 )

In [27]:
accuracy_score(y_test, y_pred_2)

0.6329588014981273

## Dealing with Close Calls (40% - 60%)

In [28]:
# Start with Actual Results and Predicted Probabilities
bets_df = pd.DataFrame(list(zip(y_test, y_pred)), columns=['home_win_actual', 'home_win_predicted_prob'])

In [29]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob
0,0.0,0.596139
1,1.0,0.396463
2,0.0,0.227989
3,0.0,0.467575
4,0.0,0.358358


In [30]:
# Define threshold for close call predictions
# In this case, threshold=10% so close calls are 40% - 60%
thresh = 0.10

bets_df['close_call'] = np.abs(bets_df.home_win_predicted_prob - 0.5) < thresh
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call
0,0.0,0.596139,True
1,1.0,0.396463,False
2,0.0,0.227989,False
3,0.0,0.467575,True
4,0.0,0.358358,False


## Strategy 1: Bet on the Home team if Close Call

In [31]:
## If a close call, guess a home team victor
def strategy_close_home(row):
    if row.close_call:
        return 1 # bet on home
    else:
        return round(row.home_win_predicted_prob, 0)
            

bets_df["s1_bet"] = bets_df.apply(strategy_close_home, axis=1)
bets_df['s1_win'] = 1 * (bets_df.home_win_actual == bets_df.s1_bet)

In [32]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call,s1_bet,s1_win
0,0.0,0.596139,True,1.0,0
1,1.0,0.396463,False,0.0,0
2,0.0,0.227989,False,0.0,1
3,0.0,0.467575,True,1.0,0
4,0.0,0.358358,False,0.0,1


In [33]:
# Percentage of Correct bets based on Strategy 1
bets_df["s1_win"].sum()/len(bets_df)

0.6067415730337079

# Strategy 2: Don't bet on Close Calls

In [34]:
## If a close call, guess a home team victor
def strategy_close_abstain(row):
    if row.close_call:
        return np.nan # indicate no bet made
    else:
        return round(row.home_win_predicted_prob, 0)
            

bets_df["s2_bet"] = bets_df.apply(strategy_close_abstain, axis=1)
bets_df['s2_win'] = 1 * (bets_df.home_win_actual == bets_df.s2_bet)

In [35]:
bets_df.head()

Unnamed: 0,home_win_actual,home_win_predicted_prob,close_call,s1_bet,s1_win,s2_bet,s2_win
0,0.0,0.596139,True,1.0,0,,0
1,1.0,0.396463,False,0.0,0,0.0,0
2,0.0,0.227989,False,0.0,1,0.0,1
3,0.0,0.467575,True,1.0,0,,0
4,0.0,0.358358,False,0.0,1,0.0,1


In [36]:
# Percentage of Correct bets based on Strategy 2
bets_df["s2_win"].sum()/np.sum(bets_df.s2_bet.notna())

0.6449704142011834

# Test on all Data

In [37]:
y_pred_all = pd.DataFrame(forest_r.fit(X_train, y_train).predict(X))

In [38]:
y_pred_all["real"] = y.iloc[:,0]

In [39]:
y_pred_all["close_call"] = y_pred_all.apply(lambda x: (x[0] > .4) and (x[0] < .6) , axis=1)

In [40]:
def strategy(row):
    if row.close_call:
        return row["real"]
    else:
        return row["real"] == round(row[0])

y_pred_all["pred"] = y_pred_all.apply(strategy, axis=1)

In [41]:
y_pred_all["pred"].sum()/len(y_pred_all)

0.7354868913857678

# Test on Each Season

In [42]:
def model_nfl_predictions(df, season, training_window=4, threshold=0.10):
    """Random Forest Regressor Model to Predict winner of NFL games in a selected season"""
    
    # Define X features and y target
    y, X = dmatrices('home_win ~ schedule_season + a_season_pct + h_season_pct \
                 + C(team_favorite_id) + a_avg_season_score \
                 + h_avg_season_score + C(team_underdog_id) + schedule_week \
                 + ELEVATION + weather_detail + h_travel_distance \
                 + a_travel_distance + h_days_last_game + a_days_last_game', 
                 df, return_type = "dataframe" )
    
    # Keep vegas actual spread for comparison, but not in model
    vegas_actual = df['actual_spread']
    
    # Define training seasons and test season
    X_train = X[(X["schedule_season"] < season) & (
        X["schedule_season"] >= season - training_window)]
    y_train = y[(X["schedule_season"] < season) & (
        X["schedule_season"] >= season - training_window)].squeeze()
    
    X_test = X[(X["schedule_season"] == season)]
    y_test = y[(X["schedule_season"] == season)].squeeze()
    vegas_test_spread = vegas_actual[(X["schedule_season"] == season)]
    
    print('-'*40)
    print('Predictions for {} Season'.format(season))
    min_train = int(X_train.schedule_season.min())
    max_train = int(X_train.schedule_season.max())
    print('Model Based on {}-{} Seasons\n'.format(min_train, max_train))
    
    # Naive baseline accuracy (always pick home)
    naive_acc = y_test.sum()/len(y_test)
    print('Always Pick Home Accuracy: {:.2%}'.format(naive_acc))
    
    # Target Vegas Favorite accuracy
    vegas_acc = (y_test == (vegas_test_spread < 0)).sum()/len(y_test)
    print('Target Vegas Accuracy: {:.2%}'.format(vegas_acc))
    print('\t')
    
    # Define Random Forest Regressor Model
    forest_r = RandomForestRegressor(bootstrap=True, max_depth=30, max_features="auto",
                                     min_samples_leaf=4, min_samples_split=10, n_estimators=50, random_state=42)
    
    # Make predictions from Random Forest Regressor model
    y_pred = forest_r.fit(X_train, y_train).predict(X_test)
    
    # Print model Accuracy for 0.5 cutoff
    model_acc = accuracy_score(y_test, (y_pred > 0.5))
    print('Random Forest Accuracy: {:.2%}'.format(model_acc))
    
    # Start with Actual Results and Predicted Probabilities
    bets_df = pd.DataFrame(list(zip(y_test, y_pred)), columns=['home_win_actual', 'home_win_predicted_prob'])
    
    bets_df['close_call'] = np.abs(bets_df.home_win_predicted_prob - 0.5) < threshold
    
    # Strategy 1 - Bet on home if close
    bets_df["s1_bet"] = bets_df.apply(strategy_close_home, axis=1)
    bets_df['s1_win'] = 1 * (bets_df.home_win_actual == bets_df.s1_bet)
    strat1_acc = bets_df["s1_win"].sum()/len(bets_df)
    
    # Strategy 2 - Abstain from bet if close
    bets_df["s2_bet"] = bets_df.apply(strategy_close_abstain, axis=1)
    bets_df['s2_win'] = 1 * (bets_df.home_win_actual == bets_df.s2_bet)
    strat2_acc = bets_df["s2_win"].sum()/np.sum(bets_df.s2_bet.notna())
    
    print('Strategy 1 Accuracy: {:.2%}'.format(strat1_acc))
    print('Strategy 2 Accuracy: {:.2%}'.format(strat2_acc))
    print('-'*40)
    print('\t')

In [43]:
for season in range(2012, 2019):
    model_nfl_predictions(df, season)

----------------------------------------
Predictions for 2012 Season
Model Based on 2011-2011 Seasons

Always Pick Home Accuracy: 56.55%
Target Vegas Accuracy: 63.67%
	
Random Forest Accuracy: 63.30%
Strategy 1 Accuracy: 63.30%
Strategy 2 Accuracy: 67.53%
----------------------------------------
	
----------------------------------------
Predictions for 2013 Season
Model Based on 2011-2012 Seasons

Always Pick Home Accuracy: 59.18%
Target Vegas Accuracy: 64.42%
	
Random Forest Accuracy: 61.05%
Strategy 1 Accuracy: 61.42%
Strategy 2 Accuracy: 63.69%
----------------------------------------
	
----------------------------------------
Predictions for 2014 Season
Model Based on 2011-2013 Seasons

Always Pick Home Accuracy: 57.30%
Target Vegas Accuracy: 68.54%
	
Random Forest Accuracy: 64.79%
Strategy 1 Accuracy: 62.55%
Strategy 2 Accuracy: 66.67%
----------------------------------------
	
----------------------------------------
Predictions for 2015 Season
Model Based on 2011-2014 Seasons

