In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
spread = pd.read_csv('../Data/spreadspoke_scores.csv')
#stadiums = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_stadiums.csv')
teams = pd.read_csv('../Data/nfl_teams.csv')
teams = teams.set_index('team_name')

In [4]:
joined = spread.join(teams, on='team_home').join(teams, on='team_away', lsuffix='_home', rsuffix='_away')



In [5]:
joined['team_division_away_season'] = np.where(joined['schedule_season'] < 2002, joined['team_division_pre2002_away'], joined['team_division_away'])
joined['team_conference_away_season'] = np.where(joined['schedule_season'] < 2002, joined['team_conference_pre2002_away'], joined['team_conference_away'])
joined['team_division_home_season'] = np.where(joined['schedule_season'] < 2002, joined['team_division_pre2002_home'], joined['team_division_home'])
joined['team_conference_home_season'] = np.where(joined['schedule_season'] < 2002, joined['team_conference_pre2002_home'], joined['team_conference_home'])


In [6]:
joined['home_spread_actual'] = joined['score_away']-joined['score_home']
joined['home_wins_actual'] = joined['home_spread_actual'] < 0

def home_spread(game):
    
    if game['team_favorite_id'] == 'NaN': 
        return None
    elif game['team_favorite_id'] == game['team_id_home']:
        return game['spread_favorite']
    else:
        return game['spread_favorite'] * -1
    
joined['home_spread'] = joined.apply(home_spread, axis=1)

In [7]:
joined['over_under_line_actual'] = joined['score_home']+joined['score_away']

In [91]:
cleaned = joined[joined.over_under_line != ' '].dropna(subset=['over_under_line'])
cleaned['over_under_line'] = cleaned['over_under_line'].astype('float')
cleaned['schedule_week'] = cleaned['schedule_week'].replace('Wildcard', 19).replace('WildCard', 19).replace('Division', 20).replace('Conference', 21).replace('Superbowl', 22).replace('SuperBowl', 22).astype('int')


In [92]:
features = ['schedule_season', 'schedule_week', 'schedule_playoff', 'team_home',
       'team_away', 'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_division_away_season',
       'team_conference_away_season', 'team_division_home_season',
       'team_conference_home_season', 'home_spread_actual', 'home_wins_actual',
       'home_spread', 'over_under_line_actual']
not_important = ['stadium','weather_humidity']
features = cleaned[features].drop(columns=not_important)
features = pd.get_dummies(features, dummy_na=True)
features = features.dropna()

In [110]:
X = features.drop(columns=['over_under_line_actual', 'home_spread_actual', 'home_wins_actual'])
y = features['home_wins_actual']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

Accuracy: 0.628383067314365


home_spread                              0.175868
schedule_season                          0.097259
over_under_line                          0.095099
schedule_week                            0.085644
weather_temperature                      0.077967
weather_wind_mph                         0.066529
team_division_away_season_AFC East       0.007975
weather_detail_nan                       0.007025
team_division_away_season_AFC West       0.006849
team_division_away_season_NFC East       0.006825
team_division_away_season_NFC West       0.006700
team_conference_away_season_AFC          0.006680
team_division_home_season_AFC East       0.006632
team_conference_away_season_NFC          0.006544
team_division_home_season_NFC West       0.006390
team_division_home_season_AFC West       0.006341
team_division_home_season_NFC East       0.005838
weather_detail_DOME                      0.005487
team_division_home_season_NFC Central    0.005412
team_division_away_season_NFC Central    0.005390


In [111]:
X = features.drop(columns=['over_under_line_actual', 'home_spread_actual', 'home_wins_actual'])
y = features['home_spread_actual']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
regr = RandomForestRegressor(max_depth=15, random_state=0,n_estimators=100)
regr.fit(X_train, y_train)
y_pred=regr.predict(X_test)
print("Accuracy:",metrics.r2_score(y_test, y_pred, multioutput='variance_weighted'))
feature_imp = pd.Series(regr.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

Accuracy: 0.1366718415864816


home_spread                            3.096473e-01
over_under_line                        7.931254e-02
weather_temperature                    7.296029e-02
schedule_season                        7.059930e-02
schedule_week                          6.216671e-02
weather_wind_mph                       4.755265e-02
team_home_Kansas City Chiefs           6.671670e-03
team_division_away_season_NFC West     6.384410e-03
team_home_Seattle Seahawks             5.930462e-03
team_away_San Francisco 49ers          5.845905e-03
team_away_Green Bay Packers            5.604627e-03
team_division_home_season_NFC West     5.490389e-03
team_away_New England Patriots         5.340588e-03
team_away_New Orleans Saints           5.302914e-03
team_away_Chicago Bears                5.161509e-03
team_division_away_season_AFC West     5.130533e-03
team_division_home_season_AFC West     4.908231e-03
team_home_San Francisco 49ers          4.864344e-03
team_division_away_season_AFC East     4.775646e-03
team_home_Ne