In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
spread = pd.read_csv('../Data/spreadspoke_scores.csv')
#stadiums = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_stadiums.csv')
teams = pd.read_csv('../Data/nfl_teams.csv')
teams = teams.set_index('team_name')

Aggregate datasets, add new data points
----

In [3]:
joined = spread.join(teams, on='team_home')\
                .join(teams, on='team_away', lsuffix='_home', rsuffix='_away')

In [4]:
joined['team_division_away_season'] = np.where(joined['schedule_season'] < 2002, joined['team_division_pre2002_away'], joined['team_division_away'])
joined['team_conference_away_season'] = np.where(joined['schedule_season'] < 2002, joined['team_conference_pre2002_away'], joined['team_conference_away'])
joined['team_division_home_season'] = np.where(joined['schedule_season'] < 2002, joined['team_division_pre2002_home'], joined['team_division_home'])
joined['team_conference_home_season'] = np.where(joined['schedule_season'] < 2002, joined['team_conference_pre2002_home'], joined['team_conference_home'])

In [5]:
joined['home_spread_actual'] = joined['score_away']-joined['score_home']
joined['home_wins_actual'] = joined['home_spread_actual'] < 0

def home_spread(game):
    
    if game['team_favorite_id'] == 'NaN': 
        return None
    elif game['team_favorite_id'] == game['team_id_home']:
        return game['spread_favorite']
    else:
        return game['spread_favorite'] * -1
    
joined['home_spread'] = joined.apply(home_spread, axis=1)

In [6]:
joined['over_under_line_actual'] = joined['score_home']+joined['score_away']

In [7]:
#treat over_under and schedule_week as continuous values
cleaned = joined[joined.over_under_line != ' '].dropna(subset=['over_under_line'])
cleaned['over_under_line'] = cleaned['over_under_line'].astype('float')
cleaned['schedule_week'] = cleaned['schedule_week']\
                    .replace('Wildcard', 19).replace('WildCard', 19)\
                    .replace('Division', 20)\
                    .replace('Conference', 21)\
                    .replace('Superbowl', 22).replace('SuperBowl', 22)\
                    .astype('int')


Prep dataset for ML
---

In [8]:
features = ['schedule_season', 'schedule_week', 'schedule_playoff', 'team_home',
       'team_away', 'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_division_away_season',
       'team_conference_away_season', 'team_division_home_season',
       'team_conference_home_season', 'home_spread_actual', 'home_wins_actual',
       'home_spread', 'over_under_line_actual']
not_important = ['stadium','weather_humidity']
features = cleaned[features].drop(columns=not_important)
features = pd.get_dummies(features, dummy_na=True)
features = features.dropna()



Classifier
------

In [13]:
def runClassifier(c, X, y):
    clf, name = c
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    clf.fit(X_train, y_train)
    y_pred=clf.predict(X_test)
    
    
    print("=======================")
    print(name)
    print("=======================")
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    if hasattr(clf, 'feature_importances_'):
        feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
        print("Features:\n", feature_imp[:10])

In [14]:
X = features.drop(columns=['over_under_line_actual', 'home_spread_actual', 'home_wins_actual'])
y = features['home_wins_actual']

classifiers = [(RandomForestClassifier(n_estimators=100), "Random Forest"),
               (GaussianNB(), "Naive Bayes")]

for c in classifiers:
    runClassifier(c, X, y)


Random Forest
Accuracy: 0.6415683553088133
Features:
 home_spread                           0.170822
schedule_season                       0.094685
over_under_line                       0.094332
schedule_week                         0.084609
weather_temperature                   0.077250
weather_wind_mph                      0.063245
team_division_away_season_AFC East    0.007896
team_conference_away_season_NFC       0.007395
team_conference_away_season_AFC       0.007358
team_division_away_season_NFC West    0.007093
dtype: float64
Naive Bayes
Accuracy: 0.5392088827203331


Regressor
------

In [15]:
def runRegressor(X, y):
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    regr = RandomForestRegressor(max_depth=8, random_state=0,n_estimators=100)
    regr.fit(X_train, y_train)
    y_pred=regr.predict(X_test)
    
    print("r2:",metrics.r2_score(y_test, y_pred))
    feature_imp = pd.Series(regr.feature_importances_,index=X.columns).sort_values(ascending=False)
    print("features:\n", feature_imp[:10])

In [16]:
X = features.drop(columns=['over_under_line_actual', 'home_spread_actual', 'home_wins_actual'])
y = features['home_spread_actual']
runRegressor(X, y)

r2: 0.14809131556249244
features:
 home_spread                      0.580275
over_under_line                  0.052191
weather_temperature              0.047006
schedule_season                  0.041404
schedule_week                    0.035850
weather_wind_mph                 0.029098
team_home_Kansas City Chiefs     0.008262
team_away_New Orleans Saints     0.005481
weather_detail_Rain | Fog        0.004385
team_home_Pittsburgh Steelers    0.003962
dtype: float64


In [17]:
X = features.drop(columns=['over_under_line_actual', 'home_spread_actual', 'home_wins_actual'])
y = features['over_under_line_actual']
runRegressor(X, y)

r2: 0.07592947380264825
features:
 over_under_line                 0.408695
weather_temperature             0.066390
weather_wind_mph                0.064372
home_spread                     0.064022
schedule_season                 0.052622
schedule_week                   0.049474
team_away_Kansas City Chiefs    0.011344
team_away_Indianapolis Colts    0.006619
team_away_Green Bay Packers     0.006489
team_home_Cincinnati Bengals    0.006267
dtype: float64
