# Predicting NHL Wins

### IMPORTS

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [184]:
import itertools
from mlxtend.plotting import plot_decision_regions
from mlxtend.classifier import StackingClassifier # <-- note: this is not from sklearn!


from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import svm

### Importing Data

In [3]:
nhl_data = pd.read_csv('nhl_win_loss.csv')

In [5]:
nhl_data.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
nhl_data.head()

Unnamed: 0,homedivision,hometeam,date,awayteam,homeresult,datetime,homefivegoalsfor,homefivegoalsagainst,hometengoalsfor,hometengoalsagainst,...,tengamegoalfordiff,fivegamegoalfordiff,tengamegoalagainstdiff,fivegamegoalagainstdiff,homefivegoaldiff,hometengoaldiff,awayfivegoaldiff,awaytengoaldiff,samedivision,win
0,Atlantic,Tampa Bay,12/27/18,Philadelphia,Win,2018-12-27,24.0,15.0,51.0,27.0,...,23.0,12.0,-10.0,1.0,9.0,24.0,-2.0,-9.0,0,1
1,Atlantic,Tampa Bay,12/29/18,Montreal,Win,2018-12-29,26.0,19.0,52.0,31.0,...,20.0,14.0,1.0,6.0,7.0,21.0,-1.0,2.0,1,1
2,Atlantic,Tampa Bay,1/8/19,Columbus,Win,2019-01-08,22.0,18.0,46.0,33.0,...,17.0,4.0,10.0,1.0,4.0,13.0,1.0,6.0,0,1
3,Atlantic,Tampa Bay,1/10/19,Carolina,Win,2019-01-10,20.0,13.0,46.0,32.0,...,18.0,-1.0,4.0,0.0,7.0,14.0,8.0,0.0,0,1
4,Atlantic,Tampa Bay,1/17/19,Toronto,Loss,2019-01-17,15.0,9.0,37.0,27.0,...,6.0,1.0,-5.0,-6.0,6.0,10.0,-1.0,-1.0,1,0


### Creating a Numeric DataFrame

In [7]:
nhl_num = nhl_data.drop(['homedivision', 'awaydivision', 'hometeam', 'awayteam', 'date', 'homeresult', 'datetime', 'awayresult'], axis=1)

In [8]:
nhl_num.drop(['homelastfiveloss', 'homelastfiveties', 'homelasttenloss', 'homelasttenties', 
             'awaylastfiveloss', 'awaylastfiveties', 'awaylasttenloss', 'awaylasttenties'], axis=1, inplace=True)

In [9]:
nhl_num_ten = nhl_num.copy()

In [10]:
nhl_num_ten.drop(['homefivegoalsfor', 'homefivegoalsagainst', 'homelastfivewins', 'homefivewinpct', 
                  'awayfivegoalsfor', 'awayfivegoalsagainst', 'awaylastfivewins', 'awayfivewinpct', 
                  'fivegamegoalfordiff', 'fivegamegoalagainstdiff', 'homefivegoaldiff', 'awayfivegoaldiff'], axis=1, inplace=True)

In [11]:
nhl_num_ten['tengoaldiff'] = nhl_num_ten['hometengoaldiff'] - nhl_num_ten['awaytengoaldiff']

In [12]:
nhl_num_ten

Unnamed: 0,hometengoalsfor,hometengoalsagainst,homelasttenwins,hometenwinpct,awaytengoalsfor,awaytengoalsagainst,awaylasttenwins,awaytenwinpct,tenwinpctdiff,tengamegoalfordiff,tengamegoalagainstdiff,hometengoaldiff,awaytengoaldiff,samedivision,win,tengoaldiff
0,51.0,27.0,9.0,0.9,28.0,37.0,4.0,0.4,0.5,23.0,-10.0,24.0,-9.0,0,1,33.0
1,52.0,31.0,9.0,0.9,32.0,30.0,7.0,0.7,0.2,20.0,1.0,21.0,2.0,1,1,19.0
2,46.0,33.0,8.0,0.8,29.0,23.0,7.0,0.7,0.1,17.0,10.0,13.0,6.0,0,1,7.0
3,46.0,32.0,8.0,0.8,28.0,28.0,6.0,0.6,0.2,18.0,4.0,14.0,0.0,0,1,14.0
4,37.0,27.0,8.0,0.8,31.0,32.0,5.0,0.5,0.3,6.0,-5.0,10.0,-1.0,1,0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,22.0,36.0,2.0,0.2,33.0,32.0,6.0,0.6,-0.4,-11.0,4.0,-14.0,1.0,1,1,-15.0
772,25.0,32.0,3.0,0.3,33.0,30.0,6.0,0.6,-0.3,-8.0,2.0,-7.0,3.0,1,1,-10.0
773,28.0,32.0,4.0,0.4,28.0,22.0,6.0,0.6,-0.2,0.0,10.0,-4.0,6.0,0,1,-10.0
774,29.0,30.0,5.0,0.5,39.0,20.0,7.0,0.7,-0.2,-10.0,10.0,-1.0,19.0,1,0,-20.0


### Model Prep

In [257]:
X = nhl_num_ten.drop(['win', 'homelasttenwins', 'awaylasttenwins', 'samedivision', 'hometenwinpct', 'awaytenwinpct', 
                     'hometengoalsfor', 'hometengoalsagainst', 'awaytengoalsfor', 'awaytengoalsagainst'], axis=1)
y = nhl_num_ten['win']

In [258]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [254]:
X = nhl_num_ten.drop('win', axis=1)

### Baseline Model

Using Accuracy as Main Scoring Metric

In [88]:
y_train.shape

(620,)

In [256]:
lr = LogisticRegression()

lr.fit(X_train, y_train)
lr.score(X_train, [1]*620)

0.5967741935483871

In [241]:
roc_auc_score(y_train, [1]*620)

0.5

Baseline Accuracy Score (the score to beat): .596

### Testing Multiple Models

In [39]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
kf = KFold(n_splits=5, shuffle=True)


In [259]:
knn = KNeighborsClassifier()
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gnb = GaussianNB()
sv = svm.SVC()
xgb = GradientBoostingClassifier()


models = [knn, lr, dt, rf, gnb, sv, xgb]
for model in models:
    print(model, cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc'), 
          'mean=', cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean())

KNeighborsClassifier() [0.52672751 0.47327249 0.53585398 0.47431551 0.44934896] mean= 0.4919036899717514
LogisticRegression() [0.57809648 0.45736636 0.63389831 0.56036506 0.49817708] mean= 0.5455806578661452
DecisionTreeClassifier() [0.4988266  0.4065189  0.52503259 0.44185137 0.49114583] mean= 0.46624497501086487
RandomForestClassifier() [0.49243807 0.42933507 0.50704042 0.46049544 0.43372396] mean= 0.4452333564211212
GaussianNB() [0.61147327 0.51473272 0.63780965 0.56297262 0.51380208] mean= 0.5681580698609301
SVC() [0.57340287 0.47353325 0.64667536 0.57079531 0.46692708] mean= 0.54626677259887
GradientBoostingClassifier() [0.47574967 0.46558018 0.50495437 0.5041721  0.42513021] mean= 0.47787894529552366


In [260]:
knn = KNeighborsClassifier()
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gnb = GaussianNB()
sv = svm.SVC()
xgb = GradientBoostingClassifier(n_estimators=100)

models = [knn, lr, dt, rf, gnb, sv, xgb]
for model in models:
    print(model, cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'), 
          'mean=', cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean())

KNeighborsClassifier() [0.53225806 0.45967742 0.5483871  0.53225806 0.46774194] mean= 0.5080645161290323
LogisticRegression() [0.57258065 0.4516129  0.62096774 0.55645161 0.51612903] mean= 0.5435483870967741
DecisionTreeClassifier() [0.49193548 0.4516129  0.5        0.43548387 0.46774194] mean= 0.4725806451612904
RandomForestClassifier() [0.47580645 0.4516129  0.5        0.46774194 0.45967742] mean= 0.4629032258064516
GaussianNB() [0.57258065 0.54032258 0.60483871 0.56451613 0.53225806] mean= 0.5629032258064516
SVC() [0.58064516 0.48387097 0.60483871 0.50806452 0.49193548] mean= 0.5338709677419355
GradientBoostingClassifier() [0.49193548 0.48387097 0.47580645 0.51612903 0.47580645] mean= 0.4935483870967742


At first glance, none of the models are better than the baseline in any form

Trying both logistic regression and random forests with hyperparameter tuning involved

In [261]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=30)

In [212]:
lr = LogisticRegression()
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
lr.fit(X_train_scaled, y_train)

LogisticRegression()

In [215]:
lr.predict_proba(X_val_scaled)

array([[0.36879143, 0.63120857],
       [0.5197494 , 0.4802506 ],
       [0.38257227, 0.61742773],
       [0.42347668, 0.57652332],
       [0.40646439, 0.59353561],
       [0.60404349, 0.39595651],
       [0.34482894, 0.65517106],
       [0.33996504, 0.66003496],
       [0.51393972, 0.48606028],
       [0.55304414, 0.44695586],
       [0.39981495, 0.60018505],
       [0.55994711, 0.44005289],
       [0.4964427 , 0.5035573 ],
       [0.4293287 , 0.5706713 ],
       [0.30630403, 0.69369597],
       [0.51978537, 0.48021463],
       [0.33172524, 0.66827476],
       [0.44502362, 0.55497638],
       [0.34424165, 0.65575835],
       [0.33870015, 0.66129985],
       [0.41554978, 0.58445022],
       [0.4152737 , 0.5847263 ],
       [0.52636588, 0.47363412],
       [0.30574726, 0.69425274],
       [0.34890693, 0.65109307],
       [0.41707562, 0.58292438],
       [0.36427322, 0.63572678],
       [0.34872001, 0.65127999],
       [0.3279789 , 0.6720211 ],
       [0.5484739 , 0.4515261 ],
       [0.

In [262]:
rf = RandomForestClassifier(n_estimators=300, min_samples_leaf=18, max_features=2, max_depth=12)
rf.fit(X_train, y_train)

rf.feature_importances_


array([0.10704517, 0.18346227, 0.16750762, 0.12923898, 0.22649978,
       0.18624619])

In [263]:
X_train.columns

Index(['tenwinpctdiff', 'tengamegoalfordiff', 'tengamegoalagainstdiff',
       'hometengoaldiff', 'awaytengoaldiff', 'tengoaldiff'],
      dtype='object')

In [264]:
for i in range(6):
    col=X_train.columns[i]
    importance=rf.feature_importances_[i]
    print(col, importance)

tenwinpctdiff 0.10704516749738129
tengamegoalfordiff 0.1834622743280888
tengamegoalagainstdiff 0.16750761501410935
hometengoaldiff 0.1292389783925344
awaytengoaldiff 0.22649977957276074
tengoaldiff 0.1862461851951254


In [217]:
y_val.shape

(124,)

In [267]:
accuracy_score(y_val, rf.predict(X_val))

0.4838709677419355

In [265]:
confusion_matrix(y_val, rf.predict(X_val))

array([[27, 37],
       [27, 33]])

##### Even after trying to tune lots of parameters, these models are still struggling
##### Going to try a new prediction next