In [2]:
from nhlpy import NHLClient
import csv
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from ydata_profiling import ProfileReport
import seaborn as sns

In [3]:
%config InteractiveShell.ast_node_interactivity = "all"

In [4]:
df = pd.read_csv('nhl_shot_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,assist1,assist2,goalie,shot_type,shot_class
0,0,2022020023,52,1,giveaway,0,1,5,5,77,6,8480289,0,0,8470860,wrist,shot-on-goal
1,1,2022020023,52,1,hit,0,0,5,5,30,30,8480145,0,0,8470860,slap,shot-on-goal
2,2,2022020023,3,0,shot-on-goal,0,0,5,5,35,30,8479333,0,0,8476945,wrist,shot-on-goal
3,3,2022020023,52,1,hit,0,0,5,5,41,14,8471218,0,0,8470860,wrist,shot-on-goal
4,4,2022020023,52,1,hit,0,0,5,5,46,17,8471218,0,0,8470860,wrist,shot-on-goal


In [5]:
df.drop('Unnamed: 0', axis = 1, inplace=True)
df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,assist1,assist2,goalie,shot_type,shot_class
0,2022020023,52,1,giveaway,0,1,5,5,77,6,8480289,0,0,8470860,wrist,shot-on-goal
1,2022020023,52,1,hit,0,0,5,5,30,30,8480145,0,0,8470860,slap,shot-on-goal
2,2022020023,3,0,shot-on-goal,0,0,5,5,35,30,8479333,0,0,8476945,wrist,shot-on-goal
3,2022020023,52,1,hit,0,0,5,5,41,14,8471218,0,0,8470860,wrist,shot-on-goal
4,2022020023,52,1,hit,0,0,5,5,46,17,8471218,0,0,8470860,wrist,shot-on-goal


In [7]:
shots = df.drop(['game_id', 'team_id', 'assist1', 'assist2'], axis=1)
shots.head()

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,shot_class
0,1,giveaway,0,1,5,5,77,6,8480289,8470860,wrist,shot-on-goal
1,1,hit,0,0,5,5,30,30,8480145,8470860,slap,shot-on-goal
2,0,shot-on-goal,0,0,5,5,35,30,8479333,8476945,wrist,shot-on-goal
3,1,hit,0,0,5,5,41,14,8471218,8470860,wrist,shot-on-goal
4,1,hit,0,0,5,5,46,17,8471218,8470860,wrist,shot-on-goal


In [8]:
client = NHLClient()
teams = client.teams.teams_info()
team_abbrs = []
for item in teams:
    abbr = item['abbr']
    team_abbrs.append(abbr)

In [9]:
player_dict = {}

for season in [20222023, 20232024, 20242025]:
    for abbreviation in team_abbrs:
    
        if abbreviation == 'UTA' and season != 20242025:
            abbreviation = 'ARI'

        roster = client.teams.roster(team_abbr=abbreviation, season=season)
        for player in roster['forwards']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

        for player in roster['defensemen']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

        for player in roster['goalies']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

In [10]:
shots['shooter'] = shots['shooter'].replace(player_dict)
shots['goalie'] = shots['goalie'].replace(player_dict)

In [11]:
home_mapping = {}
home_mapping[0] ='Away'
home_mapping[1] = 'Home'

rebound_mapping = {}
rebound_mapping[0] = 'No rebound'
rebound_mapping[1] ='Rebound'

rush_mapping = {}
rush_mapping[0] = 'No rush'
rush_mapping[1] ='Rush'

In [12]:
shots['home'] = shots['home'].replace(home_mapping)
shots['rebound'] = shots['rebound'].replace(rebound_mapping)
shots['rush'] = shots['rush'].replace(rush_mapping)

In [13]:
shots['shooter'] = shots['shooter'].apply(lambda x: 'unknown' if isinstance(x, int) else x)
shots['goalie'] = shots['goalie'].apply(lambda x: 'unknown' if isinstance(x, int) else x)

In [14]:
shots = shots[shots['shooter'] != 'unknown']
shots = shots[shots['goalie'] != 'unknown']

In [15]:
shots.head(30)

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,shot_class
0,Home,giveaway,No rebound,Rush,5,5,77,6,Morgan Barron,Jaroslav Halak,wrist,shot-on-goal
1,Home,hit,No rebound,No rush,5,5,30,30,Neal Pionk,Jaroslav Halak,slap,shot-on-goal
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,Libor Hajek,Connor Hellebuyck,wrist,shot-on-goal
3,Home,hit,No rebound,No rush,5,5,41,14,Blake Wheeler,Jaroslav Halak,wrist,shot-on-goal
4,Home,hit,No rebound,No rush,5,5,46,17,Blake Wheeler,Jaroslav Halak,wrist,shot-on-goal
5,Home,hit,No rebound,No rush,5,5,81,-27,Nikolaj Ehlers,Jaroslav Halak,wrist,missed-shot
6,Home,missed-shot,No rebound,No rush,5,5,34,30,Neal Pionk,Jaroslav Halak,wrist,shot-on-goal
7,Away,faceoff,No rebound,No rush,4,5,69,2,Vincent Trocheck,Connor Hellebuyck,deflected,shot-on-goal
8,Away,shot-on-goal,Rebound,No rush,4,5,78,5,Chris Kreider,Connor Hellebuyck,wrist,shot-on-goal
9,Away,shot-on-goal,No rebound,No rush,4,5,45,-6,Artemi Panarin,Connor Hellebuyck,wrist,shot-on-goal


In [16]:
# Fix scraper to convert the home and away ID to find the team abbreviation via a dictionary

In [17]:
shots.info()
shots.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 578664 entries, 0 to 586564
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   home          578664 non-null  object
 1   last_play     578664 non-null  object
 2   rebound       578664 non-null  object
 3   rush          578664 non-null  object
 4   home_skaters  578664 non-null  int64 
 5   away_skaters  578664 non-null  int64 
 6   x_coord       578664 non-null  int64 
 7   y_coord       578664 non-null  int64 
 8   shooter       578664 non-null  object
 9   goalie        578664 non-null  object
 10  shot_type     578664 non-null  object
 11  shot_class    578664 non-null  object
dtypes: int64(4), object(8)
memory usage: 57.4+ MB


Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord
count,578664.0,578664.0,578664.0,578664.0
mean,4.869619,4.860102,62.01065,-0.179615
std,0.494607,0.504697,19.031132,19.723196
min,0.0,0.0,0.0,-42.0
25%,5.0,5.0,49.0,-15.0
50%,5.0,5.0,65.0,0.0
75%,5.0,5.0,78.0,14.0
max,6.0,6.0,100.0,42.0


In [18]:
df['shot_class'].value_counts()
shots['shooter'].value_counts()

shot_class
shot-on-goal    369817
missed-shot     176428
goal             40320
Name: count, dtype: int64

shooter
David Pastrnak      2756
Nathan MacKinnon    2628
Auston Matthews     2464
Brady Tkachuk       2320
Matthew Tkachuk     2278
                    ... 
Filip Kral             2
Steven Fogarty         2
Darcy Kuemper          2
Sergei Bobrovsky       2
Cole Bardreau          2
Name: count, Length: 1059, dtype: int64

In [19]:
# Every shot is plotted on one half of the ice, the red line is at x = 0, the blue line is at x = 25, goal line at x = 89, all measurements are in feet.
# Use this information to create an angle to the net feature
def angle(x_coord, y_coord):
    x_centered = 89 - x_coord
    return round(np.degrees(np.arctan(y_coord/x_centered)), 2)

shots['angles'] = angle(shots['x_coord'], shots['y_coord'])
shots['goal'] = np.where(shots['shot_class'] == 'goal', 1, 0)
shots.drop('shot_class', axis=1, inplace=True)
shots.head()
shots['rebound'].value_counts()
shots['rush'].value_counts()
shots['home_skaters'].value_counts()
shots['away_skaters'].value_counts()
shots['goal'].value_counts()

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,angles,goal
0,Home,giveaway,No rebound,Rush,5,5,77,6,Morgan Barron,Jaroslav Halak,wrist,26.57,0
1,Home,hit,No rebound,No rush,5,5,30,30,Neal Pionk,Jaroslav Halak,slap,26.95,0
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,Libor Hajek,Connor Hellebuyck,wrist,29.05,0
3,Home,hit,No rebound,No rush,5,5,41,14,Blake Wheeler,Jaroslav Halak,wrist,16.26,0
4,Home,hit,No rebound,No rush,5,5,46,17,Blake Wheeler,Jaroslav Halak,wrist,21.57,0


rebound
No rebound    524609
Rebound        54055
Name: count, dtype: int64

rush
No rush    559008
Rush        19656
Name: count, dtype: int64

home_skaters
5    508367
4     53324
3      7488
6      6450
1      1578
0      1457
Name: count, dtype: int64

away_skaters
5    503762
4     57505
3      8031
6      6331
0      1578
1      1457
Name: count, dtype: int64

goal
0    538879
1     39785
Name: count, dtype: int64

In [20]:
shots = shots[shots['home_skaters'] >= 3]
shots = shots[shots['away_skaters'] >= 3]

In [21]:
shots.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575629 entries, 0 to 586564
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   home          575629 non-null  object 
 1   last_play     575629 non-null  object 
 2   rebound       575629 non-null  object 
 3   rush          575629 non-null  object 
 4   home_skaters  575629 non-null  int64  
 5   away_skaters  575629 non-null  int64  
 6   x_coord       575629 non-null  int64  
 7   y_coord       575629 non-null  int64  
 8   shooter       575629 non-null  object 
 9   goalie        575629 non-null  object 
 10  shot_type     575629 non-null  object 
 11  angles        575621 non-null  float64
 12  goal          575629 non-null  int32  
dtypes: float64(1), int32(1), int64(4), object(7)
memory usage: 59.3+ MB


In [22]:
shots.dropna(inplace=True)

In [23]:
numerical_cols = shots[['home_skaters','away_skaters','x_coord','y_coord', 'angles', 'goal']]
numerical_cols.head()
numerical_cols.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord,angles,goal
0,5,5,77,6,26.57,0
1,5,5,30,30,26.95,0
2,5,5,35,30,29.05,0
3,5,5,41,14,16.26,0
4,5,5,46,17,21.57,0


Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord,angles,goal
home_skaters,1.0,0.276981,-0.054162,-0.002851,-0.002806,-0.043774
away_skaters,0.276981,1.0,-0.060692,-0.006117,-0.006725,-0.046585
x_coord,-0.054162,-0.060692,1.0,0.004561,-0.001685,0.136777
y_coord,-0.002851,-0.006117,0.004561,1.0,0.794831,0.002424
angles,-0.002806,-0.006725,-0.001685,0.794831,1.0,0.003165
goal,-0.043774,-0.046585,0.136777,0.002424,0.003165,1.0


In [24]:
# None of the numerical features above are too correlated, shot angle and y coordinate are which makes sense.
profile = ProfileReport(shots, title='Shots Profiling Report')
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [25]:
# Perform 80/20 training/test split and stratify based on loan approval
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(shots, test_size=0.20, stratify=shots['goal'], random_state=42)
strat_train_set["goal"].value_counts() / len(strat_train_set)
strat_test_set["goal"].value_counts() / len(strat_test_set)
shots_train = strat_train_set.drop(['goal'], axis=1)
shots_test = strat_test_set.drop(['goal'], axis=1)
property = strat_train_set["goal"].copy()
property_test = strat_test_set['goal'].copy()

goal
0    0.932634
1    0.067366
Name: count, dtype: float64

goal
0    0.932638
1    0.067362
Name: count, dtype: float64

In [26]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
num_pipeline = make_pipeline(StandardScaler(), MinMaxScaler((-1,1)))

preprocessing = ColumnTransformer([
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=num_pipeline)

In [27]:
shots_prepped = preprocessing.fit_transform(shots_train)

In [29]:
from sklearn.linear_model import LogisticRegression

log_reg = make_pipeline(preprocessing, LogisticRegression())
log_reg.fit(shots_train, property)
goal_predictions = log_reg.predict(shots_train)
goal_predictions[:5].round(-2) 
property.iloc[:5].values

array([0, 0, 0, 0, 0])

array([0, 0, 0, 0, 0])

In [30]:
probabilities_log = log_reg.predict_proba(shots_test)
pred_test = log_reg.predict(shots_test)

In [31]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

auc_test = roc_auc_score(property_test, probabilities_log[:,1])
auc_train = roc_auc_score(property, log_reg.predict_proba(shots_train)[:,1])

confusion_matrix(property_test, pred_test)
print(classification_report(property_test, pred_test))

array([[107363,      7],
       [  7741,     14]], dtype=int64)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    107370
           1       0.67      0.00      0.00      7755

    accuracy                           0.93    115125
   macro avg       0.80      0.50      0.48    115125
weighted avg       0.91      0.93      0.90    115125



In [32]:
auc_test
auc_train

0.6982809169254927

0.7056299685907703

In [None]:
# Low AUC, try XG Boost classifier to see if that works better

In [33]:
from xgboost import XGBClassifier

xgb = make_pipeline(preprocessing, XGBClassifier())
xgb.fit(shots_train, property)
goal_predictions_xgb = xgb.predict(shots_train)
goal_predictions_xgb[:5].round(-2) 
property.iloc[:5].values

array([0, 0, 0, 0, 0])

array([0, 0, 0, 0, 0])

In [34]:
probabilities_xgb = xgb.predict_proba(shots_test)
pred_test_xgb = xgb.predict(shots_test)

In [35]:
auc_test = roc_auc_score(property_test, probabilities_xgb[:,1])
auc_train = roc_auc_score(property, xgb.predict_proba(shots_train)[:,1])

confusion_matrix(property_test, pred_test_xgb)
print(classification_report(property_test, pred_test_xgb))

array([[107343,     27],
       [  7702,     53]], dtype=int64)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    107370
           1       0.66      0.01      0.01      7755

    accuracy                           0.93    115125
   macro avg       0.80      0.50      0.49    115125
weighted avg       0.91      0.93      0.90    115125



In [36]:
auc_test
auc_train

0.7749999210356615

0.8035616949497044

In [37]:
import joblib

joblib.dump(xgb, 'xgb_base_v1.pkl')

['xgb_base_v1.pkl']

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01,0.05,0.1],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],
    'base_score': [0.2, 0.5, 1]
}

xgb_v2 = GridSearchCV(XGBClassifier(), parameters, scoring='roc_auc')
xgb_v2.fit(shots_train, property)
goal_predictions_xgb_v2 = xgb_v2.predict(shots_train)
goal_predictions_xgb_v2[:5].round(-2) 
property.iloc[:5].values

print('Best score:', xgb_v2.best_score_)
print('Best score:', xgb_v2.best_params_)