# xG Model Notebook

In [2]:
from nhlpy import NHLClient
import csv
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from ydata_profiling import ProfileReport
import seaborn as sns

In [3]:
%config InteractiveShell.ast_node_interactivity = "all"

Read the data from the scraper into a Pandas data frame and clean the data.

In [4]:
df = pd.read_csv('nhl_shot_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,assist1,assist2,goalie,shot_type,shot_class
0,0,2022020023,52,1,giveaway,0,1,5,5,77,6,8480289,0,0,8470860,wrist,shot-on-goal
1,1,2022020023,52,1,hit,0,0,5,5,30,30,8480145,0,0,8470860,slap,shot-on-goal
2,2,2022020023,3,0,shot-on-goal,0,0,5,5,35,30,8479333,0,0,8476945,wrist,shot-on-goal
3,3,2022020023,52,1,hit,0,0,5,5,41,14,8471218,0,0,8470860,wrist,shot-on-goal
4,4,2022020023,52,1,hit,0,0,5,5,46,17,8471218,0,0,8470860,wrist,shot-on-goal


In [5]:
df.drop('Unnamed: 0', axis = 1, inplace=True)
df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,assist1,assist2,goalie,shot_type,shot_class
0,2022020023,52,1,giveaway,0,1,5,5,77,6,8480289,0,0,8470860,wrist,shot-on-goal
1,2022020023,52,1,hit,0,0,5,5,30,30,8480145,0,0,8470860,slap,shot-on-goal
2,2022020023,3,0,shot-on-goal,0,0,5,5,35,30,8479333,0,0,8476945,wrist,shot-on-goal
3,2022020023,52,1,hit,0,0,5,5,41,14,8471218,0,0,8470860,wrist,shot-on-goal
4,2022020023,52,1,hit,0,0,5,5,46,17,8471218,0,0,8470860,wrist,shot-on-goal


In [6]:
shots = df.drop(['game_id', 'team_id', 'assist1', 'assist2'], axis=1)
shots.head()

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,shot_class
0,1,giveaway,0,1,5,5,77,6,8480289,8470860,wrist,shot-on-goal
1,1,hit,0,0,5,5,30,30,8480145,8470860,slap,shot-on-goal
2,0,shot-on-goal,0,0,5,5,35,30,8479333,8476945,wrist,shot-on-goal
3,1,hit,0,0,5,5,41,14,8471218,8470860,wrist,shot-on-goal
4,1,hit,0,0,5,5,46,17,8471218,8470860,wrist,shot-on-goal


The code below adds every team abbreviation into a list. These abbreviatons were then used to gather the rosters for each NHL team and add their players into a dictionary mapping a player's name to their player ID.

In [7]:
client = NHLClient()
teams = client.teams.teams_info()
team_abbrs = []
for item in teams:
    abbr = item['abbr']
    team_abbrs.append(abbr)

In [8]:
player_dict = {}

for season in [20222023, 20232024, 20242025]:
    for abbreviation in team_abbrs:
    
        if abbreviation == 'UTA' and season != 20242025:
            abbreviation = 'ARI'

        roster = client.teams.roster(team_abbr=abbreviation, season=season)
        for player in roster['forwards']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

        for player in roster['defensemen']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

        for player in roster['goalies']:
            if player['id'] in player_dict:
                continue
            player_dict[player['id']] = player['firstName']['default'] + ' ' + player['lastName']['default']

In [9]:
shots['shooter'] = shots['shooter'].replace(player_dict)
shots['goalie'] = shots['goalie'].replace(player_dict)

Numerical data that would not have made much sense to remain as numerical data types are mapped to categorical types. Rebounds and rush chances were determined by the time difference between events.

In [10]:
home_mapping = {}
home_mapping[0] ='Away'
home_mapping[1] = 'Home'

rebound_mapping = {}
rebound_mapping[0] = 'No rebound'
rebound_mapping[1] ='Rebound'

rush_mapping = {}
rush_mapping[0] = 'No rush'
rush_mapping[1] ='Rush'

In [11]:
shots['home'] = shots['home'].replace(home_mapping)
shots['rebound'] = shots['rebound'].replace(rebound_mapping)
shots['rush'] = shots['rush'].replace(rush_mapping)

In [12]:
shots['shooter'] = shots['shooter'].apply(lambda x: 'unknown' if isinstance(x, int) else x)
shots['goalie'] = shots['goalie'].apply(lambda x: 'unknown' if isinstance(x, int) else x)

In [13]:
shots = shots[shots['shooter'] != 'unknown']
shots = shots[shots['goalie'] != 'unknown']

In [14]:
shots.head(30)

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,shot_class
0,Home,giveaway,No rebound,Rush,5,5,77,6,Morgan Barron,Jaroslav Halak,wrist,shot-on-goal
1,Home,hit,No rebound,No rush,5,5,30,30,Neal Pionk,Jaroslav Halak,slap,shot-on-goal
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,Libor Hajek,Connor Hellebuyck,wrist,shot-on-goal
3,Home,hit,No rebound,No rush,5,5,41,14,Blake Wheeler,Jaroslav Halak,wrist,shot-on-goal
4,Home,hit,No rebound,No rush,5,5,46,17,Blake Wheeler,Jaroslav Halak,wrist,shot-on-goal
5,Home,hit,No rebound,No rush,5,5,81,-27,Nikolaj Ehlers,Jaroslav Halak,wrist,missed-shot
6,Home,missed-shot,No rebound,No rush,5,5,34,30,Neal Pionk,Jaroslav Halak,wrist,shot-on-goal
7,Away,faceoff,No rebound,No rush,4,5,69,2,Vincent Trocheck,Connor Hellebuyck,deflected,shot-on-goal
8,Away,shot-on-goal,Rebound,No rush,4,5,78,5,Chris Kreider,Connor Hellebuyck,wrist,shot-on-goal
9,Away,shot-on-goal,No rebound,No rush,4,5,45,-6,Artemi Panarin,Connor Hellebuyck,wrist,shot-on-goal


In [15]:
# Fix scraper to convert the home and away ID to find the team abbreviation via a dictionary

In [16]:
shots.info()
shots.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 579375 entries, 0 to 586564
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   home          579375 non-null  object
 1   last_play     579375 non-null  object
 2   rebound       579375 non-null  object
 3   rush          579375 non-null  object
 4   home_skaters  579375 non-null  int64 
 5   away_skaters  579375 non-null  int64 
 6   x_coord       579375 non-null  int64 
 7   y_coord       579375 non-null  int64 
 8   shooter       579375 non-null  object
 9   goalie        579375 non-null  object
 10  shot_type     579375 non-null  object
 11  shot_class    579375 non-null  object
dtypes: int64(4), object(8)
memory usage: 57.5+ MB


Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord
count,579375.0,579375.0,579375.0,579375.0
mean,4.86971,4.860098,62.010991,-0.178903
std,0.494412,0.504575,19.031434,19.723686
min,0.0,0.0,0.0,-42.0
25%,5.0,5.0,49.0,-15.0
50%,5.0,5.0,65.0,0.0
75%,5.0,5.0,78.0,14.0
max,6.0,6.0,100.0,42.0


In [17]:
df['shot_class'].value_counts()
shots['shooter'].value_counts()

shot_class
shot-on-goal    369817
missed-shot     176428
goal             40320
Name: count, dtype: int64

shooter
David Pastrnak      2756
Nathan MacKinnon    2628
Auston Matthews     2480
Brady Tkachuk       2320
Matthew Tkachuk     2278
                    ... 
Arvid Soderblom        2
Anton Levtchi          2
Anthony Stolarz        2
Jonathan Quick         2
Cole Bardreau          2
Name: count, Length: 1060, dtype: int64

The code below creates a new feature called shot angle.

In [18]:
# Every shot is plotted on one half of the ice, the red line is at x = 0, the blue line is at x = 25, goal line at x = 89, all measurements are in feet.
# Use this information to create an angle to the net feature
def angle(x_coord, y_coord):
    x_centered = 89 - x_coord
    return round(np.degrees(np.arctan(y_coord/x_centered)), 2)

shots['angles'] = angle(shots['x_coord'], shots['y_coord'])
shots['goal'] = np.where(shots['shot_class'] == 'goal', 1, 0)
shots.drop('shot_class', axis=1, inplace=True)
shots.head()
shots['rebound'].value_counts()
shots['rush'].value_counts()
shots['home_skaters'].value_counts()
shots['away_skaters'].value_counts()
shots['goal'].value_counts()

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter,goalie,shot_type,angles,goal
0,Home,giveaway,No rebound,Rush,5,5,77,6,Morgan Barron,Jaroslav Halak,wrist,26.57,0
1,Home,hit,No rebound,No rush,5,5,30,30,Neal Pionk,Jaroslav Halak,slap,26.95,0
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,Libor Hajek,Connor Hellebuyck,wrist,29.05,0
3,Home,hit,No rebound,No rush,5,5,41,14,Blake Wheeler,Jaroslav Halak,wrist,16.26,0
4,Home,hit,No rebound,No rush,5,5,46,17,Blake Wheeler,Jaroslav Halak,wrist,21.57,0


rebound
No rebound    525248
Rebound        54127
Name: count, dtype: int64

rush
No rush    559691
Rush        19684
Name: count, dtype: int64

home_skaters
5    509036
4     53356
3      7494
6      6454
1      1578
0      1457
Name: count, dtype: int64

away_skaters
5    504382
4     57585
3      8042
6      6331
0      1578
1      1457
Name: count, dtype: int64

goal
0    539538
1     39837
Name: count, dtype: int64

In [19]:
shots = shots[shots['home_skaters'] >= 3]
shots = shots[shots['away_skaters'] >= 3]

In [20]:
shots.info()

<class 'pandas.core.frame.DataFrame'>
Index: 576340 entries, 0 to 586564
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   home          576340 non-null  object 
 1   last_play     576340 non-null  object 
 2   rebound       576340 non-null  object 
 3   rush          576340 non-null  object 
 4   home_skaters  576340 non-null  int64  
 5   away_skaters  576340 non-null  int64  
 6   x_coord       576340 non-null  int64  
 7   y_coord       576340 non-null  int64  
 8   shooter       576340 non-null  object 
 9   goalie        576340 non-null  object 
 10  shot_type     576340 non-null  object 
 11  angles        576332 non-null  float64
 12  goal          576340 non-null  int32  
dtypes: float64(1), int32(1), int64(4), object(7)
memory usage: 59.4+ MB


In [21]:
shots.dropna(inplace=True)

In [22]:
numerical_cols = shots[['home_skaters','away_skaters','x_coord','y_coord', 'angles', 'goal']]
numerical_cols.head()
numerical_cols.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord,angles,goal
0,5,5,77,6,26.57,0
1,5,5,30,30,26.95,0
2,5,5,35,30,29.05,0
3,5,5,41,14,16.26,0
4,5,5,46,17,21.57,0


Unnamed: 0,home_skaters,away_skaters,x_coord,y_coord,angles,goal
home_skaters,1.0,0.276631,-0.054122,-0.00287,-0.002875,-0.043745
away_skaters,0.276631,1.0,-0.060721,-0.006117,-0.006737,-0.046501
x_coord,-0.054122,-0.060721,1.0,0.004365,-0.001764,0.136724
y_coord,-0.00287,-0.006117,0.004365,1.0,0.794822,0.002464
angles,-0.002875,-0.006737,-0.001764,0.794822,1.0,0.003236
goal,-0.043745,-0.046501,0.136724,0.002464,0.003236,1.0


The data is split into training and testing data.

In [24]:
# Perform 80/20 training/test split and stratify based on loan approval
from sklearn.model_selection import train_test_split
strat_train_set, strat_test_set = train_test_split(shots, test_size=0.20, stratify=shots['goal'], random_state=42)
strat_train_set["goal"].value_counts() / len(strat_train_set)
strat_test_set["goal"].value_counts() / len(strat_test_set)
shots_train = strat_train_set.drop(['goal'], axis=1)
shots_test = strat_test_set.drop(['goal'], axis=1)
property = strat_train_set["goal"].copy()
property_test = strat_test_set['goal'].copy()

goal
0    0.932628
1    0.067372
Name: count, dtype: float64

goal
0    0.932626
1    0.067374
Name: count, dtype: float64

A preprocessing pipeline is created to convert the data into a usable data type for the model. It encodes categorical data to a numerical data type and scales the numerical data type.

In [25]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
num_pipeline = make_pipeline(StandardScaler(), MinMaxScaler((-1,1)))

preprocessing = ColumnTransformer([
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=num_pipeline)

In [26]:
shots_prepped = preprocessing.fit_transform(shots_train)

In [27]:
from sklearn.linear_model import LogisticRegression

log_reg = make_pipeline(preprocessing, LogisticRegression())
log_reg.fit(shots_train, property)
goal_predictions = log_reg.predict(shots_train)
goal_predictions[:5].round(-2) 
property.iloc[:5].values

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0, 0, 0, 0])

array([0, 0, 1, 0, 0])

In [28]:
probabilities_log = log_reg.predict_proba(shots_test)
pred_test = log_reg.predict(shots_test)

In [29]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

auc_test = roc_auc_score(property_test, probabilities_log[:,1])
auc_train = roc_auc_score(property, log_reg.predict_proba(shots_train)[:,1])

confusion_matrix(property_test, pred_test)
print(classification_report(property_test, pred_test))

array([[107497,      4],
       [  7752,     14]], dtype=int64)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    107501
           1       0.78      0.00      0.00      7766

    accuracy                           0.93    115267
   macro avg       0.86      0.50      0.48    115267
weighted avg       0.92      0.93      0.90    115267



The logistic regression AUC  was low. It was decided that an XGBoost model should be built and compared to the logistic model.

In [30]:
auc_test
auc_train

0.6963099239465178

0.7073842972565992

In [31]:
# Low AUC, try XG Boost classifier to see if that works better

In [32]:
from xgboost import XGBClassifier

xgb = make_pipeline(preprocessing, XGBClassifier())
xgb.fit(shots_train, property)
goal_predictions_xgb = xgb.predict(shots_train)
goal_predictions_xgb[:5].round(-2) 
property.iloc[:5].values

array([0, 0, 0, 0, 0])

array([0, 0, 1, 0, 0])

In [33]:
probabilities_xgb = xgb.predict_proba(shots_test)
pred_test_xgb = xgb.predict(shots_test)

In [34]:
auc_test = roc_auc_score(property_test, probabilities_xgb[:,1])
auc_train = roc_auc_score(property, xgb.predict_proba(shots_train)[:,1])

confusion_matrix(property_test, pred_test_xgb)
print(classification_report(property_test, pred_test_xgb))

array([[107490,     11],
       [  7729,     37]], dtype=int64)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    107501
           1       0.77      0.00      0.01      7766

    accuracy                           0.93    115267
   macro avg       0.85      0.50      0.49    115267
weighted avg       0.92      0.93      0.90    115267



The XGBoost model AUC was greatly improved from the logistic regression model. The XGBoost model should be fine tuned to increase AUC even more.

In [35]:
auc_test
auc_train

0.7738028114768203

0.8011812064092082

In [36]:
import joblib

joblib.dump(xgb, 'xgb_base_v1.pkl')

['xgb_base_v1.pkl']

In [48]:
from sklearn.model_selection import GridSearchCV
import cupy as cp

parameters = {
    'device' : ['cuda'],
    'n_estimators': [100, 250, 375, 500],
    'learning_rate': [0.01,0.05,0.1],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],   
}

xgb_v2 = make_pipeline(preprocessing, GridSearchCV(XGBClassifier(n_jobs=-1), parameters, scoring='roc_auc'))
xgb_v2.fit(shots_train, property)

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } are not used.

Parameters: { "gamma" } a

In [61]:
goal_predictions_xgb_v2 = xgb_v2.predict(shots_train)
goal_predictions_xgb_v2[:5].round(-2) 
property.iloc[:5].values

array([0, 0, 0, 0, 0])

array([0, 0, 1, 0, 0])

In [62]:
probabilities_xgb_v2 = xgb_v2.predict_proba(shots_test)
pred_test_xgb_v2 = xgb_v2.predict(shots_test)

In [63]:
auc_test = roc_auc_score(property_test, probabilities_xgb_v2[:,1])
auc_train = roc_auc_score(property, xgb_v2.predict_proba(shots_train)[:,1])

confusion_matrix(property_test, pred_test_xgb_v2)
print(classification_report(property_test, pred_test_xgb_v2))

array([[107484,     17],
       [  7714,     52]], dtype=int64)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    107501
           1       0.75      0.01      0.01      7766

    accuracy                           0.93    115267
   macro avg       0.84      0.50      0.49    115267
weighted avg       0.92      0.93      0.90    115267



The AUC was slightly increased compared to the base XGBoost model.

In [64]:
auc_test
auc_train

0.7836202012415684

0.8204006516605874

In [65]:
joblib.dump(xgb_v2, 'xgb_v2.pkl')

['xgb_v2.pkl']