In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [51]:
import numpy as np
import pandas as pd
from nhlpy import NHLClient
from pathlib import Path
import pyarrow

In [4]:
client = NHLClient()

In [5]:
path2025 = Path('..') / 'data' / 'raw' / 'shots_2025.parquet'
path2024 = Path('..') / 'data' / 'raw' / 'shots_2024.parquet'
path2023 = Path('..') / 'data' / 'raw' / 'shots_2023.parquet'

In [7]:
shots_2025 = pd.read_parquet(path2025, engine="pyarrow")
shots_2024 = pd.read_parquet(path2024, engine="pyarrow")
shots_2023 = pd.read_parquet(path2023, engine="pyarrow")

In [8]:
shots_2025

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,goalie_id,goalie,shot_type,zone,shot_class
0,2024020008,52,0,hit,0,0,5,5,70,4,8476460,Mark Scheifele,8479973,Stuart Skinner,wrist,O,shot-on-goal
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,8477504,Josh Morrissey,8479973,Stuart Skinner,snap,O,missed-shot
2,2024020008,22,1,hit,0,0,5,5,68,-8,8477015,Connor Brown,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,8482149,Cole Perfetti,8479973,Stuart Skinner,snap,O,shot-on-goal
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,8477934,Leon Draisaitl,8476945,Connor Hellebuyck,snap,O,shot-on-goal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198626,2024021168,28,1,shot-on-goal,0,0,5,4,62,-16,8483481,Cam Lund,8471734,Jonathan Quick,wrist,O,goal
198627,2024021168,3,0,hit,0,0,5,5,47,29,8479323,Adam Fox,8480382,Alexandar Georgiev,slap,O,shot-on-goal
198628,2024021168,3,0,blocked-shot,0,0,5,5,88,29,8479323,Adam Fox,8480382,Alexandar Georgiev,wrist,O,missed-shot
198629,2024021168,3,0,giveaway,0,0,5,5,71,-42,8482132,Brett Berard,8480382,Alexandar Georgiev,wrist,O,shot-on-goal


In [9]:
shooter_ids2025 = shots_2025["shooter_id"].tolist()

In [34]:
test_dict = client.stats.player_career_stats(8479973)
test_dict["position"]
test_dict["shootsCatches"]
test_dict["featuredStats"]["regularSeason"]["career"]

{'gamesPlayed': 172,
 'goalsAgainstAvg': 2.756668,
 'losses': 54,
 'otLosses': 14,
 'savePctg': 0.90529,
 'shutouts': 6,
 'wins': 96}

In [24]:
stats2025 = []
stats_dict = {}
for id in shooter_ids2025:
    if id not in stats_dict.keys():
        stats = client.stats.player_career_stats(id)
        try:
            stats_dict[id] = {
                "position": stats["position"],
                "hand": stats["shootsCatches"],
                "pct": stats["featuredStats"]["regularSeason"]["career"]["shootingPctg"]
                }
        except:
            stats_dict[id] = {
                "position": stats["position"],
                "hand": stats["shootsCatches"],
                "pct": None
                }

    position = stats_dict[id]["position"]
    shooter_hand = stats_dict[id]["hand"]
    shooting_pct = stats_dict[id]["pct"]
    stats2025.append((position, shooter_hand, shooting_pct))

In [25]:
len(stats2025)

198631

In [26]:
header = ["position", "shooter_hand", "shooting_pct"]
stats2025_df = pd.DataFrame(stats2025, columns=header)

In [33]:
goalie_ids2025 = shots_2025["goalie_id"].tolist()

In [36]:
goalie_stats2025 = []
goalie_stats_dict = {}
for id in goalie_ids2025:
    if id not in goalie_stats_dict.keys():
        stats = client.stats.player_career_stats(id)
        try:
            goalie_stats_dict[id] = {
                "hand": stats["shootsCatches"],
                "pct": stats["featuredStats"]["regularSeason"]["career"]["savePctg"]
                }
        except:
            goalie_stats_dict[id] = {
                "hand": stats["shootsCatches"],
                "pct": None
                }

    shooter_hand = goalie_stats_dict[id]["hand"]
    save_pct = goalie_stats_dict[id]["pct"]
    goalie_stats2025.append((shooter_hand, save_pct))

In [37]:
goalie_header = ["glove_hand", "save_pct"]
goalie_stats2025_df = pd.DataFrame(goalie_stats2025, columns=goalie_header)

In [38]:
goalie_stats2025_df

Unnamed: 0,glove_hand,save_pct
0,L,0.905290
1,L,0.905290
2,L,0.917540
3,L,0.905290
4,L,0.917540
...,...,...
198626,L,0.910698
198627,L,0.903763
198628,L,0.903763
198629,L,0.903763


In [39]:
complete_df2025 = pd.concat([shots_2025, stats2025_df, goalie_stats2025_df], axis=1)
complete_df2025.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,...,goalie_id,goalie,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct
0,2024020008,52,0,hit,0,0,5,5,70,4,...,8479973,Stuart Skinner,wrist,O,shot-on-goal,C,R,0.1728,L,0.90529
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,...,8479973,Stuart Skinner,snap,O,missed-shot,D,L,0.0634,L,0.90529
2,2024020008,22,1,hit,0,0,5,5,68,-8,...,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal,R,R,0.1037,L,0.91754
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,...,8479973,Stuart Skinner,snap,O,shot-on-goal,C,L,0.1164,L,0.90529
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,...,8476945,Connor Hellebuyck,snap,O,shot-on-goal,C,L,0.1856,L,0.91754


In [40]:
from src.scraper import get_skater_stats

In [42]:
complete_df = get_skater_stats(shots_2025)

In [45]:
complete_df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,...,goalie_id,goalie,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct
0,2024020008,52,0,hit,0,0,5,5,70,4,...,8479973,Stuart Skinner,wrist,O,shot-on-goal,C,R,0.1728,L,0.90529
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,...,8479973,Stuart Skinner,snap,O,missed-shot,D,L,0.0634,L,0.90529
2,2024020008,22,1,hit,0,0,5,5,68,-8,...,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal,R,R,0.1037,L,0.91754
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,...,8479973,Stuart Skinner,snap,O,shot-on-goal,C,L,0.1164,L,0.90529
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,...,8476945,Connor Hellebuyck,snap,O,shot-on-goal,C,L,0.1856,L,0.91754


In [48]:
complete_df["home_skaters"].value_counts()

home_skaters
5    176528
4     16244
3      2569
6      2402
1       456
0       432
Name: count, dtype: int64

In [49]:
pd.set_option('display.max_columns', None)

In [50]:
complete_df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,goalie_id,goalie,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct
0,2024020008,52,0,hit,0,0,5,5,70,4,8476460,Mark Scheifele,8479973,Stuart Skinner,wrist,O,shot-on-goal,C,R,0.1728,L,0.90529
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,8477504,Josh Morrissey,8479973,Stuart Skinner,snap,O,missed-shot,D,L,0.0634,L,0.90529
2,2024020008,22,1,hit,0,0,5,5,68,-8,8477015,Connor Brown,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal,R,R,0.1037,L,0.91754
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,8482149,Cole Perfetti,8479973,Stuart Skinner,snap,O,shot-on-goal,C,L,0.1164,L,0.90529
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,8477934,Leon Draisaitl,8476945,Connor Hellebuyck,snap,O,shot-on-goal,C,L,0.1856,L,0.91754


In [None]:
# Every shot is plotted on one half of the ice, the red line is at x = 0, the blue line is at x = 25, goal line at x = 89, all measurements are in feet.
# Use this information to create an angle to the net feature
def angle(x_coord, y_coord):
    x_centered = 89 - x_coord
    return round(np.degrees(np.arctan(y_coord/x_centered)), 2)

complete_df['angles'] = angle(complete_df['x_coord'], complete_df['y_coord'])
complete_df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,goalie_id,goalie,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angles
0,2024020008,52,0,hit,0,0,5,5,70,4,8476460,Mark Scheifele,8479973,Stuart Skinner,wrist,O,shot-on-goal,C,R,0.1728,L,0.90529,11.89
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,8477504,Josh Morrissey,8479973,Stuart Skinner,snap,O,missed-shot,D,L,0.0634,L,0.90529,-30.96
2,2024020008,22,1,hit,0,0,5,5,68,-8,8477015,Connor Brown,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal,R,R,0.1037,L,0.91754,-20.85
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,8482149,Cole Perfetti,8479973,Stuart Skinner,snap,O,shot-on-goal,C,L,0.1164,L,0.90529,-40.73
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,8477934,Leon Draisaitl,8476945,Connor Hellebuyck,snap,O,shot-on-goal,C,L,0.1856,L,0.91754,-35.36


In [53]:
complete_df["shot_on_glove"] = complete_df["shooter_hand"] + complete_df["glove_hand"]

In [60]:
complete_df["home_skaters"] = complete_df["home_skaters"].astype(int)
complete_df["away_skaters"] = complete_df["away_skaters"].astype(int)
complete_df = complete_df[complete_df["home_skaters"] >= 3]
complete_df = complete_df[complete_df["away_skaters"] >= 3]

In [62]:
complete_df["home_skaters"].value_counts()

home_skaters
5    176528
4     16244
3      2569
6      2402
Name: count, dtype: int64

In [63]:
complete_df["situation"] = complete_df.apply(
    lambda row: "EV" if row["home_skaters"] == row["away_skaters"] else
    ('PP' if row['home_skaters'] > row['away_skaters'] else 'SH'),
    axis=1
    )

In [65]:
complete_df["situation"].value_counts()

situation
EV    162247
PP     18544
SH     16952
Name: count, dtype: int64

In [66]:
complete_df.head()

Unnamed: 0,game_id,team_id,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,goalie_id,goalie,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angles,shot_on_glove,situation
0,2024020008,52,0,hit,0,0,5,5,70,4,8476460,Mark Scheifele,8479973,Stuart Skinner,wrist,O,shot-on-goal,C,R,0.1728,L,0.90529,11.89,RL,EV
1,2024020008,52,0,faceoff,0,0,5,5,54,-21,8477504,Josh Morrissey,8479973,Stuart Skinner,snap,O,missed-shot,D,L,0.0634,L,0.90529,-30.96,LL,EV
2,2024020008,22,1,hit,0,0,5,5,68,-8,8477015,Connor Brown,8476945,Connor Hellebuyck,tip-in,O,shot-on-goal,R,R,0.1037,L,0.91754,-20.85,RL,EV
3,2024020008,52,0,shot-on-goal,0,0,5,5,53,-31,8482149,Cole Perfetti,8479973,Stuart Skinner,snap,O,shot-on-goal,C,L,0.1164,L,0.90529,-40.73,LL,EV
4,2024020008,22,1,faceoff,0,0,5,5,58,-22,8477934,Leon Draisaitl,8476945,Connor Hellebuyck,snap,O,shot-on-goal,C,L,0.1856,L,0.91754,-35.36,LL,EV


In [67]:
complete_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 197743 entries, 0 to 198630
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   game_id        197743 non-null  int64  
 1   team_id        197743 non-null  int64  
 2   home           197743 non-null  int64  
 3   last_play      197743 non-null  object 
 4   rebound        197743 non-null  int64  
 5   rush           197743 non-null  int64  
 6   home_skaters   197743 non-null  int64  
 7   away_skaters   197743 non-null  int64  
 8   x_coord        197743 non-null  int64  
 9   y_coord        197743 non-null  int64  
 10  shooter_id     197743 non-null  int64  
 11  shooter        197743 non-null  object 
 12  goalie_id      197743 non-null  int64  
 13  goalie         197743 non-null  object 
 14  shot_type      197743 non-null  object 
 15  zone           197743 non-null  object 
 16  shot_class     197743 non-null  object 
 17  position       197743 non-null  ob

In [75]:
from src.scraper import get_processed_data

In [72]:
test_df = get_skater_stats(shots_2025)

In [76]:
processed_df = get_processed_data(test_df)

In [77]:
processed_df

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angle,shot_on_glove,situation,target
0,Away,hit,No rebound,No rush,5,5,70,4,wrist,O,shot-on-goal,C,R,0.1726,L,0.905290,11.89,RL,EV,0
1,Away,faceoff,No rebound,No rush,5,5,54,-21,snap,O,missed-shot,D,L,0.0633,L,0.905290,-30.96,LL,EV,0
2,Home,hit,No rebound,No rush,5,5,68,-8,tip-in,O,shot-on-goal,R,R,0.1037,L,0.917598,-20.85,RL,EV,0
3,Away,shot-on-goal,No rebound,No rush,5,5,53,-31,snap,O,shot-on-goal,C,L,0.1187,L,0.905290,-40.73,LL,EV,0
4,Home,faceoff,No rebound,No rush,5,5,58,-22,snap,O,shot-on-goal,C,L,0.1856,L,0.917598,-35.36,LL,EV,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198626,Home,shot-on-goal,No rebound,No rush,5,4,62,-16,wrist,O,goal,C,R,0.5000,L,0.910698,-30.65,RL,PP,1
198627,Away,hit,No rebound,No rush,5,5,47,29,slap,O,shot-on-goal,D,R,0.0778,L,0.903763,34.62,RL,EV,0
198628,Away,blocked-shot,No rebound,No rush,5,5,88,29,wrist,O,missed-shot,D,R,0.0778,L,0.903763,88.03,RL,EV,0
198629,Away,giveaway,No rebound,No rush,5,5,71,-42,wrist,O,shot-on-goal,L,L,0.0869,L,0.903763,-66.80,LL,EV,0


In [78]:
df_2024 = get_skater_stats(shots_2024)

In [79]:
processed_2024 = get_processed_data(df_2024)

In [80]:
processed_2024

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angle,shot_on_glove,situation,target
0,Away,faceoff,No rebound,No rush,5,5,81,-26,snap,O,shot-on-goal,C,R,0.1821,L,0.908291,-72.90,RL,EV,0
1,Away,shot-on-goal,Rebound,No rush,5,5,83,-5,snap,O,shot-on-goal,C,R,0.1726,L,0.908291,-39.81,RL,EV,0
2,Away,blocked-shot,No rebound,No rush,5,5,77,-5,wrist,O,shot-on-goal,R,L,0.1250,L,0.908291,-22.62,LL,EV,0
3,Away,faceoff,No rebound,No rush,5,5,56,-4,deflected,O,missed-shot,L,L,0.1191,L,0.908291,-6.91,LL,EV,0
4,Home,hit,No rebound,No rush,5,5,75,-6,wrist,O,shot-on-goal,L,L,0.0785,L,0.917598,-23.20,LL,EV,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244553,Home,faceoff,No rebound,No rush,5,5,82,-1,tip-in,O,shot-on-goal,C,L,0.0560,L,0.914286,-8.13,LL,EV,0
244554,Home,shot-on-goal,Rebound,No rush,5,5,84,2,poke,O,shot-on-goal,L,R,0.2053,L,0.914286,21.80,RL,EV,0
244555,Home,shot-on-goal,No rebound,No rush,5,5,88,3,wrap-around,O,missed-shot,L,R,0.2053,L,0.914286,71.57,RL,EV,0
244556,Away,faceoff,No rebound,No rush,4,5,47,-19,slap,O,missed-shot,L,R,0.0974,L,0.905420,-24.34,RL,SH,0


In [81]:
df_2023 = get_skater_stats(shots_2023)

In [82]:
processed_2023 = get_processed_data(df_2023)

In [83]:
processed_2023

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angle,shot_on_glove,situation,target
0,Home,giveaway,No rebound,Rush,5,5,77,6,wrist,O,shot-on-goal,C,L,0.0880,L,0.915213,26.57,LL,EV,0
1,Home,hit,No rebound,No rush,5,5,30,30,slap,O,shot-on-goal,D,R,0.0427,L,0.915213,26.95,RL,EV,0
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,wrist,O,shot-on-goal,D,L,0.0439,L,0.917598,29.05,LL,EV,0
3,Home,hit,No rebound,No rush,5,5,41,14,wrist,O,shot-on-goal,R,R,0.1086,L,0.915213,16.26,RL,EV,0
4,Home,hit,No rebound,No rush,5,5,46,17,wrist,O,shot-on-goal,R,R,0.1086,L,0.915213,21.57,RL,EV,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243811,Home,giveaway,No rebound,No rush,5,5,37,-27,wrist,D,shot-on-goal,D,R,0.0116,L,0.909857,-27.44,RL,EV,0
243812,Away,shot-on-goal,No rebound,No rush,5,5,57,20,wrist,O,missed-shot,D,R,0.0344,L,0.905290,32.01,RL,EV,0
243813,Away,missed-shot,No rebound,No rush,5,5,82,11,backhand,O,missed-shot,C,L,0.0683,L,0.905290,57.53,LL,EV,0
243814,Away,hit,No rebound,No rush,5,5,73,-13,snap,O,shot-on-goal,L,R,0.0974,L,0.905290,-39.09,RL,EV,0


In [84]:
path2025 = Path('..') / 'data' / 'processed' / 'processed_shots_2025.parquet'
path2024 = Path('..') / 'data' / 'processed' / 'processed_shots_2024.parquet'
path2023 = Path('..') / 'data' / 'processed' / 'processed_shots_2023.parquet'

In [85]:
processed_df.to_parquet(path=path2025, engine="pyarrow")
processed_2024.to_parquet(path=path2024, engine="pyarrow")
processed_2023.to_parquet(path=path2023, engine="pyarrow")