In [1]:
import os
import sys

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [2]:
from src.config import PROCESSED_DIR, RAW_DIR

In [3]:
import pandas as pd

pd.set_option("display.max_columns", None)

path = RAW_DIR / "fenwick_2325.parquet"
df = pd.read_parquet(path=path, engine="fastparquet")
df.head()

Unnamed: 0,game_id,team_id,home,home_def_side,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,position,shoots,career_shooting_pct,goalie_id,goalie,goalie_catches,career_save_pct,shot_type,zone,shot_class,danger_zone
0,2023020001,14,1,left,giveaway,0,0,5,5,58,-25,8478178,Darren Raddysh,D,R,0.0563,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low
1,2023020001,14,1,left,shot-on-goal,0,0,5,5,81,8,8478010,Brayden Point,C,R,0.1846,8477424,Juuse Saros,L,0.913287,tip-in,O,shot-on-goal,med
2,2023020001,14,1,left,hit,0,0,5,5,55,30,8479661,Tanner Jeannot,L,L,0.1271,8477424,Juuse Saros,L,0.913287,snap,O,shot-on-goal,low
3,2023020001,14,1,left,takeaway,0,0,5,5,58,-30,8479591,Michael Eyssimont,C,L,0.0639,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low
4,2023020001,18,0,left,hit,0,0,5,5,-63,33,8476887,Filip Forsberg,L,R,0.1279,8477992,Jonas Johansson,L,0.890301,wrist,O,missed-shot,low


In [4]:
df.columns

Index(['game_id', 'team_id', 'home', 'home_def_side', 'last_play', 'rebound',
       'rush', 'home_skaters', 'away_skaters', 'x_coord', 'y_coord',
       'shooter_id', 'shooter', 'position', 'shoots', 'career_shooting_pct',
       'goalie_id', 'goalie', 'goalie_catches', 'career_save_pct', 'shot_type',
       'zone', 'shot_class', 'danger_zone'],
      dtype='object')

In [5]:
import numpy as np

def add_shot_angle(df):
    """
    Adds a vectorized shot_angle column.
    """

    x = np.abs(df["x_coord"].values)
    y = df["y_coord"].values

    dx = 89 - x

    behind = dx <= 0

    angle = np.degrees(np.arctan2(np.abs(y), np.abs(dx)))
    angle = np.where(behind, 90.0, angle)

    df["shot_angle"] = angle.round(3)
    return df

In [6]:
import numpy as np

# Need to engineer features to improve performance of models
features = df.copy()
features["shot_distance"] = np.sqrt(features["x_coord"]**2 + features["y_coord"]**2)
features = add_shot_angle(features)


In [7]:
features

Unnamed: 0,game_id,team_id,home,home_def_side,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,position,shoots,career_shooting_pct,goalie_id,goalie,goalie_catches,career_save_pct,shot_type,zone,shot_class,danger_zone,shot_distance,shot_angle
0,2023020001,14,1,left,giveaway,0,0,5,5,58,-25,8478178,Darren Raddysh,D,R,0.0563,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,63.158531,38.884
1,2023020001,14,1,left,shot-on-goal,0,0,5,5,81,8,8478010,Brayden Point,C,R,0.1846,8477424,Juuse Saros,L,0.913287,tip-in,O,shot-on-goal,med,81.394103,45.000
2,2023020001,14,1,left,hit,0,0,5,5,55,30,8479661,Tanner Jeannot,L,L,0.1271,8477424,Juuse Saros,L,0.913287,snap,O,shot-on-goal,low,62.649820,41.424
3,2023020001,14,1,left,takeaway,0,0,5,5,58,-30,8479591,Michael Eyssimont,C,L,0.0639,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,65.299311,44.061
4,2023020001,18,0,left,hit,0,0,5,5,-63,33,8476887,Filip Forsberg,L,R,0.1279,8477992,Jonas Johansson,L,0.890301,wrist,O,missed-shot,low,71.119618,51.766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272202,2025020359,9,0,right,faceoff,0,0,5,5,42,-12,8481596,Shane Pinto,C,R,0.1309,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,low,43.680659,14.323
272203,2025020359,26,1,right,blocked-shot,0,0,5,5,-83,6,8479675,Trevor Moore,L,L,0.0989,8482447,Leevi Meriläinen,L,0.906780,poke,O,missed-shot,high,83.216585,45.000
272204,2025020359,9,0,right,faceoff,0,0,5,6,48,16,8482105,Jake Sanderson,D,L,0.0526,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,med,50.596443,21.318
272205,2025020359,9,0,right,giveaway,0,0,5,6,76,-16,8482116,Tim Stützle,C,L,0.1357,8475311,Darcy Kuemper,L,0.914684,wrist,O,shot-on-goal,med,77.665951,50.906


In [8]:
danger_map = {
    "low": 1,
    "med": 2,
    "high": 3
}

features["danger_numeric"] = features["danger_zone"].map(danger_map)
features["shot_value"] = features["danger_numeric"] + features["rebound"] + features["rush"]
features

Unnamed: 0,game_id,team_id,home,home_def_side,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,position,shoots,career_shooting_pct,goalie_id,goalie,goalie_catches,career_save_pct,shot_type,zone,shot_class,danger_zone,shot_distance,shot_angle,danger_numeric,shot_value
0,2023020001,14,1,left,giveaway,0,0,5,5,58,-25,8478178,Darren Raddysh,D,R,0.0563,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,63.158531,38.884,1,1
1,2023020001,14,1,left,shot-on-goal,0,0,5,5,81,8,8478010,Brayden Point,C,R,0.1846,8477424,Juuse Saros,L,0.913287,tip-in,O,shot-on-goal,med,81.394103,45.000,2,2
2,2023020001,14,1,left,hit,0,0,5,5,55,30,8479661,Tanner Jeannot,L,L,0.1271,8477424,Juuse Saros,L,0.913287,snap,O,shot-on-goal,low,62.649820,41.424,1,1
3,2023020001,14,1,left,takeaway,0,0,5,5,58,-30,8479591,Michael Eyssimont,C,L,0.0639,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,65.299311,44.061,1,1
4,2023020001,18,0,left,hit,0,0,5,5,-63,33,8476887,Filip Forsberg,L,R,0.1279,8477992,Jonas Johansson,L,0.890301,wrist,O,missed-shot,low,71.119618,51.766,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272202,2025020359,9,0,right,faceoff,0,0,5,5,42,-12,8481596,Shane Pinto,C,R,0.1309,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,low,43.680659,14.323,1,1
272203,2025020359,26,1,right,blocked-shot,0,0,5,5,-83,6,8479675,Trevor Moore,L,L,0.0989,8482447,Leevi Meriläinen,L,0.906780,poke,O,missed-shot,high,83.216585,45.000,3,3
272204,2025020359,9,0,right,faceoff,0,0,5,6,48,16,8482105,Jake Sanderson,D,L,0.0526,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,med,50.596443,21.318,2,2
272205,2025020359,9,0,right,giveaway,0,0,5,6,76,-16,8482116,Tim Stützle,C,L,0.1357,8475311,Darcy Kuemper,L,0.914684,wrist,O,shot-on-goal,med,77.665951,50.906,2,2


In [9]:
features["shot_on_glove"] = features["shoots"] + features["goalie_catches"]
features["home_skaters"] = features["home_skaters"].astype(int)
features["away_skaters"] = features["away_skaters"].astype(int)
features = features[features["home_skaters"] >= 3]
features = features[features["away_skaters"] >= 3]
features = features[features["goalie_id"].notnull()]

In [10]:
def add_situation(df):
    """ 
    Adds a 'situation' column based on home/away skaters.
    """

    df["shooting_team_skaters"] = np.where(df["home"] == 1, df["home_skaters"], df["away_skaters"])
    df["defending_team_skaters"] = np.where(df["home"] == 1, df["away_skaters"], df["home_skaters"])

    df["situation"] = "EV" 

    df.loc[df["shooting_team_skaters"] > df["defending_team_skaters"], "situation"] = "PP"

    df.loc[df["shooting_team_skaters"] < df["defending_team_skaters"], "situation"] = "SH"

    df.drop(columns=["shooting_team_skaters", "defending_team_skaters"], inplace=True)

    return df

In [11]:
features = add_situation(features)
features

Unnamed: 0,game_id,team_id,home,home_def_side,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shooter_id,shooter,position,shoots,career_shooting_pct,goalie_id,goalie,goalie_catches,career_save_pct,shot_type,zone,shot_class,danger_zone,shot_distance,shot_angle,danger_numeric,shot_value,shot_on_glove,situation
0,2023020001,14,1,left,giveaway,0,0,5,5,58,-25,8478178,Darren Raddysh,D,R,0.0563,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,63.158531,38.884,1,1,RL,EV
1,2023020001,14,1,left,shot-on-goal,0,0,5,5,81,8,8478010,Brayden Point,C,R,0.1846,8477424,Juuse Saros,L,0.913287,tip-in,O,shot-on-goal,med,81.394103,45.000,2,2,RL,EV
2,2023020001,14,1,left,hit,0,0,5,5,55,30,8479661,Tanner Jeannot,L,L,0.1271,8477424,Juuse Saros,L,0.913287,snap,O,shot-on-goal,low,62.649820,41.424,1,1,LL,EV
3,2023020001,14,1,left,takeaway,0,0,5,5,58,-30,8479591,Michael Eyssimont,C,L,0.0639,8477424,Juuse Saros,L,0.913287,wrist,O,shot-on-goal,low,65.299311,44.061,1,1,LL,EV
4,2023020001,18,0,left,hit,0,0,5,5,-63,33,8476887,Filip Forsberg,L,R,0.1279,8477992,Jonas Johansson,L,0.890301,wrist,O,missed-shot,low,71.119618,51.766,1,1,RL,EV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272202,2025020359,9,0,right,faceoff,0,0,5,5,42,-12,8481596,Shane Pinto,C,R,0.1309,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,low,43.680659,14.323,1,1,RL,EV
272203,2025020359,26,1,right,blocked-shot,0,0,5,5,-83,6,8479675,Trevor Moore,L,L,0.0989,8482447,Leevi Meriläinen,L,0.906780,poke,O,missed-shot,high,83.216585,45.000,3,3,LL,EV
272204,2025020359,9,0,right,faceoff,0,0,5,6,48,16,8482105,Jake Sanderson,D,L,0.0526,8475311,Darcy Kuemper,L,0.914684,slap,O,missed-shot,med,50.596443,21.318,2,2,LL,PP
272205,2025020359,9,0,right,giveaway,0,0,5,6,76,-16,8482116,Tim Stützle,C,L,0.1357,8475311,Darcy Kuemper,L,0.914684,wrist,O,shot-on-goal,med,77.665951,50.906,2,2,LL,PP


In [12]:
features.columns

Index(['game_id', 'team_id', 'home', 'home_def_side', 'last_play', 'rebound',
       'rush', 'home_skaters', 'away_skaters', 'x_coord', 'y_coord',
       'shooter_id', 'shooter', 'position', 'shoots', 'career_shooting_pct',
       'goalie_id', 'goalie', 'goalie_catches', 'career_save_pct', 'shot_type',
       'zone', 'shot_class', 'danger_zone', 'shot_distance', 'shot_angle',
       'danger_numeric', 'shot_value', 'shot_on_glove', 'situation'],
      dtype='object')

In [13]:
processed_features = features.drop(["game_id", "team_id", "home_def_side", "x_coord", "y_coord", "shooter_id", "shooter", "goalie_id", "goalie", "goalie_catches", "shoots", "zone"], axis=1)

In [14]:
processed_features

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,position,career_shooting_pct,career_save_pct,shot_type,shot_class,danger_zone,shot_distance,shot_angle,danger_numeric,shot_value,shot_on_glove,situation
0,1,giveaway,0,0,5,5,D,0.0563,0.913287,wrist,shot-on-goal,low,63.158531,38.884,1,1,RL,EV
1,1,shot-on-goal,0,0,5,5,C,0.1846,0.913287,tip-in,shot-on-goal,med,81.394103,45.000,2,2,RL,EV
2,1,hit,0,0,5,5,L,0.1271,0.913287,snap,shot-on-goal,low,62.649820,41.424,1,1,LL,EV
3,1,takeaway,0,0,5,5,C,0.0639,0.913287,wrist,shot-on-goal,low,65.299311,44.061,1,1,LL,EV
4,0,hit,0,0,5,5,L,0.1279,0.890301,wrist,missed-shot,low,71.119618,51.766,1,1,RL,EV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272202,0,faceoff,0,0,5,5,C,0.1309,0.914684,slap,missed-shot,low,43.680659,14.323,1,1,RL,EV
272203,1,blocked-shot,0,0,5,5,L,0.0989,0.906780,poke,missed-shot,high,83.216585,45.000,3,3,LL,EV
272204,0,faceoff,0,0,5,6,D,0.0526,0.914684,slap,missed-shot,med,50.596443,21.318,2,2,LL,PP
272205,0,giveaway,0,0,5,6,C,0.1357,0.914684,wrist,shot-on-goal,med,77.665951,50.906,2,2,LL,PP


In [15]:
from src.feature_processor import XGProcessor

processor = XGProcessor()
test = processor.processFenwick(df)
test

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,position,career_shooting_pct,career_save_pct,shot_type,shot_class,danger_zone,shot_on_glove,distance,shot_angle,danger_numeric,shot_value,situation
0,1,giveaway,0,0,5,5,D,0.0563,0.913287,wrist,shot-on-goal,low,RL,63.158531,38.884,1,1,EV
1,1,shot-on-goal,0,0,5,5,C,0.1846,0.913287,tip-in,shot-on-goal,med,RL,81.394103,45.000,2,2,EV
2,1,hit,0,0,5,5,L,0.1271,0.913287,snap,shot-on-goal,low,LL,62.649820,41.424,1,1,EV
3,1,takeaway,0,0,5,5,C,0.0639,0.913287,wrist,shot-on-goal,low,LL,65.299311,44.061,1,1,EV
4,0,hit,0,0,5,5,L,0.1279,0.890301,wrist,missed-shot,low,RL,71.119618,51.766,1,1,EV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272202,0,faceoff,0,0,5,5,C,0.1309,0.914684,slap,missed-shot,low,RL,43.680659,14.323,1,1,EV
272203,1,blocked-shot,0,0,5,5,L,0.0989,0.906780,poke,missed-shot,high,LL,83.216585,45.000,3,3,EV
272204,0,faceoff,0,0,5,6,D,0.0526,0.914684,slap,missed-shot,med,LL,50.596443,21.318,2,2,PP
272205,0,giveaway,0,0,5,6,C,0.1357,0.914684,wrist,shot-on-goal,med,LL,77.665951,50.906,2,2,PP


In [16]:
output_path = PROCESSED_DIR / "processed_fenwick_2325.parquet" 

test.to_parquet(output_path, engine="fastparquet", index=False)

print(f"Saved to {output_path}")

Saved to /home/nolan/nhl/nhl_predictors/data/processed/processed_fenwick_2325.parquet
