In [1]:
import sys
from pathlib import Path

print("Python:", sys.executable)
print("CWD:", Path.cwd())



Python: /Users/macski/Projects/nhlscorer/.venv/bin/python
CWD: /Users/macski/Projects/nhlscorer/notebooks


In [2]:
from pathlib import Path
import pandas as pd

# Notebook is in nhlscorer/notebooks, so repo root is one level up
PROJECT_ROOT = Path.cwd().parent

CSV_PATH = PROJECT_ROOT / "data" / "processed" / "player_goal_rates.csv"

print("CSV:", CSV_PATH)
print("Exists:", CSV_PATH.exists())

df = pd.read_csv(CSV_PATH)

print("Rows:", len(df))
print("Cols:", len(df.columns))

df.head(3)


CSV: /Users/macski/Projects/nhlscorer/data/processed/player_goal_rates.csv
Exists: True
Rows: 924
Cols: 12


Unnamed: 0,playerId,name,team,position,games_played,I_F_goals,I_F_shotsOnGoal,I_F_xGoals,icetime,goals_per_game,shots_per_game,shooting_pct
0,8480950,Ilya Lyubushkin,TOR,D,74,0.0,47.0,2.71,76034.0,0.0,0.635135,0.0
1,8478438,Tommy Novak,NSH,C,71,18.0,113.0,13.95,60495.0,0.253521,1.591549,0.159292
2,8476925,Colton Sissons,NSH,C,81,15.0,131.0,20.86,80548.0,0.185185,1.617284,0.114504


In [3]:
df.columns.tolist()


['playerId',
 'name',
 'team',
 'position',
 'games_played',
 'I_F_goals',
 'I_F_shotsOnGoal',
 'I_F_xGoals',
 'icetime',
 'goals_per_game',
 'shots_per_game',
 'shooting_pct']

In [4]:
sig = df.copy()

# Basic filters: avoid tiny sample sizes and weird rows
sig = sig[sig["games_played"] >= 20].copy()

# A simple scoring signal: goals per game with a small shot-volume boost
# (shots matter because goal rate alone can be noisy)
sig["signal"] = sig["goals_per_game"] + 0.05 * sig["shots_per_game"]

sig = sig.sort_values("signal", ascending=False)

sig_cols = [
    "name", "team", "position",
    "games_played", "I_F_goals", "I_F_shotsOnGoal",
    "goals_per_game", "shots_per_game", "shooting_pct",
    "signal"
]

sig[sig_cols].head(30)


Unnamed: 0,name,team,position,games_played,I_F_goals,I_F_shotsOnGoal,goals_per_game,shots_per_game,shooting_pct,signal
618,Auston Matthews,TOR,C,81,69.0,369.0,0.851852,4.555556,0.186992,1.07963
598,Nathan MacKinnon,COL,C,82,51.0,405.0,0.621951,4.939024,0.125926,0.868902
148,Zach Hyman,EDM,L,80,54.0,290.0,0.675,3.625,0.186207,0.85625
37,Sam Reinhart,FLA,C,82,57.0,233.0,0.695122,2.841463,0.244635,0.837195
829,David Pastrnak,BOS,R,82,47.0,382.0,0.573171,4.658537,0.123037,0.806098
819,Kirill Kaprizov,MIN,L,75,46.0,277.0,0.613333,3.693333,0.166065,0.798
807,Filip Forsberg,NSH,L,82,48.0,347.0,0.585366,4.231707,0.138329,0.796951
456,Artemi Panarin,NYR,L,82,49.0,303.0,0.597561,3.695122,0.161716,0.782317
42,Nikita Kucherov,TBL,R,81,44.0,306.0,0.54321,3.777778,0.143791,0.732099
774,Jack Eichel,VGK,C,63,31.0,278.0,0.492063,4.412698,0.111511,0.712698


In [5]:
out_path = PROJECT_ROOT / "data" / "processed" / "player_signal_table.csv"
sig[sig_cols].to_csv(out_path, index=False)

print("Saved:", out_path)


Saved: /Users/macski/Projects/nhlscorer/data/processed/player_signal_table.csv


In [6]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
stats = pd.read_csv(PROJECT_ROOT/"data/processed/player_signal_table.csv")
odds  = pd.read_csv(PROJECT_ROOT/"data/processed/odds_anytime_goalscorer.csv")

print("stats rows:", len(stats))
print("odds rows:", len(odds))
print("odds market_key unique:", odds["market_key"].dropna().unique()[:10])
print("\nSample odds player_name:", odds["player_name"].dropna().head(20).tolist())
print("\nSample stats name:", stats["name"].dropna().head(20).tolist())


stats rows: 721
odds rows: 874
odds market_key unique: ['h2h' 'h2h_lay']

Sample odds player_name: ['Carolina Hurricanes', 'Florida Panthers', 'Draw', 'Carolina Hurricanes', 'Florida Panthers', 'Carolina Hurricanes', 'Florida Panthers', 'Draw', 'Carolina Hurricanes', 'Florida Panthers', 'Draw', 'Carolina Hurricanes', 'Florida Panthers', 'Draw', 'Carolina Hurricanes', 'Florida Panthers', 'Draw', 'Carolina Hurricanes', 'Florida Panthers', 'Draw']

Sample stats name: ['Auston Matthews', 'Nathan MacKinnon', 'Zach Hyman', 'Sam Reinhart', 'David Pastrnak', 'Kirill Kaprizov', 'Filip Forsberg', 'Artemi Panarin', 'Nikita Kucherov', 'Jack Eichel', 'Brayden Point', 'Mikko Rantanen', 'Kyle Connor', 'Sidney Crosby', 'William Nylander', 'Brady Tkachuk', 'Jonathan Marchessault', 'Steven Stamkos', 'Valeri Nichushkin', 'Jack Hughes']


In [7]:
def norm_basic(s: str) -> str:
    if not isinstance(s, str): 
        return ""
    return (
        s.lower()
         .replace(".", "")
         .replace("-", " ")
         .replace("'", "")
         .strip()
    )

stats_keys = set(stats["name"].map(norm_basic))
odds_keys  = set(odds["player_name"].map(norm_basic))

print("Unique stats keys:", len(stats_keys))
print("Unique odds keys:", len(odds_keys))
print("Intersection:", len(stats_keys & odds_keys))

# Show a few to see format
print("\nExample stats keys:", list(stats_keys)[:10])
print("\nExample odds keys:", list(odds_keys)[:10])


Unique stats keys: 720
Unique odds keys: 32
Intersection: 0

Example stats keys: ['chris tanev', 'tyler myers', 'matt dumba', 'andrei kuzmenko', 'erik johnson', 'pierre engvall', 'ryan shea', 'jordan kyrou', 'seth jones', 'klim kostin']

Example odds keys: ['new jersey devils', 'draw', 'philadelphia flyers', 'winnipeg jets', 'washington capitals', 'colorado avalanche', 'vancouver canucks', 'florida panthers', 'st louis blues', 'new york rangers']
