Transformation pipeline testing environment

In [27]:
import pandas as pd
from pathlib import Path
import ingest as ing
import os

from nba_api.stats.endpoints import ShotChartLeagueWide
from nba_api.stats.endpoints import ShotChartLineupDetail

# Read in parquet knicks file for testing
cwd = Path.cwd()
# Go to parent directory and then Data
data_dir = cwd.parent / "data"
knicks = pd.read_parquet(Path(data_dir, "knicks_playoff_shots.parquet"))

In [36]:
# ----------------------------
# TEAM-LEVEL TRANSFORMATIONS
# ----------------------------
def summarize_team_shots(df_shots):
    """
    Summarize team shot selection:
    - count of shots per zone
    - FG% per zone
    """
    summary = (
        df_shots.groupby("SHOT_ZONE_BASIC")
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary

def summarize_player_shots(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Summarize shots for each player by zone.
    """
    summary = (
        df_shots.groupby(["PLAYER_NAME", "SHOT_ZONE_BASIC"])
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary

def summarize_league_avg(league_avg: pd.DataFrame) -> pd.DataFrame:
    summary = (
        league_avg.groupby("SHOT_ZONE_BASIC", as_index=False)
        .agg(
            attempts=("FGA", "sum"),
            makes=("FGM", "sum")
        )
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary

def compare_stats(team_shots, opponent_shots, league_y_n=True, season="2024-25"):
    """
    Compare a team's shots against either league averages or opponent shots.
    """
    team_summary = summarize_team_shots(team_shots)
    if league_y_n == True:
        oppo_summary = summarize_league_avg(opponent_shots)
    else:
        oppo_summary = summarize_team_shots(opponent_shots)

    # ---- Merge results for comparison ----
    comparison = pd.merge(
        team_summary,
        oppo_summary,
        on="SHOT_ZONE_BASIC",
        suffixes=("_team", "_opponent"),
        how="outer",
    ).fillna(0)

    return comparison


def compare_to_league(
    team_shots, season="2024-25", opponent_team_name=None, season_type="Playoffs"
):
    # Identifying and creating the data_dir if necessary
    data_dir = Path.cwd()
    data_dir = data_dir.parent / "data"
    opponents_dir = data_dir / "shotcharts"
    data_dir.mkdir(exist_ok=True)
    opponents_dir.mkdir(exist_ok=True)
    # Checking if the opponent team name has been submitted
    if opponent_team_name is None:
        # League average mode
        league_path = data_dir / f"league_avg_{season}.parquet"
        if league_path.exists():
            league_avg = pd.read_parquet(league_path)
        else:
            # Fetch from API
            league_avg = ShotChartLeagueWide(season=season).get_data_frames()[0]
            league_avg.to_parquet(league_path, index=False)

        comparison = compare_stats(team_shots, league_avg, league_y_n=True, season=season)
        return comparison

    # Opponent mode
    oppo_path = opponents_dir / f"opponent_shots_{opponent_team_name}_{season}.parquet"
    if oppo_path.exists():
        opponent_shots = pd.read_parquet(oppo_path)
    else:
        # Run ingestion pipeline for opponent
        opponent_shots = ing.ingest_data(
            team_name=opponent_team_name,
            num_players=-1,
            season=season,
            season_type=season_type,
        )
        opponent_shots.to_parquet(oppo_path, index=False)

    comparison = compare_stats(team_shots, opponent_shots, league_y_n=False, season=season)
    return comparison

def prepare_shot_chart_data(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare raw shot chart data for plotting (scatter on a court).
    Only keep necessary cols: LOC_X, LOC_Y, SHOT_MADE_FLAG.
    """
    return df_shots[["LOC_X", "LOC_Y", "SHOT_MADE_FLAG", "PLAYER_NAME"]]

In [29]:
knicks_summary = summarize_team_shots(knicks)
knicks_summary

Unnamed: 0,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Above the Break 3,330,118,0.357576
1,Backcourt,5,0,0.0
2,In The Paint (Non-RA),318,152,0.477987
3,Left Corner 3,71,22,0.309859
4,Mid-Range,205,79,0.385366
5,Restricted Area,342,217,0.634503
6,Right Corner 3,61,23,0.377049


In [30]:
knicks_player_summary = summarize_player_shots(knicks)
knicks_player_summary.head()

Unnamed: 0,PLAYER_NAME,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Jalen Brunson,Above the Break 3,117,38,0.324786
1,Jalen Brunson,In The Paint (Non-RA),122,67,0.54918
2,Jalen Brunson,Left Corner 3,9,3,0.333333
3,Jalen Brunson,Mid-Range,69,30,0.434783
4,Jalen Brunson,Restricted Area,68,36,0.529412


In [37]:
knicks_vs_league = compare_to_league(knicks, season="2024-25", opponent_team_name=None, season_type="Playoffs")
knicks_vs_league

Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_opponent,makes_opponent,fg_pct_opponent
0,Above the Break 3,330,118,0.357576,72630,25660,0.353298
1,Backcourt,5,0,0.0,585,13,0.022222
2,In The Paint (Non-RA),318,152,0.477987,47477,21053,0.443436
3,Left Corner 3,71,22,0.309859,12997,4989,0.383858
4,Mid-Range,205,79,0.385366,23143,9626,0.415936
5,Restricted Area,342,217,0.634503,64916,43036,0.662949
6,Right Corner 3,61,23,0.377049,12156,4732,0.389273


In [40]:
knicks_vs_boston = compare_to_league(knicks, season="2024-25", opponent_team_name="Boston Celtics", season_type="Playoffs")
knicks_vs_boston

Collected data for 0042400216
Collected data for 0042400215
Collected data for 0042400214
Collected data for 0042400213
Collected data for 0042400212
Collected data for 0042400211
Collected data for 0042400115
Collected data for 0042400114
Collected data for 0042400113
Collected data for 0042400112
Collected data for 0042400111


Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_opponent,makes_opponent,fg_pct_opponent
0,Above the Break 3,330,118,0.357576,0.0,0.0,0.0
1,Backcourt,5,0,0.0,0.0,0.0,0.0
2,In The Paint (Non-RA),318,152,0.477987,0.0,0.0,0.0
3,Left Corner 3,71,22,0.309859,0.0,0.0,0.0
4,Mid-Range,205,79,0.385366,0.0,0.0,0.0
5,Restricted Area,342,217,0.634503,0.0,0.0,0.0
6,Right Corner 3,61,23,0.377049,0.0,0.0,0.0


In [50]:
bost_path = Path.cwd()
bost_path = bost_path.parent / "data" / "shotcharts"
print(bost_path)
opponent_shots = ing.ingest_data(
    team_name="Boston Celtics",
    num_players=5,
    season="2024-25",
    season_type="Playoffs",
)
opponent_shots.to_parquet(bost_path, index=False)

c:\Users\parke\OneDrive\Desktop\Personal Projects\NBA Shot Selection\nba-shot-selection-llm\data\shotcharts
Collected data for 0042400216
Collected data for 0042400215
Collected data for 0042400214
Collected data for 0042400213
Collected data for 0042400212
Collected data for 0042400211
Collected data for 0042400115
Collected data for 0042400114
Collected data for 0042400113
Collected data for 0042400112
Collected data for 0042400111


PermissionError: [WinError 5] Failed to open local file 'c:/Users/parke/OneDrive/Desktop/Personal Projects/NBA Shot Selection/nba-shot-selection-llm/data/shotcharts'. Detail: [Windows error 5] Access is denied.


In [54]:
compare_stats(knicks, opponent_shots, league_y_n=False, season="2024-25")

Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_opponent,makes_opponent,fg_pct_opponent
0,Above the Break 3,330,118,0.357576,247,93,0.376518
1,Backcourt,5,0,0.0,4,0,0.0
2,In The Paint (Non-RA),318,152,0.477987,119,39,0.327731
3,Left Corner 3,71,22,0.309859,28,9,0.321429
4,Mid-Range,205,79,0.385366,86,35,0.406977
5,Restricted Area,342,217,0.634503,138,104,0.753623
6,Right Corner 3,61,23,0.377049,27,11,0.407407


In [48]:
pd.read_parquet(Path(Path.cwd().parent / "data" / "shotcharts" / "opponent_shots_Boston Celtics_2024-25.parquet"))

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM


In [49]:
pd.read_parquet(Path(Path.cwd().parent / "data" / "knicks_playoff_shots.parquet"))

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM
0,Shot Chart Detail,0042400121,9,1628973,Jalen Brunson,1610612752,New York Knicks,1,11,32,...,Center(C),Less Than 8 ft.,2,25,2,1,0,20250419,NYK,DET
1,Shot Chart Detail,0042400121,19,1628973,Jalen Brunson,1610612752,New York Knicks,1,10,50,...,Center(C),Less Than 8 ft.,4,-10,41,1,0,20250419,NYK,DET
2,Shot Chart Detail,0042400121,21,1628973,Jalen Brunson,1610612752,New York Knicks,1,10,45,...,Left Side(L),24+ ft.,22,-228,5,1,1,20250419,NYK,DET
3,Shot Chart Detail,0042400121,25,1628384,OG Anunoby,1610612752,New York Knicks,1,9,57,...,Center(C),Less Than 8 ft.,2,13,22,1,0,20250419,NYK,DET
4,Shot Chart Detail,0042400121,30,1628404,Josh Hart,1610612752,New York Knicks,1,9,27,...,Center(C),8-16 ft.,12,-47,111,1,0,20250419,NYK,DET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327,Shot Chart Detail,0042400306,634,1628384,OG Anunoby,1610612752,New York Knicks,4,3,34,...,Right Side(R),24+ ft.,24,233,75,1,0,20250531,IND,NYK
1328,Shot Chart Detail,0042400306,637,1626157,Karl-Anthony Towns,1610612752,New York Knicks,4,3,30,...,Center(C),Less Than 8 ft.,2,28,7,1,1,20250531,IND,NYK
1329,Shot Chart Detail,0042400306,644,1628973,Jalen Brunson,1610612752,New York Knicks,4,3,15,...,Right Side Center(RC),24+ ft.,28,156,235,1,0,20250531,IND,NYK
1330,Shot Chart Detail,0042400306,650,1628969,Mikal Bridges,1610612752,New York Knicks,4,2,50,...,Right Side(R),24+ ft.,24,241,18,1,0,20250531,IND,NYK
