Transformation pipeline testing environment

In [None]:
import pandas as pd
from pathlib import Path
import ingest as ing
import os

from nba_api.stats.endpoints import ShotChartLeagueWide
from nba_api.stats.endpoints import ShotChartLineupDetail

# Read in parquet knicks file for testing
cwd = Path.cwd()
# Go to parent directory and then Data
data_dir = cwd.parent / "data"
knicks = pd.read_parquet(Path(data_dir, "knicks_playoff_shots.parquet"))

In [None]:
import pandas as pd

# ----------------------------
# TEAM-LEVEL TRANSFORMATIONS
# ----------------------------
def summarize_team_shots(df_shots):
    """
    Summarize team shot selection:
    - count of shots per zone
    - FG% per zone
    """
    summary = (
        df_shots.groupby("SHOT_ZONE_BASIC")
        .agg(
            attempts=("SHOT_MADE_FLAG", "count"),
            makes=("SHOT_MADE_FLAG", "sum")
        )
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary


# ----------------------------
# PLAYER-LEVEL TRANSFORMATIONS
# ----------------------------
def summarize_player_shots(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Summarize shots for each player by zone.
    """
    summary = (
        df_shots.groupby(["PLAYER_NAME", "SHOT_ZONE_BASIC"])
        .agg(
            attempts=("SHOT_MADE_FLAG", "count"),
            makes=("SHOT_MADE_FLAG", "sum")
        )
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary


# ----------------------------
# LEAGUE COMPARISONS
# ----------------------------
def compare_to_league(team_shots_df, season="2024-25", opponent_team_name=None):
    if opponent_team_name is None:
        # load league averages
        league_avg = pd.read_parquet(f"data/league_avg_{season}.parquet")
        comparison = _compare_stats(team_shots_df, league_avg)
        return comparison
    
    # Opponent mode
    fname = f"data/opponents/opponent_shots_{opponent_team_name}_{season}.parquet"
    if os.path.exists(fname):
        opponent_shots = pd.read_parquet(fname)
    else:
        # Run ingestion pipeline for opponent
        opponent_shots = get_team_shots(team_name=opponent_team_name, season=season)
        opponent_shots.to_parquet(fname, index=False)
    
    comparison = _compare_stats(team_shots_df, opponent_shots)
    return comparison



# ----------------------------
# VISUALIZATION PREP
# ----------------------------
def prepare_shot_chart_data(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare raw shot chart data for plotting (scatter on a court).
    Only keep necessary cols: LOC_X, LOC_Y, SHOT_MADE_FLAG.
    """
    return df_shots[["LOC_X", "LOC_Y", "SHOT_MADE_FLAG", "PLAYER_NAME"]]


In [4]:
summarize_team_shots(knicks)

Unnamed: 0,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Above the Break 3,330,118,0.357576
1,Backcourt,5,0,0.0
2,In The Paint (Non-RA),318,152,0.477987
3,Left Corner 3,71,22,0.309859
4,Mid-Range,205,79,0.385366
5,Restricted Area,342,217,0.634503
6,Right Corner 3,61,23,0.377049


In [7]:
summarize_player_shots(knicks)['']

Unnamed: 0,PLAYER_NAME,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Jalen Brunson,Above the Break 3,117,38,0.324786
1,Jalen Brunson,In The Paint (Non-RA),122,67,0.54918
2,Jalen Brunson,Left Corner 3,9,3,0.333333
3,Jalen Brunson,Mid-Range,69,30,0.434783
4,Jalen Brunson,Restricted Area,68,36,0.529412
5,Jalen Brunson,Right Corner 3,8,7,0.875
6,Josh Hart,Above the Break 3,38,14,0.368421
7,Josh Hart,In The Paint (Non-RA),24,10,0.416667
8,Josh Hart,Left Corner 3,9,5,0.555556
9,Josh Hart,Mid-Range,8,1,0.125


In [None]:
import pandas as pd

def compare_stats(team_shots_df, comparison_df):
    """
    Compare a team's shots against either league averages or opponent shots.
    """

    # ---- Aggregate TEAM shots ----
    team_summary = (
        team_shots_df.groupby("SHOT_ZONE_BASIC")
        .agg(
            attempts=("SHOT_MADE_FLAG", "count"),
            makes=("SHOT_MADE_FLAG", "sum")
        )
        .reset_index()
    )
    team_summary["fg_pct"] = team_summary["makes"] / team_summary["attempts"]

    # ---- Handle comparison dataset ----
    if "FGA" in comparison_df.columns:  
        # Case 1: League averages (already aggregated)
        comp_summary = comparison_df.rename(
            columns={"SHOT_ZONE_BASIC": "SHOT_ZONE_BASIC", "FGA": "attempts", "FGM": "makes", "FG_PCT": "fg_pct"}
        )[["SHOT_ZONE_BASIC", "attempts", "makes", "fg_pct"]]

    else:
        # Case 2: Opponent/team raw shots — aggregate first
        comp_summary = (
            comparison_df.groupby("SHOT_ZONE_BASIC")
            .agg(
                attempts=("SHOT_MADE_FLAG", "count"),
                makes=("SHOT_MADE_FLAG", "sum")
            )
            .reset_index()
        )
        comp_summary["fg_pct"] = comp_summary["makes"] / comp_summary["attempts"]

    # ---- Merge results for comparison ----
    comparison = pd.merge(
        team_summary,
        comp_summary,
        on="SHOT_ZONE_BASIC",
        suffixes=("_team", "_comparison"),
        how="outer"
    ).fillna(0)

    return comparison


In [None]:
from pathlib import Path
import pandas as pd
from nba_api.stats.endpoints import ShotChartLeagueWide

def compare_to_league(team_shots_df, season="2024-25", opponent_team_name=None, season_type="Playoffs"):
    # Identifying and creating the data_dir if necessary
    data_dir = Path("data")
    opponents_dir = data_dir / "shotcharts"
    data_dir.mkdir(exist_ok=True)
    opponents_dir.mkdir(exist_ok=True)
    # Checking if the opponent team name has been submitted
    if opponent_team_name is None:
        # League average mode
        league_path = data_dir / f"league_avg_{season}.parquet"
        if league_path.exists():
            league_avg = pd.read_parquet(league_path)
        else:
            # Fetch from API
            league_avg = ShotChartLeagueWide(season=season).get_data_frames()[0]
            league_avg.to_parquet(league_path, index=False)

        comparison = compare_stats(team_shots_df, league_avg)
        return comparison
    
    # Opponent mode
    oppo_path = opponents_dir / f"opponent_shots_{opponent_team_name}_{season}.parquet"
    if oppo_path.exists():
        opponent_shots = pd.read_parquet(oppo_path)
    else:
        # Run ingestion pipeline for opponent
        opponent_shots = ing.ingest_data(team_name=opponent_team_name, num_players=-1, season=season, season_type=season_type)
        opponent_shots.to_parquet(oppo_path, index=False)
    
    comparison = compare_stats(team_shots_df, opponent_shots)
    return comparison




In [None]:
def get_opponent_average(opponent_name = "League"):
    """ Gets opponent average statistics for shooting. If no team specified get league average.
    
    """
    if opponent_name == "League":
        # Get league average
        oppo_average = ShotChartLeagueWide(season="2024-25").get_data_frames()[0]
    else:
        oppo_id = ing.get_team_id(team_name=opponent_name)
        

In [17]:
ShotChartLeagueWide(season="2024-25").get_data_frames()[0].head()

Unnamed: 0,GRID_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,FGA,FGM,FG_PCT
0,League Averages,Above the Break 3,Back Court(BC),Back Court Shot,42,3,0.071
1,League Averages,Above the Break 3,Center(C),24+ ft.,18214,6390,0.351
2,League Averages,Above the Break 3,Left Side Center(LC),24+ ft.,28322,10109,0.357
3,League Averages,Above the Break 3,Right Side Center(RC),24+ ft.,26052,9158,0.352
4,League Averages,Backcourt,Back Court(BC),Back Court Shot,585,13,0.022


In [24]:
oppo_team_id = ing.get_team_id(team_name="Charlotte Hornets")
ShotChartLineupDetail(team_id_nullable=oppo_team_id, season_segment_nullable="Playoffs", season="2024-25")

KeyError: 'resultSet'

In [8]:
compare_to_league(knicks)

TypeError: compare_to_league() missing 1 required positional argument: 'league_summary'

In [9]:
prepare_shot_chart_data(knicks)

Unnamed: 0,LOC_X,LOC_Y,SHOT_MADE_FLAG,PLAYER_NAME
0,25,2,0,Jalen Brunson
1,-10,41,0,Jalen Brunson
2,-228,5,1,Jalen Brunson
3,13,22,0,OG Anunoby
4,-47,111,0,Josh Hart
...,...,...,...,...
1327,233,75,0,OG Anunoby
1328,28,7,1,Karl-Anthony Towns
1329,156,235,0,Jalen Brunson
1330,241,18,0,Mikal Bridges
