Transformation pipeline testing environment

In [52]:
import pandas as pd
from pathlib import Path
import ingest as ing
import os

from nba_api.stats.endpoints import ShotChartLeagueWide
from nba_api.stats.endpoints import ShotChartLineupDetail

# Read in parquet knicks file for testing
cwd = Path.cwd()
# Go to parent directory and then Data
data_dir = cwd.parent / "data"
knicks = pd.read_parquet(Path(data_dir, "knicks_playoff_shots.parquet"))

In [53]:
# ----------------------------
# TEAM-LEVEL TRANSFORMATIONS
# ----------------------------
def summarize_team_shots(df_shots):
    """
    Summarize team shot selection:
    - count of shots per zone
    - FG% per zone
    """
    summary = (
        df_shots.groupby("SHOT_ZONE_BASIC")
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary


# ----------------------------
# PLAYER-LEVEL TRANSFORMATIONS
# ----------------------------
def summarize_player_shots(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Summarize shots for each player by zone.
    """
    summary = (
        df_shots.groupby(["PLAYER_NAME", "SHOT_ZONE_BASIC"])
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
    summary["fg_pct"] = summary["makes"] / summary["attempts"]
    return summary

# ----------------------------
# VISUALIZATION PREP
# ----------------------------
def prepare_shot_chart_data(df_shots: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare raw shot chart data for plotting (scatter on a court).
    Only keep necessary cols: LOC_X, LOC_Y, SHOT_MADE_FLAG.
    """
    return df_shots[["LOC_X", "LOC_Y", "SHOT_MADE_FLAG", "PLAYER_NAME"]]


import pandas as pd


def compare_stats(team_shots_df, comparison_df):
    """
    Compare a team's shots against either league averages or opponent shots.
    """

    # ---- Aggregate TEAM shots ----
    team_summary = (
        team_shots_df.groupby("SHOT_ZONE_BASIC")
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
    team_summary["fg_pct"] = team_summary["makes"] / team_summary["attempts"]

    # ---- Handle comparison dataset ----
    if "FGA" in comparison_df.columns:
        # Case 1: League averages (already aggregated)
        comp_summary = comparison_df.rename(
            columns={
                "SHOT_ZONE_BASIC": "SHOT_ZONE_BASIC",
                "FGA": "attempts",
                "FGM": "makes",
                "FG_PCT": "fg_pct",
            }
        )[["SHOT_ZONE_BASIC", "attempts", "makes", "fg_pct"]]

    else:
        # Case 2: Opponent/team raw shots — aggregate first
        comp_summary = (
            comparison_df.groupby("SHOT_ZONE_BASIC")
            .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
            .reset_index()
        )
        comp_summary["fg_pct"] = comp_summary["makes"] / comp_summary["attempts"]

    # ---- Merge results for comparison ----
    comparison = pd.merge(
        team_summary,
        comp_summary,
        on="SHOT_ZONE_BASIC",
        suffixes=("_team", "_comparison"),
        how="outer",
    ).fillna(0)

    return comparison


def compare_to_league(
    team_shots_df, season="2024-25", opponent_team_name=None, season_type="Playoffs"
):
    # Identifying and creating the data_dir if necessary
    data_dir = Path.cwd()
    data_dir = data_dir.parent / "data"
    opponents_dir = data_dir / "shotcharts"
    data_dir.mkdir(exist_ok=True)
    opponents_dir.mkdir(exist_ok=True)
    # Checking if the opponent team name has been submitted
    if opponent_team_name is None:
        # League average mode
        league_path = data_dir / f"league_avg_{season}.parquet"
        if league_path.exists():
            league_avg = pd.read_parquet(league_path)
        else:
            # Fetch from API
            league_avg = ShotChartLeagueWide(season=season).get_data_frames()[0]
            league_avg.to_parquet(league_path, index=False)

        comparison = compare_stats(team_shots_df, league_avg)
        return comparison

    # Opponent mode
    oppo_path = opponents_dir / f"opponent_shots_{opponent_team_name}_{season}.parquet"
    if oppo_path.exists():
        opponent_shots = pd.read_parquet(oppo_path)
    else:
        # Run ingestion pipeline for opponent
        opponent_shots = ing.ingest_data(
            team_name=opponent_team_name,
            num_players=-1,
            season=season,
            season_type=season_type,
        )
        opponent_shots.to_parquet(oppo_path, index=False)

    comparison = compare_stats(team_shots_df, opponent_shots)
    return comparison

In [54]:
summarize_team_shots(knicks)

Unnamed: 0,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Above the Break 3,330,118,0.357576
1,Backcourt,5,0,0.0
2,In The Paint (Non-RA),318,152,0.477987
3,Left Corner 3,71,22,0.309859
4,Mid-Range,205,79,0.385366
5,Restricted Area,342,217,0.634503
6,Right Corner 3,61,23,0.377049


In [64]:
from nba_api.stats.static import teams
teams.get_teams()
boston_id = ing.get_team_id(team_name="Boston Celtics")
print(boston_id)
boston_gameids = ing.get_game_ids(boston_id, season="2024-25", season_type="Playoffs")
print(boston_gameids)
boston_cum_stats = ing.get_cum_team_stats(boston_id, boston_gameids)
print(boston_cum_stats)
boston_avg_time = ing.get_average_playtime(boston_cum_stats)

1610612738
['0042400216', '0042400215', '0042400214', '0042400213', '0042400212', '0042400211', '0042400115', '0042400114', '0042400113', '0042400112', '0042400111']
Collected data for 0042400216
Collected data for 0042400215
Collected data for 0042400214
Collected data for 0042400213
Collected data for 0042400212
Collected data for 0042400211
Collected data for 0042400115
Collected data for 0042400114
Collected data for 0042400113
Collected data for 0042400112
Collected data for 0042400111
{'0042400216':    JERSEY_NUM         PLAYER  PERSON_ID     TEAM_ID  GP  GS  ACTUAL_MINUTES  \
0           7       Brown, J    1627759  1610612738   1   1              31   
1          11   Pritchard, P    1630202  1610612738   1   0              27   
2          42     Horford, A     201143  1610612738   1   1              26   
3           9       White, D    1628401  1610612738   1   1              30   
4          30      Hauser, S    1630573  1610612738   1   0              14   
5          40  

In [69]:
boston_player_ids = ing.top_x_players_by_min(boston_avg_time, num_players=None)
print(boston_player_ids)
# boston = ing.get_team_shots(boston_id, boston_playerids, season="2024-25", season_type="Playoffs")

TypeError: '<=' not supported between instances of 'NoneType' and 'int'

In [55]:
summarize_player_shots(knicks)

Unnamed: 0,PLAYER_NAME,SHOT_ZONE_BASIC,attempts,makes,fg_pct
0,Jalen Brunson,Above the Break 3,117,38,0.324786
1,Jalen Brunson,In The Paint (Non-RA),122,67,0.54918
2,Jalen Brunson,Left Corner 3,9,3,0.333333
3,Jalen Brunson,Mid-Range,69,30,0.434783
4,Jalen Brunson,Restricted Area,68,36,0.529412
5,Jalen Brunson,Right Corner 3,8,7,0.875
6,Josh Hart,Above the Break 3,38,14,0.368421
7,Josh Hart,In The Paint (Non-RA),24,10,0.416667
8,Josh Hart,Left Corner 3,9,5,0.555556
9,Josh Hart,Mid-Range,8,1,0.125


In [56]:
knicks_vs_league = compare_to_league(team_shots_df=knicks, season="2024-25", season_type="Playoffs")
knicks_vs_league.head()

Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_comparison,makes_comparison,fg_pct_comparison
0,Above the Break 3,330,118,0.357576,42,3,0.071
1,Above the Break 3,330,118,0.357576,18214,6390,0.351
2,Above the Break 3,330,118,0.357576,28322,10109,0.357
3,Above the Break 3,330,118,0.357576,26052,9158,0.352
4,Backcourt,5,0,0.0,585,13,0.022


In [57]:
knicks_vs_celtics = compare_to_league(team_shots_df=knicks, opponent_team_name="Boston Celtics", season="2024-25", season_type="Playoffs")
knicks_vs_celtics.head()

Collected data for 0042400216
Collected data for 0042400215
Collected data for 0042400214
Collected data for 0042400213
Collected data for 0042400212
Collected data for 0042400211
Collected data for 0042400115
Collected data for 0042400114
Collected data for 0042400113
Collected data for 0042400112
Collected data for 0042400111


Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_comparison,makes_comparison,fg_pct_comparison
0,Above the Break 3,330,118,0.357576,0.0,0.0,0.0
1,Backcourt,5,0,0.0,0.0,0.0,0.0
2,In The Paint (Non-RA),318,152,0.477987,0.0,0.0,0.0
3,Left Corner 3,71,22,0.309859,0.0,0.0,0.0
4,Mid-Range,205,79,0.385366,0.0,0.0,0.0


In [40]:
knicks.columns

Index(['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME',
       'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',
       'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE',
       'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE',
       'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',
       'HTM', 'VTM'],
      dtype='object')

In [43]:
summary = (
        knicks.groupby("SHOT_ZONE_BASIC", "SHOT_ZONE_AREA")
        .agg(attempts=("SHOT_MADE_FLAG", "count"), makes=("SHOT_MADE_FLAG", "sum"))
        .reset_index()
    )
summary["fg_pct"] = summary["makes"] / summary["attempts"]

ValueError: No axis named SHOT_ZONE_AREA for object type DataFrame

In [36]:
ShotChartLeagueWide(season="2024-25").get_data_frames()

[          GRID_TYPE        SHOT_ZONE_BASIC         SHOT_ZONE_AREA  \
 0   League Averages      Above the Break 3         Back Court(BC)   
 1   League Averages      Above the Break 3              Center(C)   
 2   League Averages      Above the Break 3   Left Side Center(LC)   
 3   League Averages      Above the Break 3  Right Side Center(RC)   
 4   League Averages              Backcourt         Back Court(BC)   
 5   League Averages  In The Paint (Non-RA)              Center(C)   
 6   League Averages  In The Paint (Non-RA)              Center(C)   
 7   League Averages  In The Paint (Non-RA)           Left Side(L)   
 8   League Averages  In The Paint (Non-RA)          Right Side(R)   
 9   League Averages          Left Corner 3           Left Side(L)   
 10  League Averages              Mid-Range              Center(C)   
 11  League Averages              Mid-Range              Center(C)   
 12  League Averages              Mid-Range   Left Side Center(LC)   
 13  League Averages

In [34]:
knicks_vs_league

Unnamed: 0,SHOT_ZONE_BASIC,attempts_team,makes_team,fg_pct_team,attempts_comparison,makes_comparison,fg_pct_comparison
0,Above the Break 3,330,118,0.357576,42,3,0.071
1,Above the Break 3,330,118,0.357576,18214,6390,0.351
2,Above the Break 3,330,118,0.357576,28322,10109,0.357
3,Above the Break 3,330,118,0.357576,26052,9158,0.352
4,Backcourt,5,0,0.0,585,13,0.022
5,In The Paint (Non-RA),318,152,0.477987,13890,6314,0.455
6,In The Paint (Non-RA),318,152,0.477987,28433,12474,0.439
7,In The Paint (Non-RA),318,152,0.477987,2469,1081,0.438
8,In The Paint (Non-RA),318,152,0.477987,2685,1184,0.441
9,Left Corner 3,71,22,0.309859,12997,4989,0.384
