In [80]:
from nba_api.stats.endpoints import playbyplayv3, leaguegamefinder
import pandas as pd
import pyarrow
import os
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_rows', None)

___________
### Raw Play-By-Play Data

In [None]:
# Retrieve all games from 2023-24 season
season = "2023-24"  # Modify this as needed
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season)
games_df = gamefinder.get_data_frames()[0]

In [None]:
# Filter for specific team, extract list of games
team = "DAL"
team_games = games_df[games_df['TEAM_ABBREVIATION'] == f"{team}"]
game_ids = team_games['GAME_ID'].to_list()

In [None]:
# Confirm directory exists or create new directory
directory = f'../data/raw/{team}'
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"new directory made: {directory}")
else:
    print(f"Directory exists: {directory}")

In [None]:
# Retrieve and save parquet file for each play-by-play game data
unprocessed = game_ids.copy()
attempt = 0

while (attempt < 3) & (len(unprocessed)>0):
    attempt += 1
    for game_id in unprocessed[:]:
        ind = unprocessed.index(game_id)
        file_name = f"{team}_{game_id}.parquet"
        file_path = os.path.join(directory,file_name)
        if os.path.exists(file_path):
            print(f"Path already found: {file_path}")
            del unprocessed[ind]
            continue
        try:
            data = playbyplayv3.PlayByPlayV3(game_id=f"{game_id}")
            df = data.get_data_frames()[0]
            df.to_parquet(file_path, index=False)
            print(f"Game Successfully Processed: {game_id}" )
            del unprocessed[ind]
        except:
            print(f"FAILED: Unable to process data from game_id: {game_id}.")

if len(unprocessed) > 0:
    [print(f"Failed to process {game_id}") for game_id in unprocessed]
else:
    print("All games processed successfully.")


In [None]:
df.loc[21, 'description']

_________
### Data modification pipeline


#### 1. Change in Possession

From the play-by-play data we can generate historical statistics for each player that can be used for generating statistical probability distributions and can be used as input data for XGBoost models to generate sofmax outputs for shot type or foul likelihood. Here are the data cleaning tasks required to prepare the data for player-level aggregation.


Identify when a new possession starts. This indicator will be used to standardize player stats to "per possession" allowing quick probability distributions to be generated for game-decisions such as "who got the rebound?" or "who committed the foul?". A new possession is started when the defense reaquires the ball from the offense. This means that the same possession is extended from an offensive rebound or defensive foul. Only when the teams switch roles of offense and defense will we mark a change in possession. Here are the factors that constitute a change of possession:

- Made Shot
- Defensive Rebound
- Turnover

From the play-by-play dataset we look for very specific sequential events that indicate any of the above changes in possession:

- "Made Shot" detected in the `actionType` column.
- "Turnover" detected in the `actionType` column.
- "Rebound" detected in the `actionType` column. `Team` column of current row cannot equal `Team` column of previous row. `subType` cannot equal "Normal Rebound" - Normal Rebound is the subtype classifier for a dead ball rebound such as after a missed technical foul free throw. The play is not live and possession does not change.
- "Free Throw" detected within the `actionType` column. Current Free Throw count == Total Free Throw Count. `description` does not contain "MISS". The play-by-play data does not have an easy identifier for ending a possession with a "made free throw". This collection of contextual circumstances indicates that the final free throw was shot for this possession (Free Throw **2** of **2**), and that it was not logged as a MISS. If the second free throw was missed then we would detect the live ball rebound and default to the previous possession identifier. 

In [61]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

df = pd.read_parquet("../data/raw/BOS/BOS_0012300006.parquet")
df.loc[:, 'possession_change'] = 0


In [None]:
def made_shot_turnover(df):
    df_copy = df.copy()
    df_copy.loc[:, 'possession_change'] = np.where(df_copy['actionType'] == 'Made Shot', 1, df_copy['possession_change'])
    df_copy.loc[:, 'possession_change'] = np.where(df_copy['actionType'] == 'Turnover', 1, df_copy['possession_change'])
    return df_copy

def defensive_rebound(df):
    df_copy = df.copy()
    df_copy.loc[:,'prev_team_abv'] = df_copy.loc[:,'teamTricode'].shift(1)
    #Fill in the subtype if it's offensive or defensive rebound
    df_copy.loc[:,'possession_change'] = np.where((df_copy['actionType']=='Rebound')
                                                  & ~(df_copy['prev_team_abv']==df_copy['teamTricode'])
                                                  & ~(df_copy['subType']=='Normal Rebound'), 1, df_copy['possession_change'])
    return df_copy

def free_throw(df):
    df_copy = df.copy()
    df_copy.loc[:,'curr_FT'] = df_copy['subType'].str.extract(r'Free Throw (\d) of \d')
    df_copy.loc[:,'total_FT'] = df_copy['subType'].str.extract(r'Free Throw \d of (\d)')
    df_copy.loc[:,'final_FT'] = df_copy['curr_FT'] == df_copy['total_FT']
    df_copy.loc[:, 'shotResult'] = np.where(~(df['description'].str.contains('MISS')) 
                                            & (df['actionType'] == 'Free Throw'), 'Made', df_copy['shotResult'])
    df_copy.loc[:, 'shotResult'] = np.where((df['description'].str.contains('MISS'))
                                            & (df['actionType'] == 'Free Throw'), 'Missed', df_copy['shotResult'])
    df_copy.loc[:,'possession_change'] = np.where((df_copy['shotResult']=='Made')
                                                & (df_copy['actionType'] == 'Free Throw')
                                                & (df_copy['final_FT']), 1, df['possession_change'])
    return df_copy


In [71]:
df = made_shot_turnover(df)
df = defensive_rebound(df)
df = free_throw(df)

In [93]:
print(len(df))
print(len(set(df['actionNumber'])))

520
490


In [None]:
#df[df['description'].str.contains('REBOUND')]
df[df['actionType'] == 'Rebound']
#df[df['description'].str.contains('BLOCK')]
# filter for actionType "shot" and Rebound&Unknown--> "Made Shot", "Missed Shot", "Free Throw" 
# if teamId == 0: teamId = personId
# convert teamId to str
# Apply teamID shift
# If teamID != shift, defensive rebound, else offensive rebound
# Merge with original dataframe on unique identifier ?? 


Unnamed: 0,gameId,actionNumber,clock,period,teamId,teamTricode,personId,playerName,playerNameI,xLegacy,yLegacy,shotDistance,shotResult,isFieldGoal,scoreHome,scoreAway,pointsTotal,location,description,actionType,subType,videoAvailable,shotValue,actionId,possession_change,prev_team_abv,curr_FT,total_FT,final_FT
5,12300006,11,PT11M06.00S,1,1610612755,PHI,202699,Harris,T. Harris,0,0,0,,0,,,0,v,Harris REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,6,1,BOS,,,False
13,12300006,21,PT10M17.00S,1,1610612755,PHI,1630178,Maxey,T. Maxey,0,0,0,,0,,,0,v,Maxey REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,14,1,BOS,,,False
15,12300006,23,PT10M09.00S,1,1610612738,BOS,204001,Porziņģis,K. Porziņģis,0,0,0,,0,,,0,h,Porzingis REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,16,1,PHI,,,False
17,12300006,25,PT09M59.00S,1,1610612755,PHI,202699,Harris,T. Harris,0,0,0,,0,,,0,v,Harris REBOUND (Off:0 Def:2),Rebound,Unknown,1,0,18,1,BOS,,,False
19,12300006,27,PT09M52.00S,1,1610612738,BOS,201143,Horford,A. Horford,0,0,0,,0,,,0,h,Horford REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,20,1,PHI,,,False
21,12300006,34,PT09M41.00S,1,1610612755,PHI,202699,Harris,T. Harris,0,0,0,,0,,,0,v,Harris REBOUND (Off:0 Def:3),Rebound,Unknown,1,0,22,1,BOS,,,False
23,12300006,36,PT09M30.00S,1,1610612755,PHI,1630194,Reed,P. Reed,0,0,0,,0,,,0,v,Reed REBOUND (Off:1 Def:0),Rebound,Unknown,1,0,24,0,PHI,,,False
25,12300006,41,PT09M27.00S,1,0,,1610612738,,,0,0,0,,0,,,0,h,CELTICS Rebound,Rebound,Unknown,1,0,26,1,PHI,,,False
31,12300006,49,PT09M10.00S,1,1610612755,PHI,200782,Tucker,P. Tucker,0,0,0,,0,,,0,v,Tucker REBOUND (Off:0 Def:1),Rebound,Unknown,1,0,32,1,BOS,,,False
34,12300006,52,PT08M34.00S,1,1610612755,PHI,200782,Tucker,P. Tucker,0,0,0,,0,,,0,v,Tucker REBOUND (Off:0 Def:2),Rebound,Unknown,1,0,35,1,BOS,,,False
