# NBA Play-by-Play Possessions

For poor free throw shooters, is it a better strategy to *intentionally* miss the second free throw and aim for the offensive rebound and new possession? Analysis of play-by-play data using [nba_api](https://github.com/swar/nba_api).

### TODO

- [ ] Use consistent methods for creating new DF columns 
- [ ] Reduce use of list comprehensions for `pandas` / `numpy` methods

## 1. Setup and get play-by-play DataFrame

In [None]:
import numpy as np
import pandas as pd
import urllib
import requests

from nba_api.stats.endpoints import playbyplayv2

In [None]:
#Get a list of team IDs
from nba_api.stats.static import teams
nba_teams = teams.get_teams()

# Select the dictionary for the Pacers, which contains their team ID
team_ids = [team['id'] for team in nba_teams]

# Query for the regular season games of the Cavs (team no. 3 alphabetically)
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType

gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_ids[2], 
                            season_nullable=Season.default,
                            season_type_nullable=SeasonType.regular)  

games_dict = gamefinder.get_normalized_dict()
games = games_dict['LeagueGameFinderResults']
game_IDs = [game['GAME_ID'] for game in games]

In [None]:
# Query for the play by play of that most recent regular season game
from nba_api.stats.endpoints import playbyplayv2
df = playbyplayv2.PlayByPlayV2(game_IDs[0]).get_data_frames()[0]

## 2. DataFrame Formatting

In [None]:
#Convert the game clock time to seconds and re-order
def pbpNewRows1(df):
  df['PCTIME_SECONDS'] = df['PCTIMESTRING'].map(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))
  df = df.sort_values(['PERIOD','PCTIME_SECONDS','EVENTNUM'], ascending=[True,False,True])
  df = df.reset_index(drop=True)

  #Update the SCORE column to fill in blanks
  df.at[0,"SCORE"] = "0 - 0"
  df["SCORE"] = df["SCORE"].fillna(method="ffill")

  #New columns for possession formulae
  df['EVENTMSGTYPE_1'] = df['EVENTMSGTYPE'].shift(-1)
  df['EVENTMSGACTIONTYPE_1'] = df['EVENTMSGACTIONTYPE'].shift(-1)
  df['PCTIME_SECONDS_1'] = df['PCTIME_SECONDS'].shift(-1)
  df['PLAYER1_TEAM_ID_1'] = df['PLAYER1_TEAM_ID'].shift(-1)
  df['SCORE_1'] = df['SCORE'].shift(1)
  df.at[0,'SCORE_1'] = "0 - 0"

  return df

## 3. Extracting Further Info

There are four main ways a possession can end:
1. Made FG / FT
2. Missed FG / FT followed by a defensive rebound
3. Turnover
4. Quarter end

We need a formula for each of the outcomes to check if and when each possession ends.

In [None]:
def possEndFG(loc, df):
    # Check if it's a shooting foul: the next play is a foul with the same time code
    if (df.iloc[loc]['EVENTMSGTYPE_1'] == 6) and (df.iloc[loc]['PCTIME_SECONDS'] == df.iloc[loc]['PCTIME_SECONDS_1']):
        return False
    else:
        return True
    
def possEndRebound(loc, df):
    # Check for offensive rebound: the next play (i.e. the rebound after a miss) is by the same team
    if df.iloc[loc]['PLAYER1_TEAM_ID'] == df.iloc[loc]['PLAYER1_TEAM_ID_1']:
        return False
    else:
        return True

def possEndFT(loc, df):
    # Check for last FT (10 is 1st of 1; 12 is 2nd of 2; 15 is 3rd of 3) 
    if df.iloc[loc]['EVENTMSGACTIONTYPE'] in [10,12,15]:
        if (df.iloc[loc]['EVENTMSGTYPE_1'] == 4):
            return possEndRebound(loc, df)
        else:
            return True
    else:
        return False

# The keys in this dict correspond to relevant EVENTMSGTYPE (1 - FG make, 2 - FG miss, 3 - FT attempt)
possOutcomesDict = {
    '1': possEndFG,
    '2': possEndRebound,
    '3': possEndFT
}


def possEndCheck(loc, df):    
    if df.iloc[loc]['EVENTMSGTYPE'] in [1,2,3]:
        #print(f"running {possOutcomesDict[str(df.iloc[loc]['EVENTMSGTYPE'])]}")
        return possOutcomesDict[str(df.iloc[loc]['EVENTMSGTYPE'])](loc, df)
    elif df.iloc[loc]['EVENTMSGTYPE'] in [5,13]:
        return True
    else:
        return False

With these formulae we can run through the plays in the game and determine on which of them a possession ended, and who was in possesion for each play.
We can also calculate the change in score for each play, to use in analysing points per possession later on.

In [None]:
def pbpNewRows2(df):

    # Add new column for possession end True / False
    df['POSSESSION_END'] = [possEndCheck(loc, df) for loc in range(len(df))]

    # Gets team that wins the tip - index 0 is the start of game play 
    # Index 1 is the jump ball row and Player 3 is who it gets tipped to
    currentTeam = [df.iloc[1]["PLAYER3_TEAM_ABBREVIATION"]][0]

    # Get the abbreviations of the two teams 
    teamNames = list(filter(lambda x: x is not None, df["PLAYER1_TEAM_ABBREVIATION"].unique().tolist()))

    # Initialising variables for the loop
    switch = True
    teamInPoss = []

    for idx, loc in enumerate(range(len(df))):
        
        teamInPoss.append(currentTeam)

        # If POSSESSION_END == True, switch the team in possession for the next play
        if df.iloc[loc]["POSSESSION_END"]:
            currentTeam = teamNames[int(not switch)]
            switch = not switch

    df["POSSESSION_TEAM_ABBREVIATION"] = teamInPoss

    # Calculates the change in score between two plays - they don't have to be sequential
    def eventScore(before,after):

        start_score = str(before).split(" - ")
        end_score = str(after).split(" - ")

        diff_score = [int(x)-int(y) for x, y in zip(end_score, start_score)]

        return max(diff_score)

    df['SCORE_CHANGE'] = df.apply(lambda row: eventScore(row['SCORE_1'],row['SCORE']), axis=1)

    return df

In [None]:
df.head()

### Points Per Possession

As an example, using the new columns we've added to the pbp data, we can calculate the average points per possession on OKC offensive rebounds.

I'm sure I can update this example to use more `pandas` or `numpy` tools instead of the excessive list comprehensions I've used instead.

In [None]:
def pointsPerPossession(team, df):
  # Get a list of missed shots
  missedShots = df.index[df['EVENTMSGTYPE'] == 2].tolist()

  # For each missed shot, add if the following play is a rebound with the same team abbreviation
  ORB = [shot + 1 for shot in missedShots if df.loc[shot]['PLAYER1_TEAM_ABBREVIATION'] == team and 
                                                df.loc[shot+1]['PLAYER1_TEAM_ABBREVIATION'] == team and 
                                                df.loc[shot+1]['EVENTMSGTYPE'] == 4]

  # Get a list of the plays where possession changed
  possessionChanges = df[df['POSSESSION_END'] == True].index.tolist()
  possessionChanges = np.array(possessionChanges)

  # For each rebound, find the next change of possession and return the pair of start / end indices
  ORB_poss = [[reb, possessionChanges[possessionChanges > reb].min()+1] for reb in ORB]

  # Find the total of the points for each possession after a ORB
  points = sum([df.iloc[reb[0]:reb[1]]['SCORE_CHANGE'].sum() for reb in ORB_poss])
  num_poss = len(ORB)
  
  return [points / num_poss, num_poss]

# Points per possession is the total number of points divided by the number of possessions
team = "OKC"

ORB = pointsPerPossession(team, df)
print(f'Points Per Possession ({team}): {ORB[0]: .2f} (on {ORB[1]} possessions)') 

## Getting all play-By-play data for a season

In [None]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType

gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=Season.default, season_type_nullable=SeasonType.regular) 

In [None]:
games_dict = gamefinder.get_normalized_dict()
games = games_dict['LeagueGameFinderResults']
game_IDs = [game['GAME_ID'] for game in games]

In [None]:
game_IDs = list(set(game_IDs))

In [None]:
def getPBPdf(game_ID):
  # Given a game ID, returns a processed DataFrame of pbp data, including possession info
  df = playbyplayv2.PlayByPlayV2(game_ID).get_data_frames()[0]
  df = pbpNewRows1(df)
  df = pbpNewRows2(df)

  return df

In [None]:
df_test = getPBPdf("0022200552")

In [None]:
df_test[(df_test['EVENTMSGTYPE'] == 3) &
        (df_test['EVENTMSGACTIONTYPE'].isin([10,12,15])) &
        (df_test['PLAYER1_TEAM_ID'] == df_test['PLAYER1_TEAM_ID_1']) &
        (df_test['EVENTMSGTYPE_1'] == 4)
        ][['GAME_ID','EVENTNUM']]

In [None]:
df_test[df_test['PERIOD'] == 4].tail()

In [None]:
pbp_data = pd.DataFrame()
bad_games = []

for i, game_id in enumerate(game_IDs):
  
  percent = round(i / len(game_IDs) * 100)
  if percent % 2 == 0:
    print(f'{percent}% complete ({i}/{len(game_IDs)})')

  try:
    df_temp = getPBPdf(game_id)
    pbp_data = pd.concat([pbp_data, df_temp], ignore_index=True)
  
  except:
    bad_games.append(game_id)

print(f'Found {len(bad_games)} games with issues...')

In [None]:
pbp_data.to_csv('20230130 NBA PBP Data 22-23.csv', index=False)

In [None]:
len(pbp_data.groupby('GAME_ID'))

In [None]:
df_check = getPBPdf('0022200753')

In [None]:
pbp_data[(pbp_data['EVENTMSGTYPE'] == 3) &
        (pbp_data['EVENTMSGACTIONTYPE'].isin([10,12,15])) &
        (pbp_data['PLAYER1_TEAM_ID'] == pbp_data['PLAYER1_TEAM_ID_1']) &
        (pbp_data['EVENTMSGTYPE_1'] == 4)
        ]

In [None]:
pbp_data.iloc[103:106][['PLAYER1_ID','PLAYER1_ID_1']]

In [None]:
orb_22_23 = pbp_data[(pbp_data['EVENTMSGTYPE'] == 3) &
        (pbp_data['EVENTMSGACTIONTYPE'].isin([10,12,15])) &
        (pbp_data['PLAYER1_TEAM_ID'] == pbp_data['PLAYER1_TEAM_ID_1']) &
        (pbp_data['EVENTMSGTYPE_1'] == 4)
        ].index.to_list()

In [None]:
orb_solo_22_23 = pbp_data[(pbp_data['EVENTMSGTYPE'] == 3) &
        (pbp_data['EVENTMSGACTIONTYPE'].isin([10,12,15])) &              # last FT attempt
        (pbp_data['EVENTMSGTYPE_1'] == 4) &                              # next play is a rebound
        (pbp_data['PLAYER1_ID'] == pbp_data['PLAYER1_ID_1'])          # next play is by the same player
        ].index.to_list()

In [None]:
def pointsPerPossession(possessions, df):
  # Get a list of missed shots
  # missedShots = df.index[df['EVENTMSGTYPE'] == 2].tolist()

  # For each missed shot, add if the following play is a rebound with the same team abbreviation
  # ORB = [shot + 1 for shot in missedShots if df.loc[shot]['PLAYER1_TEAM_ABBREVIATION'] == team and 
  #                                               df.loc[shot+1]['PLAYER1_TEAM_ABBREVIATION'] == team and 
  #                                               df.loc[shot+1]['EVENTMSGTYPE'] == 4]

  # Get a list of the plays where possession changed
  possessionChanges = df[df['POSSESSION_END'] == True].index.tolist()
  possessionChanges = np.array(possessionChanges)

  # For each rebound, find the next change of possession and return the pair of start / end indices
  poss_bounds = [[play, possessionChanges[possessionChanges > play].min()+1] for play in possessions]

  # Find the total of the points for each possession after a ORB
  points = sum([df.iloc[poss[0]:poss[1]]['SCORE_CHANGE'].sum() for poss in poss_bounds])
  num_poss = len(possessions)
  
  return [points / num_poss, num_poss]

# Points per possession is the total number of points divided by the number of possessions
ppp_missedFT = pointsPerPossession(orb_solo_22_23, pbp_data)
print(f'Points Per Possession: {ppp_missedFT[0]: .2f} (on {ppp_missedFT[1]} possessions)') 

In [None]:
pbp_data['PLAYER1_ID_1'] = pbp_data['PLAYER1_ID'].shift(-1)
pbp_data['PCTIME_SECONDS_1'] = pbp_data['PCTIME_SECONDS'].shift(-1)

In [None]:
len(orb_solo_22_23)

In [None]:
def eventDescription(df, INDEX=False, EVENTNUM=False, URL=True):
  # Given an NBA play-by-play DataFrame and EVENTUM, returns the description for that play
  # Accepts the DataFrame index or the NBA-provided EVENTNUM for the play

  if EVENTNUM:
    INDEX = df.index[df['EVENTNUM'] == int(EVENTNUM)].tolist()[0]

  if not INDEX:
    print('No location entered')
    return
  
  else:
    # Get all three possible description locations and find the first that isn't "None"
    descRows = ['HOMEDESCRIPTION','NEUTRALDESCRIPTION','VISITORDESCRIPTION']
    descValues = df.iloc[INDEX][descRows].values.flatten().tolist() 
    descEvent = next((item for item in descValues if item is not None), 'No Description')
    
    # Optionally return a readable description instead of a URL-parsed version
    if URL:
      return urllib.parse.quote(descEvent)
    else:
      return descEvent



def getEventVidPage(df, loc, season):
  # Given a game's play-by-play DataFrame and an event within that game, returns the NBA.com video page of that event.
  # At the moment requires the season to be entered manually, but will eventually not require that addition.
  # Season is a string of the season years in the form 'YYYY-YY' (e.g. '2022-23')

  event_id = df.iloc[loc]['EVENTNUM']
  game_id = df.iloc[loc]['GAME_ID']

  # After further investigation it turns out that the description is not actually required for the URL to work... 
  description = eventDescription(df, loc)

  vidURL = 'https://www.nba.com/stats/events?CFID=&CFPARAMS=&GameEventID={}&GameID={}&Season={}&flag=1&title={}'.format(
    event_id,
    game_id,
    season,
    description
  )

  return vidURL


In [None]:

season = '2022-23'
loc = 403001
getEventVidPage(pbp_data, loc, season)

In [None]:
pbp_data.iloc[orb_solo_22_23]

In [None]:
def getEventVidURL(df, loc, resolution='LARGE'):
  # Given a game_id and event_id from within that game, returns the video URL of that event.
  # Also optionally accepts different video resolutions as a string input (SMALL, MEDIUM, LARGE)

  event_id = df.iloc[loc]['EVENTNUM']
  game_id = df.iloc[loc]['GAME_ID']

  headers = {
    'Host': 'stats.nba.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true',
    'Connection': 'keep-alive',
    'Referer': 'https://stats.nba.com/',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
  }

  vidRes = {
    'SMALL': 'surl',
    'MEDIUM': 'murl',
    'LARGE': 'lurl'
  }

  vidURL = 'https://stats.nba.com/stats/videoeventsasset?GameEventID={}&GameID={}'.format(
    event_id, 
    game_id)

  r = requests.get(vidURL, headers=headers)
  json = r.json()
  videoUrls = json['resultSets']['Meta']['videoUrls']
  playlist = json['resultSets']['playlist']
  
  return videoUrls[0][vidRes[resolution]]

In [None]:
getEventVidURL(pbp_data, 403001)

In [None]:
def save_NBA_vid(df, loc, filename):

  url = getEventVidURL(df, loc)
  save_as = filename

  # Download from URL
  with urllib.request.urlopen(url) as file:
      content = file.read()

  # Save to file
  with open(save_as, 'wb') as download:
      download.write(content)

In [None]:
for i, loc in enumerate(orb_solo_22_23):

  filename = f'NBA_VID/FT_REB_{i} - {pbp_data.iloc[loc]["GAME_ID"]}_{pbp_data.iloc[loc]["EVENTNUM"]}.mp4'
  print(filename)

In [None]:
game_teams = [game['TEAM_NAME'] for game in games]

In [None]:
gamesDF = pd.DataFrame(games)

In [None]:
gamesDF

In [118]:
%reset

Nothing done.
