# Missed Second Rebounds

For poor free throw shooters, is it a better strategy to *intentionally* miss the second free throw and aim for the offensive rebound and new possession? Analysis of play-by-play data using [nba_api](https://github.com/swar/nba_api).

In [1]:
import numpy as np
import pandas as pd
import urllib
import requests

from nba_api.stats.endpoints import playbyplayv2
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamelog

## Getting Data

In [129]:
gamelog = leaguegamelog.LeagueGameLog(counter=0,
                                      direction='ASC',
                                      league_id='00',
                                      player_or_team_abbreviation='T',
                                      season='2021-22',
                                      season_type_all_star='Regular Season',
                                      sorter='DATE')

In [130]:
games_dict = gamelog.get_normalized_dict()['LeagueGameLog']
game_IDs_raw = [(game['GAME_ID'], game['GAME_DATE'], game['MATCHUP']) for game in games_dict]
game_IDs = [game['GAME_ID'] for game in games_dict]
game_IDs = list(set(game_IDs))
game_IDs_dict = {game[0]: game[1] for game in game_IDs_raw}
game_matchups = {game[0]: game[2] for game in game_IDs_raw}

In [5]:
#Convert the game clock time to seconds and re-order
def pbpNewRows1(df):
  df['PCTIME_SECONDS'] = df['PCTIMESTRING'].map(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))
  df = df.sort_values(['PERIOD','PCTIME_SECONDS','EVENTNUM'], ascending=[True,False,True])
  df = df.reset_index(drop=True)

  #Update the SCORE column to fill in blanks
  df.at[0,"SCORE"] = "0 - 0"
  df["SCORE"] = df["SCORE"].fillna(method="ffill")

  #New columns for possession formulae
  df['EVENTMSGTYPE_1'] = df['EVENTMSGTYPE'].shift(-1)
  df['EVENTMSGACTIONTYPE_1'] = df['EVENTMSGACTIONTYPE'].shift(-1)
  df['PCTIME_SECONDS_1'] = df['PCTIME_SECONDS'].shift(-1)
  df['PLAYER1_TEAM_ID_1'] = df['PLAYER1_TEAM_ID'].shift(-1)
  df['PLAYER1_ID_1'] = df['PLAYER1_ID'].shift(-1)
  df['PCTIME_SECONDS_1'] = df['PCTIME_SECONDS'].shift(-1)
  df['SCORE_1'] = df['SCORE'].shift(1)
  df.at[0,'SCORE_1'] = "0 - 0"

  #Adding game date and matchup
  df['GAME_DATE'] = df.apply(lambda row: game_IDs_dict[row['GAME_ID']], axis=1)
  df['MATCHUP'] = df.apply(lambda row: game_matchups[row['GAME_ID']], axis=1)

  return df

def possEndFG(loc, df):
    # Check if it's a shooting foul: the next play is a foul with the same time code
    if (df.iloc[loc]['EVENTMSGTYPE_1'] == 6) and (df.iloc[loc]['PCTIME_SECONDS'] == df.iloc[loc]['PCTIME_SECONDS_1']):
        return False
    else:
        return True
    
def possEndRebound(loc, df):
    # Check for offensive rebound: the next play (i.e. the rebound after a miss) is by the same team
    if df.iloc[loc]['PLAYER1_TEAM_ID'] == df.iloc[loc]['PLAYER1_TEAM_ID_1']:
        return False
    else:
        return True

def possEndFT(loc, df):
    # Check for last FT (10 is 1st of 1; 12 is 2nd of 2; 15 is 3rd of 3) 
    if df.iloc[loc]['EVENTMSGACTIONTYPE'] in [10,12,15]:
        if (df.iloc[loc]['EVENTMSGTYPE_1'] == 4):
            return possEndRebound(loc, df)
        else:
            return True
    else:
        return False

# The keys in this dict correspond to relevant EVENTMSGTYPE (1 - FG make, 2 - FG miss, 3 - FT attempt)
possOutcomesDict = {
    '1': possEndFG,
    '2': possEndRebound,
    '3': possEndFT
}


def possEndCheck(loc, df):    
    if df.iloc[loc]['EVENTMSGTYPE'] in [1,2,3]:
        #print(f"running {possOutcomesDict[str(df.iloc[loc]['EVENTMSGTYPE'])]}")
        return possOutcomesDict[str(df.iloc[loc]['EVENTMSGTYPE'])](loc, df)
    elif df.iloc[loc]['EVENTMSGTYPE'] in [5,13]:
        return True
    else:
        return False


def pbpNewRows2(df):

    # Add new column for possession end True / False
    df['POSSESSION_END'] = [possEndCheck(loc, df) for loc in range(len(df))]

    # Gets team that wins the tip - index 0 is the start of game play 
    # Index 1 is the jump ball row and Player 3 is who it gets tipped to
    currentTeam = [df.iloc[1]["PLAYER3_TEAM_ABBREVIATION"]][0]

    # Get the abbreviations of the two teams 
    teamNames = list(filter(lambda x: x is not None, df["PLAYER1_TEAM_ABBREVIATION"].unique().tolist()))

    # Initialising variables for the loop
    switch = True
    teamInPoss = []

    for idx, loc in enumerate(range(len(df))):
        
        teamInPoss.append(currentTeam)

        # If POSSESSION_END == True, switch the team in possession for the next play
        if df.iloc[loc]["POSSESSION_END"]:
            currentTeam = teamNames[int(not switch)]
            switch = not switch

    df["POSSESSION_TEAM_ABBREVIATION"] = teamInPoss

    # Calculates the change in score between two plays - they don't have to be sequential
    def eventScore(before,after):

        start_score = str(before).split(" - ")
        end_score = str(after).split(" - ")

        diff_score = [int(x)-int(y) for x, y in zip(end_score, start_score)]

        return max(diff_score)

    df['SCORE_CHANGE'] = df.apply(lambda row: eventScore(row['SCORE_1'],row['SCORE']), axis=1)

    return df

In [6]:
def getPBPdf(game_ID):
  # Given a game ID, returns a processed DataFrame of pbp data, including possession info
  df = playbyplayv2.PlayByPlayV2(game_ID).get_data_frames()[0]
  df = pbpNewRows1(df)
  df = pbpNewRows2(df)

  return df

In [48]:
pbp_data = pd.DataFrame()
bad_games = []

for i, game_id in enumerate(game_IDs):
  
  percent = round(i / len(game_IDs) * 100,1)
  if i % 20 == 0:
    print(f'{percent}% complete ({i} / {len(game_IDs)})')

  try:
    df_temp = getPBPdf(game_id[0])
    pbp_data = pd.concat([pbp_data, df_temp], ignore_index=True)
  
  except:
    bad_games.append(game_id)

print(f'Found {len(bad_games)} games with issues...')

0.0% complete (0/2118)
0.9% complete (20/2118)
1.9% complete (40/2118)
2.8% complete (60/2118)
3.8% complete (80/2118)
4.7% complete (100/2118)
5.7% complete (120/2118)
6.6% complete (140/2118)
7.6% complete (160/2118)
8.5% complete (180/2118)
9.4% complete (200/2118)
10.4% complete (220/2118)
11.3% complete (240/2118)
12.3% complete (260/2118)
13.2% complete (280/2118)
14.2% complete (300/2118)
15.1% complete (320/2118)
16.1% complete (340/2118)
17.0% complete (360/2118)
17.9% complete (380/2118)
18.9% complete (400/2118)
19.8% complete (420/2118)
20.8% complete (440/2118)
21.7% complete (460/2118)
22.7% complete (480/2118)
23.6% complete (500/2118)
24.6% complete (520/2118)
25.5% complete (540/2118)
26.4% complete (560/2118)
27.4% complete (580/2118)
28.3% complete (600/2118)
29.3% complete (620/2118)
30.2% complete (640/2118)
31.2% complete (660/2118)
32.1% complete (680/2118)
33.1% complete (700/2118)
34.0% complete (720/2118)
34.9% complete (740/2118)
35.9% complete (760/2118)
36.

In [49]:
pbp_data.to_csv('20230201 NBA PBP Data 19-20.csv', index=False)

In [128]:
#pbp_data.to_csv('20230130 NBA PBP Data 22-23.csv', index=False)

## Analysis

In [22]:
# Re-load data from CSV
pbp_data = pd.read_csv('NBA_PBP_21_22.csv')
pbp_data['GAME_ID'] = pbp_data.apply(lambda row: '00' + str(row['GAME_ID']), axis=1)

# Below rows are no longer required––leftover from cleaning up the first pass
#pbp_data['GAME_DATE'] = pbp_data.apply(lambda row: game_IDs_dict[row['GAME_ID']], axis=1)
#pbp_data['MATCHUP'] = pbp_data.apply(lambda row: game_matchups[row['GAME_ID']], axis=1)

In [23]:
orb_solo = pbp_data[(pbp_data['EVENTMSGTYPE'] == 3) &
        (pbp_data['EVENTMSGACTIONTYPE'].isin([10,12,15])) &              # last FT attempt
        (pbp_data['EVENTMSGTYPE_1'] == 4) &                              # next play is a rebound
        (pbp_data['PLAYER1_ID'] == pbp_data['PLAYER1_ID_1'])             # next play is by the same player
        ].index.to_list()

In [26]:
orb_solo_shot = []
for loc in orb_solo:
  if pbp_data.iloc[loc + 1]['EVENTMSGTYPE_1'] in [1,2] and pbp_data.iloc[loc + 1]['PLAYER1_ID'] == pbp_data.iloc[loc + 1]['PLAYER1_ID_1']:

     orb_solo_shot.append(loc + 1)

In [27]:
print(len(orb_solo))
print(len(orb_solo_shot))

107
44


In [143]:
def pointsPerPossession(possessions, df):
  # Get a list of the plays where possession changed
  possessionChanges = df[df['POSSESSION_END'] == True].index.tolist()
  possessionChanges = np.array(possessionChanges)

  # For each rebound, find the next change of possession and return the pair of start / end indices
  poss_bounds = [[play, possessionChanges[possessionChanges > play].min()+1] for play in possessions]

  # Find the total of the points for each possession after a ORB
  points = sum([df.iloc[poss[0]:poss[1]]['SCORE_CHANGE'].sum() for poss in poss_bounds])
  num_poss = len(possessions)
  
  return [points / num_poss, num_poss]

# Points per possession is the total number of points divided by the number of possessions
ppp_missedFT = pointsPerPossession(orb_solo, pbp_data)
print(f'Points Per Possession: {ppp_missedFT[0]: .2f} (on {ppp_missedFT[1]} possessions)') 

Points Per Possession:  1.10 (on 107 possessions)


In [29]:
def getEventVidURL(df, loc, resolution='LARGE'):
  # Given a game_id and event_id from within that game, returns the video URL of that event.
  # Also optionally accepts different video resolutions as a string input (SMALL, MEDIUM, LARGE)

  event_id = df.iloc[loc]['EVENTNUM']
  game_id = df.iloc[loc]['GAME_ID']

  #print("Got locations")

  headers = {
    'Host': 'stats.nba.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true',
    'Connection': 'keep-alive',
    'Referer': 'https://stats.nba.com/',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
  }

  vidRes = {
    'SMALL': 'surl',
    'MEDIUM': 'murl',
    'LARGE': 'lurl'
  }

  vidURL = 'https://stats.nba.com/stats/videoeventsasset?GameEventID={}&GameID={}'.format(
    event_id, 
    game_id)

  #print("Making request for " + str(loc))

  r = requests.get(vidURL, headers=headers)
  json = r.json()
  videoUrls = json['resultSets']['Meta']['videoUrls']
  playlist = json['resultSets']['playlist']
  
  return videoUrls[0][vidRes[resolution]]

In [136]:
season = '2021-22'
loc = orb_solo[107]
getEventVidURL(pbp_data, loc)

IndexError: list index out of range

In [137]:
no_tip = [3,7,8,11,12,14,15,17,21,24,26,29,33,40,41,46,47,48,49,54,57,59,63,65,66,68,69,71,72,73,75,76,86,87,93,94,95,97,103,104,105]
no_vid = [4,9,35,52,61,64,78,89]
check_locs = [51,88]

In [139]:
no_tip_locs = [orb_solo[num] for num in no_tip]

In [144]:
no_tip_solo = []
for loc in no_tip_locs:
  if pbp_data.iloc[loc + 1]['EVENTMSGTYPE_1'] in [1,2] and pbp_data.iloc[loc + 1]['PLAYER1_ID'] == pbp_data.iloc[loc + 1]['PLAYER1_ID_1']:

     no_tip_solo.append(loc + 1)

In [146]:
len(no_tip_solo)

25

In [147]:
pointsPerPossession(no_tip_locs, pbp_data)

[1.12, 25]

In [15]:
def checkGame(df, loc=False, event_id=False, game_id=False):

  if event_id:
    df.index[(df['GAME_ID'] == str(game_id)) & (df['EVENTNUM'] == int(event_id))]
  else:
    if not loc:
      return

  game_id = df.iloc[loc]['GAME_ID']
  event_id = df.iloc[loc]['EVENTNUM']
  game_date = df.iloc[loc]['GAME_DATE']
  matchup = df.iloc[loc]['MATCHUP']
  
  print(f'{matchup} - {game_date}')
  print(f'GAME_ID: {game_id}')
  print(f'EVENT_ID: {event_id}')
  print(f'LOC: {loc}')


In [17]:
checkGame(pbp_data, 571976)

IndexError: single positional indexer is out-of-bounds

In [128]:
datapbp_data['GAME_ID'].unique().size

1230

In [20]:
def save_NBA_vid(df, loc, filename):

  url = getEventVidURL(df, loc)
  save_as = filename

  # Download from URL
  with urllib.request.urlopen(url) as file:
    content = file.read()

  # Save to file
  with open(save_as, 'wb') as download:
    download.write(content)

  print(f'Saved {filename}')

In [9]:
for loc in orb_solo:
  #print(loc)
  getEventVidURL(pbp_data, loc)

KeyboardInterrupt: 

In [21]:
good_example_locs = [15098, 5736, 38342, 93464, 152107, 170134, 203556, 206561, 235851, 236411, 253552, 315144, 317232]

In [39]:
#urls = []
for i, loc in enumerate(good_example_locs):

  filename = f'NBA_VID/FT_REB_{i} - {pbp_data.iloc[loc]["GAME_ID"]}_{pbp_data.iloc[loc]["EVENTNUM"]}.mp4'
  save_NBA_vid(pbp_data, loc, filename)
  #urls.append(getEventVidURL(pbp_data, loc))
  print(loc)

Saved NBA_VID/FT_REB_0 - 0022200037_100.mp4
15098
Saved NBA_VID/FT_REB_1 - 0022200333_238.mp4
5736
Saved NBA_VID/FT_REB_2 - 0022200388_639.mp4
38342
Saved NBA_VID/FT_REB_3 - 0022200669_322.mp4
55577
Saved NBA_VID/FT_REB_4 - 0022200542_655.mp4
93464
Saved NBA_VID/FT_REB_5 - 0022200312_410.mp4
152107
Saved NBA_VID/FT_REB_6 - 0022200516_617.mp4
170134
Saved NBA_VID/FT_REB_7 - 0022200464_15.mp4
198980
Saved NBA_VID/FT_REB_8 - 0022200116_568.mp4
203556
Saved NBA_VID/FT_REB_9 - 0022200241_110.mp4
206561
Saved NBA_VID/FT_REB_10 - 0022200512_617.mp4
235851
Saved NBA_VID/FT_REB_11 - 0022200552_701.mp4
236411
Saved NBA_VID/FT_REB_12 - 0022200663_391.mp4
253552
Saved NBA_VID/FT_REB_13 - 0022200303_122.mp4
315144
Saved NBA_VID/FT_REB_14 - 0022200510_561.mp4
317232


In [23]:
urls

['https://videos.nba.com/nba/pbp/media/2022/10/23/0022200037/100/cd5091ec-fc57-16f6-ec22-94717a8b6b55_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/12/02/0022200333/238/de3e9dae-087b-0352-7453-b6f8713f7762_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/12/10/0022200388/639/d6b37dbd-3677-d128-51a1-61441b43a49b_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2023/01/18/0022200669/322/8db5ef85-52d9-8c46-84aa-681a92dc4166_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/12/31/0022200542/655/0dcc8d9d-ed28-e9b7-5d0b-abeacc476d2a_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/11/30/0022200312/410/d7a3d112-918e-ed96-a4d8-17370ff29360_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/12/28/0022200516/617/c9e72df0-38d9-c22f-6abd-9e3a0155149e_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/12/20/0022200464/15/efa86340-a2c2-08b3-fb83-8cae20c07f8b_1280x720.mp4',
 'https://videos.nba.com/nba/pbp/media/2022/11/02/0022200116/568/

### Fixing Game Data
The original function for getting game data accidentially captured every game twice. There should be 1,230 total games in an NBA season. This removes the duplicate games from existing saved data, instead of downloading it again.

In [3]:
#Check that there are 1,230 unique games in dataset
pbp_data['GAME_ID'].unique().size

1230

In [9]:

pbp_data = pbp_data.drop_duplicates()


In [10]:
pbp_data.to_csv('NBA_PBP_21_22.csv', index=False)