# PlayByPlay data exploration
In this notebook we'll explore the data obtained from the `PlayByPlay`, `PlayByPlayV2` and `PlayByPlayV3` endpoints.

In [2]:
import pandas as pd

In [3]:
GAME_ID = '0022400630'

In [4]:
from nba_api.stats.endpoints import PlayByPlay, PlayByPlayV2, PlayByPlayV3

pbp = PlayByPlay(GAME_ID)
pbpv2 = PlayByPlayV2(GAME_ID)
pbpv3 = PlayByPlayV3(GAME_ID)

In [5]:
pbp.get_data_frames()[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   GAME_ID             506 non-null    object
 1   EVENTNUM            506 non-null    int64 
 2   EVENTMSGTYPE        506 non-null    int64 
 3   EVENTMSGACTIONTYPE  506 non-null    int64 
 4   PERIOD              506 non-null    int64 
 5   WCTIMESTRING        506 non-null    object
 6   PCTIMESTRING        506 non-null    object
 7   HOMEDESCRIPTION     261 non-null    object
 8   NEUTRALDESCRIPTION  9 non-null      object
 9   VISITORDESCRIPTION  269 non-null    object
 10  SCORE               123 non-null    object
 11  SCOREMARGIN         123 non-null    object
dtypes: int64(4), object(8)
memory usage: 47.6+ KB


In [6]:
pbpv2.get_data_frames()[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   GAME_ID                    506 non-null    object 
 1   EVENTNUM                   506 non-null    int64  
 2   EVENTMSGTYPE               506 non-null    int64  
 3   EVENTMSGACTIONTYPE         506 non-null    int64  
 4   PERIOD                     506 non-null    int64  
 5   WCTIMESTRING               506 non-null    object 
 6   PCTIMESTRING               506 non-null    object 
 7   HOMEDESCRIPTION            261 non-null    object 
 8   NEUTRALDESCRIPTION         9 non-null      object 
 9   VISITORDESCRIPTION         269 non-null    object 
 10  SCORE                      123 non-null    object 
 11  SCOREMARGIN                123 non-null    object 
 12  PERSON1TYPE                506 non-null    int64  
 13  PLAYER1_ID                 506 non-null    int64  

In [7]:
pbpv3.get_data_frames()[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539 entries, 0 to 538
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   gameId          539 non-null    object
 1   actionNumber    539 non-null    int64 
 2   clock           539 non-null    object
 3   period          539 non-null    int64 
 4   teamId          539 non-null    int64 
 5   teamTricode     539 non-null    object
 6   personId        539 non-null    int64 
 7   playerName      539 non-null    object
 8   playerNameI     539 non-null    object
 9   xLegacy         539 non-null    int64 
 10  yLegacy         539 non-null    int64 
 11  shotDistance    539 non-null    int64 
 12  shotResult      539 non-null    object
 13  isFieldGoal     539 non-null    int64 
 14  scoreHome       539 non-null    object
 15  scoreAway       539 non-null    object
 16  pointsTotal     539 non-null    int64 
 17  location        539 non-null    object
 18  descriptio

We'll go with the `PlayByPlayV3` endpoint because it seems the most complete and interesting.

In [9]:
df = pbpv3.get_data_frames()[0]

df.columns

Index(['gameId', 'actionNumber', 'clock', 'period', 'teamId', 'teamTricode',
       'personId', 'playerName', 'playerNameI', 'xLegacy', 'yLegacy',
       'shotDistance', 'shotResult', 'isFieldGoal', 'scoreHome', 'scoreAway',
       'pointsTotal', 'location', 'description', 'actionType', 'subType',
       'videoAvailable', 'shotValue', 'actionId'],
      dtype='object')

In [10]:
cols2drop = [
    'gameId', 'actionNumber', 'teamTricode', 'playerName', 'playerNameI', 
    'isFieldGoal', 'pointsTotal', 'location', 'videoAvailable', 'actionId'
]

df.drop(columns=cols2drop, inplace=True)

In [11]:
df

Unnamed: 0,clock,period,teamId,personId,xLegacy,yLegacy,shotDistance,shotResult,scoreHome,scoreAway,description,actionType,subType,shotValue
0,PT12M00.00S,1,0,0,0,0,0,,0,0,Start of 1st Period (7:11 PM EST),period,start,0
1,PT12M00.00S,1,1610612766,1631109,0,0,0,,,,Jump Ball M. Williams vs. Clingan: Tip to Micić,Jump Ball,,0
2,PT11M37.00S,1,1610612766,1631109,-10,31,3,Missed,,,MISS M. Williams 3' Cutting Dunk Shot,Missed Shot,Cutting Dunk Shot,2
3,PT11M37.00S,1,1610612757,1642270,0,0,0,,,,Clingan BLOCK (1 BLK),,,2
4,PT11M34.00S,1,1610612757,1642270,0,0,0,,,,Clingan REBOUND (Off:0 Def:1),Rebound,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,PT00M11.30S,4,1610612766,1630182,0,0,0,,,,Green REBOUND (Off:1 Def:4),Rebound,Unknown,0
535,PT00M10.70S,4,1610612766,1631109,9,4,1,Made,95,100,M. Williams 1' Cutting Dunk Shot (13 PTS) (Gre...,Made Shot,Cutting Dunk Shot,2
536,PT00M03.30S,4,1610612757,1641739,-13,17,2,Made,95,102,Camara 2' Driving Dunk (11 PTS) (Simons 2 AST),Made Shot,Driving Dunk Shot,2
537,PT00M00.00S,4,1610612766,1631109,-15,0,2,Made,97,102,M. Williams 2' Cutting Layup Shot (15 PTS) (Sm...,Made Shot,Cutting Layup Shot,2


In [12]:
df.isna().sum() + (df == '').sum()

clock             0
period            0
teamId            0
personId          0
xLegacy           0
yLegacy           0
shotDistance      0
shotResult      375
scoreHome       415
scoreAway       415
description       0
actionType       33
subType          84
shotValue         0
dtype: int64

In [92]:
df['scoreHome'] = df['scoreHome'].replace('', pd.NA).ffill()
df['scoreAway'] = df['scoreAway'].replace('', pd.NA).ffill()

In [94]:
df.dtypes

clock           object
period           int64
teamId           int64
personId         int64
xLegacy          int64
yLegacy          int64
shotDistance     int64
scoreHome       object
scoreAway       object
description     object
actionType      object
subType         object
shotValue        int64
dtype: object

In [95]:
df['clock'] = pd.to_timedelta(df['clock'])

for col in df.select_dtypes(include=['int64']).columns:
    df[col] = pd.to_numeric(df[col], downcast='unsigned')

df['scoreHome'] = pd.to_numeric(df['scoreHome'], downcast='unsigned')
df['scoreAway'] = pd.to_numeric(df['scoreAway'], downcast='unsigned')
df['xLegacy'] = pd.to_numeric(df['xLegacy'], downcast='signed')
df['yLegacy'] = pd.to_numeric(df['yLegacy'], downcast='signed')

df = df.convert_dtypes()
df.dtypes

clock           timedelta64[ns]
period                    UInt8
teamId                   UInt32
personId                 UInt32
xLegacy                   Int16
yLegacy                   Int16
shotDistance              UInt8
scoreHome                 UInt8
scoreAway                 UInt8
description      string[python]
actionType       string[python]
subType          string[python]
shotValue                 UInt8
dtype: object

In [96]:
df

Unnamed: 0,clock,period,teamId,personId,xLegacy,yLegacy,shotDistance,scoreHome,scoreAway,description,actionType,subType,shotValue
0,0 days 00:12:00,1,0,0,0,0,0,0,0,Start of 1st Period (7:11 PM EST),period,start,0
1,0 days 00:12:00,1,1610612766,1631109,0,0,0,0,0,Jump Ball M. Williams vs. Clingan: Tip to Micić,Jump Ball,,0
2,0 days 00:11:37,1,1610612766,1631109,-10,31,3,0,0,MISS M. Williams 3' Cutting Dunk Shot,Missed Shot,Cutting Dunk Shot,2
3,0 days 00:11:37,1,1610612757,1642270,0,0,0,0,0,Clingan BLOCK (1 BLK),,,2
4,0 days 00:11:34,1,1610612757,1642270,0,0,0,0,0,Clingan REBOUND (Off:0 Def:1),Rebound,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,0 days 00:00:11.300000,4,1610612766,1630182,0,0,0,93,100,Green REBOUND (Off:1 Def:4),Rebound,Unknown,0
535,0 days 00:00:10.700000,4,1610612766,1631109,9,4,1,95,100,M. Williams 1' Cutting Dunk Shot (13 PTS) (Gre...,Made Shot,Cutting Dunk Shot,2
536,0 days 00:00:03.300000,4,1610612757,1641739,-13,17,2,95,102,Camara 2' Driving Dunk (11 PTS) (Simons 2 AST),Made Shot,Driving Dunk Shot,2
537,0 days 00:00:00,4,1610612766,1631109,-15,0,2,97,102,M. Williams 2' Cutting Layup Shot (15 PTS) (Sm...,Made Shot,Cutting Layup Shot,2


In [17]:
df['actionType'].unique()

array(['period', 'Jump Ball', 'Missed Shot', '', 'Rebound', 'Made Shot',
       'Foul', 'Substitution', 'Timeout', 'Turnover', 'Free Throw',
       'Violation', 'Instant Replay'], dtype=object)

In [16]:
action_types = [action for action in df['actionType'].unique() if action != '']
action_subtypes = {
    action: action_df['subType'].unique().tolist()
        for action, action_df in df.groupby('actionType') if action != ''
}

In [66]:
action_subtypes

{'Foul': ['Personal',
  'Shooting',
  'Offensive Charge',
  'Loose Ball',
  'Offensive',
  'Defense 3 Second',
  'Personal Take'],
 'Free Throw': ['Free Throw 1 of 2',
  'Free Throw 2 of 2',
  'Free Throw 1 of 1',
  'Free Throw Technical'],
 'Instant Replay': ['Coach Challenge Support Ruling'],
 'Jump Ball': [''],
 'Made Shot': ['Pullup Jump shot',
  'Running Layup Shot',
  'Jump Shot',
  'Running Jump Shot',
  'Step Back Jump shot',
  'Running Alley Oop Layup Shot',
  'Running Pull-Up Jump Shot',
  'Driving Finger Roll Layup Shot',
  'Putback Dunk Shot',
  'Driving Layup Shot',
  'Running Dunk Shot',
  'Driving Floating Jump Shot',
  'Turnaround Fadeaway shot',
  'Tip Layup Shot',
  'Dunk Shot',
  'Cutting Dunk Shot',
  'Driving Dunk Shot',
  'Floating Jump shot',
  'Running Finger Roll Layup Shot',
  'Cutting Layup Shot'],
 'Missed Shot': ['Cutting Dunk Shot',
  'Pullup Jump shot',
  'Jump Shot',
  'Turnaround Hook Shot',
  'Driving Floating Bank Jump Shot',
  'Running Layup Shot',
 