# Live PlayByPlay data exploration
In this notebook we'll explore the data obtained from the live `PlayByPlay` endpoint (https://github.com/swar/nba_api/blob/master/docs/nba_api/live/endpoints/playbyplay.md).

In [1]:
import pandas as pd

In [2]:
GAME_ID = '0022400630'

In [3]:
from nba_api.live.nba.endpoints import PlayByPlay

PlayByPlay?

[31mInit signature:[39m PlayByPlay(game_id, proxy=[38;5;28;01mNone[39;00m, headers=[38;5;28;01mNone[39;00m, timeout=[32m30[39m, get_request=[38;5;28;01mTrue[39;00m)
[31mDocstring:[39m      <no docstring>
[31mFile:[39m           ~/anaconda3/envs/MBAI/lib/python3.13/site-packages/nba_api/live/nba/endpoints/playbyplay.py
[31mType:[39m           type
[31mSubclasses:[39m     

In [4]:
pbp = PlayByPlay(GAME_ID)

In [5]:
pbp.get_dict().keys()

dict_keys(['meta', 'game'])

In [6]:
pbp.get_dict()['game'].keys()

dict_keys(['gameId', 'actions'])

In [7]:
df = pd.DataFrame(pbp.get_dict()['game']['actions'])

In [8]:
df.columns

Index(['actionNumber', 'clock', 'timeActual', 'period', 'periodType',
       'actionType', 'subType', 'qualifiers', 'personId', 'x', 'y',
       'possession', 'scoreHome', 'scoreAway', 'edited', 'orderNumber',
       'isTargetScoreLastPeriod', 'xLegacy', 'yLegacy', 'isFieldGoal', 'side',
       'description', 'personIdsFilter', 'teamId', 'teamTricode', 'descriptor',
       'jumpBallRecoveredName', 'jumpBallRecoverdPersonId', 'playerName',
       'playerNameI', 'jumpBallWonPlayerName', 'jumpBallWonPersonId',
       'jumpBallLostPlayerName', 'jumpBallLostPersonId', 'area', 'areaDetail',
       'shotDistance', 'shotResult', 'blockPlayerName', 'blockPersonId',
       'shotActionNumber', 'reboundTotal', 'reboundDefensiveTotal',
       'reboundOffensiveTotal', 'pointsTotal', 'assistPlayerNameInitial',
       'assistPersonId', 'assistTotal', 'officialId', 'foulPersonalTotal',
       'foulTechnicalTotal', 'foulDrawnPlayerName', 'foulDrawnPersonId',
       'turnoverTotal', 'stealPlayerName', 's

In [9]:
cols2drop = [
    'actionNumber', 'periodType', 'edited', 'orderNumber',
    'isTargetScoreLastPeriod', 'isFieldGoal', 'side', 'personIdsFilter',
    'teamTricode', 'jumpBallRecoveredName', 'playerName',
    'playerNameI', 'jumpBallWonPlayerName', 'jumpBallLostPlayerName', 
    'blockPlayerName', 'shotActionNumber', 'reboundTotal', 'reboundDefensiveTotal',
    'reboundOffensiveTotal', 'pointsTotal', 'assistPlayerNameInitial',
    'assistTotal', 'foulPersonalTotal', 'foulTechnicalTotal', 
    'foulDrawnPlayerName', 'turnoverTotal', 'stealPlayerName'
]

df.drop(columns=cols2drop, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   clock                     611 non-null    object 
 1   timeActual                611 non-null    object 
 2   period                    611 non-null    int64  
 3   actionType                611 non-null    object 
 4   subType                   611 non-null    object 
 5   qualifiers                611 non-null    object 
 6   personId                  611 non-null    int64  
 7   x                         164 non-null    float64
 8   y                         164 non-null    float64
 9   possession                611 non-null    int64  
 10  scoreHome                 611 non-null    object 
 11  scoreAway                 611 non-null    object 
 12  xLegacy                   164 non-null    float64
 13  yLegacy                   164 non-null    float64
 14  descriptio

In [11]:
df[['clock', 'timeActual']]

Unnamed: 0,clock,timeActual
0,PT12M00.00S,2025-01-25T00:10:48.5Z
1,PT11M57.00S,2025-01-25T00:10:50.5Z
2,PT11M37.00S,2025-01-25T00:11:10.2Z
3,PT11M37.00S,2025-01-25T00:11:10.2Z
4,PT11M34.00S,2025-01-25T00:11:13.2Z
...,...,...
606,PT00M10.70S,2025-01-25T02:30:23.3Z
607,PT00M03.30S,2025-01-25T02:30:35.8Z
608,PT00M00.00S,2025-01-25T02:30:44.7Z
609,PT00M00.00S,2025-01-25T02:30:53.7Z


In [12]:
df['clock'] = pd.to_timedelta(df['clock'])
df['timeActual'] = pd.to_datetime(df['timeActual'])

In [13]:
df['period'].unique()

array([1, 2, 3, 4])

In [14]:
df['period'] = df['period'].astype('uint8')

In [15]:
df['actionType'].unique()

array(['period', 'jumpball', '2pt', 'block', 'rebound', '3pt', 'foul',
       'substitution', 'timeout', 'turnover', 'steal', 'freethrow',
       'violation', 'game'], dtype=object)

In [16]:
df['actionType'] = df['actionType'].astype('string') 

In [17]:
action_subtypes = {
    action: action_df['subType'].unique().tolist()
        for action, action_df in df.groupby('actionType')
}

action_subtypes

{'2pt': ['DUNK', 'Jump Shot', 'Layup', 'Hook'],
 '3pt': ['Jump Shot'],
 'block': [''],
 'foul': ['personal', 'offensive', 'technical'],
 'freethrow': ['1 of 2', '2 of 2', '1 of 1'],
 'game': ['end'],
 'jumpball': ['recovered'],
 'period': ['start', 'end'],
 'rebound': ['defensive', 'offensive'],
 'steal': [''],
 'substitution': ['out', 'in'],
 'timeout': ['full'],
 'turnover': ['out-of-bounds',
  'traveling',
  'lost ball',
  'bad pass',
  'offensive foul',
  'shot clock'],
 'violation': ['kicked ball',
  'defensive goaltending',
  'lane',
  'delay-of-game']}

In [18]:
df['subType'] = df['subType'].astype('string') 

In [19]:
df[['actionType', 'subType', 'qualifiers', 'description', 'descriptor']]

Unnamed: 0,actionType,subType,qualifiers,description,descriptor
0,period,start,[],Period Start,
1,jumpball,recovered,[],Jump Ball M. Williams vs. D. Clingan: Tip to V...,startperiod
2,2pt,DUNK,[pointsinthepaint],MISS M. Williams cutting DUNK - blocked,cutting
3,block,,[],D. Clingan BLOCK (1 BLK),
4,rebound,defensive,[],D. Clingan REBOUND (Off:0 Def:1),
...,...,...,...,...,...
606,2pt,DUNK,"[pointsinthepaint, 2ndchance]",M. Williams cutting DUNK (13 PTS) (J. Green 2 ...,cutting
607,2pt,DUNK,[pointsinthepaint],T. Camara driving DUNK (11 PTS) (A. Simons 2 AST),driving
608,2pt,Layup,[pointsinthepaint],M. Williams cutting Layup (15 PTS) (N. Smith J...,cutting
609,period,end,[],Period End,


In [20]:
qualifiers_set = set()
for qualifiers in df['qualifiers']:
    for qualifier in qualifiers: 
        qualifiers_set.add(qualifier)

qualifiers_set

{'1freethrow',
 '2freethrow',
 '2ndchance',
 'deadball',
 'defensivegoaltending',
 'fastbreak',
 'fromturnover',
 'inpenalty',
 'mandatory',
 'pointsinthepaint',
 'startperiod',
 'team'}

In [21]:
df.drop(columns=['qualifiers', 'description'], inplace=True)

In [22]:
df['descriptor'].unique()

array([nan, 'startperiod', 'cutting', 'pullup', 'running', 'turnaround',
       'driving floating bank', 'putback', 'step back',
       'running alley-oop', 'driving floating', 'running pullup',
       'bad pass', 'tip', 'driving finger roll', 'driving', 'shooting',
       'turnaround fadeaway', 'charge', 'alley-oop', 'loose ball',
       'lost ball', 'floating', 'driving reverse', 'defensive-3-second',
       'technical', 'running finger roll', 'take'], dtype=object)

In [23]:
df['descriptor'] = df['descriptor'].astype('string') 

In [24]:
df[['x', 'y', 'xLegacy', 'yLegacy']].dropna()

Unnamed: 0,x,y,xLegacy,yLegacy
2,8.886334,51.960784,-10.0,31.0
5,91.672142,75.490196,127.0,26.0
6,17.690539,46.323529,18.0,114.0
9,93.117608,50.000000,0.0,12.0
10,27.677398,19.362745,153.0,208.0
...,...,...,...,...
593,94.410000,50.000000,0.0,0.0
604,86.678712,3.676471,-232.0,73.0
606,94.037451,51.715686,9.0,4.0
607,7.440867,52.696078,-13.0,17.0


In [25]:
assert all(df['xLegacy'].dropna() == df['xLegacy'].dropna().astype('int')) 
assert all(df['yLegacy'].dropna() == df['yLegacy'].dropna().astype('int'))

In [26]:
df.drop(columns=['xLegacy', 'yLegacy'], inplace=True)

In [27]:
df[['x', 'y']].dropna().describe()

Unnamed: 0,x,y
count,164.0,164.0
mean,48.080786,50.10611
std,36.124591,23.107214
min,3.892904,2.696078
25%,8.722076,43.321078
50%,31.685283,49.509804
75%,87.434297,59.191176
max,94.957293,97.54902


In [28]:
df['x'] = df['x'].astype('float16')
df['y'] = df['y'].astype('float16')

In [29]:
df[['scoreHome', 'scoreAway']]

Unnamed: 0,scoreHome,scoreAway
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
606,95,100
607,95,102
608,97,102
609,97,102


In [30]:
df['scoreHome'] = df['scoreHome'].astype('uint8')
df['scoreAway'] = df['scoreAway'].astype('uint8')

In [31]:
df['area'].unique()

array([nan, 'Restricted Area', 'Mid-Range', 'In The Paint (Non-RA)',
       'Above the Break 3', 'Left Corner 3', 'Right Corner 3'],
      dtype=object)

In [32]:
df['areaDetail'].unique()

array([nan, '0-8 Center', '8-16 Right', '8-16 Center', '24+ Right Center',
       '24+ Left', '24+ Right', '16-24 Right Center', '8-16 Left',
       '24+ Left Center', '16-24 Center', '24+ Center', '16-24 Right',
       '16-24 Left Center'], dtype=object)

In [33]:
df.drop(columns=['area', 'areaDetail'], inplace=True)

In [34]:
df['shotResult'].unique()

array([nan, 'Missed', 'Made'], dtype=object)

In [35]:
df['shotResult'] = df['shotResult'].replace({'Made': True, 'Missed': False}).astype('boolean')

In [36]:
df['shotDistance']

0       NaN
1       NaN
2      3.25
3       NaN
4       NaN
       ... 
606    0.93
607    2.20
608    1.47
609     NaN
610     NaN
Name: shotDistance, Length: 611, dtype: float64

In [37]:
df['shotDistance'] = df['shotDistance'].astype('float16')

In [38]:
df[['actionType', 'subType', 'possession', 'teamId']]

Unnamed: 0,actionType,subType,possession,teamId
0,period,start,0,
1,jumpball,recovered,1610612766,1.610613e+09
2,2pt,DUNK,1610612766,1.610613e+09
3,block,,1610612766,1.610613e+09
4,rebound,defensive,1610612757,1.610613e+09
...,...,...,...,...
606,2pt,DUNK,1610612766,1.610613e+09
607,2pt,DUNK,1610612757,1.610613e+09
608,2pt,Layup,1610612766,1.610613e+09
609,period,end,1610612757,


In [39]:
df[df['possession'] == 0]['actionType'].unique()

<StringArray>
['period', 'game']
Length: 2, dtype: string

In [40]:
df['teamId'].unique()

array([           nan, 1.61061277e+09, 1.61061276e+09])

In [41]:
df['possession'] = df['possession'].replace(0, pd.NA).astype('UInt32')
df['teamId'] = df['teamId'].astype('UInt32')

In [42]:
df['personId'].unique()

array([      0,  203995, 1631109, 1642270, 1629014, 1630166, 1628998,
       1641733, 1630182, 1641739,  203924, 1629006, 1630703, 1631133,
       1631101, 1642354, 1631209, 1630625, 1631217, 1629610])

In [43]:
df[df['personId'] == 0]['actionType'].unique()

<StringArray>
['period', 'rebound', 'timeout', 'turnover', 'violation', 'game']
Length: 6, dtype: string

In [44]:
df['personId'] = df['personId'].replace(0, pd.NA).astype('UInt32')

In [45]:
df['officialId'].unique()

array([     nan, 1626301., 1628487.,  204059.])

In [46]:
df['officialId'] = df['officialId'].astype('UInt32')

In [47]:
df.filter(regex='PersonId')

Unnamed: 0,jumpBallRecoverdPersonId,jumpBallWonPersonId,jumpBallLostPersonId,blockPersonId,assistPersonId,foulDrawnPersonId,stealPersonId
0,,,,,,,
1,203995.0,1631109.0,1642270.0,,,,
2,,,,1642270.0,,,
3,,,,,,,
4,,,,,,,
...,...,...,...,...,...,...,...
606,,,,,1630182.0,,
607,,,,,1629014.0,,
608,,,,,1641733.0,,
609,,,,,,,


In [52]:
df.rename(columns={'jumpBallRecoverdPersonId': 'jumpBallRecoveredPersonId'}, inplace=True)

In [53]:
(df.filter(regex='PersonId') == 0).any()

jumpBallRecoveredPersonId    False
jumpBallWonPersonId          False
jumpBallLostPersonId         False
blockPersonId                False
assistPersonId               False
foulDrawnPersonId            False
stealPersonId                False
dtype: boolean

In [54]:
df['jumpBallRecoveredPersonId'] = df['jumpBallRecoveredPersonId'].astype('UInt32')
df['jumpBallWonPersonId'] = df['jumpBallWonPersonId'].astype('UInt32')
df['jumpBallLostPersonId'] = df['jumpBallLostPersonId'].astype('UInt32')
df['blockPersonId'] = df['blockPersonId'].astype('UInt32')
df['assistPersonId'] = df['assistPersonId'].astype('UInt32')
df['foulDrawnPersonId'] = df['foulDrawnPersonId'].astype('UInt32')
df['stealPersonId'] = df['stealPersonId'].astype('UInt32')

In [55]:
df.dtypes

clock                            timedelta64[ns]
timeActual                   datetime64[ns, UTC]
period                                     uint8
actionType                        string[python]
subType                           string[python]
personId                                  UInt32
x                                        float16
y                                        float16
possession                                UInt32
scoreHome                                  uint8
scoreAway                                  uint8
teamId                                    UInt32
descriptor                        string[python]
jumpBallRecoveredPersonId                 UInt32
jumpBallWonPersonId                       UInt32
jumpBallLostPersonId                      UInt32
shotDistance                             float16
shotResult                               boolean
blockPersonId                             UInt32
assistPersonId                            UInt32
officialId          

In [56]:
original_memory = pd.DataFrame(pbp.get_dict()['game']['actions']).memory_usage(deep=True).sum()
optimized_memory = df.memory_usage(deep=True).sum()

print(f"Original teams df memory: {original_memory / 1024**2:.2f} MB")
print(f"Optimized teams df memory: {optimized_memory / 1024**2:.2f} MB")
print(f"Memory saved: {(original_memory - optimized_memory) / 1024**2:.2f} MB ({(1 - optimized_memory/original_memory)*100:.1f}%)")

Original teams df memory: 0.91 MB
Optimized teams df memory: 0.15 MB
Memory saved: 0.77 MB (84.0%)
