#### Imports

In [2]:
import os
import time
import datetime
import pandas as pd

from tqdm import tqdm

from nba_api.stats.static import players, teams

#### Fetch Data from Local Directory

In [3]:
def _get_seconds_per_possession_data(filepath='data/seconds-per-possession/regular-season/'):
    '''
    returns DataFrame of Seconds Per Possession - Offense Data for each Player every Season & Team
    '''
    filenames = os.listdir(filepath)
    
    # initialize dataframe
    spp_df = pd.DataFrame()
    
    # loop through the filenames and get data
    for filename in tqdm(filenames):
        temp_df = pd.read_csv(filepath + filename)
        spp_df = pd.concat([spp_df, temp_df]).reset_index(drop=True)
        
    return spp_df

def _get_seasonal_touches_data(filepath='data/touches/season-totals/'):
    '''
    returns DataFrame of Tracking Touches Data for each season
    '''
    filenames = os.listdir(filepath)
    
    # initialize dataframe
    tracking_df = pd.DataFrame()
    
    # loop through the filenames and get data
    for filename in tqdm(filenames):
        temp_df = pd.read_csv(filepath + filename)
        tracking_df = pd.concat([tracking_df, temp_df]).reset_index(drop=True)
    
    return tracking_df

def _get_traded_touches_data(filepath='data/touches/traded-totals/'):
    '''
    returns DataFrame of Traded Tracking Touches Data for each season
    '''
    filenames = os.listdir(filepath)
    
    # initialize dataframe
    tracking_df = pd.DataFrame()
    
    # loop through the filenames and get data
    for filename in tqdm(filenames):
        temp_df = pd.read_csv(filepath + filename)
        tracking_df = pd.concat([tracking_df, temp_df]).reset_index(drop=True)
    
    return tracking_df

def _get_possessions_data(filepath='data/possessions/'):
    '''
    returns DataFrame of Player's Offensive Possession Totals by Season
    '''
    filenames = os.listdir(filepath)
    
    # initialize dataframe
    possessions_df = pd.DataFrame()
    
    # loop through the filenames and get data
    for filename in tqdm(filenames):
        temp_df = pd.read_csv(filepath + filename).rename(columns={'OffPoss': 'OFF_POSS'})
        possessions_df = pd.concat([possessions_df, temp_df]).reset_index(drop=True)
    
    return possessions_df

def _get_ids_data():
    '''
    returns DataFrame of Player IDs
    '''
    ids_df = pd.read_csv('data/ids/players_modern_database.csv')
    return ids_df

#### Call the Functions and Get the Data

In [4]:
ids_df = _get_ids_data()
spp_df = _get_seconds_per_possession_data()
tracking_df = _get_seasonal_touches_data()
traded_tracking_df = _get_traded_touches_data()
possessions_df = _get_possessions_data()

100%|███████████████████████████████████████████████████████████████████████████████| 300/300 [00:01<00:00, 157.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 119.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 84.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.59it/s]


In [5]:
mikeJames_spp_df = spp_df[spp_df['PLAYER_NAME'] == 'Mike James'].reset_index(drop=True)
spp_df = spp_df[~(spp_df['PLAYER_NAME'] == 'Mike James')].reset_index(drop=True)

In [6]:
# get DataFrame of all traded players by season
traded_names_df = traded_tracking_df[['SEASON','PLAYER_ID','PLAYER_NAME']].copy()
traded_names_df = traded_names_df.drop_duplicates()
print('The Number of Traded Players:', len(traded_names_df))

# create a list of tuples containing the traded players
traded_names = list(zip(traded_names_df.SEASON, traded_names_df.PLAYER_NAME))

traded_names[:5]

The Number of Traded Players: 681


[('2013-14', 'Andre Miller'),
 ('2013-14', 'Caron Butler'),
 ('2013-14', 'John Salmons'),
 ('2013-14', 'Reggie Evans'),
 ('2013-14', 'Luke Ridnour')]

In [7]:
# filter out the traded players season totals from the dataframe
singular_tracking_df = tracking_df[~tracking_df[["SEASON","PLAYER_NAME"]].apply(tuple, 1).isin(traded_names)].copy()
print('The Number of Untraded Player Totals: ', singular_tracking_df.shape[0])

# create a final dataframe with every player's total time of possession and touches by team and season
final_tracking_df = pd.concat([singular_tracking_df, traded_tracking_df]).reset_index(drop=True)
final_tracking_df = final_tracking_df.rename(columns={'TEAM_ABBREVIATION': 'TEAM'})
print('The Number of Unique Player Totals by team AND season:', final_tracking_df.shape[0])

final_tracking_df.head()

The Number of Untraded Player Totals:  4538
The Number of Unique Player Totals by team AND season: 5947


Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM,TIME_OF_POSS,TOUCHES,FRONT_CT_TOUCHES,AVG_DRIB_PER_TOUCH,SEASON
0,201985,AJ Price,MIN,20.6,244,90,4.78,2013-14
1,1733,Al Harrington,WAS,20.2,823,499,0.58,2013-14
2,201143,Al Horford,ATL,53.3,1826,1176,0.49,2013-14
3,2744,Al Jefferson,CHA,139.2,3575,2776,0.59,2013-14
4,202329,Al-Farouq Aminu,NOP,88.4,2520,1709,1.03,2013-14


In [8]:
# create a team ID column
id_to_abb_map = dict()

# create team abbreviation to team ID mapping
for team in teams.get_teams():
    id_to_abb_map[team['abbreviation']] = team['id']
    
final_tracking_df['TEAM_ID'] = final_tracking_df['TEAM'].map(id_to_abb_map)
possessions_df['TEAM_ID'] = possessions_df['TEAM'].map(id_to_abb_map)

In [9]:
playerID_map = dict(zip(ids_df['PLAYER_NAME'], ids_df['PLAYER_ID']))

print('The number of  player ids to identify:', len(spp_df))

# map player IDs to player names
spp_df['PLAYER_ID'] = spp_df['PLAYER_NAME'].map(playerID_map)

# separate the rows that have been identified with player ids
identified_spp_df = spp_df[~spp_df['PLAYER_ID'].isna()].reset_index(drop=True)
unidentified_spp_df = spp_df[spp_df['PLAYER_ID'].isna()].reset_index(drop=True)

print('The Number of Identified Player IDs:', len(identified_spp_df))
print('The Number of Unidentified Player IDs:', len(unidentified_spp_df))

pbp_to_nba_map = {
     'CJ Wilcox': 'C.J. Wilcox',
     'Cameron Reynolds': 'Cam Reynolds',
     'Charles Brown Jr.': 'Charlie Brown Jr.',
     'Danuel House': 'Danuel House Jr.',
     'Enes Kanter': 'Enes Freedom',
     'Frank Mason': 'Frank Mason III',
     'Jeff Dowtin': 'Jeff Dowtin Jr.',
     'Juan Hernangomez': 'Juancho Hernangomez',
     'Kevin Knox': 'Kevin Knox II',
     'Marcus Morris': 'Marcus Morris Sr.',
     'Michael Frazier': 'Melvin Frazier Jr.',
     'Nicolas Claxton': 'Nic Claxton',
     'OG Anunoby': 'O.G. Anunoby',
     'P.J. Dozier': 'PJ Dozier',
     'PJ Tucker': 'P.J. Tucker',
     'TJ Leaf': 'T.J. Leaf',
     'TJ Warren': 'T.J. Warren',
     'Walter Lemon Jr.': 'Walt Lemon Jr.'
}

# change the pbp names to NBA names and now identify their player IDs
unidentified_spp_df['PLAYER_NAME'] = unidentified_spp_df['PLAYER_NAME'].map(pbp_to_nba_map)
unidentified_spp_df['PLAYER_ID'] = unidentified_spp_df['PLAYER_NAME'].map(playerID_map)

spp_df = pd.concat([identified_spp_df, unidentified_spp_df]).reset_index(drop=True)
print('The Final Number of Identified Player IDs:', len(spp_df))

The number of  player ids to identify: 6009
The Number of Identified Player IDs: 5951
The Number of Unidentified Player IDs: 58
The Final Number of Identified Player IDs: 6009


In [10]:
merged_df = pd.merge(spp_df[['SEASON','MINUTES_ON','PLAYER_NAME','PLAYER_ID','TEAM_ID','SECONDS_PER_POSS_OFFENSE_PLAYER_ON']],
                     final_tracking_df[['SEASON','PLAYER_ID','TEAM_ID','TIME_OF_POSS']],
                     on=['SEASON','PLAYER_ID','TEAM_ID'],
                     how='left'
                    )
merged_df = pd.merge(merged_df, possessions_df, on=['SEASON','PLAYER_ID','TEAM_ID'], how='left')

In [11]:
no_possessions_found_df = merged_df[merged_df['OFF_POSS'].isna()].reset_index(drop=True)

final_df = merged_df[~merged_df['OFF_POSS'].isna()].reset_index(drop=True)

print("The Number of Players who don't have any possession data, but only seconds-per-poss data:", len(no_possessions_found_df))
print("The Number of Players with the data:", len(final_df))

The Number of Players who don't have any possession data, but only seconds-per-poss data: 207
The Number of Players with the data: 5802


In [12]:
final_df['BALL_HOG%'] = round(((final_df['TIME_OF_POSS'] * 60) / (final_df['OFF_POSS'] * final_df['SECONDS_PER_POSS_OFFENSE_PLAYER_ON'])) * 100, 1)
final_df = final_df.sort_values(by=['PLAYER_NAME','SEASON']).reset_index(drop=True)

print("The Number of Players with the data:", len(final_df))
weird_df = final_df[(final_df['PLAYER_NAME'] == 'Louis King') & (final_df['SEASON'] == '2022-23')].reset_index(drop=True)
print("The Number of Players with weird data:", len(weird_df))
final_df = final_df[~((final_df['PLAYER_NAME'] == 'Louis King') & (final_df['SEASON'] == '2022-23'))].reset_index(drop=True)
print("The Final Number of Players with the data:", len(final_df))

# save the data
final_df.to_csv('data/ball-hog-rate/ball-hog-rates_regular_season_2013_23.csv', index=False)

final_df.head()

The Number of Players with the data: 5802
The Number of Players with weird data: 1
The Final Number of Players with the data: 5801


Unnamed: 0,SEASON,MINUTES_ON,PLAYER_NAME,PLAYER_ID,TEAM_ID,SECONDS_PER_POSS_OFFENSE_PLAYER_ON,TIME_OF_POSS,TEAM,OFF_POSS,Points,...,FG2A,FG3M,FG3A,FtPoints,PtsAssisted2s,PtsUnassisted2s,PtsAssisted3s,PtsUnassisted3s,PtsPutbacks,BALL_HOG%
0,2022-23,83,A.J. Lawson,1630639.0,1610612742,14.477,3.5,DAL,230.0,54.0,...,18.0,10.0,25.0,2.0,10.0,12.0,30.0,0.0,10.0,6.3
1,2022-23,2,A.J. Lawson,1630639.0,1610612750,12.0,0.0,MIN,5.0,2.0,...,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,2022-23,346,AJ Green,1631260.0,1610612749,14.634,17.8,MIL,717.0,154.0,...,20.0,44.0,105.0,4.0,14.0,4.0,120.0,12.0,0.0,10.2
3,2022-23,1372,AJ Griffin,1631100.0,1610612737,14.134,52.8,ATL,2916.0,639.0,...,274.0,101.0,259.0,42.0,164.0,130.0,270.0,33.0,20.0,7.7
4,2016-17,163,AJ Hammons,1627773.0,1610612742,15.291,5.0,DAL,320.0,48.0,...,32.0,5.0,10.0,9.0,22.0,2.0,15.0,0.0,2.0,6.1
