In [80]:
import os
import time
import random
import difflib
import numpy as np
import pandas as pd

from tqdm import tqdm

import requests
from bs4 import BeautifulSoup

from unidecode import unidecode
from nba_api.stats.static import teams

In [3]:
def _scrape_bball_ref_pbp(year):
    '''
    returns nothing, but scrape bball-ref pbp page and saves data to local directory
    '''
    # fetch response from URL
    url = "https://www.basketball-reference.com/leagues/NBA_{}_play-by-play.html#pbp_stats".format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", {"id": "pbp_stats"})
    
    # define column names
    headers = ['PLAYER_NAME','POS','AGE','TEAM','GP','MP',
               'PG%','SG%','SF%','PF%','C%','ON-COURT','ON_OFF',
               'BAD_PASS','LOST_BALL','FOUL_SHOOT','OFF_FOUL',
               'FOUL_SHOOT_DRAWN','OFF_FOUL_DRAWN','PGA','AND1','BLKD']
    
    rows = []
    
    # scrape rows on page
    for tr in table.select("tbody tr"):
        row = [td.text for td in tr.select("td")]

        rows.append(row)
        
    # create dataframe and save the data to local directory
    df = pd.DataFrame(rows, columns=headers)
    
    season_param = str(year-1) + '-' + str(year)[2:]
    
    df.to_csv('data/bballref-pbp/bball_ref_pbp_data_{}.csv'.format(season_param), index=False)
    
def _scrape_custom_bball_ref_pbp(start_year, end_year):
    '''
    returns nothing, but calls the scrape_bball_ref_pbp function for various amount of years
    '''
    for year in tqdm(range(start_year, end_year+1)):
        _scrape_bball_ref_pbp(year)

In [19]:
_scrape_custom_bball_ref_pbp(2014,2023)

In [138]:
# read the data in
filepath = 'data/bballref-pbp/'
pbp_df = pd.DataFrame()

for filename in os.listdir(filepath):
    temp_df = pd.read_csv(filepath + filename)
    season = filename.split('_')[-1].replace('.csv','')
    temp_df = temp_df[~temp_df['PLAYER_NAME'].isna()].reset_index(drop=True)
    temp_df['SEASON'] = season
    pbp_df = pd.concat([pbp_df, temp_df]).reset_index(drop=True)
    
# replace accents in player names and create player name mapping
for name in sorted(list(pbp_df['PLAYER_NAME'].unique())):
    name_map[name] = unidecode(name.replace('*',''))
    
# convert percentage column to two decimal places
for col in ['PG%','SG%','SF%','PF%','C%']:
    pbp_df[col] = pbp_df[col].str.replace('%', '').astype(float) / 100
    
# get list of teams and sort them
team_lst = teams.get_teams()
sorted_team_lst = sorted(team_lst, key=lambda x: x['full_name'])

abb_to_id_map = dict()

# map values to each other
for team in sorted_team_lst:
    if team['abbreviation'] == 'CHA':
        abb_to_id_map['CHO'] = team['id']
    elif team['abbreviation'] == 'PHX':
        abb_to_id_map['PHO'] = team['id']
    elif team['abbreviation'] == 'BKN':
        abb_to_id_map['BRK'] = team['id']
    abb_to_id_map[team['abbreviation']] = team['id']
    
# replace blank values with 0s
pbp_df = pbp_df.replace(np.nan, 0.0)
    
# create new columns and fix names
pbp_df['PLAYER_NAME'] = pbp_df['PLAYER_NAME'].map(name_map)
pbp_df['POS_EST'] = round(((pbp_df['PG%'] * 1) + (pbp_df['SG%'] * 2) + (pbp_df['SF%'] * 3) + (pbp_df['PF%'] * 4) + (pbp_df['C%'] * 5)), 2)
pbp_df['TEAM_ID'] = pbp_df['TEAM'].map(abb_to_id_map)

In [139]:
print(pbp_df.shape)
cs_df = pbp_df[pbp_df['POS_EST'] > 5.0].reset_index(drop=True)
cs_df['POS_EST'] = 5.00
pbp_df = pbp_df[~(pbp_df['POS_EST'] > 5.0)].reset_index(drop=True)
pbp_df = pd.concat([pbp_df, cs_df]).reset_index(drop=True)
print(pbp_df.shape)

(6654, 25)
(6654, 25)


In [140]:
pbp_df[['SEASON','PLAYER_NAME','TEAM_ID','POS_EST']].to_csv('data/ids/players_bbref_database_w_posest.csv', index=False)

In [141]:
ballhog_df = pd.read_csv('data/ball-hog-rate/ball-hog-rates_regular_season_2013_23_w_age_exp.csv')

In [142]:
updated_ballhog_df = pd.merge(ballhog_df, pbp_df[['SEASON','PLAYER_NAME','TEAM_ID','POS_EST']], 
                                 on=['SEASON','PLAYER_NAME','TEAM_ID'], how='left')

updated_ballhog_df.head()

Unnamed: 0,SEASON,MINUTES_ON,PLAYER_NAME,PLAYER_ID,TEAM_ID,SECONDS_PER_POSS_OFFENSE_PLAYER_ON,TIME_OF_POSS,TEAM,OFF_POSS,BALL_HOG%,AGE,EXP,POS_EST
0,2022-23,83,A.J. Lawson,1630639,1610612742,14.477,3.5,DAL,230,6.3,22,0,2.75
1,2022-23,2,A.J. Lawson,1630639,1610612750,12.0,0.0,MIN,5,0.0,22,0,2.99
2,2021-22,94,Bruno Fernando,1628981,1610612745,14.121,4.5,HOU,198,9.7,23,2,5.0
3,2021-22,58,Bruno Fernando,1628981,1610612738,13.843,2.7,BOS,134,8.7,23,2,4.88
4,2020-21,226,Bruno Fernando,1628981,1610612737,14.698,9.3,ATL,480,7.9,22,1,5.0


In [143]:
name_map = dict()

unused_names_df = pbp_df[~pbp_df['PLAYER_NAME'].isin(updated_ballhog_df[~updated_ballhog_df['POS_EST'].isna()]['PLAYER_NAME'].to_list())].reset_index(drop=True)

for name in sorted(list(unused_names_df['PLAYER_NAME'].unique())):
    for nba_name in updated_ballhog_df[updated_ballhog_df['POS_EST'].isna()]['PLAYER_NAME'].unique():
        seq = difflib.SequenceMatcher(None, name, nba_name)
        score = seq.ratio()*100
        
        if score > 80.0:
            name_map[name] = nba_name

In [144]:
name_map

{'A.J. Green': 'AJ Green',
 'A.J. Hammons': 'AJ Hammons',
 'Andrew White': 'Andrew White III',
 'B.J. Johnson': 'BJ Johnson',
 'Brian Bowen': 'Brian Bowen II',
 'C.J. Miles': 'CJ Miles',
 'Cameron Reynolds': 'Cam Reynolds',
 'D.J. White': 'DJ White',
 'Derrick Walton': 'Derrick Walton Jr.',
 'Glen Rice Jr.': 'Glen Rice',
 'Harry Giles': 'Harry Giles III',
 'J.J. Hickson': 'JJ Hickson',
 'J.J. Redick': 'JJ Redick',
 'J.R. Smith': 'JR Smith',
 'Jeff Dowtin': 'Jeff Dowtin Jr.',
 'Jeff Taylor': 'Jeffery Taylor',
 'John Butler': 'John Butler Jr.',
 "Johnny O'Bryant": "Johnny O'Bryant III",
 'K.J. McDaniels': 'KJ McDaniels',
 'Kevin Knox': 'Kevin Knox II',
 'Marcus Morris': 'Marcus Morris Sr.',
 'Melvin Frazier': 'Melvin Frazier Jr.',
 'Mitch Creek': 'Mitchell Creek',
 'OG Anunoby': 'O.G. Anunoby',
 'P.J. Hairston': 'PJ Hairston',
 'Perry Jones': 'Perry Jones III',
 'R.J. Hunter': 'RJ Hunter',
 'Robert Williams': 'Robert Williams III',
 'Vince Edwards': 'Vincent Edwards',
 'Vitor Luiz Favera

In [145]:
identified_df = updated_ballhog_df[~updated_ballhog_df['POS_EST'].isna()].reset_index(drop=True)
unidentified_df = updated_ballhog_df[updated_ballhog_df['POS_EST'].isna()].reset_index(drop=True).drop(columns=['POS_EST'])

In [146]:
unidentified_df = pd.merge(unidentified_df, pbp_df[['SEASON','PLAYER_NAME','TEAM_ID','POS_EST']], 
                           on=['SEASON','PLAYER_NAME','TEAM_ID'], how='left')

In [147]:
print(identified_df.shape, unidentified_df.shape)

(5681, 13) (120, 13)


In [148]:
updated_ballhog_df = pd.concat([identified_df, unidentified_df]).reset_index(drop=True)
updated_ballhog_df.head()

Unnamed: 0,SEASON,MINUTES_ON,PLAYER_NAME,PLAYER_ID,TEAM_ID,SECONDS_PER_POSS_OFFENSE_PLAYER_ON,TIME_OF_POSS,TEAM,OFF_POSS,BALL_HOG%,AGE,EXP,POS_EST
0,2022-23,83,A.J. Lawson,1630639,1610612742,14.477,3.5,DAL,230,6.3,22,0,2.75
1,2022-23,2,A.J. Lawson,1630639,1610612750,12.0,0.0,MIN,5,0.0,22,0,2.99
2,2021-22,94,Bruno Fernando,1628981,1610612745,14.121,4.5,HOU,198,9.7,23,2,5.0
3,2021-22,58,Bruno Fernando,1628981,1610612738,13.843,2.7,BOS,134,8.7,23,2,4.88
4,2020-21,226,Bruno Fernando,1628981,1610612737,14.698,9.3,ATL,480,7.9,22,1,5.0


In [149]:
updated_ballhog_df.to_csv('data/ball-hog-rate/ball-hog-rates_regular_season_2013_23_w_age_exp_pos.csv', index=False)