#### Imports

In [105]:
import os
import math
import time
import json
import random
import datetime
import selenium
import numpy as np
import pandas as pd

from tqdm import tqdm
from datetime import date
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import commonteamroster

#### Functions

In [38]:
def _get_modern_players(start_yr=2013, end_yr=2023):
    '''
    returns a list of player IDs that played in the modern tracking era
    '''
    # create dataframe to store rosters
    players_df = pd.DataFrame()
    team_lst = teams.get_teams()

    # iterate through season and teams
    for season in tqdm(range(2013, 2023)):
        season_param = str(season) + '-' + str(season+1)[2:]

        for team in team_lst:
            # fetch roster for that team during that specific year
            team_id = team['id']
            while True:
                try:
                    temp_df = commonteamroster.CommonTeamRoster(team_id=team_id, 
                                                                season=season_param
                                                               ).get_data_frames()[0]
                    players_df = pd.concat([players_df, temp_df]).reset_index(drop=True)
                    break
                except Exception as e:
                    print('An error occurred for the following params {}, {}:'.format(team_id, season_param), e)
            time.sleep(0.2)
            
    return list(players_df['PLAYER_ID'].unique())

def _create_ids_database(player_ids):
    '''
    returns a dataframe of player_ids and their names, as well as saves it to a local directory
    '''
    player_info = []
    players_lst = players.get_players()
    
    for pid in tqdm(player_ids):
        for player in players_lst:
            if (player['id'] == pid): 
                player_info.append([player['full_name'], player['first_name'], player['last_name'], pid])
                break

    
    ids_df = pd.DataFrame(player_info, columns=['PLAYER_NAME','FIRST_NAME','LAST_NAME','PLAYER_ID'])
    ids_df.to_csv('data/ids/players_modern_database.csv', index=False)
    return ids_df

def _open_chromedriver(download_directory=r'C:\Users\lukar\Desktop\Sports Analytics\NBA Ball Handle Rate\data\gamelogs'):
    '''
    returns browser object of chromedriver with a specified download directory
    '''
    # create chromeOptions object
    chrome_options = Options()
    
    prefs = {"download.default_directory" : download_directory}
    chrome_options.add_experimental_option("prefs",prefs)
    
    browser = webdriver.Chrome(options=chrome_options)
    return browser

def _scrape_player_rs_gamelogs(browser, pid, folderpath="C:/Users/lukar/Desktop/Sports Analytics/NBA Ball Handle Rate/data/gamelogs/", start_yr=2013, end_yr=2023):
    '''
    returns nothing, but downloads a .csv file to a local directory given parameters to a specific URL
    '''
    url = "https://www.pbpstats.com/game-logs/nba/player?Season=2022-23,2020-21,2021-22,2019-20,2017-18,2018-19,2016-17,2015-16,2014-15,2013-14&SeasonType=Regular%2BSeason&EntityId={}".format(pid)
    browser.get(url)
    time.sleep(8.5)
    attempts = 0
    
    found_link = False
    
    while True:
        try:
            try:
                csv_link = browser.find_element(By.XPATH, '/html/body/div/div/main/div[4]/div[2]/div[4]/a')
                csv_link.click()
            except: 
                csv_link = browser.find_element(By.XPATH, '/html/body/div/div/main/div[4]/div[2]/div[3]/a')
                csv_link.click()
                
            time.sleep(1.2)
            found_link = True
            
            break
            
        except Exception as e:
            # print('Could not find download link, attempting to try again...[Attempt #{}]'.format(attempts))
            time.sleep(1.2)
            if (attempts == 50):
                print('Could not find download link for Player ID: {}'.format(pid))
                break
            attempts += 1
            
    if found_link:
        old_filename = 'pbpstats_export.csv'
        new_filename = '{}_rs_pbp_scoring_gamelog_{}-{}.csv'.format(str(pid), str(start_yr), str(end_yr)[2:])
            
        old_file_path = os.path.join(folderpath, old_filename)
        new_file_path = os.path.join(folderpath, new_filename)
        time.sleep(0.2)
            
        os.rename(old_file_path, new_file_path)
        time.sleep(.5)
             
    return

#### Step 1) Get List of Unique Player IDs that played in the Modern Tracking Era (2013-Present)

In [3]:
modern_players = _get_modern_players()

 40%|█████████████████████████████████▏                                                 | 4/10 [00:49<01:13, 12.27s/it]

An error occurred for the following params 1610612738, 2017-18: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612738, 2017-18: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612738, 2017-18: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for the following params 1610612738, 2017-18: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/commonteamroster?LeagueID=&Season=2017-18&TeamID=1610612738 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002A8AA1CC190>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
An error occurred for the following params 1610612738, 2017-18: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:43<04:04, 48.89s/it]

An error occurred for the following params 1610612747, 2018-19: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612747, 2018-19: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for the following params 1610612747, 2018-19: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/commonteamroster?LeagueID=&Season=2018-19&TeamID=1610612747 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002A8AA233F50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:43<03:30, 52.64s/it]

An error occurred for the following params 1610612746, 2019-20: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612746, 2019-20: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for the following params 1610612746, 2019-20: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/commonteamroster?LeagueID=&Season=2019-20&TeamID=1610612746 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002A8AA28B310>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
An error occurred for the following params 1610612761, 2019-20: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612761, 2019-20: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An 

 70%|██████████████████████████████████████████████████████████                         | 7/10 [06:14<04:14, 84.91s/it]

An error occurred for the following params 1610612756, 2020-21: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612756, 2020-21: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
An error occurred for the following params 1610612756, 2020-21: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
An error occurred for the following params 1610612756, 2020-21: HTTPSConnectionPool(host='stats.nba.com', port=443): Max retries exceeded with url: /stats/commonteamroster?LeagueID=&Season=2020-21&TeamID=1610612756 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002A8AA1767D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:04<00:00, 48.41s/it]


#### Step 2) Create ID Database of Players in the Modern Tracking Era (2013-Present)

In [12]:
ids_df = _create_ids_database(modern_players)
print(ids_df.shape)
ids_df.head()

100%|████████████████████████████████████████████████████████████████████████████| 1286/1286 [00:00<00:00, 7865.60it/s]

(1283, 4)





Unnamed: 0,PLAYER_NAME,FIRST_NAME,LAST_NAME,PLAYER_ID
0,Jeff Teague,Jeff,Teague,201952
1,Lou Williams,Lou,Williams,101150
2,Paul Millsap,Paul,Millsap,200794
3,DeMarre Carroll,DeMarre,Carroll,201960
4,Pero Antic,Pero,Antic,203544


#### Step 3) Scrape Every Player's Entire Gamelong in the Modern Tracking Era (2013-Present) 

In [3]:
# CAN SKIP STEP 1 & 2 IF YOU HAVE SAVED IDS AS .csv FILE 
ids_df = pd.read_csv('data/ids/players_modern_database.csv')
print(ids_df.shape)
ids_df.head(2)

(1283, 4)


Unnamed: 0,PLAYER_NAME,FIRST_NAME,LAST_NAME,PLAYER_ID
0,Jeff Teague,Jeff,Teague,201952
1,Lou Williams,Lou,Williams,101150


In [32]:
browser = _open_chromedriver()

In [41]:
print('Scraping Regular Season Gamelogs...')

# manually kept track of IDs that produced errors and ran code separately to get files (error occurred because the xpath to .csv download was different for small gamelogs)
error_ids = [2568, 1885, 201591, 1627868, 1627741, 202918, 1628500, 1628571, 1629129, 
             1629055, 1629341, 1629739, 1629624, 1630492, 1630258, 1630624, 1630597, 
             1630525, 1629083, 1630604, 1631171, 1631221, 1630641, 1631096
            ]

# iterate through each player id, and download the .csv file of their gamelog
for pid in tqdm(ids_df['PLAYER_ID'].to_list()):
    _scrape_player_rs_gamelogs(browser, pid)
    pass
    
print('Finished Regular Season Gamelogs!')

Scraping Regular Season Gamelogs...


100%|██████████████████████████████████████████████████████████████████████████████████████| 1283/1283 [00:00<?, ?it/s]

Finished Regular Season Gamelogs!





#### Step 4) Clean Gamelog Data & Get Possession Data by Season & Team for each Player

In [44]:
poss_filepath = "C:/Users/lukar/Desktop/Sports Analytics/NBA Ball Handle Rate/data/gamelogs/"
poss_files = os.listdir(poss_filepath)

# create dataframe of all the games
possessions_df = pd.DataFrame()

# iterate through all the gamelog files we have
for filename in tqdm(poss_files):
    temp_df = pd.read_csv(poss_filepath + filename)
    pid = filename.split('_')[0].replace('.csv','')
    temp_df['PLAYER_ID'] = int(pid)
    # check if the gamelog had no games available for that player, if not, add to dataframe
    if (0 not in temp_df['Date'].to_list()): 
        possessions_df = pd.concat([possessions_df, temp_df]).reset_index(drop=True)
    
print(possessions_df.shape)

possessions_df['Date'] = pd.to_datetime(possessions_df['Date'])
    
possessions_df.head()

100%|██████████████████████████████████████████████████████████████████████████████| 1283/1283 [01:19<00:00, 16.18it/s]


(252309, 32)


Unnamed: 0,Date,Team,Opponent,Minutes,OffPoss,Points,FG2M,FG2A,Fg2Pct,FG3M,...,ShotQualityAvg,EfgPct,TsPct,PtsPutbacks,Fg2aBlocked,FG2APctBlocked,Fg3aBlocked,FG3APctBlocked,Usage,PLAYER_ID
0,2013-10-30,GSW,LAL,17:59,38,2,1,2,0.5,0,...,0.6982,0.5,0.5,0,0,0.0,0,0.0,4.651163,101106
1,2013-10-31,GSW,LAC,28:09,61,17,6,7,0.857143,0,...,0.556986,0.857143,0.85,2,0,0.0,0,0.0,16.666667,101106
2,2013-11-02,GSW,SAC,20:28,41,6,3,6,0.5,0,...,0.494983,0.5,0.428571,0,0,0.0,0,0.0,19.512195,101106
3,2013-11-04,GSW,PHI,22:57,52,4,1,1,1.0,0,...,0.8625,1.0,0.666667,0,0,0.0,0,0.0,6.557377,101106
4,2013-11-06,GSW,MIN,16:18,34,2,1,2,0.5,0,...,0.4265,0.5,0.5,0,0,0.0,0,0.0,13.513514,101106


In [138]:
# load .json file of season dates and convert it to a dataframe
r = open('assets/seasons.json')
season_dates = json.load(r)
dates_df = pd.DataFrame.from_dict(season_dates, orient='index')

def map_season(game_date):
    '''
    returns the NBA season (YYYY-YY+1 format) in which the game was played for 
    '''
    # convert date to YYYY-MM-DD format
    game_date = game_date.strftime("%Y-%m-%d")
    
    # iterate through the start and end dates for each season
    for start_month, start_day, end_month, end_day in zip(dates_df['regular_start_month'].to_list(), dates_df['regular_start_day'].to_list(), 
                                                          dates_df['regular_end_month'].to_list(), dates_df['regular_end_day'].to_list()):
        for year in list(dates_df.index.values):
            season_start_month = dates_df.loc[year, 'regular_start_month']
            season_start_day = dates_df.loc[year, 'regular_start_day']
            season_end_month = dates_df.loc[year, 'regular_end_month']
            season_end_day = dates_df.loc[year, 'regular_end_day']

            start_date_obj = date(int(year), int(season_start_month), int(season_start_day))
            end_date_obj = date(int(year)+1, int(season_end_month), int(season_end_day))
            
            start_date = start_date_obj.strftime("%Y-%m-%d")
            end_date = end_date_obj.strftime("%Y-%m-%d")
            
            # check if game date lies between the start and end date, if so, the game was played for the season
            if start_date <= game_date <= end_date:
                season = str(year) + '-' + str(int(year)+1)[2:]
                return season
    return ''

possessions_df['SEASON'] = possessions_df['Date'].apply(map_season)
possessions_df.head(2)

Unnamed: 0,Date,Team,Opponent,Minutes,OffPoss,Points,FG2M,FG2A,Fg2Pct,FG3M,...,EfgPct,TsPct,PtsPutbacks,Fg2aBlocked,FG2APctBlocked,Fg3aBlocked,FG3APctBlocked,Usage,PLAYER_ID,SEASON
0,2013-10-30,GSW,LAL,17:59,38,2,1,2,0.5,0,...,0.5,0.5,0,0,0.0,0,0.0,4.651163,101106,2013-14
1,2013-10-31,GSW,LAC,28:09,61,17,6,7,0.857143,0,...,0.857143,0.85,2,0,0.0,0,0.0,16.666667,101106,2013-14


In [141]:
imprt_cols = ['SEASON','PLAYER_ID','Team','OffPoss']
player_possessions_gb = possessions_df[imprt_cols].groupby(by=['SEASON','PLAYER_ID','Team']).sum().reset_index()

player_possessions_gb = player_possessions_gb.rename(columns={'Team': 'TEAM'})

player_possessions_gb.to_csv('data/possessions/player_possessions_by_team_season_2013-23.csv', index=False)

player_possessions_gb.head(2)

Unnamed: 0,SEASON,PLAYER_ID,TEAM,OffPoss
0,2013-14,708,BKN,2072
1,2013-14,951,MIA,3637


#### Step 5) Get Traded Players Database

In [145]:
# create dictionary mapping of season to traded players
traded_players = {}

# iterate through each season
for season in player_possessions_gb['SEASON'].unique():
    season_int = int(season[:-3])
    season_param = str(season_int) + '-' + str(season_int+1)[2:]
    # subset the dataframe by season and look at the SEASON, PLAYER_ID column only
    season_df = player_possessions_gb[player_possessions_gb['SEASON'] == season].reset_index(drop=True)[['SEASON','PLAYER_ID']]
    # if there are duplicate PLAYER_IDs, then they played for multiple teams that season
    traded_df = season_df[season_df.duplicated(keep=False)].reset_index(drop=True)
    # match season to a list of unique Player IDs
    traded_players[season] = list(traded_df['PLAYER_ID'].unique())
    
def get_team_dates(season_param, gamelog_df, player_id):
    '''
    returns the start date and end date for the traded player on the team they played for
    '''
    traded_df = pd.DataFrame(columns=['SEASON','PLAYER_ID','TEAM','START_DATE','END_DATE'])
    p_gamelog_df = gamelog_df[(gamelog_df['SEASON'] == season_param) & 
                              (gamelog_df['PLAYER_ID'] == player_id)].reset_index(drop=True)
    for team in list(p_gamelog_df['Team'].unique()):
        pt_gamelog_df = p_gamelog_df[p_gamelog_df['Team'] == team].reset_index(drop=True).sort_values(by=['Date'])
        dates = pt_gamelog_df['Date'].to_list()
        start_date, end_date = dates[0], dates[-1]
        
        player_info = [season_param, player_id, team, start_date, end_date]
        traded_df.loc[len(traded_df)] = player_info
        
    return traded_df

# create dataframe
traded_df = pd.DataFrame(columns=['SEASON','PLAYER_ID','TEAM','START_DATE','END_DATE'])

# iterate through each season and then iterate through the player-ids of each season
for season_param in traded_players:
    for player_id in tqdm(traded_players[season_param]):
        temp_df = get_team_dates(season_param, player_possessions_df, player_id)
        traded_df = pd.concat([traded_df, temp_df]).reset_index(drop=True)
        
print(traded_df.shape)

# convert date columns to string
traded_df['START_DATE'] = traded_df['START_DATE'].astype(str)
traded_df['END_DATE'] = traded_df['END_DATE'].astype(str)

traded_df.to_csv('data/ids/traded_players_database.csv', index=False)

traded_df.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 67.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:01<00:00, 66.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 68.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<00:00, 65.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [00:00<00:00, 64.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 85/85 [00:01<00:00, 64.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [00:00<00:00, 63.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [00:01<00:00, 66.27it/s]
100%|███████████████████████████████████

(1409, 5)





Unnamed: 0,SEASON,PLAYER_ID,TEAM,START_DATE,END_DATE
0,2013-14,1889,DEN,2013-10-30,2013-12-30
1,2013-14,1889,WAS,2014-02-22,2014-04-16
2,2013-14,2406,MIL,2013-10-30,2014-02-20
3,2013-14,2406,OKC,2014-03-04,2014-04-16
4,2013-14,2422,SAC,2013-10-30,2013-12-07
