In [8]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Create a function to scrape the data from a basketball-reference page
def get_bballref_df(url, data_type):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 30  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url, data_type)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url, data_type)

    finally:
        driver.quit()
    
    # Find the table
    table = soup.find('table', {'id': data_type})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    # Additional cleaning only needed for the shooting stats
    if data_type == 'shooting_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    # Cleaning the data to only contain power forwards
    df = df[df.get('Pos') == 'PF']
    
    return df

In [9]:
# List out all the seasons we want data from
seasons = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']

# Create a helper function to sort by TOT if the player was on multiple teams that season
def keep_tot_or_first(group):
    # If 'TOT' exists for the player, keep it
    if 'TOT' in group['Tm'].values:
        return group[group['Tm'] == 'TOT']
    # Otherwise, keep the first entry (assuming it's the player's only entry)
    return group.iloc[[0]]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 2 desired categories
for i in range(2):
    if i == 0:
        data = 'advanced'
    else:
        data = 'shooting'
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        df = get_bballref_df(url, data + '_stats')
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(keep_tot_or_first)
        if i == 0:
            df.drop(columns = ['Rk', 'Tm', 'Age', 'Pos', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        else:
            df.drop(columns = ['Rk', 'Tm', 'Age', 'Pos', 'Dunks - %FGA', 'Dunks - #', 'Heaves - Att.', 'Heaves - #'], inplace=True)
            df.rename(columns = {'Dist.': 'Avg FG Distance'}, inplace=True)
        bballref_dfs[season + data] = df

Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_advanced.html. Retrying...
Error for https://www.basketball-reference.com/leagues/NBA_2016_advanced.html: Message: unknown error: cannot determine loading status
from disconnected: unable to send message to renderer
  (Session info: chrome=128.0.6613.139)
Stacktrace:
	GetHandleVerifier [0x00007FF6C3359412+29090]
	(No symbol) [0x00007FF6C32CE239]
	(No symbol) [0x00007FF6C318B1DA]
	(No symbol) [0x00007FF6C3172EBC]
	(No symbol) [0x00007FF6C3172E2C]
	(No symbol) [0x00007FF6C31715CB]
	(No symbol) [0x00007FF6C317231F]
	(No symbol) [0x00007FF6C31811BD]
	(No symbol) [0x00007FF6C31995DD]
	(No symbol) [0x00007FF6C319F17A]
	(No symbol) [0x00007FF6C3172A6A]
	(No symbol) [0x00007FF6C3199176]
	(No symbol) [0x00007FF6C3226907]
	(No symbol) [0x00007FF6C3206493]
	(No symbol) [0x00007FF6C31D09D1]
	(No symbol) [0x00007FF6C31D1B31]
	GetHandleVerifier [0x00007FF6C367871D+3302573]
	GetHandleVerifier [0x00007FF6C36C4243+3612627]
	GetHa

In [10]:
bballref_dfs['2014advanced'].columns

Index(['Player', 'G', 'MP', 'TS%', '3PAr', 'FTr', 'USG%'], dtype='object')

In [11]:
bballref_dfs['2018shooting'].columns

Index(['Player', 'G', 'MP', 'FG%', 'Avg FG Distance',
       '% of FGA by Distance - 2P', '% of FGA by Distance - 0-3',
       '% of FGA by Distance - 3-10', '% of FGA by Distance - 10-16',
       '% of FGA by Distance - 16-3P', '% of FGA by Distance - 3P',
       'FG% by Distance - 2P', 'FG% by Distance - 0-3',
       'FG% by Distance - 3-10', 'FG% by Distance - 10-16',
       'FG% by Distance - 16-3P', 'FG% by Distance - 3P', '% of FG Ast'd - 2P',
       '% of FG Ast'd - 3P', 'Corner 3s - %3PA', 'Corner 3s - 3P%'],
      dtype='object')

In [13]:
for df in bballref_dfs:
    bballref_dfs[df].to_csv(f'{df}.csv')