In [8]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Create a function to scrape the data from a basketball-reference page
def get_bballref_df(url, data_type):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url, data_type)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url, data_type)

    finally:
        driver.quit()
    
    # Find the table
    table = soup.find('table', {'id': data_type})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    # Additional cleaning only needed for the shooting stats and play-by-play stats
    if data_type == 'shooting_stats' or data_type == 'pbp_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    return df

In [9]:
# List out all the seasons we want data from
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
tables = ['advanced', 'shooting', 'play-by-play', 'per_game']

# Create a helper function to sort by TOT if the player was on multiple teams that season
def keep_tot_or_first(group, d_type):
    if d_type == 'per_game':
        team = 'Team'
    else:
        team = 'Tm'
    # If 'TOT' exists for the player, keep it
    if 'TOT' in group[team].values:
        return group[group[team] == 'TOT']
    # Otherwise, keep the first entry (assuming it's the player's only entry)
    return group.iloc[[0]]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 2 desired categories
for data in tables:
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        if data == 'play-by-play':
            df = get_bballref_df(url, 'pbp_stats')
        else:
            df = get_bballref_df(url, data + '_stats')
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(lambda group: keep_tot_or_first(group, data))
        if data == 'advanced':
            df.drop(columns = ['Rk', 'Tm', 'Age', 'TS%', '3PAr', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        elif data == 'shooting':
            df.drop(columns = ['G', 'MP', 'Dist.', 'Rk', 'Tm', 'Age', 'Pos',  'FG% by Distance - 2P', 'FG% by Distance - 0-3', 'FG% by Distance - 3-10', 'FG% by Distance - 10-16', 'FG% by Distance - 16-3P', 'FG% by Distance - 3P', 'FG%', 'Dunks - #', 'Heaves - Att.', 'Heaves - #', 'Corner 3s - 3P%'], inplace=True)
        elif data == 'play-by-play':
            df.drop(columns = ['Totals - G', 'Totals - MP', 'Rk', 'Tm', 'Age', 'Pos', 'Position Estimate - PG%', 'Position Estimate - SG%', 'Position Estimate - SF%', 'Position Estimate - PF%', 'Position Estimate - C%', '+/- Per 100 Poss. - OnCourt', '+/- Per 100 Poss. - On-Off', 'Turnovers - BadPass', 'Turnovers - LostBall', 'Fouls Committed - Shoot', 'Fouls Committed - Off.', 'Fouls Drawn - Off.', 'Misc. - PGA', 'Misc. - And1', 'Misc. - Blkd'], inplace=True)
        else:
            df.drop(columns = ['G', 'MP', 'Rk', 'Team', 'Age', 'Pos', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'PTS', 'Awards'], inplace=True)
        bballref_dfs[season + data] = df

Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_advanced.html. Retrying...
Error for https://www.basketball-reference.com/leagues/NBA_2016_advanced.html: Message: unknown error: cannot determine loading status
from disconnected: unable to send message to renderer
  (Session info: chrome=128.0.6613.139)
Stacktrace:
	GetHandleVerifier [0x00007FF6C3359412+29090]
	(No symbol) [0x00007FF6C32CE239]
	(No symbol) [0x00007FF6C318B1DA]
	(No symbol) [0x00007FF6C3172EBC]
	(No symbol) [0x00007FF6C3172E2C]
	(No symbol) [0x00007FF6C31715CB]
	(No symbol) [0x00007FF6C317231F]
	(No symbol) [0x00007FF6C31811BD]
	(No symbol) [0x00007FF6C31995DD]
	(No symbol) [0x00007FF6C319F17A]
	(No symbol) [0x00007FF6C3172A6A]
	(No symbol) [0x00007FF6C3199176]
	(No symbol) [0x00007FF6C3226907]
	(No symbol) [0x00007FF6C3206493]
	(No symbol) [0x00007FF6C31D09D1]
	(No symbol) [0x00007FF6C31D1B31]
	GetHandleVerifier [0x00007FF6C367871D+3302573]
	GetHandleVerifier [0x00007FF6C36C4243+3612627]
	GetHa

In [1]:
separated_dfs_by_season = {}
for season in seasons:
    separated_dfs_by_season[season] = {}
    for df in bballref_dfs:
        if season in df:
            separated_dfs_by_season[season][df] = bballref_dfs[df]

SyntaxError: incomplete input (2987636711.py, line 2)

In [None]:
# Remove 'Unnamed: 0' columns from each dataframe in the dictionary
for season in separated_dfs_by_season:
    for df in separated_dfs_by_season[season]:
        separated_dfs_by_season[season][df].drop(columns=[col for col in separated_dfs_by_season[season][df].columns if 'Unnamed' in col], inplace=True)

# Create a new dictionary to hold the merged dataframes by season
dfs_by_season = {}

for season in separated_dfs_by_season:
    merged_df = pd.merge(separated_dfs_by_season[season][season + 'advanced'], separated_dfs_by_season[season][season + 'shooting'], on='Player', how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'per_game'], on='Player', how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'play-by-play'], on='Player', how='inner')
    dfs_by_season[season] = merged_df

In [None]:
# Create an empty list to store each season's dataframe with the season column added
df_list = []

# Loop over each season and its corresponding dataframe in dfs_by_season
for season, df in dfs_by_season.items():
    # Create a copy of the dataframe and add a 'Season' column with the current season
    df_copy = df.copy()
    df_copy['Season'] = season
    # Append the modified dataframe to the list
    df_list.append(df_copy)

# Concatenate all dataframes in the list into a single dataframe and set the index appropriately
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.set_index(['Season', 'Player'], inplace=True)

In [None]:
combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

# Changing all the values in the df to be floats so that I can apply PCA
combined_df.fillna(0, inplace=True)  # Filling NaNs with 0
combined_df = combined_df.astype(float)

# Replacing any infinite values with 0
combined_df.replace([np.inf, -np.inf], 0, inplace=True)

per_48_stats = ['FGA', 'PF', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST', 'Fouls Drawn - Shoot']

for stat in per_48_stats:
    combined_df[stat] = combined_df[stat] / (combined_df['MP'] / combined_df['G']) * 48

In [None]:
# Cleaning up the dataframe
combined_df['Fouls Drawn - Shoot'] = combined_df['Fouls Drawn - Shoot'] / combined_df['G']
combined_df.rename(columns = {'PF': 'Personal Fouls Committed'}, inplace=True)
combined_df.rename(columns = {'Fouls Drawn - Shoot': 'Shooting Fouls Drawn'}, inplace=True)
combined_df.drop(columns = ['G', 'MP', '% of FGA by Distance - 2P', 'GS', 'FG'], inplace=True)
combined_df = combined_df[combined_df['Pos'] == 'PF']

In [None]:
# Create a copy of the dataframe to apply PCA on
final_df = combined_df

# Changing all the values in the df to be floats so that I can apply PCA
final_df.fillna(0, inplace=True)  # Filling NaNs with 0
final_df = final_df.astype(float)

# Replacing any infinite values with 0
final_df.replace([np.inf, -np.inf], 0, inplace=True)

ordered_cols = ['USG%', 'FGA', 'Personal Fouls Committed', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST', "% of FG Ast'd - 3P", "% of FG Ast'd - 2P", 'FTr', 'Shooting Fouls Drawn', 'Corner 3s - %3PA', '% of FGA by Distance - 3P', '% of FGA by Distance - 16-3P', '% of FGA by Distance - 10-16', '% of FGA by Distance - 3-10', '% of FGA by Distance - 0-3', 'Dunks - %FGA']
final_df = final_df.reindex(columns = ordered_cols)

# Removing any players named "Player"
final_df.reset_index(inplace=True)
final_df = final_df[final_df['Player'] != 'Player']
final_df.set_index(['Season', 'Player'], inplace=True)

In [None]:
final_df

In [13]:
for df in bballref_dfs:
    bballref_dfs[df].to_csv(f'{df}.csv')