In [59]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt

# Create a function to scrape the data from a basketball-reference page
def get_bballref_df(url, data_type):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url, data_type)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url, data_type)

    finally:
        driver.quit()
    
    # Find the table
    table = soup.find('table', {'id': data_type})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    # Additional cleaning only needed for the shooting stats and play-by-play stats
    if data_type == 'shooting_stats' or data_type == 'pbp_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    return df

In [60]:
# List out all the seasons we want data from
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
tables = ['advanced', 'shooting', 'play-by-play', 'per_game']

# Create a helper function to sort by TOT if the player was on multiple teams that season
def keep_tot_or_first(group, d_type):
    if d_type == 'per_game':
        team = 'Team'
    else:
        team = 'Tm'
    # If 'TOT' exists for the player, keep it
    if 'TOT' in group[team].values:
        return group[group[team] == 'TOT']
    # Otherwise, keep the first entry (assuming it's the player's only entry)
    return group.iloc[[0]]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 2 desired categories
for data in tables:
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        if data == 'play-by-play':
            df = get_bballref_df(url, 'pbp_stats')
        else:
            df = get_bballref_df(url, data + '_stats')
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(lambda group: keep_tot_or_first(group, data))
        if data == 'advanced':
            df.drop(columns = ['Rk', 'Tm', 'Age', 'TS%', '3PAr', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        elif data == 'shooting':
            df.drop(columns = ['G', 'MP', 'Dist.', 'Rk', 'Tm', 'Age', 'Pos',  'FG% by Distance - 2P', 'FG% by Distance - 0-3', 'FG% by Distance - 3-10', 'FG% by Distance - 10-16', 'FG% by Distance - 16-3P', 'FG% by Distance - 3P', 'FG%', 'Dunks - #', 'Heaves - Att.', 'Heaves - #', 'Corner 3s - 3P%'], inplace=True)
        elif data == 'play-by-play':
            df.drop(columns = ['Totals - G', 'Totals - MP', 'Rk', 'Tm', 'Age', 'Pos', 'Position Estimate - PG%', 'Position Estimate - SG%', 'Position Estimate - SF%', 'Position Estimate - PF%', 'Position Estimate - C%', '+/- Per 100 Poss. - OnCourt', '+/- Per 100 Poss. - On-Off', 'Turnovers - BadPass', 'Turnovers - LostBall', 'Fouls Committed - Shoot', 'Fouls Committed - Off.', 'Fouls Drawn - Off.', 'Misc. - PGA', 'Misc. - And1', 'Misc. - Blkd'], inplace=True)
        else:
            df.drop(columns = ['G', 'MP', 'Rk', 'Team', 'Age', 'Pos', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'PTS', 'Awards'], inplace=True)
        bballref_dfs[season + data] = df

In [61]:
separated_dfs_by_season = {}
for season in seasons:
    separated_dfs_by_season[season] = {}
    for df in bballref_dfs:
        if season in df:
            separated_dfs_by_season[season][df] = bballref_dfs[df]

In [62]:
# Remove 'Unnamed: 0' columns from each dataframe in the dictionary
for season in separated_dfs_by_season:
    for df in separated_dfs_by_season[season]:
        separated_dfs_by_season[season][df].drop(columns=[col for col in separated_dfs_by_season[season][df].columns if 'Unnamed' in col], inplace=True)

# Create a new dictionary to hold the merged dataframes by season
dfs_by_season = {}

for season in separated_dfs_by_season:
    merged_df = pd.merge(separated_dfs_by_season[season][season + 'advanced'], separated_dfs_by_season[season][season + 'shooting'], on='Player', how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'per_game'], on='Player', how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'play-by-play'], on='Player', how='inner')
    dfs_by_season[season] = merged_df

In [63]:
# Create an empty list to store each season's dataframe with the season column added
df_list = []

# Loop over each season and its corresponding dataframe in dfs_by_season
for season, df in dfs_by_season.items():
    # Create a copy of the dataframe and add a 'Season' column with the current season
    df_copy = df.copy()
    df_copy['Season'] = season
    # Append the modified dataframe to the list
    df_list.append(df_copy)

# Concatenate all dataframes in the list into a single dataframe and set the index appropriately
combined_df = pd.concat(df_list, ignore_index=True)
combined_df = combined_df[combined_df['Pos'] == 'PF'] # Indexing for only Power Forwards
combined_df.drop(columns = ['Pos'], inplace=True)
combined_df.set_index(['Season', 'Player'], inplace=True)

In [64]:
combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

# Changing all the values in the df to be floats
combined_df = combined_df.astype(float)

combined_df.fillna(0, inplace=True)  # Filling NaNs with 0

# Replacing any infinite values with 0
combined_df.replace([np.inf, -np.inf], 0, inplace=True)

per_48_stats = ['FGA', 'PF', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST', 'Fouls Drawn - Shoot']

for stat in per_48_stats:
    combined_df[stat] = combined_df[stat] / (combined_df['MP'] / combined_df['G']) * 48

In [65]:
# Cleaning up the dataframe
combined_df['Fouls Drawn - Shoot'] = combined_df['Fouls Drawn - Shoot'] / combined_df['G']
combined_df.rename(columns = {'PF': 'Personal Fouls Committed'}, inplace=True)
combined_df.rename(columns = {'Fouls Drawn - Shoot': 'Shooting Fouls Drawn'}, inplace=True)
combined_df.drop(columns = ['G', 'MP', '% of FGA by Distance - 2P', 'GS', 'FG'], inplace=True)

In [66]:
# Create a copy of the dataframe to apply PCA on
final_df = combined_df

# Changing all the values in the df to be floats so that I can apply PCA
final_df.fillna(0, inplace=True)  # Filling NaNs with 0
final_df = final_df.astype(float)

# Replacing any infinite values with 0
final_df.replace([np.inf, -np.inf], 0, inplace=True)

ordered_cols = ['USG%', 'FGA', 'Personal Fouls Committed', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST', "% of FG Ast'd - 3P", "% of FG Ast'd - 2P", 'FTr', 'Shooting Fouls Drawn', 'Corner 3s - %3PA', '% of FGA by Distance - 3P', '% of FGA by Distance - 16-3P', '% of FGA by Distance - 10-16', '% of FGA by Distance - 3-10', '% of FGA by Distance - 0-3', 'Dunks - %FGA']
final_df = final_df.reindex(columns = ordered_cols)

# Removing any players named "Player"
final_df.reset_index(inplace=True)
final_df = final_df[final_df['Player'] != 'Player']
final_df.set_index(['Season', 'Player'], inplace=True)

In [67]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,USG%,FGA,Personal Fouls Committed,BLK,STL,DRB,ORB,TOV,AST,% of FG Ast'd - 3P,% of FG Ast'd - 2P,FTr,Shooting Fouls Drawn,Corner 3s - %3PA,% of FGA by Distance - 3P,% of FGA by Distance - 16-3P,% of FGA by Distance - 10-16,% of FGA by Distance - 3-10,% of FGA by Distance - 0-3,Dunks - %FGA
Season,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001,A.C. Green,14.4,11.158044,4.184266,0.278951,1.115804,6.973777,3.626364,1.394755,1.394755,0.000,0.764,0.343,1.496811,0.167,0.019,0.438,0.164,0.102,0.278,0.012
2001,Aaron Williams,17.8,13.310959,6.571233,2.358904,1.179452,7.750685,4.380822,2.695890,1.853425,0.000,0.623,0.477,2.691781,0.000,0.003,0.103,0.131,0.262,0.502,0.129
2001,Adam Keefe,11.5,9.232536,5.770335,1.154067,1.538756,6.924402,5.000957,2.308134,1.923445,1.000,0.683,0.396,1.435407,0.333,0.019,0.258,0.170,0.176,0.377,0.038
2001,Al Harrington,18.8,13.852008,5.738689,0.395772,1.583087,6.728118,2.968288,3.759831,3.364059,0.000,0.675,0.289,1.649049,0.143,0.013,0.210,0.193,0.250,0.333,0.068
2001,Alan Henderson,22.6,17.810387,4.259006,0.774365,1.355138,6.001326,4.839779,3.291050,1.355138,0.000,0.537,0.404,3.288398,0.000,0.001,0.051,0.210,0.304,0.434,0.046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Trey Lyles,14.4,13.222798,3.606218,0.721244,0.721244,8.174093,2.644560,1.682902,2.884974,1.000,0.729,0.218,1.409326,0.384,0.682,0.019,0.022,0.100,0.178,0.025
2024,Usman Garuba,20.8,16.000000,4.800000,8.000000,3.200000,11.200000,8.000000,4.800000,3.200000,0.000,0.000,0.333,2.666667,0.000,0.167,0.000,0.000,0.500,0.333,0.000
2024,Wenyen Gabriel,17.0,13.037037,5.925926,1.185185,1.185185,10.666667,4.148148,4.740741,1.777778,1.000,0.714,0.227,1.777778,0.500,0.273,0.000,0.091,0.091,0.545,0.136
2024,Zeke Nnaji,15.4,12.566667,6.766667,3.383333,1.450000,5.316667,5.316667,2.416667,2.900000,1.000,0.619,0.416,2.166667,0.391,0.154,0.007,0.027,0.215,0.597,0.181
