In [17]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib

nba_teams = {
    'Miami Heat': 'MIA',
    'Chicago Bulls': 'CHI',
    'Philadelphia 76ers': 'PHI',
    'New Jersey Nets': 'NJN',
    'Golden State Warriors': 'GSW',
    'Boston Celtics': 'BOS',
    'Indiana Pacers': 'IND',
    'Atlanta Hawks': 'ATL',
    'New York Knicks': 'NYK',
    'Toronto Raptors': 'TOR',
    'Cleveland Cavaliers': 'CLE',
    'Orlando Magic': 'ORL',
    'Phoenix Suns': 'PHO',
    'Denver Nuggets': 'DEN',
    'Houston Rockets': 'HOU',
    'Minnesota Timberwolves': 'MIN',
    'San Antonio Spurs': 'SAS',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'Charlotte Hornets': 'CHH',
    'Detroit Pistons': 'DET',
    'Dallas Mavericks': 'DAL',
    'Seattle SuperSonics': 'SEA',
    'Vancouver Grizzlies': 'VAN',
    'Los Angeles Lakers': 'LAL',
    'Los Angeles Clippers': 'LAC',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS',
    'Milwaukee Bucks': 'MIL',
    'Memphis Grizzlies': 'MEM',
    'New Orleans Hornets': 'NOH',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'Oklahoma City Thunder': 'OKC',
    'Brooklyn Nets': 'BRK',
    'New Orleans Pelicans': 'NOP',
    'Charlotte Bobcats': 'CHO'
}

# Create a function to scrape the data from a basketball-reference page
def get_bballref_records_df(url):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url)

    finally:
        driver.quit()
    
    # Find the table
    table = soup.find('table', {'id': 'expanded_standings'})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    df.columns = df.columns.map(' - '.join)
    for column in df.columns:
        if 'Unnamed' in column:
            new_column_name = column.split(' - ')[1]
            df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    df = df.drop(columns = [column for column in df.columns if column not in ['Team', 'Overall']])
    
    df['Tm'] = df['Team'].map(nba_teams)
    
    return df

In [18]:
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
season_records_dfs = {}
for season in seasons:
    if season in ['2001', '2002']:
        nba_teams['Charlotte Hornets'] = 'CHH'
    else:
        nba_teams['Charlotte Hornets'] = 'CHA'
    season_records_dfs[season] = get_bballref_records_df(f'https://www.basketball-reference.com/leagues/NBA_{season}_standings.html')

In [20]:
processed_dfs = []

for season, df in season_records_dfs.items():
    # Create the new index as 'Rank-Statistic'
    df['Rank-Statistic'] = season + '-' + df['Tm']
    # Keep only 'Rank-Statistic' as the index and 'Overall' column
    processed_dfs.append(df.set_index('Rank-Statistic')[['Overall']])

# Concatenate all the processed DataFrames
result_df = pd.concat(processed_dfs)

In [26]:
result_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\team_records_2001-2024.csv")

In [19]:
season_records_dfs

{'2001':                       Team Overall   Tm
 0        San Antonio Spurs   58-24  SAS
 1       Los Angeles Lakers   56-26  LAL
 2       Philadelphia 76ers   56-26  PHI
 3         Sacramento Kings   55-27  SAC
 4         Dallas Mavericks   53-29  DAL
 5                Utah Jazz   53-29  UTA
 6          Milwaukee Bucks   52-30  MIL
 7             Phoenix Suns   51-31  PHO
 8               Miami Heat   50-32  MIA
 9   Portland Trail Blazers   50-32  POR
 10         New York Knicks   48-34  NYK
 11  Minnesota Timberwolves   47-35  MIN
 12         Toronto Raptors   47-35  TOR
 13       Charlotte Hornets   46-36  CHH
 14         Houston Rockets   45-37  HOU
 15     Seattle SuperSonics   44-38  SEA
 16           Orlando Magic   43-39  ORL
 17          Indiana Pacers   41-41  IND
 18          Denver Nuggets   40-42  DEN
 19          Boston Celtics   36-46  BOS
 20         Detroit Pistons   32-50  DET
 21    Los Angeles Clippers   31-51  LAC
 22     Cleveland Cavaliers   30-52  CLE
 23     

In [28]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib

# Create a function to scrape the data from a basketball-reference page
def get_bballref_df(url, data_type):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url, data_type)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url, data_type)

    finally:
        driver.quit()
    
    if (data_type in ['advanced', 'shooting']):
        table=soup.find('table', {'id': data_type})
    else:
        # Find the table's container div by ID
        div_id = "switcher_" + data_type  # Adjusting based on your `data_type`
        table_div = soup.find('div', {'id': div_id})
        if table_div is None:
            print(f"Could not find table div with id {div_id}.")
            return None
    
        # Find the table
        table = table_div.find('table', {'id': data_type})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    # Additional cleaning only needed for the shooting stats and play-by-play stats
    if data_type == 'shooting' or data_type == 'pbp_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    return df

In [29]:
# List out all the seasons we want data from
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
tables = ['advanced', 'shooting', 'play-by-play', 'per_game']

# Create a helper function to keep players' stats from each team they played for in a single season
def drop_tot_rows(group):
    group['Team'] = group['Team'].astype(str)
    group = group[(group['Team'] != 'nan') & (group['Team'] != 'Team')]
    return group[~group['Team'].str.endswith('TM')]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 4 desired categories
for data in tables:
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        if data == 'play-by-play':
            df = get_bballref_df(url, 'pbp_stats')
        elif data in ['advanced', 'shooting']:
            df = get_bballref_df(url, data)
        else:
            df = get_bballref_df(url, data + '_stats')
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(drop_tot_rows)
        if data == 'advanced':
            df.drop(columns = ['Rk', 'Age', 'TS%', '3PAr', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        elif data == 'shooting':
            df.drop(columns = ['G', 'MP', 'Team', 'Dist.', 'Rk', 'Age', 'Pos',  'FG% by Distance - 2P', 'FG% by Distance - 0-3', 'FG% by Distance - 3-10', 'FG% by Distance - 10-16', 'FG% by Distance - 16-3P', 'FG% by Distance - 3P', 'FG%', 'Dunks - #', 'Heaves - Att.', 'Heaves - Md.', 'Corner 3s - 3P%'], inplace=True)
        elif data == 'play-by-play':
            df.drop(columns = ['G', 'MP', 'Rk', 'Team', 'Age', 'Pos', '+/- Per 100 Poss - OnCourt', '+/- Per 100 Poss - On-Off', 'Turnovers - BadPass', 'Turnovers - LostBall', 'Fouls Committed - Shoot', 'Fouls Committed - Off.', 'Misc. - PGA', 'Misc. - And1', 'Misc. - Blkd', 'Awards'], inplace=True)
        else:
            df.drop(columns = ['G', 'MP', 'Rk', 'Team', 'Age', 'Pos', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'PTS', 'Awards'], inplace=True)
        bballref_dfs[season + data] = df

Timeout reached for https://www.basketball-reference.com/leagues/NBA_2011_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2017_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2017_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2017_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2006_play-by-play.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_play-by-play.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2021_play-by-play.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2002_per_game.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2020_per_g

In [135]:
for season in seasons:
    for data in tables:
        if data != 'advanced':
            bballref_dfs[season + data] = bballref_dfs[season + data].drop(columns=['Team'])

In [174]:
separated_dfs_by_season['2024']['2024per_game'][separated_dfs_by_season['2024']['2024per_game']['Player'] == 'Spencer Dinwiddie']

Unnamed: 0,Player,GS,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF
740,Spencer Dinwiddie,48,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
742,Spencer Dinwiddie,4,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9


In [188]:
separated_dfs_by_season = {}
for season in seasons:
    separated_dfs_by_season[season] = {}
    for df in bballref_dfs:
        if season in df:
            separated_dfs_by_season[season][df] = bballref_dfs[df]

In [189]:
for season in seasons:
    for df in separated_dfs_by_season[season]:
        if 'advanced' not in df:
            separated_dfs_by_season[season][df] = separated_dfs_by_season[season][df].drop_duplicates(subset=['Player', 'GS'], keep='first')

In [190]:
# Remove 'Unnamed: 0' columns from each dataframe in the dictionary
for season in separated_dfs_by_season:
    for df in separated_dfs_by_season[season]:
        separated_dfs_by_season[season][df] = separated_dfs_by_season[season][df].drop(columns=[col for col in separated_dfs_by_season[season][df].columns if 'Unnamed' in col])
        separated_dfs_by_season[season][df]['GS'] = separated_dfs_by_season[season][df]['GS'].astype(str)

In [185]:
separated_dfs_by_season['2024']

{'2024advanced':               Player Team Pos   G  GS    MP   FTr  USG% Awards
 449       A.J. Green  MIL  SG  56   0   614  .097  15.2    NaN
 565      A.J. Lawson  DAL  SG  42   0   311  .190  20.0    NaN
 615       AJ Griffin  ATL  SF  20   0   171  .032  17.3    NaN
 60      Aaron Gordon  DEN  PF  73  73  2297  .376  17.8    NaN
 273    Aaron Holiday  HOU  PG  78   1  1269  .151  16.7    NaN
 ..               ...  ...  ..  ..  ..   ...   ...   ...    ...
 378      Zach LaVine  CHI  SG  25  23   872  .274  23.8    NaN
 618   Zavier Simpson  MEM  PG   7   0   161  .074  17.5    NaN
 468       Zeke Nnaji  DEN  PF  58   0   576  .416  15.4    NaN
 328  Ziaire Williams  MEM  SF  51  15  1038  .198  19.7    NaN
 77   Zion Williamson  NOP  PF  70  70  2207  .452  29.7    NaN
 
 [657 rows x 9 columns],
 '2024shooting':               Player  GS % of FGA by Distance - 2P % of FGA by Distance - 0-3  \
 0         A.J. Green   0                      .138                       .026   
 1       

In [186]:
dfs_by_season['2024'][dfs_by_season['2024']['Player'] == 'Spencer Dinwiddie']

Unnamed: 0,Player,Team,Pos,G,MP,FTr,USG%,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,...,Corner 3s - %3PA,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF
1288,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.443,0.14,0.222,...,0.212,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
1289,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.443,0.14,0.222,...,0.212,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
1290,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.443,0.14,0.222,...,0.212,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9
1291,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.443,0.14,0.222,...,0.212,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9
1292,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.362,0.078,0.177,...,0.367,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
1293,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.362,0.078,0.177,...,0.367,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
1294,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.362,0.078,0.177,...,0.367,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9
1295,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.362,0.078,0.177,...,0.367,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9
1296,Spencer Dinwiddie,LAL,PG,28,678,0.355,12.2,0.443,0.14,0.222,...,0.212,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8
1297,Spencer Dinwiddie,LAL,PG,28,678,0.355,12.2,0.443,0.14,0.222,...,0.212,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8


In [191]:
# Create a new dictionary to hold the merged dataframes by season
dfs_by_season = {}

for season in separated_dfs_by_season:
    merged_df = pd.merge(separated_dfs_by_season[season][season + 'advanced'], separated_dfs_by_season[season][season + 'shooting'], on=['Player', 'GS'], how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'per_game'], on=['Player', 'GS'], how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'play-by-play'], on=['Player', 'GS'], how='inner')
    dfs_by_season[season] = merged_df

In [192]:
for df in dfs_by_season:
    cols_to_drop = []
    for col in dfs_by_season[df].columns:
        if 'GS' in col or 'Awards' in col or 'Foul' in col or 'Position Estimate' in col or '_x' in col or '_y' in col:
            cols_to_drop.append(col)
    dfs_by_season[df].drop(columns=cols_to_drop, inplace=True)

In [193]:
# Create an empty list to store each season's dataframe with the season column added
df_list = []

# Loop over each season and its corresponding dataframe in dfs_by_season
for season, df in dfs_by_season.items():
    # Create a copy of the dataframe and add a 'Season' column with the current season
    df_copy = df.copy()
    df_copy['Season'] = season
    # Append the modified dataframe to the list
    df_list.append(df_copy)

# Concatenate all dataframes in the list into a single dataframe and set the index appropriately
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.set_index(['Season', 'Player'], inplace=True)

In [194]:
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Pos,G,MP,FTr,USG%,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,% of FGA by Distance - 10-16,...,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF,Shoot
Season,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2001,A.C. Green,MIA,PF,82,1411,.343,14.4,.981,.278,.102,.164,...,1.8,4.0,1.3,2.5,0.5,0.4,0.1,0.5,1.5,44
2001,A.J. Guyton,CHI,PG,33,630,.094,16.5,.641,.078,.042,.094,...,2.4,5.8,0.3,0.8,1.9,0.3,0.2,0.7,1.1,5
2001,Aaron McKie,PHI,SG,76,2394,.272,18.9,.762,.232,.137,.183,...,4.4,9.4,0.4,3.7,5.0,1.4,0.1,2.7,2.3,73
2001,Aaron Williams,NJN,PF,82,2336,.477,17.8,.997,.502,.262,.131,...,3.6,7.9,2.6,4.6,1.1,0.7,1.4,1.6,3.9,131
2001,Adam Keefe,GSW,PF,67,836,.396,11.5,.981,.377,.176,.170,...,1.0,2.4,1.3,1.8,0.5,0.4,0.3,0.6,1.5,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Zach LaVine,CHI,SG,25,872,.274,23.8,.551,.237,.136,.053,...,6.8,15.0,0.3,4.8,3.9,0.8,0.3,2.1,2.3,
2024,Zavier Simpson,MEM,PG,7,161,.074,17.5,.685,.167,.333,.093,...,2.4,7.7,0.6,2.3,3.6,1.0,0.4,1.4,1.6,
2024,Zeke Nnaji,DEN,PF,58,576,.416,15.4,.846,.597,.215,.027,...,1.2,2.6,1.1,1.1,0.6,0.3,0.7,0.5,1.4,
2024,Ziaire Williams,MEM,SF,51,1038,.198,19.7,.500,.220,.127,.085,...,2.9,7.4,0.7,2.8,1.5,0.7,0.2,1.3,1.7,


In [195]:
combined_df.reset_index()[(combined_df.reset_index()['Player'] == 'Spencer Dinwiddie') & (combined_df.reset_index()['Season'] == '2024')]

Unnamed: 0,Season,Player,Team,Pos,G,MP,FTr,USG%,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,...,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF,Shoot
12079,2024,Spencer Dinwiddie,BRK,PG,48,1474,0.321,18.6,0.443,0.14,...,4.1,10.4,0.5,2.8,6.0,0.8,0.2,1.3,1.8,
12080,2024,Spencer Dinwiddie,LAL,PG,28,678,0.355,12.2,0.362,0.078,...,2.0,5.0,0.1,1.5,2.4,0.5,0.5,1.0,1.9,


In [196]:
combined_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\df_with_all_positions_and_teams(2001-2024).csv")

In [41]:
# Filter for only PFs
combined_df = combined_df[combined_df['Pos'] == 'PF']
combined_df.drop(columns = ['Pos'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df.drop(columns = ['Pos'], inplace=True)


In [44]:
combined_df = combined_df.apply(pd.to_numeric, errors='coerce')

# Changing all the values in the df to be floats
combined_df = combined_df.astype(float)

combined_df.fillna(0, inplace=True)  # Filling NaNs with 0

# Replacing any infinite values with 0
combined_df.replace([np.inf, -np.inf], 0, inplace=True)

per_48_stats = ['FGA', 'PF', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST']

for stat in per_48_stats:
    combined_df[stat] = combined_df[stat] / (combined_df['MP'] / combined_df['G']) * 48

In [46]:
# Cleaning up the dataframe
combined_df.rename(columns = {'PF': 'Personal Fouls Committed'}, inplace=True)
combined_df.rename(columns = {'Fouls Drawn - Shoot': 'Shooting Fouls Drawn'}, inplace=True)
combined_df.drop(columns = ['% of FGA by Distance - 2P', 'FG'], inplace=True)

In [49]:
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,G,MP,FTr,USG%,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,% of FGA by Distance - 10-16,% of FGA by Distance - 16-3P,% of FGA by Distance - 3P,% of FG Ast'd - 2P,...,STL,BLK,TOV,Personal Fouls Committed,Position Estimate - PG%,Position Estimate - SG%,Position Estimate - SF%,Position Estimate - PF%,Position Estimate - C%,Shoot
Season,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2001,A.C. Green,82.0,1411.0,0.343,14.4,0.278,0.102,0.164,0.438,0.019,0.764,...,3.112549,0.778137,3.890686,11.672057,0.0,0.0,0.0,47.0,53.0,44.0
2001,Aaron Williams,82.0,2336.0,0.477,17.8,0.502,0.262,0.131,0.103,0.003,0.623,...,1.987296,3.974592,4.542391,11.072077,0.0,0.0,6.0,68.0,26.0,131.0
2001,Adam Keefe,67.0,836.0,0.396,11.5,0.377,0.176,0.170,0.258,0.019,0.683,...,5.919425,4.439569,8.879137,22.197843,0.0,0.0,17.0,70.0,13.0,25.0
2001,Al Harrington,78.0,1892.0,0.289,18.8,0.333,0.250,0.193,0.210,0.013,0.675,...,3.132704,0.783176,7.440173,11.356053,0.0,1.0,70.0,28.0,1.0,65.0
2001,Alan Henderson,73.0,1810.0,0.404,22.6,0.434,0.304,0.210,0.051,0.001,0.537,...,2.623428,1.499101,6.371181,8.245058,0.0,0.0,0.0,79.0,21.0,124.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Usman Garuba,6.0,18.0,0.333,20.8,0.333,0.500,0.000,0.000,0.167,0.000,...,51.200000,128.000000,76.800000,76.800000,0.0,0.0,0.0,10.0,90.0,0.0
2024,Wenyen Gabriel,5.0,81.0,0.227,17.0,0.545,0.091,0.091,0.000,0.273,0.714,...,3.511660,3.511660,14.046639,17.558299,0.0,8.0,75.0,16.0,0.0,0.0
2024,Xavier Tillman Sr.,54.0,974.0,0.179,14.8,0.337,0.294,0.061,0.018,0.290,0.812,...,6.373744,5.665550,4.249162,9.914712,0.0,0.0,0.0,43.0,57.0,0.0
2024,Zeke Nnaji,58.0,576.0,0.416,15.4,0.597,0.215,0.027,0.007,0.154,0.619,...,7.008333,16.352778,11.680556,32.705556,0.0,0.0,0.0,9.0,91.0,0.0


In [47]:
# Create a copy of the dataframe to apply PCA on
final_df = combined_df

# Changing all the values in the df to be floats so that I can apply PCA
final_df.fillna(0, inplace=True)  # Filling NaNs with 0
final_df = final_df.astype(float)

# Replacing any infinite values with 0
final_df.replace([np.inf, -np.inf], 0, inplace=True)

ordered_cols = ['G', 'MP', 'USG%', 'FGA', 'Personal Fouls Committed', 'BLK', 'STL', 'DRB', 'ORB', 'TOV', 'AST', "% of FG Ast'd - 3P", "% of FG Ast'd - 2P", 'FTr', 'Shooting Fouls Drawn', 'Corner 3s - %3PA', '% of FGA by Distance - 3P', '% of FGA by Distance - 16-3P', '% of FGA by Distance - 10-16', '% of FGA by Distance - 3-10', '% of FGA by Distance - 0-3', 'Dunks - %FGA', ]
final_df = final_df.reindex(columns = ordered_cols)

# Removing any players named "Player"
final_df.reset_index(inplace=True)
final_df = final_df[final_df['Player'] != 'Player']
final_df.set_index(['Season', 'Player'], inplace=True)

In [48]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,G,MP,USG%,FGA,Personal Fouls Committed,BLK,STL,DRB,ORB,TOV,...,% of FG Ast'd - 2P,FTr,Shooting Fouls Drawn,Corner 3s - %3PA,% of FGA by Distance - 3P,% of FGA by Distance - 16-3P,% of FGA by Distance - 10-16,% of FGA by Distance - 3-10,% of FGA by Distance - 0-3,Dunks - %FGA
Season,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2001,A.C. Green,82.0,1411.0,14.4,31.125486,11.672057,0.778137,3.112549,19.453429,10.115783,3.890686,...,0.764,0.343,,0.167,0.019,0.438,0.164,0.102,0.278,0.012
2001,Aaron Williams,82.0,2336.0,17.8,22.428054,11.072077,3.974592,1.987296,13.059373,7.381385,4.542391,...,0.623,0.477,,0.000,0.003,0.103,0.131,0.262,0.502,0.129
2001,Adam Keefe,67.0,836.0,11.5,35.516550,22.197843,4.439569,5.919425,26.637412,19.238131,8.879137,...,0.683,0.396,,0.333,0.019,0.258,0.170,0.176,0.377,0.038
2001,Al Harrington,78.0,1892.0,18.8,27.411163,11.356053,0.783176,3.132704,13.313993,5.873821,7.440173,...,0.675,0.289,,0.143,0.013,0.210,0.193,0.250,0.333,0.068
2001,Alan Henderson,73.0,1810.0,22.6,34.479334,8.245058,1.499101,2.623428,11.618037,9.369384,6.371181,...,0.537,0.404,,0.000,0.001,0.051,0.210,0.304,0.434,0.046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Usman Garuba,6.0,18.0,20.8,256.000000,76.800000,128.000000,51.200000,179.200000,128.000000,76.800000,...,0.000,0.333,,0.000,0.167,0.000,0.000,0.500,0.333,0.000
2024,Wenyen Gabriel,5.0,81.0,17.0,38.628258,17.558299,3.511660,3.511660,31.604938,12.290809,14.046639,...,0.714,0.227,,0.500,0.273,0.000,0.091,0.091,0.545,0.136
2024,Xavier Tillman Sr.,54.0,974.0,14.8,36.826074,9.914712,5.665550,6.373744,18.413037,9.206519,4.249162,...,0.812,0.179,,0.617,0.290,0.018,0.061,0.294,0.337,0.065
2024,Zeke Nnaji,58.0,576.0,15.4,60.738889,32.705556,16.352778,7.008333,25.697222,25.697222,11.680556,...,0.619,0.416,,0.391,0.154,0.007,0.027,0.215,0.597,0.181
