In [42]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib
from fake_useragent import UserAgent

nba_teams = {
    'Miami Heat': 'MIA',
    'Chicago Bulls': 'CHI',
    'Philadelphia 76ers': 'PHI',
    'New Jersey Nets': 'NJN',
    'Golden State Warriors': 'GSW',
    'Boston Celtics': 'BOS',
    'Indiana Pacers': 'IND',
    'Atlanta Hawks': 'ATL',
    'New York Knicks': 'NYK',
    'Toronto Raptors': 'TOR',
    'Cleveland Cavaliers': 'CLE',
    'Orlando Magic': 'ORL',
    'Phoenix Suns': 'PHO',
    'Denver Nuggets': 'DEN',
    'Houston Rockets': 'HOU',
    'Minnesota Timberwolves': 'MIN',
    'San Antonio Spurs': 'SAS',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'Charlotte Hornets': 'CHH',
    'Detroit Pistons': 'DET',
    'Dallas Mavericks': 'DAL',
    'Seattle SuperSonics': 'SEA',
    'Vancouver Grizzlies': 'VAN',
    'Los Angeles Lakers': 'LAL',
    'Los Angeles Clippers': 'LAC',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS',
    'Milwaukee Bucks': 'MIL',
    'Memphis Grizzlies': 'MEM',
    'New Orleans Hornets': 'NOH',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'Oklahoma City Thunder': 'OKC',
    'Brooklyn Nets': 'BRK',
    'New Orleans Pelicans': 'NOP',
    'Charlotte Bobcats': 'CHO',
    'New Charlotte Hornets': 'CHA'
}

# Function to check if the webpage exists
def page_not_exists(url):
    response = requests.get(url)
    return response.status_code == 404

def get_bballref_df(url, data_type, season, retry_count = 0, max_retries = 3):
    time.sleep(3.68)
    
    # Get the page request
    #ua = UserAgent()
    #headers = {'User-Agent': ua.random}
    #response = requests.get(url, headers=headers)
    
    response = requests.get(url)
    
    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Finding the table
    if (data_type in ['advanced', 'shooting']):
        table=soup.find('table', {'id': data_type})
    else:
        # Find the table's container div by ID
        div_id = "switcher_" + data_type  # Adjusting based on your `data_type`
        table_div = soup.find('div', {'id': div_id})
        if table_div is None:
            print(f"Could not find table div with id {div_id}.")
            return None
    
        # Find the table
        table = table_div.find('table', {'id': data_type})
    
    # Try to find the 'roster' table directly, and specify the class or ID to narrow down the search
    if table is None:
        # Check if retry count has exceeded max retries
        if retry_count >= max_retries:
            print(f"Max retries reached for {data_type} in {season}. Moving to the next data type.")
            return None
        
        print(f"No 'roster' table found for {data_type} in {season}. Retrying...")
        return get_bballref_df(url, data_type, season, retry_count + 1, max_retries)

    # Read the table into a DataFrame
    try:
        df = pd.read_html(str(table))[0]  # Convert HTML table to DataFrame
    except ValueError:
        print(f"Error: Could not parse the table for {data_type} in {season}.")
        return None

    # Additional cleaning only needed for the shooting stats and play-by-play stats
    if data_type == 'shooting' or data_type == 'pbp_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    print(f"Found table for {data_type} in {season}.")
    
    return df

In [43]:
# List out all the seasons we want data from
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
tables = ['advanced', 'shooting', 'play-by-play', 'per_game']

# Create a helper function to keep players' stats from each team they played for in a single season
def drop_tot_rows(group):
    group['Team'] = group['Team'].astype(str)
    group = group[(group['Team'] != 'nan') & (group['Team'] != 'Team')]
    return group[~group['Team'].str.endswith('TM')]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 4 desired categories
for data in tables:
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        if data == 'play-by-play':
            df = get_bballref_df(url, 'pbp_stats', season)
        elif data in ['advanced', 'shooting']:
            df = get_bballref_df(url, data, season)
        else:
            df = get_bballref_df(url, data + '_stats', season)
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(drop_tot_rows)
        if data == 'advanced':
            df.drop(columns = ['Rk', 'GS', 'Age', 'TS%', '3PAr', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        elif data == 'shooting':
            df.drop(columns = ['G', 'GS', 'MP', 'Dist.', 'Rk', 'Age', 'Pos',  'FG% by Distance - 2P', 'FG% by Distance - 0-3', 'FG% by Distance - 3-10', 'FG% by Distance - 10-16', 'FG% by Distance - 16-3P', 'FG% by Distance - 3P', 'FG%', 'Dunks - #', 'Heaves - Att.', 'Heaves - Md.', 'Corner 3s - 3P%'], inplace=True)
        elif data == 'play-by-play':
            df.drop(columns = ['G', 'GS', 'MP', 'Rk', 'Age', 'Pos', '+/- Per 100 Poss - OnCourt', '+/- Per 100 Poss - On-Off', 'Turnovers - BadPass', 'Turnovers - LostBall', 'Fouls Committed - Shoot', 'Fouls Committed - Off.', 'Misc. - PGA', 'Misc. - And1', 'Misc. - Blkd', 'Awards'], inplace=True)
        else:
            df.drop(columns = ['G', 'GS', 'MP', 'Rk', 'Age', 'Pos', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'PTS', 'Awards'], inplace=True)
        bballref_dfs[season + data] = df

Found table for advanced in 2001.
Found table for advanced in 2002.
Found table for advanced in 2003.
Found table for advanced in 2004.
Found table for advanced in 2005.
Found table for advanced in 2006.
Found table for advanced in 2007.
Found table for advanced in 2008.
Found table for advanced in 2009.
Found table for advanced in 2010.
Found table for advanced in 2011.
Found table for advanced in 2012.
Found table for advanced in 2013.
Found table for advanced in 2014.
Found table for advanced in 2015.
Found table for advanced in 2016.
Found table for advanced in 2017.
Found table for advanced in 2018.
Found table for advanced in 2019.
Found table for advanced in 2020.
Found table for advanced in 2021.
Found table for advanced in 2022.
Found table for advanced in 2023.
Found table for advanced in 2024.
Found table for shooting in 2001.
Found table for shooting in 2002.
Found table for shooting in 2003.
Found table for shooting in 2004.
Found table for shooting in 2005.
Found table fo

In [45]:
bballref_dfs['2024shooting']

Unnamed: 0,Player,Team,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,% of FGA by Distance - 10-16,% of FGA by Distance - 16-3P,% of FGA by Distance - 3P,% of FG Ast'd - 2P,% of FG Ast'd - 3P,Dunks - %FGA,Corner 3s - %3PA,Awards
428,A.J. Green,MIL,0.138,0.026,0.020,0.031,0.061,0.862,0.857,0.942,0.000,0.225,
539,A.J. Lawson,DAL,0.587,0.364,0.198,0.008,0.017,0.413,0.610,1.000,0.107,0.640,
586,AJ Griffin,ATL,0.371,0.048,0.194,0.065,0.065,0.629,0.750,0.800,0.016,0.205,
58,Aaron Gordon,DEN,0.807,0.554,0.183,0.053,0.017,0.193,0.642,0.850,0.256,0.391,
260,Aaron Holiday,HOU,0.480,0.122,0.170,0.139,0.048,0.520,0.255,0.810,0.005,0.212,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,Zach LaVine,CHI,0.551,0.237,0.136,0.053,0.125,0.449,0.396,0.525,0.040,0.183,
589,Zavier Simpson,MEM,0.685,0.167,0.333,0.093,0.093,0.315,0.250,0.800,0.019,0.118,
446,Zeke Nnaji,DEN,0.846,0.597,0.215,0.027,0.007,0.154,0.619,1.000,0.181,0.391,
313,Ziaire Williams,MEM,0.500,0.220,0.127,0.085,0.069,0.500,0.641,0.897,0.095,0.370,


In [46]:
separated_dfs_by_season = {}
for season in seasons:
    separated_dfs_by_season[season] = {}
    for df in bballref_dfs:
        if season in df:
            separated_dfs_by_season[season][df] = bballref_dfs[df]

In [47]:
separated_dfs_by_season['2001']

{'2001advanced':                  Player Team Pos     G      MP    FTr  USG%  Awards
 217          A.C. Green  MIA  PF  82.0  1411.0  0.343  14.4     NaN
 356         A.J. Guyton  CHI  PG  33.0   630.0  0.094  16.5     NaN
 84          Aaron McKie  PHI  SG  76.0  2394.0  0.272  18.9  6MOY-1
 94       Aaron Williams  NJN  PF  82.0  2336.0  0.477  17.8     NaN
 320          Adam Keefe  GSW  PF  67.0   836.0  0.396  11.5     NaN
 ..                  ...  ...  ..   ...     ...    ...   ...     ...
 499         Will Perdue  POR   C  13.0    58.0  0.444   8.6     NaN
 401       William Avery  MIN  PG  55.0   463.0  0.250  20.0     NaN
 521     Zendon Hamilton  LAC   C   3.0    19.0  0.889  35.3     NaN
 358  Zydrunas Ilgauskas  CLE   C  24.0   616.0  0.333  24.1     NaN
 329           Žan Tabak  IND   C  55.0   777.0  0.253  15.8     NaN
 
 [490 rows x 8 columns],
 '2001shooting':                  Player Team  % of FGA by Distance - 2P  \
 217          A.C. Green  MIA                      0.

In [48]:
# Create a new dictionary to hold the merged dataframes by season
dfs_by_season = {}

for season in separated_dfs_by_season:
    merged_df = pd.merge(separated_dfs_by_season[season][season + 'advanced'], separated_dfs_by_season[season][season + 'shooting'], on=['Player', 'Team'], how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'per_game'], on=['Player', 'Team'], how='inner')
    merged_df = pd.merge(merged_df, separated_dfs_by_season[season][season + 'play-by-play'], on=['Player', 'Team'], how='inner')
    dfs_by_season[season] = merged_df

In [50]:
for df in dfs_by_season:
    cols_to_drop = []
    for col in dfs_by_season[df].columns:
        if 'GS' in col or 'Awards' in col or 'Foul' in col or 'Position Estimate' in col or '_x' in col or '_y' in col:
            cols_to_drop.append(col)
    dfs_by_season[df].drop(columns=cols_to_drop, inplace=True)

In [59]:
dfs_by_season['2011']

Unnamed: 0,Player,Team,Pos,G,MP,FTr,USG%,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,...,Corner 3s - %3PA,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF
0,A.J. Price,IND,PG,50.0,795.0,0.253,22.7,0.534,0.069,0.128,...,0.040,2.3,6.4,0.3,1.1,2.2,0.6,0.0,1.1,1.2
1,Aaron Brooks,HOU,PG,34.0,811.0,0.214,26.1,0.563,0.199,0.137,...,0.183,3.9,11.4,0.3,1.2,3.8,0.6,0.1,1.6,2.0
2,Aaron Brooks,PHO,PG,25.0,473.0,0.285,25.7,0.665,0.205,0.130,...,0.164,3.4,8.0,0.4,0.7,4.2,0.5,0.0,1.8,1.8
3,Aaron Gray,NOH,C,41.0,531.0,0.343,13.1,1.000,0.657,0.293,...,,1.4,2.4,1.4,2.7,0.4,0.3,0.3,0.8,2.3
4,Acie Law,MEM,PG,11.0,94.0,0.526,16.0,0.684,0.158,0.211,...,0.667,0.3,1.7,0.3,0.7,1.3,0.4,0.0,0.9,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,Zabian Dowdell,PHO,PG,24.0,292.0,0.136,25.0,0.920,0.264,0.096,...,0.300,2.1,5.2,0.2,0.6,2.1,0.8,0.1,1.2,1.3
538,Zach Randolph,MEM,PF,75.0,2724.0,0.333,25.0,0.964,0.394,0.329,...,0.116,8.0,15.8,4.3,7.8,2.2,0.8,0.3,2.0,2.3
539,Zaza Pachulia,ATL,C,79.0,1244.0,0.772,14.5,1.000,0.591,0.220,...,,1.4,2.9,1.5,2.7,0.7,0.4,0.3,0.9,2.3
540,Zydrunas Ilgauskas,MIA,C,72.0,1145.0,0.144,16.0,0.997,0.310,0.103,...,0.000,2.3,4.4,1.5,2.5,0.4,0.3,0.8,0.7,2.6


In [60]:
# Create an empty list to store each season's dataframe with the season column added
df_list = []

# Loop over each season and its corresponding dataframe in dfs_by_season
for season, df in dfs_by_season.items():
    # Create a copy of the dataframe and add a 'Season' column with the current season
    df_copy = df.copy()
    df_copy['Season'] = season
    # Append the modified dataframe to the list
    df_list.append(df_copy)

# Concatenate all dataframes in the list into a single dataframe and set the index appropriately
combined_df = pd.concat(df_list, ignore_index=True)
combined_df.set_index(['Season', 'Player'], inplace=True)

In [65]:
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Team,Pos,G,MP,FTr,USG%,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,% of FGA by Distance - 10-16,...,FG,FGA,ORB,DRB,AST,STL,BLK,TOV,PF,Shoot
Season,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2001,A.C. Green,MIA,PF,82.0,1411.0,0.343,14.4,0.981,0.278,0.102,0.164,...,1.8,4.0,1.3,2.5,0.5,0.4,0.1,0.5,1.5,44.0
2001,A.J. Guyton,CHI,PG,33.0,630.0,0.094,16.5,0.641,0.078,0.042,0.094,...,2.4,5.8,0.3,0.8,1.9,0.3,0.2,0.7,1.1,5.0
2001,Aaron McKie,PHI,SG,76.0,2394.0,0.272,18.9,0.762,0.232,0.137,0.183,...,4.4,9.4,0.4,3.7,5.0,1.4,0.1,2.7,2.3,73.0
2001,Aaron Williams,NJN,PF,82.0,2336.0,0.477,17.8,0.997,0.502,0.262,0.131,...,3.6,7.9,2.6,4.6,1.1,0.7,1.4,1.6,3.9,131.0
2001,Adam Keefe,GSW,PF,67.0,836.0,0.396,11.5,0.981,0.377,0.176,0.170,...,1.0,2.4,1.3,1.8,0.5,0.4,0.3,0.6,1.5,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024,Zach LaVine,CHI,SG,25.0,872.0,0.274,23.8,0.551,0.237,0.136,0.053,...,6.8,15.0,0.3,4.8,3.9,0.8,0.3,2.1,2.3,
2024,Zavier Simpson,MEM,PG,7.0,161.0,0.074,17.5,0.685,0.167,0.333,0.093,...,2.4,7.7,0.6,2.3,3.6,1.0,0.4,1.4,1.6,
2024,Zeke Nnaji,DEN,PF,58.0,576.0,0.416,15.4,0.846,0.597,0.215,0.027,...,1.2,2.6,1.1,1.1,0.6,0.3,0.7,0.5,1.4,
2024,Ziaire Williams,MEM,SF,51.0,1038.0,0.198,19.7,0.500,0.220,0.127,0.085,...,2.9,7.4,0.7,2.8,1.5,0.7,0.2,1.3,1.7,


In [67]:
combined_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\df_with_all_positions_and_teams(2001-2024).csv")