In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Create a function to scrape the data from a basketball-reference page
def get_bballref_df(url, data_type):
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url, data_type)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
            
        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html')
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url, data_type)

    finally:
        driver.quit()
    
    # Find the table
    table = soup.find('table', {'id': data_type})
    
    # Parse the table data
    df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
    
    # Additional cleaning only needed for the shooting stats and play-by-play stats
    if data_type == 'shooting_stats' or data_type == 'pbp_stats':
        df.columns = df.columns.map(' - '.join)
        for column in df.columns:
            if 'Unnamed' in column:
                new_column_name = column.split(' - ')[1]
                df.rename(columns={column: new_column_name}, inplace=True)
    
    # Cleaning up the dataframes
    for column in df.columns:
        if 'Unnamed' in column:
            df.drop(columns=[column], inplace=True)
    
    return df

In [2]:
# List out all the seasons we want data from
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
tables = ['advanced', 'shooting', 'play-by-play', 'per_game']

# Create a helper function to sort by TOT if the player was on multiple teams that season
def keep_tot_or_first(group, d_type):
    if d_type == 'per_game':
        team = 'Team'
    else:
        team = 'Tm'
    # If 'TOT' exists for the player, keep it
    if 'TOT' in group[team].values:
        return group[group[team] == 'TOT']
    # Otherwise, keep the first entry (assuming it's the player's only entry)
    return group.iloc[[0]]

# Create a dictionary to store all the dataframes
bballref_dfs = {}

# Go through and extract the data from every season for the 2 desired categories
for data in tables:
    for season in seasons:
        url = f'https://www.basketball-reference.com/leagues/NBA_{season}_{data}.html'
        if data == 'play-by-play':
            df = get_bballref_df(url, 'pbp_stats')
        else:
            df = get_bballref_df(url, data + '_stats')
        df['Player'] = df['Player'].str.replace(r'\*$', '', regex=True)
        df = df.groupby('Player', group_keys=False).apply(lambda group: keep_tot_or_first(group, data))
        if data == 'advanced':
            df.drop(columns = ['Rk', 'Tm', 'Age', 'TS%', '3PAr', 'Pos', 'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'], inplace=True)
        elif data == 'shooting':
            df.drop(columns = ['G', 'MP', 'Dist.', 'Rk', 'Tm', 'Age', 'Pos',  'FG% by Distance - 2P', 'FG% by Distance - 0-3', 'FG% by Distance - 3-10', 'FG% by Distance - 10-16', 'FG% by Distance - 16-3P', 'FG% by Distance - 3P', 'FG%', 'Dunks - #', 'Heaves - Att.', 'Heaves - #', 'Corner 3s - 3P%'], inplace=True)
        elif data == 'play-by-play':
            df.drop(columns = ['Totals - G', 'Totals - MP', 'Rk', 'Tm', 'Age', 'Pos', 'Position Estimate - PG%', 'Position Estimate - SG%', 'Position Estimate - SF%', 'Position Estimate - PF%', 'Position Estimate - C%', '+/- Per 100 Poss. - OnCourt', '+/- Per 100 Poss. - On-Off', 'Turnovers - BadPass', 'Turnovers - LostBall', 'Fouls Committed - Shoot', 'Fouls Committed - Off.', 'Fouls Drawn - Off.', 'Misc. - PGA', 'Misc. - And1', 'Misc. - Blkd'], inplace=True)
        else:
            df.drop(columns = ['G', 'MP', 'Rk', 'Team', 'Age', 'Pos', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'PTS', 'Awards'], inplace=True)
        bballref_dfs[season + data] = df

Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2015_advanced.html. Retrying...
Timeout reached for https://www.basketball-reference.com/leagues/NBA_2006_per_game.html. Retrying...


In [3]:
bballref_dfs['2003shooting']

Unnamed: 0,Player,% of FGA by Distance - 2P,% of FGA by Distance - 0-3,% of FGA by Distance - 3-10,% of FGA by Distance - 10-16,% of FGA by Distance - 16-3P,% of FGA by Distance - 3P,% of FG Ast'd - 2P,% of FG Ast'd - 3P,Dunks - %FGA,Corner 3s - %3PA
173,A.J. Guyton,.750,.000,.250,.000,.500,.250,,,.000,.000
294,Aaron McKie,.832,.314,.105,.183,.230,.168,.590,1.000,.000,.580
480,Aaron Williams,.998,.483,.273,.134,.107,.002,.623,,.089,.000
181,Adam Harrington,.622,.216,.054,.000,.351,.378,.833,1.000,.027,.143
147,Adonal Foyle,.997,.455,.383,.116,.043,.003,.605,,.128,.000
...,...,...,...,...,...,...,...,...,...,...,...
373,Zach Randolph,.990,.507,.266,.118,.099,.010,.606,,.043,.200
177,Zendon Hamilton,1.000,1.000,.000,.000,.000,.000,.000,,.000,
214,Zydrunas Ilgauskas,.996,.285,.277,.289,.143,.004,.600,,.054,.200
444,Óscar Torres,.639,.194,.111,.139,.194,.361,.444,.857,.000,.385


In [4]:
for df in bballref_dfs:
    bballref_dfs[df].to_csv(df + '.csv')