In [14]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

# Create a function to scrape the data from a basketball-reference page
def get_height_df(url):
    # Create a list to hold all of the dataframes
    list_dfs = []
    
    # Initialize the selenium driver
    driver = webdriver.Chrome()
    
    start_time = time.time()  # Record start time
    max_wait_time = 60  # Maximum wait time in seconds
    
    try:
        # Open the specified page
        driver.get(url)
        
        # Continuously check the time until the page is fully loaded
        while True:
            elapsed_time = time.time() - start_time
            if elapsed_time > max_wait_time:
                print(f"Timeout reached for {url}. Retrying...")
                driver.quit()
                return get_bballref_df(url)
            
            # Check if page is loaded
            if driver.execute_script("return document.readyState;") == "complete":
                break
            
            time.sleep(1)  # Wait a bit before checking again
        
        # Retrieve the number of pages needed to go through
        num_pages_text = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[4]')
        num_pages = int(num_pages_text.text[-2:])
        
        for i in range(num_pages):
            # Parse with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html')
        
            # Find the table
            table = soup.find('table', {'class': 'Crom_table__p1iZz'})
    
            # Parse the table data
            df = pd.read_html(str(table))[0]  # Converts HTML table to DataFrame
        
            # Add the df to the list
            list_dfs.append(df)
        
            # Click the "Next" button to go to the next page
            next_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[5]/button[2]')
            driver.execute_script("arguments[0].click();", next_button)
    
    except Exception as e:
        print(f"Error for {url}: {e}")
        driver.quit()
        return get_bballref_df(url)

    finally:
        driver.quit()
    
    heights_df = pd.concat(list_dfs, ignore_index=True)
    
    return heights_df.drop(columns=[column for column in heights_df.columns if column not in ['Player', 'Team', 'Height']])

In [15]:
seasons = ['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

In [17]:
height_dfs = {}
for season in seasons:
    height_dfs[season] = get_height_df(f'https://www.nba.com/stats/players/bio?Season={season}')

In [18]:
for season_key, df in height_dfs.items():
    year = int(season_key.split("-")[1]) + 2000  # Extract second year and convert to int
    df["Season"] = year
combined_df = pd.concat(height_dfs.values(), ignore_index=True)

In [19]:
combined_df

Unnamed: 0,Player,Team,Height,Season
0,A.C. Green,MIA,6-9,2001
1,A.J. Guyton,CHI,6-1,2001
2,Aaron McKie,PHI,6-5,2001
3,Aaron Williams,NJN,6-9,2001
4,Adam Keefe,GSW,6-9,2001
...,...,...,...,...
11654,Zach LaVine,CHI,6-5,2024
11655,Zavier Simpson,MEM,6-0,2024
11656,Zeke Nnaji,DEN,6-9,2024
11657,Ziaire Williams,MEM,6-9,2024


In [20]:
combined_df[combined_df['Season'] == 2013]

Unnamed: 0,Player,Team,Height,Season
5399,AJ Price,WAS,6-2,2013
5400,Aaron Brooks,HOU,6-0,2013
5401,Aaron Gray,TOR,7-0,2013
5402,Al Harrington,ORL,6-9,2013
5403,Al Horford,ATL,6-10,2013
...,...,...,...,...
5863,Willie Green,LAC,6-3,2013
5864,Wilson Chandler,DEN,6-8,2013
5865,Xavier Henry,NOH,6-6,2013
5866,Zach Randolph,MEM,6-9,2013


In [21]:
combined_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\heights_df(2001-2024).csv")