In [5]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib
from fake_useragent import UserAgent

nba_teams = {
    'Miami Heat': 'MIA',
    'Chicago Bulls': 'CHI',
    'Philadelphia 76ers': 'PHI',
    'New Jersey Nets': 'NJN',
    'Golden State Warriors': 'GSW',
    'Boston Celtics': 'BOS',
    'Indiana Pacers': 'IND',
    'Atlanta Hawks': 'ATL',
    'New York Knicks': 'NYK',
    'Toronto Raptors': 'TOR',
    'Cleveland Cavaliers': 'CLE',
    'Orlando Magic': 'ORL',
    'Phoenix Suns': 'PHO',
    'Denver Nuggets': 'DEN',
    'Houston Rockets': 'HOU',
    'Minnesota Timberwolves': 'MIN',
    'San Antonio Spurs': 'SAS',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'Charlotte Hornets': 'CHH',
    'Detroit Pistons': 'DET',
    'Dallas Mavericks': 'DAL',
    'Seattle SuperSonics': 'SEA',
    'Vancouver Grizzlies': 'VAN',
    'Los Angeles Lakers': 'LAL',
    'Los Angeles Clippers': 'LAC',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS',
    'Milwaukee Bucks': 'MIL',
    'Memphis Grizzlies': 'MEM',
    'New Orleans Hornets': 'NOH',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'Oklahoma City Thunder': 'OKC',
    'Brooklyn Nets': 'BRK',
    'New Orleans Pelicans': 'NOP',
    'Charlotte Bobcats': 'CHO',
    'New Charlotte Hornets': 'CHA'
}

# Function to check if the webpage exists
def page_not_exists(url):
    response = requests.get(url)
    return response.status_code == 404

def get_bballref_records_df(url, team, season, retry_count = 0, max_retries = 3):
    time.sleep(3.68)
    
    # Get the page request
    #ua = UserAgent()
    #headers = {'User-Agent': ua.random}
    #response = requests.get(url, headers=headers)
    
    response = requests.get(url)
    
    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Try to find the 'roster' table directly, and specify the class or ID to narrow down the search
    table = soup.find('table', {'id': 'per_game_stats'})
    if table is None:
        # Check if retry count has exceeded max retries
        if retry_count >= max_retries:
            print(f"Max retries reached for {team} in {season}. Moving to the next team.")
            return None
        
        print(f"No 'roster' table found for {team} in {season}. Retrying...")
        return get_bballref_records_df(url, team, season, retry_count + 1, max_retries)

    # Read the table into a DataFrame
    try:
        df = pd.read_html(str(table))[0]  # Convert HTML table to DataFrame
    except ValueError:
        print(f"Error: Could not parse the table for {team} in {season}.")
        return None

    # Clean the DataFrame as needed
    df = df.drop(columns=[column for column in df.columns if column not in ['Player', 'Pos', 'G', 'MP']])
    
    print(f"Found table for {team} in {season}.")
    
    return df

In [4]:
position_height_dfs['2001MIA']

Unnamed: 0,Player,Pos
0,Anthony Mason,PF
1,Eddie Jones,SG
2,Tim Hardaway,PG
3,Brian Grant,C
4,Bruce Bowen,SF
5,Dan Majerle,SG
6,Alonzo Mourning,C
7,Anthony Carter,PG
8,A.C. Green,PF
9,Cedric Ceballos,SF


In [None]:
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
position_height_dfs = {}
for season in seasons:
    for team in nba_teams.values():
        url = f'https://www.basketball-reference.com/teams/{team}/{season}.html'
        if page_not_exists(url):
            print(f"Page not found for {team} in {season}. Skipping...")
            time.sleep(3.68)
            continue
        # Fetch the data if the page exists
        df = get_bballref_records_df(url, team, season)
        position_height_dfs[season+team] = df

Found table for MIA in 2001.
Found table for CHI in 2001.
Found table for PHI in 2001.
Found table for NJN in 2001.
Found table for GSW in 2001.
Found table for BOS in 2001.
Found table for IND in 2001.
Found table for ATL in 2001.
Found table for NYK in 2001.
Found table for TOR in 2001.
Found table for CLE in 2001.
Found table for ORL in 2001.
Found table for PHO in 2001.
Found table for DEN in 2001.
Found table for HOU in 2001.
Found table for MIN in 2001.
Found table for SAS in 2001.
Found table for POR in 2001.
Found table for SAC in 2001.
Found table for CHH in 2001.
Found table for DET in 2001.
Found table for DAL in 2001.
Found table for SEA in 2001.
Found table for VAN in 2001.
Found table for LAL in 2001.
Found table for LAC in 2001.
Found table for UTA in 2001.
Found table for WAS in 2001.
Found table for MIL in 2001.
Page not found for MEM in 2001. Skipping...
Page not found for NOH in 2001. Skipping...
Page not found for NOK in 2001. Skipping...
Page not found for OKC in 2

Page not found for CHO in 2007. Skipping...
Found table for CHA in 2007.
Found table for MIA in 2008.
Found table for CHI in 2008.
Found table for PHI in 2008.
Found table for NJN in 2008.
Found table for GSW in 2008.
Found table for BOS in 2008.
Found table for IND in 2008.
Found table for ATL in 2008.
Found table for NYK in 2008.
Found table for TOR in 2008.
Found table for CLE in 2008.
Found table for ORL in 2008.
Found table for PHO in 2008.
Found table for DEN in 2008.
Found table for HOU in 2008.
Found table for MIN in 2008.
Found table for SAS in 2008.
Found table for POR in 2008.
Found table for SAC in 2008.
Page not found for CHH in 2008. Skipping...
Found table for DET in 2008.
Found table for DAL in 2008.
Found table for SEA in 2008.
Page not found for VAN in 2008. Skipping...
Found table for LAL in 2008.
Found table for LAC in 2008.
Found table for UTA in 2008.
Found table for WAS in 2008.
Found table for MIL in 2008.
Found table for MEM in 2008.
Found table for NOH in 2008

Found table for NOP in 2014.
Page not found for CHO in 2014. Skipping...
Found table for CHA in 2014.
Found table for MIA in 2015.
Found table for CHI in 2015.
Found table for PHI in 2015.
Page not found for NJN in 2015. Skipping...
Found table for GSW in 2015.
Found table for BOS in 2015.
Found table for IND in 2015.
Found table for ATL in 2015.
Found table for NYK in 2015.
Found table for TOR in 2015.
Found table for CLE in 2015.
Found table for ORL in 2015.
Found table for PHO in 2015.
Found table for DEN in 2015.
Found table for HOU in 2015.
Found table for MIN in 2015.
Found table for SAS in 2015.
Found table for POR in 2015.
Found table for SAC in 2015.
Page not found for CHH in 2015. Skipping...
Found table for DET in 2015.
Found table for DAL in 2015.
Page not found for SEA in 2015. Skipping...
Page not found for VAN in 2015. Skipping...
Found table for LAL in 2015.
Found table for LAC in 2015.
Found table for UTA in 2015.
Found table for WAS in 2015.
Found table for MIL in 201

Found table for BRK in 2021.
Found table for NOP in 2021.
Found table for CHO in 2021.
Page not found for CHA in 2021. Skipping...
Found table for MIA in 2022.
Found table for CHI in 2022.
Found table for PHI in 2022.
Page not found for NJN in 2022. Skipping...
Found table for GSW in 2022.
Found table for BOS in 2022.
Found table for IND in 2022.
Found table for ATL in 2022.
Found table for NYK in 2022.
Found table for TOR in 2022.
Found table for CLE in 2022.
Found table for ORL in 2022.
Found table for PHO in 2022.
Found table for DEN in 2022.
Found table for HOU in 2022.
Found table for MIN in 2022.
Found table for SAS in 2022.
Found table for POR in 2022.
Found table for SAC in 2022.
Page not found for CHH in 2022. Skipping...
Found table for DET in 2022.
Found table for DAL in 2022.
Page not found for SEA in 2022. Skipping...
Page not found for VAN in 2022. Skipping...
Found table for LAL in 2022.
Found table for LAC in 2022.
Found table for UTA in 2022.
Found table for WAS in 202

In [90]:
for roster in position_height_dfs:
    position_height_dfs[roster]['Roster'] = roster

In [93]:
height_pos_df = pd.concat(position_height_dfs.values(), ignore_index=True)

In [94]:
height_pos_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\heights_and_positions_df(2001-2024).csv")