In [1]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import html5lib
from fake_useragent import UserAgent

nba_teams = {
    'Miami Heat': 'MIA',
    'Chicago Bulls': 'CHI',
    'Philadelphia 76ers': 'PHI',
    'New Jersey Nets': 'NJN',
    'Golden State Warriors': 'GSW',
    'Boston Celtics': 'BOS',
    'Indiana Pacers': 'IND',
    'Atlanta Hawks': 'ATL',
    'New York Knicks': 'NYK',
    'Toronto Raptors': 'TOR',
    'Cleveland Cavaliers': 'CLE',
    'Orlando Magic': 'ORL',
    'Phoenix Suns': 'PHO',
    'Denver Nuggets': 'DEN',
    'Houston Rockets': 'HOU',
    'Minnesota Timberwolves': 'MIN',
    'San Antonio Spurs': 'SAS',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'Charlotte Hornets': 'CHH',
    'Detroit Pistons': 'DET',
    'Dallas Mavericks': 'DAL',
    'Seattle SuperSonics': 'SEA',
    'Vancouver Grizzlies': 'VAN',
    'Los Angeles Lakers': 'LAL',
    'Los Angeles Clippers': 'LAC',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS',
    'Milwaukee Bucks': 'MIL',
    'Memphis Grizzlies': 'MEM',
    'New Orleans Hornets': 'NOH',
    'New Orleans/Oklahoma City Hornets': 'NOK',
    'Oklahoma City Thunder': 'OKC',
    'Brooklyn Nets': 'BRK',
    'New Orleans Pelicans': 'NOP',
    'Charlotte Bobcats': 'CHO',
    'New Charlotte Hornets': 'CHA'
}

# Function to check if the webpage exists
def page_not_exists(url):
    response = requests.get(url)
    return response.status_code == 404

def get_bballref_records_df(url, team, season, retry_count = 0, max_retries = 3):
    time.sleep(3.68)
    
    # Get the page request
    #ua = UserAgent()
    #headers = {'User-Agent': ua.random}
    #response = requests.get(url, headers=headers)
    
    response = requests.get(url)
    
    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Try to find the 'roster' table directly, and specify the class or ID to narrow down the search
    table = soup.find('table', {'id': 'per_game_stats'})
    if table is None:
        # Check if retry count has exceeded max retries
        if retry_count >= max_retries:
            print(f"Max retries reached for {team} in {season}. Moving to the next team.")
            return None
        
        print(f"No 'roster' table found for {team} in {season}. Retrying...")
        return get_bballref_records_df(url, team, season, retry_count + 1, max_retries)

    # Read the table into a DataFrame
    try:
        df = pd.read_html(str(table))[0]  # Convert HTML table to DataFrame
    except ValueError:
        print(f"Error: Could not parse the table for {team} in {season}.")
        return None

    # Clean the DataFrame as needed
    df = df.drop(columns=[column for column in df.columns if column not in ['Player', 'Pos', 'Ht']])
    
    print(f"Found table for {team} in {season}.")
    
    return df

In [91]:
position_height_dfs['2001MIA']

Unnamed: 0,Player,Pos,Ht,Roster
0,Bruce Bowen,SF,6-7,2001MIA
1,Anthony Carter,PG,6-1,2001MIA
2,Duane Causwell,C,7-0,2001MIA
3,Cedric Ceballos,SF,6-7,2001MIA
4,Ricky Davis,SG,6-6,2001MIA
5,Todd Fuller,C,6-11,2001MIA
6,Brian Grant,C,6-9,2001MIA
7,A.C. Green,PF,6-9,2001MIA
8,Tim Hardaway,PG,6-0,2001MIA
9,Eddie House,PG,6-1,2001MIA


In [None]:
seasons = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
position_height_dfs = {}
for season in seasons:
    for team in nba_teams.values():
        url = f'https://www.basketball-reference.com/teams/{team}/{season}.html'
        if page_not_exists(url):
            print(f"Page not found for {team} in {season}. Skipping...")
            time.sleep(3.68)
            continue
        # Fetch the data if the page exists
        df = get_bballref_records_df(url, team, season)
        position_height_dfs[season+team] = df

Found table for MIA in 2001.
Found table for CHI in 2001.
Found table for PHI in 2001.
Found table for NJN in 2001.
Found table for GSW in 2001.
Found table for BOS in 2001.
Found table for IND in 2001.
Found table for ATL in 2001.
Found table for NYK in 2001.
Found table for TOR in 2001.
Found table for CLE in 2001.
Found table for ORL in 2001.
Found table for PHO in 2001.
Found table for DEN in 2001.


In [90]:
for roster in position_height_dfs:
    position_height_dfs[roster]['Roster'] = roster

In [93]:
height_pos_df = pd.concat(position_height_dfs.values(), ignore_index=True)

In [94]:
height_pos_df.to_csv(r"C:\Users\vaugh\Desktop\basketball-pf-research\Basketball-reference data\heights_and_positions_df(2001-2024).csv")