In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import os
import requests
from bs4 import BeautifulSoup, Comment
from datetime import datetime

from utils import mp_to_minutes

In [2]:
df = pd.read_html("https://www.basketball-reference.com/draft/NBA_2003.html")[0]
print(df.columns)

MultiIndex([('Unnamed: 0_level_0',      'Rk'),
            ('Unnamed: 1_level_0',      'Pk'),
            ('Unnamed: 2_level_0',      'Tm'),
            (           'Round 1',  'Player'),
            (           'Round 1', 'College'),
            ('Unnamed: 5_level_0',     'Yrs'),
            (            'Totals',       'G'),
            (            'Totals',      'MP'),
            (            'Totals',     'PTS'),
            (            'Totals',     'TRB'),
            (            'Totals',     'AST'),
            (          'Shooting',     'FG%'),
            (          'Shooting',     '3P%'),
            (          'Shooting',     'FT%'),
            (          'Per Game',      'MP'),
            (          'Per Game',     'PTS'),
            (          'Per Game',     'TRB'),
            (          'Per Game',     'AST'),
            (          'Advanced',      'WS'),
            (          'Advanced',   'WS/48'),
            (          'Advanced',     'BPM'),
            (

In [3]:
df.columns = [
    f"{a}_{b}".strip("_")
    for a, b in df.columns
]
print(df.columns)

Index(['Unnamed: 0_level_0_Rk', 'Unnamed: 1_level_0_Pk',
       'Unnamed: 2_level_0_Tm', 'Round 1_Player', 'Round 1_College',
       'Unnamed: 5_level_0_Yrs', 'Totals_G', 'Totals_MP', 'Totals_PTS',
       'Totals_TRB', 'Totals_AST', 'Shooting_FG%', 'Shooting_3P%',
       'Shooting_FT%', 'Per Game_MP', 'Per Game_PTS', 'Per Game_TRB',
       'Per Game_AST', 'Advanced_WS', 'Advanced_WS/48', 'Advanced_BPM',
       'Advanced_VORP'],
      dtype='object')


In [4]:
df["Round 1_Player"].head(10)

0       LeBron James
1      Darko Miliƒçiƒá
2    Carmelo Anthony
3         Chris Bosh
4        Dwyane Wade
5        Chris Kaman
6       Kirk Hinrich
7          T.J. Ford
8      Mike Sweetney
9       Jarvis Hayes
Name: Round 1_Player, dtype: object

In [5]:
# Filter out rows where "Round 1_Player" is NaN
df = df[df["Round 1_Player"].notna()].copy()

In [3]:
# extract draft class from the url
def load_draft_class(draft_year):
    url = f"https://www.basketball-reference.com/draft/NBA_{draft_year}.html"

    df = pd.read_html(url)[0]
    df_links = pd.read_html(url, extract_links="body")[0]

    df.columns = [f"{a}_{b}".strip("_") for a, b in df.columns]
    df_links.columns = df.columns

    df = df[df["Round 1_Player"].notna()].copy()

    df["player"] = df["Round 1_Player"]
    df["player_id"] = df_links["Round 1_Player"].apply(
        lambda x: x[1].split("/")[-1].replace(".html", "")
        if isinstance(x, tuple) and x[1]
        else None
    )

    # üîë drop header rows like "Round 2"
    df = df[df["player_id"].notna()].copy()

    df["draft_year"] = draft_year
    df["rookie_season"] = draft_year + 1

    return df[["draft_year", "player", "player_id", "rookie_season"]]



In [4]:
# load or scrape draft class with caching
# def load_or_scrape_draft(year):
#     path = f"assets/drafts/draft_{year}.csv"
#     if os.path.exists(path):
#         return pd.read_csv(path)
    
#     df = load_draft_class(year)
#     df.to_csv(path, index=False)
#     return df

# function to scrape a single draft class
def scrape_and_save_year(year):
    df = load_draft_class(year)
    df.to_csv(f"assets/drafts/draft_{year}.csv", index=False)
    print(f"‚úì saved {year}")


In [5]:
# iterate through years and save each draft class

# years = [2022, 2023, 2024, 2025]

# for y in years:
#     scrape_and_save_year(y)
#     time.sleep(10)

In [6]:
# freeze the draft years collected
import glob
draft_classes = pd.concat(
    [pd.read_csv(f) for f in glob.glob("assets/drafts/draft_20*.csv")],
    ignore_index=True
)
draft_classes.head(100)


Unnamed: 0,draft_year,player,player_id,rookie_season
0,2000,Kenyon Martin,martike01,2001
1,2000,Stromile Swift,swiftst01,2001
2,2000,Darius Miles,milesda01,2001
3,2000,Marcus Fizer,fizerma01,2001
4,2000,Mike Miller,millemi01,2001
...,...,...,...,...
95,2001,Michael Wright,wrighmi01,2002
96,2001,Earl Watson,watsoea01,2002
97,2001,Jamison Brewer,breweja01,2002
98,2001,Bobby Simmons,simmobo01,2002


In [7]:
# override rookie seasons for specific players
rookie_overrides = {
    "griffbl01": 2011,  # drafted 2009, rookie season 2010‚Äì11
    "embiijo01": 2016,  # drafted 2014, rookie season 2015-16
}

In [14]:
df = pd.read_html(
    "https://www.basketball-reference.com/players/j/jamesle01/gamelog/2004",
    attrs={"id": "player_game_log_reg"}
)[0]

print(list(df.columns))

['Rk', 'Gcar', 'Gtm', 'Date', 'Team', 'Unnamed: 5', 'Opp', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']


In [22]:
def load_rookie_gamelog(player_id, season):
    first_letter = player_id[0]
    url = (
        f"https://www.basketball-reference.com/players/"
        f"{first_letter}/{player_id}/gamelog/{season}"
    )

    df = pd.read_html(
        url,
        attrs={"id": "player_game_log_reg"}
    )[0]

    # 1Ô∏è‚É£ KEEP ONLY REAL GAMES (must have an opponent)
    df = df[df["Opp"].notna()].copy()

    # 2Ô∏è‚É£ Convert MP to numeric (DNPs become NaN)
    df["MP"] = df["MP"].apply(mp_to_minutes)

    # 3Ô∏è‚É£ Drop games not played
    df = df[df["MP"].notna()].copy()

    return df


In [23]:
df = load_rookie_gamelog("jamesle01", 2004)

print("Rows:", len(df))
print(df[["Date", "Opp", "MP", "PTS"]].head(10))


Rows: 79
          Date  Opp         MP PTS
0   2003-10-29  SAC  42.833333  25
1   2003-10-30  PHO  40.350000  21
3   2003-11-01  POR  39.166667   8
4   2003-11-05  DEN  41.100000   7
5   2003-11-07  IND  43.733333  23
6   2003-11-08  WAS  44.500000  17
7   2003-11-10  NYK  33.650000  17
8   2003-11-12  MIA  42.666667  18
9   2003-11-14  BOS  35.600000  10
10  2003-11-15  PHI  46.950000  22
