In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import os
import requests
from bs4 import BeautifulSoup, Comment
from datetime import datetime
import html5lib
import lxml

from utils import mp_to_minutes

In [2]:
# extract draft class from the url
def load_draft_class(draft_year):
    url = f"https://www.basketball-reference.com/draft/NBA_{draft_year}.html"

    df = pd.read_html(url)[0]
    df_links = pd.read_html(url, extract_links="body")[0]

    df.columns = [f"{a}_{b}".strip("_") for a, b in df.columns]
    df_links.columns = df.columns

    df = df[df["Round 1_Player"].notna()].copy()

    df["player"] = df["Round 1_Player"]
    df["player_id"] = df_links["Round 1_Player"].apply(
        lambda x: x[1].split("/")[-1].replace(".html", "")
        if isinstance(x, tuple) and x[1]
        else None
    )

    # üîë drop header rows like "Round 2"
    df = df[df["player_id"].notna()].copy()

    df["draft_year"] = draft_year
    df["rookie_season"] = draft_year + 1

    return df[["draft_year", "player", "player_id", "rookie_season"]]



In [3]:
# load or scrape draft class with caching
# def load_or_scrape_draft(year):
#     path = f"assets/drafts/draft_{year}.csv"
#     if os.path.exists(path):
#         return pd.read_csv(path)
    
#     df = load_draft_class(year)
#     df.to_csv(path, index=False)
#     return df

# function to scrape a single draft class
def scrape_and_save_year(year):
    df = load_draft_class(year)
    df.to_csv(f"assets/drafts/draft_{year}.csv", index=False)
    print(f"‚úì saved {year}")


In [4]:
# iterate through years and save each draft class

# years = [2022, 2023, 2024, 2025]

# for y in years:
#     scrape_and_save_year(y)
#     time.sleep(10)

In [28]:
# freeze the draft years collected
import glob
draft_classes = pd.concat(
    [pd.read_csv(f) for f in glob.glob("assets/drafts/draft_20*.csv")],
    ignore_index=True
)
draft_classes.head(100)
draft_classes.to_csv("assets/draft_classes.csv", index=False)


In [6]:
# override rookie seasons for specific players
rookie_overrides = {
    "griffbl01": 2011,  # drafted 2009, rookie season 2010‚Äì11
    "embiijo01": 2016,  # drafted 2014, rookie season 2015-16
}

In [7]:
df = pd.read_html(
    "https://www.basketball-reference.com/players/j/jamesle01/gamelog/2004",
    attrs={"id": "player_game_log_reg"}
)[0]

print(list(df.columns))

['Rk', 'Gcar', 'Gtm', 'Date', 'Team', 'Unnamed: 5', 'Opp', 'Result', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']


In [7]:
def load_rookie_gamelog(player_id, season):
    first_letter = player_id[0]
    url = (
        f"https://www.basketball-reference.com/players/"
        f"{first_letter}/{player_id}/gamelog/{season}"
    )

    df = pd.read_html(
        url,
        attrs={"id": "player_game_log_reg"}
    )[0]

    # 1Ô∏è‚É£ KEEP ONLY REAL GAMES (must have an opponent)
    df = df[df["Opp"].notna()].copy()

    # 2Ô∏è‚É£ Convert MP to numeric (DNPs become NaN)
    df["MP"] = df["MP"].apply(mp_to_minutes)

    # 3Ô∏è‚É£ Drop games not played
    df = df[df["MP"].notna()].copy()

    return df


In [9]:
df = load_rookie_gamelog("jamesle01", 2004)

print("Rows:", len(df))
print(df[["Date", "Opp", "MP", "PTS"]].head(10))


Rows: 79
          Date  Opp         MP PTS
0   2003-10-29  SAC  42.833333  25
1   2003-10-30  PHO  40.350000  21
3   2003-11-01  POR  39.166667   8
4   2003-11-05  DEN  41.100000   7
5   2003-11-07  IND  43.733333  23
6   2003-11-08  WAS  44.500000  17
7   2003-11-10  NYK  33.650000  17
8   2003-11-12  MIA  42.666667  18
9   2003-11-14  BOS  35.600000  10
10  2003-11-15  PHI  46.950000  22


In [8]:
def aggregate_capped_minutes(
    gamelog,
    cap_minutes=900,
    stat_cols=("PTS", "TRB", "AST", "STL", "BLK", "TOV"),
):
    """
    Aggregate rookie production up to a capped number of minutes.
    """

    # Ensure counting stats are numeric
    gamelog = gamelog.copy()
    for stat in stat_cols:
        gamelog[stat] = pd.to_numeric(gamelog[stat], errors="coerce")

    total_minutes = 0.0
    totals = {stat: 0.0 for stat in stat_cols}

    for _, row in gamelog.iterrows():
        mp = row["MP"]

        if total_minutes >= cap_minutes:
            break

        remaining = cap_minutes - total_minutes

        if mp <= remaining:
            weight = 1.0
            used_mp = mp
        else:
            weight = remaining / mp
            used_mp = remaining

        total_minutes += used_mp

        for stat in stat_cols:
            if pd.notna(row[stat]):
                totals[stat] += row[stat] * weight

    return {
        "minutes_used": total_minutes,
        **totals,
    }

In [9]:
lebron_2004 = load_rookie_gamelog("jamesle01", 2004)

agg_300 = aggregate_capped_minutes(lebron_2004, cap_minutes=300)
agg_900 = aggregate_capped_minutes(lebron_2004, cap_minutes=900)

agg_300, agg_900

({'minutes_used': 300.0,
  'PTS': 124.18750000000001,
  'TRB': 52.03125,
  'AST': 48.40625,
  'STL': 11.6875,
  'BLK': 5.34375,
  'TOV': 25.375},
 {'minutes_used': 900.0,
  'PTS': 386.99963302752303,
  'TRB': 146.3783486238532,
  'AST': 140.83779816513763,
  'STL': 28.0,
  'BLK': 14.0,
  'TOV': 80.91889908256881})

In [10]:
# calculate rate stats
def add_rate_stats(agg, minutes_base=36):
    factor = minutes_base / agg["minutes_used"]
    return {
        f"{k}_per_{minutes_base}": v * factor
        for k, v in agg.items()
        if k not in ("minutes_used",)
    }

In [13]:
rookie_caps_900 = pd.read_csv("assets/rookie_caps_900.csv")
errors_df = pd.read_csv("assets/rookie_errors.csv")

In [14]:
len(rookie_caps_900) - len(errors_df)

-1516

In [None]:
retry = errors_df[errors_df["error"].str.contains("429")]
retry.head()

Unnamed: 0,player_id,rookie_season,error
1,alexaco02,2001,HTTP Error 429: Too Many Requests
2,cleavma01,2001,HTTP Error 429: Too Many Requests
3,collija02,2001,HTTP Error 429: Too Many Requests
4,turkohe01,2001,HTTP Error 429: Too Many Requests
5,masonde01,2001,HTTP Error 429: Too Many Requests


In [None]:
new_results = []
new_errors = []
cap = 900

for _, row in retry.iterrows():
    try:
        gamelog = load_rookie_gamelog(row.player_id, row.rookie_season)
        agg = aggregate_capped_minutes(gamelog, cap_minutes=cap)

        new_results.append({
            "player_id": row.player_id,
            "rookie_season": row.rookie_season,
            **agg
        })

        time.sleep(6)  # polite delay

    except Exception as e:
        new_errors.append({
            "player_id": row.player_id,
            "rookie_season": row.rookie_season,
            "error": str(e),
        })


In [17]:
rookie_caps_900 = pd.concat([rookie_caps_900, pd.DataFrame(new_results)], ignore_index=True)
errors_df = pd.concat([errors_df, pd.DataFrame(new_errors)], ignore_index=True)

In [20]:
# reconstruct missing player names and draft years
rookie_caps_900 = rookie_caps_900.merge(
    draft_classes,
    on=["player_id", "rookie_season"],
    how="left",
    suffixes=("", "_fix"),
)

rookie_caps_900["draft_year"] = rookie_caps_900["draft_year"].fillna(rookie_caps_900["draft_year_fix"])
rookie_caps_900["player"] = rookie_caps_900["player"].fillna(rookie_caps_900["player_fix"])

rookie_caps_900 = rookie_caps_900.drop(columns=[c for c in rookie_caps_900.columns if c.endswith("_fix")])
rookie_caps_900.head(200)


Unnamed: 0,draft_year,player,player_id,rookie_season,minutes_used,PTS,TRB,AST,hit_cap,STL,BLK,TOV
0,2000.0,Kenyon Martin,martike01,2001,900.000000,287.515991,197.842217,46.000000,True,,,
1,2000.0,Stromile Swift,swiftst01,2001,900.000000,276.023829,186.755957,17.000000,True,,,
2,2000.0,Darius Miles,milesda01,2001,900.000000,316.055385,227.523077,46.513846,True,,,
3,2000.0,Marcus Fizer,fizerma01,2001,900.000000,354.029369,188.726872,47.000000,True,,,
4,2000.0,Mike Miller,millemi01,2001,900.000000,328.808327,154.819095,56.000000,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
122,2002.0,Lonny Baxter,baxtelo01,2003,683.600000,262.000000,165.000000,16.000000,,9.0,22.000000,46.000000
123,2002.0,Jamal Sampson,sampsja01,2003,8.816667,0.000000,2.000000,1.000000,,1.0,0.000000,0.000000
124,2002.0,Chris Owens,owensch01,2003,6.316667,4.000000,1.000000,0.000000,,0.0,0.000000,1.000000
125,2002.0,Rasual Butler,butlera01,2003,900.000000,323.804291,107.000000,52.982208,,11.0,24.982208,42.982208


In [21]:
# deduplicate to keep only one entry per player per rookie season
rookie_caps_900 = (
    rookie_caps_900
    .sort_values("minutes_used", ascending=False)
    .drop_duplicates(subset=["player_id", "rookie_season"], keep="first")
    .reset_index(drop=True)
)

In [22]:
rookie_caps_900.head(200)
rookie_caps_900.to_csv("assets/rookie_caps_900.csv", index=False)
errors_df.to_csv("assets/rookie_errors.csv", index=False)

In [24]:
rookie_caps_900.shape
rookie_caps_900["rookie_season"].min(), rookie_caps_900["rookie_season"].max()


(np.int64(2001), np.int64(2006))

In [25]:
rookie_caps_900.duplicated(subset=["player_id", "rookie_season"]).sum()

np.int64(0)

In [29]:
# load existing data
rookie_caps_900 = pd.read_csv("assets/rookie_caps_900.csv")
errors_df = pd.read_csv("assets/rookie_errors.csv")
draft_classes = pd.read_csv("assets/draft_classes.csv")

# zip to sets for easy comparison
already_done = set(zip(rookie_caps_900.player_id, rookie_caps_900.rookie_season))
already_failed = set(zip(errors_df.player_id, errors_df.rookie_season))

In [30]:
# build todo dataframe
mask_done = draft_classes.apply(
    lambda r: (r.player_id, r.rookie_season) in already_done,
    axis=1
)

mask_failed = draft_classes.apply(
    lambda r: (r.player_id, r.rookie_season) in already_failed,
    axis=1
)

todo = draft_classes[~mask_done & ~mask_failed].copy()

In [37]:
print("Total draft classes:", len(draft_classes))
print("Already done:", len(already_done))
print("Already failed:", len(already_failed))
print("Remaining todo:", len(todo))

todo.head()

retry = draft_classes.merge(
    errors_df[["player_id", "rookie_season"]],
    on=["player_id", "rookie_season"],
    how="inner"
)

len(retry), retry.head()

Total draft classes: 1542
Already done: 127
Already failed: 1529
Remaining todo: 0


(2943,
    draft_year              player  player_id  rookie_season
 0        2000         Etan Thomas  thomaet01           2001
 1        2000  Courtney Alexander  alexaco02           2001
 2        2000      Mateen Cleaves  cleavma01           2001
 3        2000       Jason Collier  collija02           2001
 4        2000       Hedo T√ºrkoƒülu  turkohe01           2001)

In [38]:
new_results = []
new_errors = []

for i, (_, row) in enumerate(retry.iterrows(), 1):
    try:
        gamelog = load_rookie_gamelog(row.player_id, row.rookie_season)
        agg = aggregate_capped_minutes(gamelog, cap_minutes=900)

        new_results.append({
            **row.to_dict(),
            **agg
        })

        if i % 10 == 0:
            print(f"Processed {i}/{len(retry)}")

        time.sleep(6)

    except Exception as e:
        new_errors.append({
            "player_id": row.player_id,
            "rookie_season": row.rookie_season,
            "error": str(e),
        })

Processed 20/2943
Processed 30/2943


In [40]:
len(new_results)

30

In [39]:
if new_results:
    rookie_caps_900 = pd.concat([rookie_caps_900, pd.DataFrame(new_results)], ignore_index=True)
    rookie_caps_900 = rookie_caps_900.drop_duplicates(subset=["player_id", "rookie_season"])

if new_errors:
    errors_df = pd.concat([errors_df, pd.DataFrame(new_errors)], ignore_index=True)
    errors_df = errors_df.drop_duplicates(subset=["player_id", "rookie_season"])

rookie_caps_900.head(200)

Unnamed: 0,draft_year,player,player_id,rookie_season,minutes_used,PTS,TRB,AST,hit_cap,STL,BLK,TOV
0,2000.0,Kenyon Martin,martike01,2001,900.000000,287.515991,197.842217,46.000000,True,,,
1,2000.0,Stromile Swift,swiftst01,2001,900.000000,276.023829,186.755957,17.000000,True,,,
2,2000.0,Darius Miles,milesda01,2001,900.000000,316.055385,227.523077,46.513846,True,,,
3,2000.0,Marcus Fizer,fizerma01,2001,900.000000,354.029369,188.726872,47.000000,True,,,
4,2000.0,Mike Miller,millemi01,2001,900.000000,328.808327,154.819095,56.000000,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
122,2002.0,Tito Maddox,maddoti01,2003,34.016667,11.000000,7.000000,5.000000,,3.0,1.0,3.0
123,2000.0,Jason Hart,hartja01,2001,9.916667,2.000000,0.000000,1.000000,,0.0,0.0,2.0
124,2000.0,Mamadou N'Diaye,ndiayma02,2001,9.433333,4.000000,2.000000,0.000000,,0.0,0.0,0.0
125,2002.0,Jamal Sampson,sampsja01,2003,8.816667,0.000000,2.000000,1.000000,,1.0,0.0,0.0


In [36]:
todo.head(20)

Unnamed: 0,draft_year,player,player_id,rookie_season


In [None]:
# batch process rookie classes
# cap = 900
# results = []
# errors = []

# for _, row in draft_classes.iterrows():
#     try:
#         gamelog = load_rookie_gamelog(row.player_id, row.rookie_season)
#         agg = aggregate_capped_minutes(gamelog, cap_minutes=cap)

#         results.append({
#             **row.to_dict(),
#             **agg
#         })

#     except Exception as e:
#         errors.append({
#             "player_id": row.player_id,
#             "rookie_season": row.rookie_season,
#             "error": str(e),
#         })

# rookie_caps_900 = pd.DataFrame(results)
# errors_df = pd.DataFrame(errors)

In [None]:
rookie_caps_900.head(200)

Unnamed: 0,draft_year,player,player_id,rookie_season,minutes_used,PTS,TRB,AST
0,2000,Kenyon Martin,martike01,2001,900.0,287.515991,197.842217,46.0
1,2000,Stromile Swift,swiftst01,2001,900.0,276.023829,186.755957,17.0
2,2000,Darius Miles,milesda01,2001,900.0,316.055385,227.523077,46.513846
3,2000,Marcus Fizer,fizerma01,2001,900.0,354.029369,188.726872,47.0
4,2000,Mike Miller,millemi01,2001,900.0,328.808327,154.819095,56.0
5,2000,DerMarr Johnson,johnsde03,2001,900.0,291.886534,113.591022,48.0
6,2000,Chris Mihm,mihmch01,2001,900.0,339.696023,215.278409,10.379735
7,2000,Jamal Crawford,crawfja01,2001,900.0,240.824859,70.329944,124.164972
8,2000,Joel Przybilla,przybjo01,2001,267.066667,27.0,71.0,2.0
9,2000,Keyon Dooling,doolike01,2001,900.0,321.989399,61.123675,128.123675


In [22]:
errors_df.head(200)

Unnamed: 0,player_id,rookie_season,error
0,thomaet01,2001,No tables found
1,alexaco02,2001,HTTP Error 429: Too Many Requests
2,cleavma01,2001,HTTP Error 429: Too Many Requests
3,collija02,2001,HTTP Error 429: Too Many Requests
4,turkohe01,2001,HTTP Error 429: Too Many Requests
...,...,...,...
195,szewcsz01,2004,HTTP Error 429: Too Many Requests
196,austima01,2004,HTTP Error 429: Too Many Requests
197,hansetr01,2004,HTTP Error 429: Too Many Requests
198,blakest01,2004,HTTP Error 429: Too Many Requests


In [25]:
retry_ids = errors_df.query("error.str.contains('429')", engine="python")
# rerun only these later, slowly
retry_ids

Unnamed: 0,player_id,rookie_season,error
1,alexaco02,2001,HTTP Error 429: Too Many Requests
2,cleavma01,2001,HTTP Error 429: Too Many Requests
3,collija02,2001,HTTP Error 429: Too Many Requests
4,turkohe01,2001,HTTP Error 429: Too Many Requests
5,masonde01,2001,HTTP Error 429: Too Many Requests
...,...,...,...
1524,olbrila01,2026,HTTP Error 429: Too Many Requests
1525,richawi02,2026,HTTP Error 429: Too Many Requests
1526,shulgma01,2026,HTTP Error 429: Too Many Requests
1527,niangsa01,2026,HTTP Error 429: Too Many Requests


In [26]:
rookie_caps_900["hit_cap"] = rookie_caps_900["minutes_used"] >= cap

In [27]:
rookie_caps_900.to_csv("assets/rookie_caps_900.csv", index=False)
errors_df.to_csv("assets/rookie_errors.csv", index=False)