In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import os
import requests
from bs4 import BeautifulSoup, Comment
from datetime import datetime
import html5lib
import lxml

from utils import mp_to_minutes

In [None]:
import duckdb

DB_PATH = "assets/rookies.duckdb"
con = duckdb.connect(DB_PATH)

In [None]:
# Create tables for database if they do not exist
con.execute("""
CREATE TABLE IF NOT EXISTS draft_classes (
    draft_year INTEGER,
    player TEXT,
    player_id TEXT,
    rookie_season INTEGER
);

CREATE TABLE IF NOT EXISTS rookie_games (
    player_id TEXT,
    rookie_season INTEGER,
    game_no INTEGER,
    game_date DATE,
    MP DOUBLE,
    PTS INTEGER,
    TRB INTEGER,
    AST INTEGER,
    STL INTEGER,
    BLK INTEGER,
    TOV INTEGER,
    FG INTEGER,
    FGA INTEGER,
    TP INTEGER,
    TPA INTEGER,
    FT INTEGER,
    FTA INTEGER
);

CREATE TABLE IF NOT EXISTS player_bios (
    player_id TEXT,
    height_in INTEGER,
    weight_lb INTEGER,
    position TEXT
);

CREATE TABLE IF NOT EXISTS scrape_log (
    player_id TEXT,
    rookie_season INTEGER,
    status TEXT,
    error TEXT,
    last_attempt TIMESTAMP
);
""")


In [None]:
# one-time import of CSV data into database
con.execute("DELETE FROM draft_classes")
con.execute("INSERT INTO draft_classes SELECT * FROM read_csv_auto('assets/draft_classes.csv')")

if os.path.exists("assets/player_bios.csv"):
    con.execute("DELETE FROM player_bios")
    con.execute("INSERT INTO player_bios SELECT * FROM read_csv_auto('assets/player_bios.csv')")


In [2]:
# extract draft class from the url
def load_draft_class(draft_year):
    url = f"https://www.basketball-reference.com/draft/NBA_{draft_year}.html"

    df = pd.read_html(url)[0]
    df_links = pd.read_html(url, extract_links="body")[0]

    df.columns = [f"{a}_{b}".strip("_") for a, b in df.columns]
    df_links.columns = df.columns

    df = df[df["Round 1_Player"].notna()].copy()

    df["player"] = df["Round 1_Player"]
    df["player_id"] = df_links["Round 1_Player"].apply(
        lambda x: x[1].split("/")[-1].replace(".html", "")
        if isinstance(x, tuple) and x[1]
        else None
    )

    # ðŸ”‘ drop header rows like "Round 2"
    df = df[df["player_id"].notna()].copy()

    df["draft_year"] = draft_year
    df["rookie_season"] = draft_year + 1

    return df[["draft_year", "player", "player_id", "rookie_season"]]



In [3]:
# load or scrape draft class with caching
# def load_or_scrape_draft(year):
#     path = f"assets/drafts/draft_{year}.csv"
#     if os.path.exists(path):
#         return pd.read_csv(path)
    
#     df = load_draft_class(year)
#     df.to_csv(path, index=False)
#     return df

# function to scrape a single draft class
def scrape_and_save_year(year):
    df = load_draft_class(year)
    df.to_csv(f"assets/drafts/draft_{year}.csv", index=False)
    print(f"âœ“ saved {year}")


In [4]:
# freeze the draft years collected
import glob
draft_classes = pd.concat(
    [pd.read_csv(f) for f in glob.glob("assets/drafts/draft_20*.csv")],
    ignore_index=True
)
draft_classes.head(100)
draft_classes.to_csv("assets/draft_classes.csv", index=False)


In [5]:
# override rookie seasons for specific players
rookie_overrides = {
    "griffbl01": 2011,  # drafted 2009, rookie season 2010â€“11
    "embiijo01": 2016,  # drafted 2014, rookie season 2015-16
}

In [None]:
def load_rookie_gamelog(player_id, season):
    first_letter = player_id[0]
    url = f"https://www.basketball-reference.com/players/{first_letter}/{player_id}/gamelog/{season}"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    dfs = pd.read_html(resp.text, attrs={"id": "player_game_log_reg"})
    if not dfs:
        raise ValueError("No tables found")

    df = dfs[0]

    df = df[df["Opp"].notna()].copy()
    df["MP"] = df["MP"].apply(mp_to_minutes)
    df = df[df["MP"].notna()].copy()

    return df


In [None]:
def insert_gamelog_into_db(player_id, rookie_season):
    gamelog = load_rookie_gamelog(player_id, rookie_season).copy()

    gamelog["player_id"] = player_id
    gamelog["rookie_season"] = rookie_season
    gamelog["game_no"] = np.arange(1, len(gamelog) + 1)

    # Normalize column names
    gamelog = gamelog.rename(columns={
        "Date": "game_date",
        "3P": "TP",
        "3PA": "TPA"
    })

    cols = ["player_id","rookie_season","game_no","game_date","MP","PTS","TRB","AST","STL","BLK","TOV","FG","FGA","TP","TPA","FT","FTA"]
    gamelog = gamelog[cols]

    con.execute("INSERT INTO rookie_games SELECT * FROM gamelog")


In [8]:
# calculate rate stats
def add_rate_stats(agg, minutes_base=36):
    factor = minutes_base / agg["minutes_used"]
    return {
        f"{k}_per_{minutes_base}": v * factor
        for k, v in agg.items()
        if k not in ("minutes_used",)
    }

In [9]:
rookie_caps_900 = pd.read_csv("assets/rookie_caps_900.csv")
errors_df = pd.read_csv("assets/rookie_errors.csv")
draft_classes = pd.read_csv("assets/draft_classes.csv")

In [None]:
# function to scrape player bio data
# store height (inches), weight (lbs), position
def scrape_player_bio(player_id):
    first_letter = player_id[0]
    url = f"https://www.basketball-reference.com/players/{first_letter}/{player_id}.html"

    try:
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "lxml")

        pos = None
        height_in = None
        weight_lb = None

        # Position
        pos_tag = soup.find("strong", string="Position")
        if pos_tag:
            pos = pos_tag.next_sibling.strip().split("â–ª")[0].strip()

        # Height / Weight
        h_tag = soup.find("strong", string="Height")
        w_tag = soup.find("strong", string="Weight")

        if h_tag:
            h = h_tag.next_sibling.strip()
            feet, inches = h.split("-")
            height_in = int(feet) * 12 + int(inches)

        if w_tag:
            w = w_tag.next_sibling.strip().replace("lb", "").strip()
            weight_lb = int(w)

        return {
            "player_id": player_id,
            "height_in": height_in,
            "weight_lb": weight_lb,
            "position": pos,
        }

    except Exception as e:
        return {
            "player_id": player_id,
            "height_in": None,
            "weight_lb": None,
            "position": None,
        }


In [None]:
# count unique players in draft classes
unique_players = draft_classes["player_id"].dropna().unique()
len(unique_players)

In [None]:
# load existing bios
# avoids redundant scraping
bios_path = "assets/player_bios.csv"

if os.path.exists(bios_path):
    player_bios = pd.read_csv(bios_path)
    done_ids = set(player_bios.player_id)
else:
    player_bios = pd.DataFrame(columns=["player_id", "height_in", "weight_lb", "position"])
    done_ids = set()


In [None]:
# identify players still needing bios
todo_bios = [pid for pid in unique_players if pid not in done_ids]
len(todo_bios)

In [None]:
# scrape missing bios with checkpointing
# saves progress every 25 players to avoid data loss, just like the gamelog scraping
new_bios = []

for i, pid in enumerate(todo_bios, 1):
    bio = scrape_player_bio(pid)
    new_bios.append(bio)

    if i % 25 == 0:
        print(f"Scraped {i}/{len(todo_bios)} bios â€” saving checkpoint")

        player_bios = pd.concat([player_bios, pd.DataFrame(new_bios)], ignore_index=True)
        player_bios = player_bios.drop_duplicates(subset=["player_id"])
        player_bios.to_csv(bios_path, index=False)

        new_bios = []
        time.sleep(10)  # cooldown

# final save
if new_bios:
    player_bios = pd.concat([player_bios, pd.DataFrame(new_bios)], ignore_index=True)
    player_bios = player_bios.drop_duplicates(subset=["player_id"])
    player_bios.to_csv(bios_path, index=False)


In [None]:
# merge bios into rookie data
bios = pd.read_csv("assets/player_bios.csv")
rookie_caps_900 = rookie_caps_900.merge(bios, on="player_id", how="left")


In [10]:
done_keys = set(zip(rookie_caps_900.player_id, rookie_caps_900.rookie_season))

In [11]:
failed_keys = set(
    zip(
        errors_df.loc[~errors_df["error"].str.contains("429", na=False), "player_id"],
        errors_df.loc[~errors_df["error"].str.contains("429", na=False), "rookie_season"],
    )
)

In [12]:
draft_classes["_key"] = list(zip(draft_classes.player_id, draft_classes.rookie_season))

draft_classes["_done"] = draft_classes["_key"].isin(done_keys)
draft_classes["_failed"] = draft_classes["_key"].isin(failed_keys)

todo = draft_classes[~draft_classes["_done"] & ~draft_classes["_failed"]].copy()

len(todo), todo.head()

(1231,
     draft_year           player  player_id  rookie_season               _key  \
 19        2000   Speedy Claxton  claxtsp01           2001  (claxtsp01, 2001)   
 26        2000    PrimoÅ¾ Brezec  brezepr01           2001  (brezepr01, 2001)   
 29        2000      Marko JariÄ‡  jaricma01           2001  (jaricma01, 2001)   
 40        2000  Chris Carrawell  carrach01           2001  (carrach01, 2001)   
 45        2000  DeeAndre Hulett  huletde01           2001  (huletde01, 2001)   
 
     _done  _failed  
 19  False    False  
 26  False    False  
 29  False    False  
 40  False    False  
 45  False    False  )

In [None]:
retry = errors_df[errors_df["error"].str.contains("429", na=False)].merge(
    draft_classes,
    on=["player_id", "rookie_season"],
    how="left"
)

In [None]:
new_results = []
new_errors = []
CHECKPOINT_EVERY = 25

for i, (_, row) in enumerate(retry.iterrows(), 1):
    try:
        gamelog = load_rookie_gamelog(row.player_id, row.rookie_season)
        agg = aggregate_capped_minutes(gamelog, cap_minutes=900)

        new_results.append({**row.to_dict(), **agg})

        if i % CHECKPOINT_EVERY == 0:
            print(f"Processed {i}/{len(retry)} â€” checkpointing")

            rookie_caps_900 = (
                pd.concat([rookie_caps_900, pd.DataFrame(new_results)], ignore_index=True)
                .sort_values("minutes_used", ascending=False)
                .drop_duplicates(subset=["player_id", "rookie_season"], keep="first")
                .reset_index(drop=True)
            )

            rookie_caps_900.to_csv("assets/rookie_caps_900.csv", index=False)

            new_results = []  # clear buffer

        time.sleep(random.uniform(25, 40))

    except requests.HTTPError as e:
        if "429" in str(e):
            print("Hit 429 â€” sleeping 10 minutes")
            time.sleep(600)
            continue


In [14]:
len(new_results), len(new_errors)

(0, 1231)

In [15]:
new_errors_df = pd.DataFrame(new_errors)
new_errors_df["error"].value_counts().head(10)

error
HTTP Error 429: Too Many Requests    1216
No tables found                        15
Name: count, dtype: int64

In [13]:
# Flush in-memory progress safely

if new_results:
    rookie_caps_900 = (
        pd.concat([rookie_caps_900, pd.DataFrame(new_results)], ignore_index=True)
        .sort_values("minutes_used", ascending=False)
        .drop_duplicates(subset=["player_id", "rookie_season"], keep="first")
        .reset_index(drop=True)
    )

    rookie_caps_900.to_csv("assets/rookie_caps_900.csv", index=False)
    print(f"Saved {len(new_results)} new rows to rookie_caps_900")

    new_results = []

if new_errors:
    errors_df = (
        pd.concat([errors_df, pd.DataFrame(new_errors)], ignore_index=True)
        .drop_duplicates(subset=["player_id", "rookie_season"])
        .reset_index(drop=True)
    )

    errors_df.to_csv("assets/rookie_errors.csv", index=False)
    print(f"Saved {len(new_errors)} new errors")

    new_errors = []


Saved 1231 new errors


In [None]:
# players missing minutes_total
# backfill from scratch
needs_backfill = rookie_caps_900[rookie_caps_900["minutes_total"].isna() | ~rookie_caps_900.columns.isin(["minutes_total"])]

backfill_results = []

for i, row in needs_backfill.iterrows():
    try:
        gamelog = load_rookie_gamelog(row.player_id, row.rookie_season)
        agg = aggregate_capped_minutes(gamelog, cap_minutes=900)

        backfill_results.append({
            "player_id": row.player_id,
            "rookie_season": row.rookie_season,
            "minutes_total": agg["minutes_total"],
            "hit_cap": agg["hit_cap"],
        })

        time.sleep(random.uniform(12,18))

    except Exception as e:
        print("Backfill failed:", row.player_id, row.rookie_season, e)


In [None]:
# merge backfilled data
# do not overwrite main table until confirmed it works.

backfill_df = pd.DataFrame(backfill_results)

rookie_caps_900 = rookie_caps_900.merge(
    backfill_df,
    on=["player_id", "rookie_season"],
    how="left",
    suffixes=("", "_new")
)

rookie_caps_900["minutes_total"] = rookie_caps_900["minutes_total"].fillna(rookie_caps_900["minutes_total_new"])
rookie_caps_900["hit_cap"] = rookie_caps_900["hit_cap"].fillna(rookie_caps_900["hit_cap_new"])

rookie_caps_900 = rookie_caps_900.drop(columns=[c for c in rookie_caps_900.columns if c.endswith("_new")])
