In [3]:
import os
import time
import datetime
import string
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
BASE_INDEX = "https://www.basketball-reference.com/players/{letter}/"
BASE_GAMELOG = "https://www.basketball-reference.com/players/{letter}/{slug}/gamelog/{season_end}"
OUT_DIR = "./bbref_players_games_simple/"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; simple-scraper/1.0)"}
S = requests.Session()
S.headers.update(HEADERS)

In [5]:
def last_n_season_ends(n=3):
    today = datetime.date.today()
    last_end = today.year if today.month < 7 else today.year + 1
    return [last_end - i for i in range(n)]

In [6]:
def get_players_for_letter(letter):
    url = BASE_INDEX.format(letter=letter)
    r = S.get(url, timeout=15); r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    table = soup.find("table", id="players")
    if not table:
        return []
    rows = table.select("tbody > tr")
    players = []
    for tr in rows:
        if tr.get("class") and "thead" in tr.get("class"):
            continue
        name_tag = tr.find("th").find("a") if tr.find("th") else tr.find("a")
        if not name_tag:
            continue
        href = name_tag.get("href", "")
        slug = href.rsplit("/", 1)[-1].replace(".html", "")
        name = name_tag.get_text(strip=True)
        year_max = (tr.find("td", {"data-stat":"year_max"}) or "").get_text(strip=True) if tr else ""
        players.append({"name": name, "slug": slug, "year_max": year_max})
    return players

In [7]:
def is_active(record, recent_end):
    ym = record.get("year_max", "").strip()
    if ym == "" or ym.lower() == "active":
        return True
    try:
        return int(ym) >= recent_end
    except Exception:
        return False

In [8]:
def fetch_player_gamelogs(letter, slug, season_ends):
    frames = []
    for s in season_ends:
        url = BASE_GAMELOG.format(letter=letter, slug=slug, season_end=s)
        try:
            r = S.get(url, timeout=15)
            r.raise_for_status()
            tables = pd.read_html(r.text)
            if not tables:
                continue
            tbl = max(tables, key=lambda t: t.shape[0])
            if tbl.empty:
                continue
            tbl = tbl.loc[~tbl.iloc[:,0].isna()].copy()  # drop header repeats
            tbl["SEASON_END"] = s
            frames.append(tbl)
            time.sleep(3.5)
        except Exception:
            # skip if page missing or parse fails
            continue
    if frames:
        return pd.concat(frames, ignore_index=True, sort=False)
    return None

In [9]:
def save(df, slug, name, letter):
    path = os.path.join(OUT_DIR, f"{letter}/")
    os.makedirs(path, exist_ok=True)
    name_safe = "".join(c for c in name.replace(" ", "_") if c.isalnum() or c in ("_", "-"))
    path = os.path.join(path, f"{slug}_{name_safe}_last3.csv")
    df.to_csv(path, index=False)
    print("saved:", path)

In [10]:
def run(letters, seasons_n=3):
    seasons = last_n_season_ends(seasons_n)
    recent = seasons[0]
    print("Seasons (end):", seasons)
    for letter in letters:
        time.sleep(3.5)
        players = get_players_for_letter(letter)
        active = [p for p in players if is_active(p, recent)]
        for p in active:
            print("fetching", p["name"])
            df = fetch_player_gamelogs(letter, p["slug"], seasons)
            if df is not None and not df.empty:
                df["PLAYER_NAME"] = p["name"]
                save(df, p["slug"], p["name"], letter)
            else:
                print("no data for", p["name"])
            time.sleep(3.5)  # polite pause between players

In [18]:
if __name__ == "__main__":
    # test small subset first, change letters or seasons_n as needed
    # for letter in string.ascii_lowercase[4:]:
    run(letters=("u",), seasons_n=3)
        # time.sleep(1.5)

Seasons (end): [2026, 2025, 2024]
