In [1]:
import os
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

SEASONS = list(range(2024, 2026))

DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")

os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)

def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries + 1):
        time.sleep(sleep * i)
        try:
            def fetch():
                with sync_playwright() as p:
                    browser = p.firefox.launch()
                    page = browser.new_page()
                    page.goto(url)
                    print(page.title())
                    content = page.inner_html(selector)
                    browser.close()
                    return content

            with ThreadPoolExecutor(max_workers=1) as executor:
                future = executor.submit(fetch)
                html = future.result()

        except Exception as e:
            print(f"Error fetching {url}: {e}")
            continue
        else:
            break
    return html

def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = get_html(url, "#content .filter")

    if not html:
        print(f"Failed to retrieve data for season {season}")
        return

    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a")
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links if 'href' in l.attrs]

    for url in standings_pages:
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])

        if os.path.exists(save_path):
            print(f"File already exists: {save_path}, skipping...")
            continue

        html = get_html(url, "#all_schedule")

        if html:
            with open(save_path, "w+", encoding="utf-8") as f:
                f.write(html)
            print(f"Saved: {save_path}")
        else:
            print(f"Failed to retrieve: {url}")

for season in SEASONS:
    scrape_season(season)

standings_files = os.listdir(STANDINGS_DIR)

def scrape_game(standings_file):
    with open(standings_file, 'r', encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a")
    hrefs = [l.get('href') for l in links]
    box_scores = [f"https://www.basketball-reference.com{l}" for l in hrefs if l and "boxscore" in l and '.html' in l]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = get_html(url, "#content")
        if not html:
            continue
        with open(save_path, "w+", encoding="utf-8") as f:
            f.write(html)

for season in SEASONS:
    files = [s for s in standings_files if str(season) in s]
    for f in files:
        filepath = os.path.join(STANDINGS_DIR, f)
        scrape_game(filepath)

# Fetching game statistics
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

SCORE_DIR = "data/scores"
os.makedirs(SCORE_DIR, exist_ok=True)

BASE_URL = "https://www.basketball-reference.com"
SEASON = "2025"
SCHEDULE_URL = f"{BASE_URL}/leagues/NBA_{SEASON}_games.html"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

# Get all monthly schedule URLs
response = requests.get(SCHEDULE_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
month_links = [a["href"] for a in soup.select("div.filter a")]

# Collect all game URLs
game_urls = []
for month_link in month_links:
    month_url = BASE_URL + month_link
    response = requests.get(month_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    game_urls += [BASE_URL + a["href"] for a in soup.select("td[data-stat='box_score_text'] a")]
    time.sleep(2)  # Prevent getting blocked

print(f"Found {len(game_urls)} game box scores")

games = []
base_cols = None

for i, url in enumerate(game_urls):
    time.sleep(2)  # Prevent getting blocked
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        
        try:
            line_score = pd.read_html(str(soup), attrs={'id': 'line_score'})[0]
            cols = list(line_score.columns)
            cols[0] = "team"
            cols[-1] = "total"
            line_score.columns = cols
            line_score = line_score[["team", "total"]]
        except Exception as e:
            print(f"Skipping {url} due to missing line score: {e}")
            continue
        
        teams = list(line_score["team"])
        summaries = []
        for team in teams:
            try:
                basic = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-basic'}, index_col=0)[0]
                advanced = pd.read_html(str(soup), attrs={'id': f'box-{team}-game-advanced'}, index_col=0)[0]
                totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
                totals.index = totals.index.str.lower()
                maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
                maxes.index = maxes.index.str.lower() + "_max"
                summary = pd.concat([totals, maxes])
                if base_cols is None:
                    base_cols = list(summary.index.drop_duplicates(keep="first"))
                    base_cols = [b for b in base_cols if "bpm" not in b]
                summary = summary[base_cols]
                summaries.append(summary)
            except Exception as e:
                print(f"Skipping {url} due to missing stats for {team}: {e}")
                continue
        
        if summaries:
            summary = pd.concat(summaries, axis=1).T
            game = pd.concat([summary, line_score], axis=1)
            game["home"] = [0, 1]
            game_opp = game.iloc[::-1].reset_index()
            game_opp.columns += "_opp"
            full_game = pd.concat([game, game_opp], axis=1)
            full_game["season"] = SEASON
            full_game["date"] = pd.to_datetime(url.split("/")[-1][:8], format="%Y%m%d")
            full_game["won"] = full_game["total"] > full_game["total_opp"]
            games.append(full_game)
        
        if len(games) % 100 == 0:
            print(f"{len(games)} / {len(game_urls)} games processed")
    else:
        print(f"Failed to download {url}")

if not games:
    print("No valid games were processed!")
else:
    games_df = pd.concat(games, ignore_index=True)
    csv_path = os.path.join(SCORE_DIR, "games_data.csv")
    games_df.to_csv(csv_path, index=False)
    print(f"DataFrame successfully saved to {csv_path}!")


2023-24 NBA Schedule | Basketball-Reference.com
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-october.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-november.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-december.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-january.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-february.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-march.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-april.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-may.html
2023-24 NBA Schedule | Basketball-Reference.com
Saved: data/standings/NBA_2024_games-june.html
2024-25 NBA Schedule | Basketball-Reference.com
2024-25 NBA Schedule | Basketb

KeyboardInterrupt: 