In [1]:
import requests
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

## **MVP Record (1991-2025)**

In [2]:
def scrape_mvp_awards(start_year=1990, end_year=2027, sleep=True):
    years = list(range(start_year, end_year))
    url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
    }

    # STEP 1 — DOWNLOAD HTML FILES
    for year in years:
        if sleep:
            time.sleep(random.uniform(2.5, 5.0))  # avoid rate limits

        url = url_start.format(year)
        print(f"Downloading MVP page for {year}...")

        response = requests.get(url, headers=headers)

        with open(f"mvp/{year}.html", "w+", encoding="utf-8") as f:
            f.write(response.text)

    print("MVP HTML download complete.")

    # STEP 2 — PARSE MVP HTML FILES
    dfs = []
    for year in years:
        with open(f"mvp/{year}.html", encoding="utf-8") as f:
            page = f.read()

        if "Rate Limited Request" in page:
            print(f" {year} blocked (429). Skipping.")
            continue

        soup = BeautifulSoup(page, "html.parser")

        # remove duplicate header row
        over_header = soup.find("tr", class_="over_header")
        if over_header:
            over_header.decompose()

        table = soup.find(id="mvp")
        if table is None:
            print(f" No MVP table found for {year}")
            continue

        df = pd.read_html(StringIO(str(table)))[0]
        df["Year"] = year
        dfs.append(df)

    print("MVP parsing complete.")

    return pd.concat(dfs, ignore_index=True)


Downloading 1990...
Downloading 1991...
Downloading 1992...
Downloading 1993...
Downloading 1994...
Downloading 1995...
Downloading 1996...
Downloading 1997...
Downloading 1998...
Downloading 1999...
Downloading 2000...
Downloading 2001...
Downloading 2002...
Downloading 2003...
Downloading 2004...
Downloading 2005...
Downloading 2006...
Downloading 2007...
Downloading 2008...
Downloading 2009...
Downloading 2010...
Downloading 2011...
Downloading 2012...
Downloading 2013...
Downloading 2014...
Downloading 2015...
Downloading 2016...
Downloading 2017...
Downloading 2018...
Downloading 2019...
Downloading 2020...
Downloading 2021...
Downloading 2022...
Downloading 2023...
Downloading 2024...
Downloading 2025...
Downloading 2026...
Download complete.
⚠️ No MVP table found for year 2026
Parsing complete.


In [None]:
years = list(range(1990, 2027))
mvp_df = scrape_team_standings(years)

print(mvp_df.head())
print(mvp_df.shape)

In [8]:
mvp_df.to_csv("mvps.csv")

-------------------------------------------------------------------

## **Players Record (1991-2025)**

In [None]:
def scrape_player_stats(driver, start_year=1990, end_year=2027, sleep=True):
    years = list(range(start_year, end_year))
    url_template = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

    # STEP 1 — Download HTML using Selenium
    for year in years:
        url = url_template.format(year)
        print(f"Downloading player stats for {year}: {url}")

        driver.get(url)
        driver.execute_script("window.scrollTo(1, 10000)")

        if sleep:
            time.sleep(random.uniform(2.5, 4.0))

        html = driver.page_source
        with open(f"player/{year}.html", "w+", encoding="utf-8") as f:
            f.write(html)

    print("Player HTML download complete.")

    # STEP 2 — Parse HTML
    dfs = []
    for year in years:
        with open(f"player/{year}.html", encoding="utf-8") as f:
            page = f.read()

        soup = BeautifulSoup(page, "html.parser")

        # Remove duplicate table header
        thead = soup.find("tr", class_="thead")
        if thead:
            thead.decompose()

        table = soup.find(id="per_game_stats")
        if table is None:
            print(f" No player stats table for {year}")
            continue

        df = pd.read_html(StringIO(str(table)))[0]
        df["Year"] = year
        dfs.append(df)

    print("Player stats parsing complete.")

    return pd.concat(dfs, ignore_index=True)


In [None]:
years = list(range(1990, 2027))
player_df = scrape_team_standings(years)

print(player_df.head())
print(player_df.shape)

In [None]:
full_player_df.to_csv("players.csv")

In [13]:
#!pip install selenium

In [14]:
# xattr -d com.apple.quarantine chromedriver

In [15]:
#!pip install webdriver-manager

--------

## **Teams Record (1991-2025)**

In [9]:
def scrape_team_standings(years, sleep=True):
    team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
    
    # ------------------------
    # STEP 1 — Download HTMLs
    # ------------------------
    for year in years:
        url = team_stats_url.format(year)

        if sleep:
            time.sleep(random.uniform(2.5, 5.0))

        print(f"Downloading {year}...")

        response = requests.get(url)
        html = response.text

        with open(f"team/{year}.html", "w+", encoding="utf-8") as f:
            f.write(html)

    print("Team HTML download complete.")

    # ------------------------
    # STEP 2 — Parse Each File
    # ------------------------
    dfs = []

    for year in years:
        with open(f"team/{year}.html", encoding="utf-8") as f:
            page = f.read()

        soup = BeautifulSoup(page, "html.parser")

        # Remove extra header row if present
        thead = soup.find("tr", class_="thead")
        if thead:
            thead.decompose()

        # ------------------------
        # EASTERN CONFERENCE
        # ------------------------
        east_table = soup.find(id="divs_standings_E")
        if east_table is not None:
            east_df = pd.read_html(StringIO(str(east_table)))[0]
            east_df["Year"] = year
            east_df["Team"] = east_df["Eastern Conference"]
            del east_df["Eastern Conference"]
            dfs.append(east_df)
        else:
            print(f" No East table found for {year}")

        # Reload soup to avoid modified DOM
        soup = BeautifulSoup(page, "html.parser")
        thead = soup.find("tr", class_="thead")
        if thead:
            thead.decompose()

        # ------------------------
        # WESTERN CONFERENCE
        # ------------------------
        west_table = soup.find(id="divs_standings_W")
        if west_table is not None:
            west_df = pd.read_html(StringIO(str(west_table)))[0]
            west_df["Year"] = year
            west_df["Team"] = west_df["Western Conference"]
            del west_df["Western Conference"]
            dfs.append(west_df)
        else:
            print(f" No West table found for {year}")

    print("Team data parsing complete.")
    
    return pd.concat(dfs, ignore_index=True)

In [10]:
years = list(range(1990, 2027))
team_df = scrape_team_standings(years)

print(team_df.head())
print(team_df.shape)


Downloading 1990...
Downloading 1991...
Downloading 1992...
Downloading 1993...
Downloading 1994...
Downloading 1995...
Downloading 1996...
Downloading 1997...
Downloading 1998...
Downloading 1999...
Downloading 2000...
Downloading 2001...
Downloading 2002...
Downloading 2003...
Downloading 2004...
Downloading 2005...
Downloading 2006...
Downloading 2007...
Downloading 2008...
Downloading 2009...
Downloading 2010...
Downloading 2011...
Downloading 2012...
Downloading 2013...
Downloading 2014...
Downloading 2015...
Downloading 2016...
Downloading 2017...
Downloading 2018...
Downloading 2019...
Downloading 2020...
Downloading 2021...
Downloading 2022...
Downloading 2023...
Downloading 2024...
Downloading 2025...
Downloading 2026...
Team HTML download complete.
Team data parsing complete.
    W   L  W/L%    GB   PS/G   PA/G    SRS  Year                 Team
0  53  29  .646     —  110.2  105.2   4.23  1990  Philadelphia 76ers*
1  52  30  .634   1.0  110.0  106.0   3.23  1990      Boston Ce

In [12]:
team_df.to_csv("teams.csv")