## Scraping MVP voting information and downloading data

In [2]:
import cloudscraper
import os
import time
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup


In [None]:
years = list(range(1991, 2026))

[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]


In [4]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = url_start.format(year)
    data = scraper.get(url)
    
    with open(f"mvp/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

SSLError: HTTPSConnectionPool(host='www.basketball-reference.com', port=443): Max retries exceeded with url: /awards/awards_1991.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)')))

In [29]:
dfs = []

for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(StringIO(str(mvp_table)))[0]
    mvp_df["Year"] = year
    
    dfs.append(mvp_df)

In [30]:
mvps = pd.concat(dfs)

mvps.to_csv("mvps.csv")

mvps

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77,891,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10,497,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.320,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6,476,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2,222,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.570,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0,142,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.770,15.5,0.225,1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,7T,Anthony Edwards,23,MIN,0,12,1000,0.012,79,36.3,...,5.7,4.5,1.2,0.6,0.447,0.395,0.837,8.4,0.140,2025
8,9,Stephen Curry,36,GSW,0,2,1000,0.002,70,32.2,...,4.4,6.0,1.1,0.4,0.448,0.397,0.933,7.9,0.168,2025
9,10T,Jalen Brunson,28,NYK,0,1,1000,0.001,65,35.4,...,2.9,7.3,0.9,0.1,0.488,0.383,0.821,8.3,0.172,2025
10,10T,James Harden,35,LAC,0,1,1000,0.001,79,35.3,...,5.8,8.7,1.5,0.7,0.410,0.352,0.874,8.3,0.143,2025


## Scraping player stats and downloading data

In [7]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = player_stats_url.format(year)
    data = scraper.get(url)
    
    with open(f"player/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

SSLError: HTTPSConnectionPool(host='www.basketball-reference.com', port=443): Max retries exceeded with url: /leagues/NBA_1991_per_game.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)')))

In [31]:
dfs = []

for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    #soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(StringIO(str(player_table)))[0]
    player_df["Year"] = year
    
    dfs.append(player_df)

In [None]:
players = pd.concat(dfs)

players.to_csv("players.csv")


Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Year
0,1.0,Michael Jordan,27.0,CHI,SG,82.0,82.0,37.0,12.1,22.4,...,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5,"MVP-1,DPOY-7,AS,NBA1,DEF1",1991
1,2.0,Karl Malone,27.0,UTA,PF,82.0,82.0,40.3,10.3,19.6,...,8.9,11.8,3.3,1.1,1.0,3.0,3.3,29.0,"MVP-5,AS,NBA1",1991
2,3.0,Bernard King,34.0,WSB,SF,64.0,64.0,37.5,11.1,23.6,...,3.2,5.0,4.6,0.9,0.3,4.0,2.9,28.4,"MVP-16,AS,NBA3",1991
3,4.0,Charles Barkley,27.0,PHI,SF,67.0,67.0,37.3,9.9,17.4,...,6.3,10.1,4.2,1.6,0.5,3.1,2.6,27.6,"MVP-4,AS,NBA1",1991
4,5.0,Patrick Ewing,28.0,NYK,C,81.0,81.0,38.3,10.4,20.3,...,8.8,11.2,3.0,1.0,3.2,3.6,3.5,26.6,"MVP-11,DPOY-7,AS,NBA2",1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,566.0,Jahlil Okafor,29.0,IND,C,1.0,0.0,3.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,2025
732,567.0,Zyon Pullin,23.0,MEM,SG,3.0,0.0,1.0,0.0,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2025
733,568.0,Isaiah Stevens,24.0,MIA,PG,3.0,0.0,2.0,0.0,0.7,...,0.7,0.7,0.0,0.3,0.0,0.0,0.0,0.0,,2025
734,569.0,Terry Taylor,25.0,SAC,PF,3.0,0.0,2.0,0.0,0.3,...,0.0,0.3,0.7,0.0,0.0,0.0,0.0,0.0,,2025


## Scraping team stats and downloading data

In [10]:
team_standings_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = team_standings_url.format(year)
    data = scraper.get(url)
    
    with open(f"team/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

SSLError: HTTPSConnectionPool(host='www.basketball-reference.com', port=443): Max retries exceeded with url: /leagues/NBA_1991_standings.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:992)')))

In [34]:
dfs = []

for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    east_table = soup.find_all(id="divs_standings_E")[0]
    east_df = pd.read_html(StringIO(str(east_table)))[0]
    east_df["Year"] = year
    east_df["Team"] = east_df["Eastern Conference"]
    del east_df["Eastern Conference"]

    dfs.append(east_df)

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    west_table = soup.find_all(id="divs_standings_W")[0]
    west_df = pd.read_html(StringIO(str(west_table)))[0]
    west_df["Year"] = year
    west_df["Team"] = west_df["Western Conference"]
    del west_df["Western Conference"]
    
    dfs.append(west_df)

In [35]:
teams = pd.concat(dfs)

teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
13,52,30,0.634,—,114.3,109.8,4.97,2025,Houston Rockets*
14,48,34,0.585,4.0,121.7,116.9,4.79,2025,Memphis Grizzlies*
15,39,43,0.476,13.0,114.2,115.4,-0.74,2025,Dallas Mavericks
16,34,48,0.415,18.0,113.9,116.7,-2.45,2025,San Antonio Spurs
17,21,61,0.256,31.0,109.8,119.3,-8.59,2025,New Orleans Pelicans


In [36]:
teams.to_csv("teams.csv")