## Scraping MVP voting information and downloading data

In [2]:
import cloudscraper
import os
import time
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup


In [4]:
years = list(range(1991, 2025))

In [None]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = url_start.format(year)
    data = scraper.get(url)
    
    with open(f"mvp/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

In [None]:
dfs = []

for year in years:
    with open("mvp/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(StringIO(str(mvp_table)))[0]
    mvp_df["Year"] = year
    
    dfs.append(mvp_df)

In [30]:
mvps = pd.concat(dfs)

mvps.to_csv("mvps.csv")

## Scraping player stats and downloading data

In [6]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = player_stats_url.format(year)
    data = scraper.get(url)
    
    with open(f"player/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

KeyboardInterrupt: 

In [37]:
dfs = []

for year in years:
    with open("player/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    #soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(StringIO(str(player_table)))[0]
    player_df["Year"] = year
    
    dfs.append(player_df)

In [41]:
players = pd.concat(dfs)

players.to_csv("players.csv")

## Scraping team stats and downloading data

In [7]:
team_standings_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

scraper = cloudscraper.create_scraper()

os.makedirs("mvp", exist_ok=True)

for year in years:
    url = team_standings_url.format(year)
    data = scraper.get(url)
    
    with open(f"team/{year}.html", "w", encoding="utf-8") as f:
        f.write(data.text)

    time.sleep(5)

In [8]:
dfs = []

for year in years:
    with open("team/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    east_table = soup.find_all(id="divs_standings_E")[0]
    east_df = pd.read_html(StringIO(str(east_table)))[0]
    east_df["Year"] = year
    east_df["Team"] = east_df["Eastern Conference"]
    del east_df["Eastern Conference"]

    dfs.append(east_df)

    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    west_table = soup.find_all(id="divs_standings_W")[0]
    west_df = pd.read_html(StringIO(str(west_table)))[0]
    west_df["Year"] = year
    west_df["Team"] = west_df["Western Conference"]
    del west_df["Western Conference"]
    
    dfs.append(west_df)

In [9]:
teams = pd.concat(dfs)

teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [10]:
teams.to_csv("teams.csv")