In [1]:
import requests
import os
import shutil



In [2]:
# years we want to scrape
years = list(range(1991,2022))

In [3]:
# loop that saves html files to mvp folder
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    url = url_start.format(year)
    
    data = requests.get(url)
    
    with open("mvp/{}.html".format(year), "w+", encoding= "utf-8") as f:
        f.write(data.text)

### Parsing mvp table with bsp 

In [4]:
from bs4 import BeautifulSoup

In [5]:
with open("mvp/1991.html", encoding='utf-8') as f:
    page = f.read()
    
soup = BeautifulSoup(page, 'html.parser')
soup.find('tr', class_="over_header").decompose()

AttributeError: 'NoneType' object has no attribute 'decompose'

In [None]:
mvp_table = soup.find_all(id="mvp")[0]
#mvp_table

In [None]:
import pandas as pd

#### Read HTML into pandas 

In [None]:
mvp_1991 = pd.read_html(str(mvp_table))[0]
mvp_1991.head(2)

In [None]:
mvp_1991["Year"] = 1991

In [None]:
mvp_1991.head(3)

In [None]:
# loop to create multiple dfs for all years
dfs = []
for year in years:
    with open("mvp/{}.html".format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year        # will help us know which year mvp is from
    dfs.append(mvp_df)

In [None]:
# combine all these dfs into 1
mvps = pd.concat(dfs)

mvps.tail()

In [None]:
mvps.to_csv('mvps.csv')

# Now getting player stats

In [None]:
# rewriting above code to work for players url
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in years:
    url = player_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("player/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

But if you check in the player folder, the stored htmls have only 17 players, while the original site has over 600 player stats. This is because the site is in java and is causing rendering problems. It assumes my web renders the page in a browser but thats not the case.

So trying to find a way to use my web to render java. 

# Selenium

In [None]:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [None]:
driver = webdriver.Chrome(
    executable_path="C:/Users/Rono/Downloads/chrome-win64"
    )


In [None]:
dfs = []
for year in years:
    with open("player/{}.html".format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [None]:
players = pd.concat(dfs)

In [None]:
players

In [None]:
players.to_csv('players.csv')

# Division standings

In [None]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [None]:
# demo of single year
year = 1991
url = team_stats_url.format(year)

data = requests.get(url)

with open("team/{}.html".format(year), "w+", encoding='utf-8') as f:
    f.write(data.text)

In [None]:
# looping for all years
for year in years:
    url = team_stats_url.format(year)
    
    data = requests.get(url)
    
    with open("team/{}.html".format(year), "w+", encoding='utf-8') as f:
        f.write(data.text)

In [None]:
dfs = []
for year in years:
    with open("team/{}.html".format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    #soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    w_table = soup.find_all(id="divs_standings_W")
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

In [None]:
teams = pd.concat(dfs)

In [None]:
teams.to_csv("teams.csv")