# Web scraping - Basketball reference

In [1]:
# Install packages
!pip install requests
!pip install beautifulsoup4



In [2]:
# Import libraries/ packages in the notebook
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Define a list for looping through awards by year
years = list(range(1991, 2022))

In [4]:
# Define url variable with the year to be filled afterwards (inside the loop)
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [5]:
# Save data for each year as a separate html
for year in years:
    # The url will change with each loop instance
    url = url_start.format(year)
    # Get the data from the url
    data = requests.get(url)
    # Save each html as a separate file
    with open("html_files/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [6]:
# Create empty list, to be filled with data from each html
dfs = []

# Loop through each html file
for year in years:
    # Read html file
    with open("html_files/{}.html".format(year)) as f:
        page = f.read()
    # Parse the html file with BeautifulSoup
    soup = BeautifulSoup(page, "html.parser")
    # Pin-point table-rows with the class "over_header", and remove them from
    soup.find('tr', class_ = 'over_header').decompose()
    # Find the table with the id "mvp"
    mvp_table = soup.find(id = 'mvp')
    # Read the html file with Pandas
    mvp = pd.read_html(str(mvp_table))[0]
    # Add "Year" column so we have information inside the table
    mvp["Year"] = year
    # Append the DataFrame to the list
    dfs.append(mvp)

In [7]:
# Concatenate the list
mvps = pd.concat(dfs)

In [8]:
# Display first five rows
mvps.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


In [9]:
# Export DataFrame to CSV
mvps.to_csv("mvps.csv")