# Web Scraping Scripts for NBA data

The goal of this script is to scrape data from the www.basketball-reference.com database on the Most Valuable Player (MVP) award voting as well as the players' individual total, per game and advanced statistics, and teams' statistics during the years between 1991 to 2021. 


This script is part of a general investigation and analysis on NBA data in the past decades.


The script is based on analysis and scripts developed by Vik Paruchuri from DataQiest. GitHub: https://github.com/dataquestio/project-walkthroughs/blob/master/mvp/web_scraping.ipynb

This version of the code has the addition of more data being scraped, i.e player total and advance statistics, as well as a method of extrating the description of the column headers.



In [1]:
import pandas as pd
import requests
import os
import shutil
import re

#### NOTE: This section is only if you are running the script in Google Colab. This is not needed if you are running on Jupyter
It mounts a google drive to the server so that it can be used during the execution. 

In [2]:
from google.colab import drive
drive.mount('/content/drive') # mounting the drive to the server
%cd /content/drive/My\ Drive/Colab_Notebooks/NBA_Analysis
path = !pwd # save the directory to variable for reading/writing files

Mounted at /content/drive
/content/drive/My Drive/Colab_Notebooks/NBA_Analysis


## Downloading HTML Pages
To ensure easy access and not overloading the basketball-reference servers, this section will download the htmp pages using requests.

The Pages downoaded are:
https://www.basketball-reference.com/awards/awards_{1991-2021}.html
https://www.basketball-reference.com/leagues/NBA_{1991-2021}_totals.html
https://www.basketball-reference.com/leagues/NBA_{1991-2021}_per_game.html
https://www.basketball-reference.com/leagues/NBA_{1991-2021}_advanced.html
https://www.basketball-reference.com/leagues/NBA_{1991-2021}_standings.html

#### NOTE: The ranges inside the curly brackets represent the range of year we are interested. Populate with only a single year at a time.

In [3]:
years = list(range(1991,2022))

In [4]:
# MVP votes
url_award_voting = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    url = url_award_voting.format(year)
    data = requests.get(url)
    with open(path[0]+"/DATA_mvp/award_voting_{}.html".format(year), "w+") as f: 
        f.write(data.text)

In [5]:
# Players' Total Stastistics
url_player_stats_total = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"

for year in years:
    url = url_player_stats_total.format(year)
    data = requests.get(url)
    with open(path[0]+"/DATA_player_stats_total/player_stats_{}.html".format(year), "w+") as f:
        f.write(data.text)

In [6]:
# Players' Per Game Stastistics
url_player_stats_pgame = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in years:
    url = url_player_stats_pgame.format(year)
    data = requests.get(url)
    with open(path[0]+"/DATA_player_stats_pgame/player_stats_{}.html".format(year), "w+") as f:
        f.write(data.text)

In [7]:
# Players' Advanced Stastistics
url_player_stats_adv = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html"

for year in years:
    url = url_player_stats_adv.format(year)
    data = requests.get(url)
    with open(path[0]+"/DATA_player_stats_adv/player_stats_{}.html".format(year), "w+") as f:
        f.write(data.text)

In [8]:
# Teams' Statitstics
url_team_stats = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = url_team_stats.format(year)
    data = requests.get(url)
    with open(path[0]+"/DATA_team_stats/team_stats_{}.html".format(year), "w+") as f:
        f.write(data.text)

## Scraping the Webpages


In [9]:
from bs4 import BeautifulSoup

In [10]:
# Create a function to extract and clean the description Series of the dataframe columns names
''' 
  The description is located in a tag called "data-tip". This tag can be missing.
  If this is the case, the function will obtain the description from the "aria-label"
  tag. The 'data-tip' will be missing if the name put in the "aria-label" is 
  descriptive enough. The "aria-label" tag holds the full name of the abbreviation
  used in the table in the html file. 
'''
def clean_desc(html_text):
  if html_text.get('data-tip') is None: # check if the "data-tip" tag is missing
    return html_text.get('aria-label')# if so return the the information in the "aria-label" tag
  else: # if the "data-tip" tag is present
    desc = re.compile('<br+?>').sub(': ',html_text.get('data-tip')) # substitute the bold text tag <br> tag with :
    return re.compile('<.*?>').sub('',desc) # delete all remaining text style tags from the HTML file
# print(pd.Series(map(clean_desc,cur_DESC)))

In [None]:
# Create a dataframe of MVP Voting Stats for all years
dfs_mvp = []
for year in years:
    with open(path[0]+"/DATA_mvp/award_voting_{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()# find the main table header above the table in the HTML file and delete it
    cur_table = soup.find_all(id="mvp")[0]# find the entire table in the HTML file associated with this id

    if year == 1991: # create a column description value only for the first year in var "years"
      for match in soup.find_all(attrs={'data-stat':'DUMMY'}): # clear any dummy lines and columns
        match.decompose()
      cur_DESC_mvp = pd.Series(map(clean_desc,cur_table.find_all('th',scope='col'))) # find, extract, filter all the lines holding the description of the table columns

    mvp_df = pd.read_html(str(cur_table))[0]# convert the table to a datafame
    mvp_df["Year"] = year # save the current year 
    dfs_mvp.append(mvp_df)# append list of dataframes with current year

df_mvp = pd.concat(dfs_mvp) #  Concatinate the list of dataframes into a single dataframe for all years
cur_DESC_mvp[len(cur_DESC_mvp)] = 'Year'# Add a description of the Year column
cur_DESC_mvp.set_axis(df_mvp.columns) # Replace the indeces of the Description series with the names of the dataframe columns 
df_mvp.head()

In [None]:
# Create a dataframe of Player Total Stats for all years
dfs_plr_stats_tot = []
for year in years:
    with open(path[0]+"/DATA_player_stats_total/player_stats_{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    for match in soup.find_all('tr', class_="thead"): # find all table headers inside the table in the HTML file and delete it
        match.decompose()
    cur_table = soup.findAll(id="div_totals_stats")[0] # find the entire table in the HTML file associated with this id

    if year == 1991: # create a column description value only for the first year in var "years"
      for match in soup.find_all(attrs={'data-stat':'DUMMY'}): # clear any dummy lines and columns
        match.decompose()
      cur_DESC_total = pd.Series(map(clean_desc,cur_table.find_all('th',scope='col'))) # find, extract, filter all the lines holding the description of the table columns

    cur_df = pd.read_html(str(cur_table))[0]# convert the table to a datafame
    cur_df["Year"] = year # save the current year 
    dfs_plr_stats_tot.append(cur_df) # append list of dataframes with current year

dfs_plr_stats_tot = pd.concat(dfs_plr_stats_tot)#  Concatinate the list of dataframes into a single dataframe for all years
cur_DESC_total[len(cur_DESC_total)] = 'Year'# Add a description of the Year column
cur_DESC_total.set_axis(dfs_plr_stats_tot.columns)# Replace the indeces of the Description series with the names of the dataframe columns 
dfs_plr_stats_tot.head()

In [None]:
# Create a dataframe of Player Per Game Stats for all years
dfs_plr_stats_pgame = []
for year in years:
    with open(path[0]+"/DATA_player_stats_pgame/player_stats_{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    for match in soup.find_all('tr', class_="thead"): # find all table headers inside the table in the HTML file and delete it
        match.decompose()
    cur_table = soup.findAll(id='div_per_game_stats')[0] # find the entire table in the HTML file associated with this id

    if year == 1991: # create a column description value only for the first year in var "years"
      for match in soup.find_all(attrs={'data-stat':'DUMMY'}): # clear any dummy lines and columns
        match.decompose()
      cur_DESC_pgame = pd.Series(map(clean_desc,cur_table.find_all('th',scope='col'))) # find, extract, filter all the lines holding the description of the table columns

    cur_df = pd.read_html(str(cur_table))[0]# convert the table to a datafame
    cur_df['Year'] = year # save the current year 
    dfs_plr_stats_pgame.append(cur_df) # append list of dataframes with current year

dfs_plr_stats_pgame = pd.concat(dfs_plr_stats_pgame) #  Concatinate the list of dataframes into a single dataframe for all years
cur_DESC_pgame[len(cur_DESC_pgame)] = 'Year'# Add a description of the Year column
cur_DESC_pgame.set_axis(dfs_plr_stats_pgame.columns)# Replace the indeces of the Description series with the names of the dataframe columns 
dfs_plr_stats_pgame.head()

In [None]:
# Create a dataframe of Player Advanced Stats for all years
dfs_plr_stats_adv = []
for year in years:
    with open(path[0]+"/DATA_player_stats_adv/player_stats_{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    for match in soup.find_all('tr', class_="thead"): # find all table headers inside the table in the HTML file and delete it
        match.decompose()
    cur_table = soup.findAll(id='div_advanced_stats')[0] # find the entire table in the HTML file associated with this id

    if year == 1991: # create a column description value only for the first year in var "years"
      for match in soup.find_all(attrs={'data-stat':'DUMMY'}): # clear any dummy lines and columns
        match.decompose()
      cur_DESC_adv = pd.Series(map(clean_desc,cur_table.find_all('th',scope='col'))) # find, extract, filter all the lines holding the description of the table columns

    cur_df = pd.read_html(str(cur_table))[0]# convert the table to a datafame
    cur_df['Year'] = year # save the current year 
    dfs_plr_stats_adv.append(cur_df) # append list of dataframes with current year

dfs_plr_stats_adv = pd.concat(dfs_plr_stats_adv) #  Concatinate the list of dataframes into a single dataframe for all years

dfs_plr_stats_adv = dfs_plr_stats_adv.dropna(how='all', axis='columns')
cur_DESC_adv[len(cur_DESC_adv)] = 'Year'# Add a description of the Year column
cur_DESC_adv.set_axis(dfs_plr_stats_adv.columns)# Replace the indeces of the Description series with the names of the dataframe columns 

In [38]:
# Create a dataframe of Teams' Stats for all years
dfs_team_stats = []
for year in years:
  with open(path[0]+'/DATA_team_stats/team_stats_{}.html'.format(year)) as f:
    page = f.read()
  
  soup = BeautifulSoup(page, 'html.parser')
  for match in soup.find_all('tr', class_="thead"): # find all table headers inside the table in the HTML file and delete it
        match.decompose()
  cur_table_E = soup.find_all(id='div_divs_standings_E')[0] # find the entire table in the HTML file associated with this id
  cur_df_E = pd.read_html(str(cur_table_E))[0]# convert the table to a datafame
  cur_df_E['Year'] = year # Add a Year column and value
  # Save the Teams' name from the "Conference" column to a "Team" column
  cur_df_E['Team'] = cur_df_E['Eastern Conference']
  cur_df_E.drop('Eastern Conference',axis=1,inplace=True)

  cur_table_W = soup.find_all(id="div_divs_standings_W")[0]# find the entire table in the HTML file associated with this id
  cur_df_W = pd.read_html(str(cur_table_W))[0]# convert the table to a datafame
  cur_df_W["Year"] = year # Add a Year column and value
  # Save the Teams' name from the "Conference" column to a "Team" column
  cur_df_W["Team"] = cur_df_W["Western Conference"]
  cur_df_W.drop('Western Conference',axis=1,inplace=True)

  if year == 1991: # create a column description value only from the first year in var "years"
      for match in soup.find_all(attrs={'data-stat':'DUMMY'}): # clear any dummy lines and columns
        match.decompose()
      cur_DESC_team = pd.Series(map(clean_desc,cur_table_W.find_all('th',scope='col'))) # find, extract, filter all the lines holding the description of the table columns
  cur_DESC_team[0] = 'Team'# Substitute the first column description that says the conference with "Teams"

  cur_df = pd.concat([cur_df_E,cur_df_W])#  Concatinate the East and West team dataframes into a single dataframe for a given year
  cur_df['Team'] = cur_df['Team'].apply(lambda x: x[:-1] if x[-1]=='*'  else x) # delete "*" mark for playoff teams
  dfs_team_stats.append(cur_df)# Append the two Conference dataframes into a single league dataframe

dfs_team_stats = pd.concat(dfs_team_stats) #  Concatinate the list of dataframes into a single dataframe for all years
cur_DESC_team[len(cur_DESC_team)] = 'Year' # Add a description of the Year column
cur_DESC_team.set_axis(dfs_team_stats.columns) # Replace the indeces of the Description series with the names of the dataframe columns 
dfs_team_stats.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics
1,44,38,0.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers
2,39,43,0.476,17.0,103.1,103.3,-0.43,1991,New York Knicks
3,30,52,0.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,0.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets


In [26]:
class NBA_data:
  '''
  Creates an object with the NBA data and the description of the saved statistics
  data - pandas.DataFrame
  DESC - pandas.Series
  '''
  def __init__(self, data, DESC):
    self.data = data
    self.cols_DESC = DESC

In [39]:
mvp_data = NBA_data(df_mvp,cur_DESC_mvp)
plr_stats_tot_data = NBA_data(dfs_plr_stats_tot,cur_DESC_total)
plr_stats_pgame_data = NBA_data(dfs_plr_stats_pgame,cur_DESC_pgame)
plr_stats_adv_data = NBA_data(dfs_plr_stats_adv,cur_DESC_adv)
team_data = NBA_data(dfs_team_stats,cur_DESC_team)

In [41]:
import pickle

filehandler = open('mvp_data.obj', 'wb') 
pickle.dump(mvp_data, filehandler)

filehandler = open('plr_stats_tot_data.obj', 'wb') 
pickle.dump(plr_stats_tot_data, filehandler)

filehandler = open('plr_stats_pgame_data.obj', 'wb') 
pickle.dump(plr_stats_pgame_data, filehandler)

filehandler = open('plr_stats_adv_data.obj', 'wb') 
pickle.dump(plr_stats_adv_data, filehandler)

filehandler = open('team_data.obj', 'wb') 
pickle.dump(team_data, filehandler)

In [None]:
'''
dfs_mvp.to_csv(path[0]+'/dfs_mvp.csv')
dfs_plr_stats_tot.to_csv(path[0]+'/dfs_plr_stats_tot.csv')
dfs_plr_stats_pgame.to_csv(path[0]+'/dfs_plr_stats_pgame.csv')
dfs_team_stats.to_csv(path[0]+'/dfs_team_stats.csv')
'''