# Data Scraper

Notebook that scrapes box score data from basketball-reference.com

In [9]:
# load packages
from bs4 import BeautifulSoup
import urllib3
import sys
import requests
import pandas as pd

In [24]:
# abbreviate team names to shorter, 3 letter version
def abbrev_team(team_name) :
    return {
        "Atlanta Hawks" : "ATL",
        "Brooklyn Nets" : "BRK",
        "Boston Celtics" :  "BOS",
        "Charlotte Bobcats" : "CHA",
        "Charlotte Hornets" : "CHO",
        "Chicago Bulls" : "CHI",
        "Cleveland Cavaliers" : "CLE",
        "Dallas Mavericks" : "DAL",
        "Denver Nuggets" : "DEN",
        "Detroit Pistons" : "DET",
        "Golden State Warriors" : "GSW",
        "Houston Rockets" : "HOU",
        "Indiana Pacers" : "IND",
        "Los Angeles Clippers" : "LAC",
        "Los Angeles Lakers" : "LAL",
        "Memphis Grizzlies" : "MEM",
        "Miami Heat" : "MIA",
        "Milwaukee Bucks" : "MIL",
        "Minnesota Timberwolves" : "MIN",
        "New Orleans Pelicans" : "NOP",
        "New Orleans Hornets" : "NOH",
        "New Jersey Nets" : "NJN",
        "New York Knicks" : "NYK",
        "Oklahoma City Thunder" : "OKC",
        "Orlando Magic" : "ORL",
        "Philadelphia 76ers" : "PHI",
        "Phoenix Suns" : "PHO",
        "Portland Trail Blazers" : "POR",
        "Sacramento Kings" : "SAC",
        "San Antonio Spurs" : "SAS",
        "Toronto Raptors" : "TOR",
        "Utah Jazz" : "UTA",
        "Washington Wizards" : "WAS"
    }.get(team_name, "ERROR - abbrev_team")

In [25]:
# url from which to scrape data
base_url = "http://www.basketball-reference.com"

year_one_months = ("october", "november", "december")
year_two_months = ("january", "february", "march", "april")

year_one_months_lockout = ("december",)
year_two_months_lockout = ("january", "february", "march", "april")

year_one_months_covid = ("october", "november", "december")
year_two_months_covid = ("january", "february", "march", "july", "august")

regular_years = (2010, 2011, 2013, 2014, 2015, 2016, 2017, 2018,2019)
lockout_year = 2012
covid_years = (2020)

In [26]:
# function to build url based on year and month
def get_schedule_url(year, month) :    
    #if month is valid, return the url
    if year in regular_years :
        if month in year_one_months :
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"
        else :
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"
    elif year in covid_years:
        if month in year_one_months_covid:
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"
        else :
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"
    else:
        if month in year_one_months_lockout:
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"
        else :
            return base_url + "/leagues/NBA_" + str(year) + "_games-" + month + ".html"

In [27]:
# function to get the entire seasons worth of urls
def get_season_urls(year) :

    if year in regular_years:
        #build url for year
        base_url_schedule_1 = base_url + "/leagues/NBA_" + str(year) + "_games-"
        url_extensions = year_one_months + year_two_months
        urls = []
        #this is where scraping takes place
        for extension in url_extensions :
            schedule_url = get_schedule_url(year, extension)
            response = requests.get(schedule_url)
            doc = BeautifulSoup(response.content, "html.parser")
            schedule_table = doc.find_all("tbody")[0]
            box_score_els = schedule_table.find_all(attrs={"data-stat": "box_score_text"})

            for el in box_score_els :
                urls.append(base_url + el.find("a").get('href'))

    elif year in covid_years:
        base_url_schedule_1 = base_url + "/leagues/NBA_" + str(year) + "_games-"
        url_extensions = year_one_months_covid + year_two_months_covid
        urls = []
        #this is where scraping takes place
        for extension in url_extensions :
            schedule_url = get_schedule_url(year, extension)
            response = requests.get(schedule_url)
            doc = BeautifulSoup(response.content, "html.parser")
            schedule_table = doc.find_all("tbody")[0]
            box_score_els = schedule_table.find_all(attrs={"data-stat": "box_score_text"})

            for el in box_score_els :
                urls.append(base_url + el.find("a").get('href'))

    else:
        base_url_schedule_1 = base_url + "/leagues/NBA_" + str(year) + "_games-"
        url_extensions = year_one_months_lockout + year_two_months_lockout
        urls = []
        #this is where scraping takes place
        for extension in url_extensions :
            schedule_url = get_schedule_url(year, extension)
            response = requests.get(schedule_url)
            doc = BeautifulSoup(response.content, "html.parser")
            schedule_table = doc.find_all("tbody")[0]
            box_score_els = schedule_table.find_all(attrs={"data-stat": "box_score_text"})

            for el in box_score_els :
                urls.append(base_url + el.find("a").get('href'))


    return urls

In [29]:
# function which takes in url and file, and saves game data to file
def scrape_game(url,f) :

    html = requests.get(url)
    doc = BeautifulSoup(html.content, "html.parser")
    if not len(doc.find_all(attrs={"data-label":"All Games in Series"})) == 0 :
        return False
    line = ""

    # Find Away Team
    breadcrumbs = doc.find_all("div", class_="breadcrumbs")[0]
    game_head = breadcrumbs.find_all("strong")[0].get_text()
    away_team_name = game_head[:game_head.find(" at ")]

    # Find Game Date
    year = url[url.find('res/')+4:url.find('res/')+8]
    month = url[url.find(year)+4:url.find(year)+6]
    day = url[url.find(year)+6:url.find(year)+8]

    # Find Home Team
    home_team_abbrev = url[url.find(year)+9:url.find(year)+12]

    # Record Stats
    tables = doc.find_all("tfoot")
    line = line + year + ", " + month + ", " + day + ", " + home_team_abbrev                 + ", " + abbrev_team(str(away_team_name)) + ", "
    for table in tables :
        stats = table.find_all("td")
        for stat in stats :
            if(stat.get_text() != "") :
                line = line + stat.get_text() + ", "
    f.write(line[:-2] + '\n')



    return True

In [32]:
# function to run it all
def run_season_scrape (year, f):
    season_urls = get_season_urls(year)
    for urls in season_urls:
        scrape_game(urls,f)

Heres an example of how to run for 2010 season

In [1]:
#df = pd.DataFrame(list())
#df.to_csv("2010.csv")
#f_one = open("2010.csv", "a")
#run_season_scrape(2010,f_one)