# Data Scraping

This notebook aims to scrape data from seasons 1976 up to 2020. We will use beautifulsoup to scrape data on player statistics, team standings, and mvp award data from basketball-reference.

The following resources were used for this part of the project:
- https://medium.com/analytics-vidhya/intro-to-scraping-basketball-reference-data-8adcaa79664a
- https://towardsdatascience.com/web-scraping-nba-stats-4b4f8c525994

In [9]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

from tqdm import tqdm
import pandas as pd
import os

YEAR will be the range of years we will collect data from.

In [10]:
YEAR = []

for year in range(1976,2020):
    YEAR.append(year)

The BasketballReferenceScraper class contains methods that will allow the collection of different types of data. scrapeStats will require a year and page_string as an input while scrapeStandings and scrapeMVP will only require the year. 

The scraped data will be saved as an csv file in the corresponding folder located at ../../input.

In [11]:
class BasketballReferenceScraper():

    def scrapeStats(self, YEAR, page_string):

        # specify directory we want the csv files to go into
        os.chdir('../../input/{}'.format(page_string))

        for i in tqdm(range(len(YEAR))):

            # year to scrape
            year = YEAR[i]

            # url of page to scrape data from
            url = "https://www.basketball-reference.com/leagues/NBA_{}_{}.html".format(year, page_string)

            # html from given url
            html = urlopen(url)

            # initialize BeautifulSoup class object
            soup = BeautifulSoup(html, features="html.parser")

            # extract text to get a list of header labels (exclude ranking order from basketball reference)
            headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')[1:]]

            # extract text to get a list of row values (exclude first header row)
            rows = soup.findAll('tr')[1:]
            player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]

            # create pandas dataframe to organize data extracted
            stats = pd.DataFrame(player_stats, columns=headers)
            
            # gets the path which was changed to the specified folder
            export_path = os.getcwd()

            # saves to csv file labeled by the statistics type and year
            stats.to_csv(export_path + '\\{}-{}.csv'.format(year, page_string))

    def scrapeStandings(self, YEAR):

        # specify directory we want the csv files to go into
        os.chdir('../../input/standings')

        for i in tqdm(range(len(YEAR))):

            # year to scrape
            year = YEAR[i]

            # url of page to scrape data from
            url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html".format(year)

            # html from given url
            html = urlopen(url)

            # initialize BeautifulSoup class object
            soup = BeautifulSoup(html, features="html.parser")

            # extract text to get a list of headers
            titles = [th.getText() for th in soup.findAll('tr')[0].findAll('th')[1:]]

            # extract text for only column headers
            headers = titles[0:titles.index("SRS")+1]

            # remove column headers from titles
            titles = titles[titles.index("SRS")+1:]

            # get team names
            try:
                team_names = titles[0:titles.index("Easter Conference")]
            except:
                team_names = titles

            # remove non team names from list
            for i in headers:
                team_names.remove(i)

            #remove W and western conference from team names
            team_names.remove("Western Conference")
            try:
                team_names.remove("W")
            except:
                None

            # list of divisions to remove from team names
            divisions = ["Atlantic Division", "Central Division", "Southeast Division", "Northwest Division",
                         "Pacific Division", "Southwest Division", "Midwest Division"]

            # remove division from list
            for division in divisions:
                try:
                    team_names.remove(division)
                except:
                    None

            # then grab all data from rows except first row
            rows = soup.findAll('tr')[1:]

            # get the team standings
            team_standings = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]

            # remove empty elements
            team_standings = [e for e in team_standings if e != []]
            # only keep needed rows
            team_standings = team_standings[0:len(team_names)]

            # add corresponding team name to the respective team standings
            for i in range(0, len(team_standings)):
                team_standings[i].insert(0, team_names[i])

            # add team to headers
            headers.insert(0, "Team")

            # create pandas dataframe
            year_standings = pd.DataFrame(team_standings, columns=headers)

            # delete 'games behind' column
            del year_standings['GB']

            # add a column to dataframe to indicate playoff appearance
            year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
            # remove * from team names
            year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
            # add losing season indicator (win % < .5)
            year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

            # gets the path which was changed to the specified folder
            export_path = os.getcwd()

            # saves to csv file labeled by the statistics type and year
            year_standings.to_csv(export_path + '\\{}-standings.csv'.format(year))

    def scrapeMvp(self, YEAR):

        # specify directory we want the csv files to go into
        os.chdir('../../input/mvp_data')

        for i in tqdm(range(len(YEAR))):

            # year to scrape
            year = YEAR[i]

            # url of page to scrape data from
            url = "https://www.basketball-reference.com/awards/awards_{}.html".format(year)

            # html from given url
            html = urlopen(url)

            # initialize BeautifulSoup class object
            soup = BeautifulSoup(html, features="html.parser")

            # extract text to get a list of header labels
            headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')[1:]]

            # extract text to get a list of row values
            rows = soup.findAll('tr')[2:]
            player_awards = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]

            # remove non-mvp data
            for i in range(len(player_awards)):
                if player_awards[i] == []:
                    player_awards = player_awards[0:i]
                    break

            # create pandas dataframe to organize data extracted
            awards = pd.DataFrame(player_awards, columns=headers)

            # gets the path which was changed to the specified folder
            export_path = os.getcwd()

            # saves to csv file labeled by the statistics type and year
            awards.to_csv(export_path + '\\{}-awards.csv'.format(year))


In [12]:
scrape = BasketballReferenceScraper()

In [13]:
scrape.scrapeMvp(YEAR)

100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:11<00:00,  4.00it/s]


In [14]:
scrape.scrapeStandings(YEAR)

100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:10<00:00,  4.20it/s]


In [15]:
scrape.scrapeStats(YEAR, 'advanced')

100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:59<00:00,  1.35s/it]


In [None]:
scrape.scrapeStats(YEAR, 'per_game')

 84%|████████████████████████████████████████████████████████████████████▉             | 37/44 [00:45<00:11,  1.61s/it]