In [1]:
# For scraping data
import requests
from bs4 import BeautifulSoup, Comment
from datetime import date
import time

# For handling data
import pandas as pd
import numpy as np

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

## Grabbing the Data
Before we can do any data analysis we have to grab some data! If you are interested in how we got our data read through this section. We use `BeautifulSoup` and the Google developer tools to scrape the data from <a href = "https://www.basketball-reference.com/">Basketball Reference</a>. If you aren't interested in how we got the data go ahead and skip to the next section, I won't mind!

First we will be making a list containing all of the HTML file addresses that contain the data we want. This can be found with a little poking around. We want all of the team per game stats for each season since 1974. If we go to the season pages for the NBA we find that they all have the same format: "https://www.basketball-reference.com/leagues/NBA_ (year the season started)".

In [2]:
# What Years will we be pulling?
Current_Year = date.today().year
Years = range(1974,Current_Year + 1)

# Occurs on all the webpages we want
base = "https://www.basketball-reference.com/leagues/NBA_"
HTMLs = []

# Need to add the year to each web address
for year in Years:
    HTMLs.append(base + str(year) + ".html")
    
# For our csv file we want to have a record of the
# season the data is pulled from.
def MakeSeason(year):
    return str(year) + "-" + str(year+1)[2:4]

Seasons = map(MakeSeason,Years)

Now that we have the web addresses we can pull the data from each page. We'll take the data we want and save it to a csv file (this file can be found in the repository without having to run the following code). 

In [25]:
# This helper function makes our writing our data to file
# a little easier to read
def WriteStat(td,Season):
    Stats = ['team_name','g','mp','fg','fga','fg_pct','fg3',
            'fg3a','fg3_pct','fg2','fg2a','fg2_pct','ft','fta',
            'ft_pct','orb','drb','trb','ast','stl','blk','tov',
            'pf','pts']
    if td['data-stat'] in Stats:
        if td['data-stat'] == "team_name":
            f.write("\n" + Season + ",")
            f.write(str(td.a.string) + ",")
        elif td['data-stat'] == "pts":
            f.write(str(td.string))
        else:
            if td.string == None:
                # Coding missing value as -99
                f.write(str(-99) + ",")
            else:
                f.write(str(td.string) + ",")


# Write our data to a csv file
f = open("NBA_Team_PerGame_Stats.csv","w+")

# Write our variable names to file
f.write("Season,Team,Games,MP,FG,FGA,FG_Perc,3P,3PA,3P_Perc,2P" +
        ",2PA,2P_Perc,FT,FTA,FT_Perc,ORB,DRB,TRB,AST,STL," + 
       "BLK,TOV,PF,PTS")

This is where `BeautifulSoup` is needed. We will not give an extensive explanation of how this python package works. These sorts of posts can easily be found with <a href = "https://www.google.com/search?q=beautifulsoup+python+tutorial&oq=beautifulsoup+python+tu&aqs=chrome.0.0j69i60j69i57j0l3.6664j0j7&sourceid=chrome&ie=UTF-8">Google</a>. That being said what follows is a somewhat high level overview of what we're doing. Don't worry if this part doesn't make sense right now. The important take away from this section is that `BeautifulSoup` is a powerful python library that is worth a time investment.


We want to search through the largish HTML files for each page and pluck out the information we want. Our data is stored in a table found in an HTML div labeled 'all_team-stats-per_game'. The data is written in as an HTML comment so we'll need to convert the comment into something searchable with `BeautifulSoup`. We look through the searchable comment for the table's (HTML tbody) body and then grab each table entry (HTML td) and grab the statistic.

Here's the code:

In [None]:
# For each of our links
for i in range(len(HTMLs)):
    # What Season are we in?
    Season = Seasons[i]
    
    # Grab the website
    page = requests.get(HTMLs[i])
    # Turn the html into a beautiful soup object
    soup = BeautifulSoup(page.content,'html.parser')

    ## Get the Table ##
    # Get the div that holds the table
    Table_Wrapper = soup.find('div',{'id':'all_team-stats-per_game'})
    
    # The table is stored as a comment
    # Extract that comment and turn it into a Beautiful Soup object
    comments = Table_Wrapper.findAll(text = lambda text:isinstance(text,Comment))
    Table = [comment.extract() for comment in comments][0]
    Table = BeautifulSoup(Table,'html.parser')
    
    # Grab the table entries
    TDs = Table.div.table.tbody.find_all('td')
    
    # To help us keep track of where the code is while it runs
    print("Grabbing the stats from the " + Season + " season.")
    
    # For each table entry, record the stat
    for td in TDs:
        # Keeps us in the loop while the code runs
        if td['data-stat'] == 'team_name':
            print("Writing stats for the " + td.a.string)
        WriteStat(td,Season)
    
    print("Done with " + Season + "! :-) \n\n")
    
# Close our file    
f.close()

Grabbing the stats from the 1974-75 season.
Writing stats for the Buffalo Braves
Writing stats for the Golden State Warriors
Writing stats for the Los Angeles Lakers
Writing stats for the Boston Celtics
Writing stats for the Atlanta Hawks
Writing stats for the Phoenix Suns
Writing stats for the Houston Rockets
Writing stats for the Milwaukee Bucks
Writing stats for the Seattle SuperSonics
Writing stats for the Portland Trail Blazers
Writing stats for the Detroit Pistons
Writing stats for the Chicago Bulls
Writing stats for the Kansas City-Omaha Kings
Writing stats for the Capital Bullets
Writing stats for the New York Knicks
Writing stats for the Philadelphia 76ers
Writing stats for the Cleveland Cavaliers
Done with 1974-75! :-) 


Grabbing the stats from the 1975-76 season.
Writing stats for the Golden State Warriors
Writing stats for the Buffalo Braves
Writing stats for the Boston Celtics
Writing stats for the Atlanta Hawks
Writing stats for the Washington Bullets
Writing stats for t

Grabbing the stats from the 1984-85 season.
Writing stats for the Denver Nuggets
Writing stats for the San Antonio Spurs
Writing stats for the Detroit Pistons
Writing stats for the Los Angeles Lakers
Writing stats for the Utah Jazz
Writing stats for the Portland Trail Blazers
Writing stats for the Boston Celtics
Writing stats for the Phoenix Suns
Writing stats for the San Diego Clippers
Writing stats for the Houston Rockets
Writing stats for the Dallas Mavericks
Writing stats for the Kansas City Kings
Writing stats for the New Jersey Nets
Writing stats for the Golden State Warriors
Writing stats for the Seattle SuperSonics
Writing stats for the Philadelphia 76ers
Writing stats for the New York Knicks
Writing stats for the Milwaukee Bucks
Writing stats for the Indiana Pacers
Writing stats for the Chicago Bulls
Writing stats for the Washington Bullets
Writing stats for the Cleveland Cavaliers
Writing stats for the Atlanta Hawks
Done with 1984-85! :-) 


Grabbing the stats from the 1985-8

Grabbing the stats from the 1993-94 season.
Writing stats for the Phoenix Suns
Writing stats for the Charlotte Hornets
Writing stats for the Golden State Warriors
Writing stats for the Portland Trail Blazers
Writing stats for the Seattle SuperSonics
Writing stats for the Sacramento Kings
Writing stats for the Indiana Pacers
Writing stats for the Cleveland Cavaliers
Writing stats for the Atlanta Hawks
Writing stats for the Los Angeles Clippers
Writing stats for the Utah Jazz
Writing stats for the San Antonio Spurs
Writing stats for the Orlando Magic
Writing stats for the Denver Nuggets
Writing stats for the Chicago Bulls
Writing stats for the Philadelphia 76ers
Writing stats for the Los Angeles Lakers
Writing stats for the Houston Rockets
Writing stats for the Boston Celtics
Writing stats for the Miami Heat
Writing stats for the New Jersey Nets
Writing stats for the Milwaukee Bucks
Writing stats for the Washington Bullets
Writing stats for the New York Knicks
Writing stats for the Detro

Grabbing the stats from the 2001-02 season.
Writing stats for the Sacramento Kings
Writing stats for the Milwaukee Bucks
Writing stats for the Los Angeles Lakers
Writing stats for the Dallas Mavericks
Writing stats for the Toronto Raptors
Writing stats for the Orlando Magic
Writing stats for the Minnesota Timberwolves
Writing stats for the Seattle SuperSonics
Writing stats for the Houston Rockets
Writing stats for the Utah Jazz
Writing stats for the Denver Nuggets
Writing stats for the San Antonio Spurs
Writing stats for the Detroit Pistons
Writing stats for the Portland Trail Blazers
Writing stats for the Philadelphia 76ers
Writing stats for the Boston Celtics
Writing stats for the Phoenix Suns
Writing stats for the Washington Wizards
Writing stats for the Indiana Pacers
Writing stats for the Golden State Warriors
Writing stats for the Los Angeles Clippers
Writing stats for the Cleveland Cavaliers
Writing stats for the New Jersey Nets
Writing stats for the Charlotte Hornets
Writing st

That's it! We've written the code to a file and the file is now stored in the directory of the jupyter notebook. If you were to take this code and run it on your local machine you two would have a freshly written file on your hard drive. Now we're surfing the web!

## Loading and Cleaning the Data

Now that we've hung ten surfing the world wide web with python, we should probably check that we got the data we actually wanted! Let's load up the csv using pandas and then check it out.

In [22]:
a = pd.read_csv("NBA_Team_PerGame_Stats.csv")
len(a.columns)
a.head()

Unnamed: 0,Season,Team,Games,MP,FG,FGA,FG_Perc,3P,3PA,3P_Perc,...,FT_Perc,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1974-75,Buffalo Braves,82,241.8,45.5,94.7,0.48,,,,...,0.765,14.0,34.5,48.5,26.4,9.6,7.3,22.3,22.9,111.6
1,1974-75,Golden State Warriors,82,240.3,45.4,97.8,0.464,,,,...,0.778,16.8,37.0,53.8,24.3,8.1,5.5,20.3,23.1,109.9
2,1974-75,Los Angeles Lakers,82,240.9,43.1,95.2,0.453,,,,...,0.769,16.6,36.2,52.9,26.6,9.7,8.0,23.3,24.8,109.2
3,1974-75,Boston Celtics,82,241.2,44.3,97.2,0.456,,,,...,0.8,16.8,37.5,54.3,26.7,6.8,3.7,21.9,22.8,109.0
4,1974-75,Atlanta Hawks,82,240.9,43.9,94.4,0.465,,,,...,0.752,15.1,33.1,48.2,24.3,9.2,4.0,22.2,25.3,108.6


In [6]:
a.tail()

Unnamed: 0,Season,Team,Games,MP,FG,FGA,FG_Perc,3P,3PA,3P_Perc,...,FT_Perc,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
1218,2019-20,Chicago Bulls,77,242.9,39.8,87.7,0.454,9.2,26.2,0.353,...,0.787,8.8,34.1,42.8,22.0,7.4,4.3,14.4,20.4,105.1
1219,2019-20,Cleveland Cavaliers,77,241.0,38.8,87.5,0.443,10.4,29.2,0.356,...,0.794,10.6,32.0,42.5,20.8,6.5,2.4,13.5,20.3,104.5
1220,2019-20,Miami Heat,76,240.3,39.5,87.7,0.451,11.3,32.2,0.352,...,0.693,11.3,34.9,46.2,24.2,7.7,5.5,14.7,21.0,105.6
1221,2019-20,New York Knicks,76,241.3,38.3,88.8,0.432,9.9,29.3,0.338,...,0.761,10.7,33.9,44.7,19.9,7.0,5.0,14.1,21.2,104.8
1222,2019-20,Memphis Grizzlies,77,242.3,37.8,84.2,0.449,9.5,28.3,0.337,...,0.768,8.8,32.8,41.5,23.9,8.4,5.5,14.1,22.1,102.9


In [23]:
# Note that we won't see a description of 3 point stats yet
# This is because the 3 pointer was not implemented until
# 1979. We'll have to change the currently recorded 'None'
# value to an na in order to get a description.
a.describe()

Unnamed: 0,Games,MP,FG,FGA,FG_Perc,2P,2PA,2P_Perc,FT,FTA,FT_Perc,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0,1223.0
mean,80.717907,241.661897,39.165576,84.61619,0.462434,34.708913,71.904334,0.483273,19.580131,25.959853,0.754504,12.652412,30.448324,43.099019,23.198038,8.166476,5.049223,15.989779,22.568684,102.372281
std,5.443632,0.839679,3.43396,5.141355,0.021135,5.805299,12.020909,0.02241,2.53091,3.24135,0.028591,2.035316,2.16604,2.489687,2.621564,1.152435,0.977403,2.261353,2.380986,6.936812
min,50.0,240.0,30.8,71.2,0.401,23.1,41.9,0.414,12.2,16.6,0.66,7.6,24.9,35.6,15.6,5.5,2.4,11.2,16.6,81.9
25%,82.0,241.2,36.5,80.8,0.447,30.1,62.5,0.468,17.7,23.7,0.737,11.1,29.0,41.4,21.2,7.3,4.3,14.4,20.8,97.2
50%,82.0,241.5,38.7,84.1,0.461,32.2,67.5,0.482,19.4,25.8,0.756,12.5,30.2,42.9,23.0,8.0,5.0,15.5,22.5,102.2
75%,82.0,242.1,42.0,88.1,0.476,40.5,83.3,0.498,21.3,28.2,0.773,14.1,31.75,44.6,25.1,8.9,5.6,17.2,24.2,107.3
max,82.0,244.9,48.5,108.1,0.545,48.2,98.2,0.565,29.1,37.2,0.832,18.5,40.4,54.3,31.4,12.9,8.7,24.5,30.1,126.5


In [14]:
a.loc[a['3P'] == 'None',['3P','3PA','3P_Perc']] = np.nan
a['3P'].to_int()

AttributeError: 'Series' object has no attribute 'to_int'