# NBA Data
*Webscraping to obtain desired NBA Data for University of Utah MATH 4100/COMP 5360 Final Project.* <br>
<br>
Palani Thangaraj<br>
Thomas Pembroke<br>

In [7]:
# imports and setup 
from bs4 import BeautifulSoup
#  either of these libraries can be used to get html from a website
import time
import os

import pandas as pd
import scipy as sc
import numpy as np

import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

# where the data is stored
TEAM_DATA_PATH = "team_data"
CONTRACT_DATA_PATH = "contract_data"
PLAYER_DATA_PATH = "player_data"

## Part 1: NBA Player Contracts
Create a dataframe that holds each players average annual salaray for the past 10 completed seasons (2008-2018)

In [2]:
# array to hold BeautifulSoup objects for player contracts
contract_soups = []

for folder in os.listdir(CONTRACT_DATA_PATH):
    if folder.endswith((".html")):
        fpath = os.path.join(CONTRACT_DATA_PATH, folder)
        with open(fpath, encoding="utf-8", errors='ignore') as new_file:
            html = new_file.read()
            contract_soups.append(BeautifulSoup(html, "html.parser"))
print(len(contract_soups))

194


In [3]:
# arrays to store column data
YR = []     # year
RK = []     # rank
NAME = []   # name
POS = []    # position
TEAM = []   # team
SALARY = [] # salary

for soup in contract_soups:  
    year = soup.find_all(class_="mod-container mod-no-header-footer mod-page-header")[0].get_text().strip().split()[4]
    for player_odd in soup.find_all(True, {'class':['oddrow', 'evenrow']}): #(class_="oddrow" or class_"evenrow):
        YR.append(year)
        RK.append(player_odd.find_all("td")[0].get_text().strip())
        NAME.append(player_odd.find_all("a")[0].get_text().strip())
        POS.append(player_odd.find_all("td")[1].get_text().strip().split()[2])
        TEAM.append(player_odd.find_all("td")[2].get_text().strip())
        SALARY.append(player_odd.find_all("td")[3].get_text().strip())
        
PlayerContracts = pd.DataFrame()
PlayerContracts['YR'] = pd.Series(YR)
PlayerContracts['RK'] = pd.Series(RK)
PlayerContracts['NAME'] = pd.Series(NAME)
PlayerContracts['POS'] = pd.Series(POS)
PlayerContracts['TEAM'] = pd.Series(TEAM)
PlayerContracts['SALARY'] = pd.Series(SALARY)

PlayerContracts.head()

Unnamed: 0,YR,RK,NAME,POS,TEAM,SALARY
0,2000-2001Season:,1,Kevin Garnett,PF,Minnesota Timberwolves,"$19,600,000"
1,2000-2001Season:,2,Shaquille O'Neal,C,Los Angeles Lakers,"$19,285,000"
2,2000-2001Season:,3,Alonzo Mourning,C,Miami Heat,"$16,879,000"
3,2000-2001Season:,4,Juwan Howard,PF,Washington Wizards,"$16,875,000"
4,2000-2001Season:,5,Hakeem Olajuwon,C,Houston Rockets,"$16,685,000"


## Part 2: NBA Team Data
Create a dataframe that holds each teams season stats (i.e. records, ppg, etc.) for the past 17 completed seasons (2000-2018)

In [4]:
# array to hold BeautifulSoup objects for each season
team_soups = []

for folder in os.listdir(TEAM_DATA_PATH):
    if folder.endswith((".html")):
        fpath = os.path.join(TEAM_DATA_PATH, folder)
        with open(fpath, encoding="utf-8") as new_file:
            html = new_file.read()
        team_soups.append(BeautifulSoup(html, "html.parser"))
print(len(team_soups))

18


In [5]:
# arrays to store column data
YR = []     # year
RANK = []   # season rank
TEAM = []   # team
GP = []     # games played
W = []      # wins
L = []      # losses
W_per = []  # win percent
PTS = []    # average pts/game
FGM = []    # field goals made
FGA = []    # field goals attempted
FG_per = [] # field goal percent
TPM = []    # 3 points made
TPA = []    # 3 points attempted
TP_per = [] # 3 point percent
FTM = []    # free throw made
FTA = []    # free throw attempted
FT_per = [] # free throw percent
OREB = []   # offensive rebounds
DREB = []   # defensive rebounds
REB = []    # total rebounds
AST = []    # assists
TOV = []    # turnovers
STL = []    # steals
BLK = []    # blocks
BLKA = []   # blocked field goal attempts
PF = []     # personal fouls
PFD = []    # personal fouls drawn
PTS = []    # points
PM = []     # plus minus

for soup in team_soups:  
    for team in soup.find_all(class_="nba-stat-table__overflow"):
        for stats in team.find_all("tr", {"data-ng-repeat" : "(i, row) in page track by row.$hash"}):
            YR.append(soup.find("span", {"class" : "stats-filter-pill__text"}).get_text().strip())
            RANK.append(stats.find_all("td")[0].get_text().strip())
            TEAM.append(stats.find_all("td")[1].get_text().strip())
            GP.append(stats.find_all("td")[2].get_text().strip())
            W.append(stats.find_all("td")[3].get_text().strip())
            L.append(stats.find_all("td")[4].get_text().strip())
            W_per.append(stats.find_all("td")[5].get_text().strip())
            PTS.append(stats.find_all("td")[7].get_text().strip())
            FGM.append(stats.find_all("td")[8].get_text().strip())
            FGA.append(stats.find_all("td")[9].get_text().strip())
            FG_per.append(stats.find_all("td")[10].get_text().strip())
            TPM.append(stats.find_all("td")[11].get_text().strip())
            TPA.append(stats.find_all("td")[12].get_text().strip())
            TP_per.append(stats.find_all("td")[13].get_text().strip())
            FTM.append(stats.find_all("td")[14].get_text().strip())
            FTA.append(stats.find_all("td")[15].get_text().strip())
            FT_per.append(stats.find_all("td")[16].get_text().strip())
            OREB.append(stats.find_all("td")[17].get_text().strip())
            DREB.append(stats.find_all("td")[18].get_text().strip())
            REB.append(stats.find_all("td")[19].get_text().strip())
            AST.append(stats.find_all("td")[20].get_text().strip())
            TOV.append(stats.find_all("td")[21].get_text().strip())
            STL.append(stats.find_all("td")[22].get_text().strip())
            BLK.append(stats.find_all("td")[23].get_text().strip())
            BLKA.append(stats.find_all("td")[24].get_text().strip())
            PF.append(stats.find_all("td")[25].get_text().strip())
            PFD.append(stats.find_all("td")[26].get_text().strip())
            PM.append(stats.find_all("td")[27].get_text().strip())

TeamStats = pd.DataFrame()
TeamStats['YR'] = pd.Series(YR)
TeamStats['RANK'] = pd.Series(RANK)
TeamStats['TEAM'] = pd.Series(TEAM)
TeamStats['GP'] = pd.Series(GP)
TeamStats['W'] = pd.Series(W)
TeamStats['L'] = pd.Series(L)
TeamStats['W_per'] = pd.Series(W_per)
TeamStats['PTS'] = pd.Series(PTS)
TeamStats['FGM'] = pd.Series(FGM)
TeamStats['FGA'] = pd.Series(FGA)
TeamStats['FG_per'] = pd.Series(FG_per)
TeamStats['TPM'] = pd.Series(TPM)
TeamStats['TPA'] = pd.Series(TPA)
TeamStats['TP_per'] = pd.Series(TP_per)
TeamStats['FTM'] = pd.Series(FTM)
TeamStats['FTA'] = pd.Series(FTA)
TeamStats['FT_per'] = pd.Series(FT_per)
TeamStats['OREB'] = pd.Series(OREB)
TeamStats['DREB'] = pd.Series(DREB)
TeamStats['REB'] = pd.Series(REB)
TeamStats['AST'] = pd.Series(AST)
TeamStats['TOV'] = pd.Series(TOV)
TeamStats['STL'] = pd.Series(STL)
TeamStats['BLK'] = pd.Series(BLK)
TeamStats['BLKA'] = pd.Series(BLKA)
TeamStats['PF'] = pd.Series(PF)
TeamStats['PFD'] = pd.Series(PFD)
TeamStats['PM'] = pd.Series(PM)

TeamStats.head()

Unnamed: 0,YR,RANK,TEAM,GP,W,L,W_per,PTS,FGM,FGA,...,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PM
0,2000-01,1,San Antonio Spurs,82,58,24,0.707,96.2,35.2,76.4,...,33.1,44.1,21.7,14.0,6.9,7.0,5.2,18.9,0.0,7.8
1,2000-01,2,Los Angeles Lakers,82,56,26,0.683,100.6,37.9,81.5,...,31.5,44.7,23.0,14.4,6.9,6.0,4.0,22.8,0.1,3.4
2,2000-01,2,Philadelphia 76ers,82,56,26,0.683,94.7,35.4,79.1,...,31.7,44.8,20.6,15.8,8.4,5.0,5.6,20.4,0.0,4.3
3,2000-01,4,Sacramento Kings,82,55,27,0.671,101.7,38.2,85.0,...,33.0,45.0,22.6,14.9,9.7,5.3,5.8,19.5,0.0,5.8
4,2000-01,5,Dallas Mavericks,82,53,29,0.646,100.5,37.6,81.9,...,31.4,41.5,21.2,13.9,7.5,6.0,4.6,23.3,0.1,4.3


## Part 3: NBA Player Data
Create a dataframe that holds each players personal stats (FT %, ppg, etc.) for the past 10 completed seasons (2008-2018)

In [8]:
# array to hold BeautifulSoup objects for each season
player_soups = []

for folder in os.listdir(PLAYER_DATA_PATH):
    if folder.endswith((".html")):
        fpath = os.path.join(PLAYER_DATA_PATH, folder)
        with open(fpath, encoding="utf-8") as new_file:
            html = new_file.read()
        player_soups.append(BeautifulSoup(html, "html.parser"))
print(len(player_soups))

175


In [9]:
# arrays to store column data
YR = []     # year
PLAYER = [] # player
TEAM = []   # team
AGE = []    # age
GP = []     # games played
W = []      # wins
L = []      # losses
MIN = []    # minutes
PTS = []    # average pts/game
FGM = []    # field goals made
FGA = []    # field goals attempted
FG_per = [] # field goal percent
TPM = []    # 3 points made
TPA = []    # 3 points attempted
TP_per = [] # 3 point percent
FTM = []    # free throw made
FTA = []    # free throw attempted
FT_per = [] # free throw percent
OREB = []   # offensive rebounds
DREB = []   # defensive rebounds
REB = []    # total rebounds
AST = []    # assists
TOV = []    # turnovers
STL = []    # steals
BLK = []    # blocks
PF = []     # personal fouls
DD2 = []    # double doubles
TD3 = []    # triple doubles
PM = []     # plus minus

for soup in player_soups:  
    for player in soup.find_all(class_="nba-stat-table__overflow"):
        for stats in player.find_all("tr", {"data-ng-repeat" : "(i, row) in page track by ::row.$hash"}):
            YR.append(soup.find("span", {"class" : "stats-filter-pill__text"}).get_text().strip())
            PLAYER.append(stats.find_all("td")[1].get_text().strip())
            TEAM.append(stats.find_all("td")[2].get_text().strip())
            AGE.append(stats.find_all("td")[3].get_text().strip())
            GP.append(stats.find_all("td")[4].get_text().strip())
            W.append(stats.find_all("td")[5].get_text().strip())
            L.append(stats.find_all("td")[6].get_text().strip())
            MIN.append(stats.find_all("td")[7].get_text().strip())
            PTS.append(stats.find_all("td")[8].get_text().strip())
            FGM.append(stats.find_all("td")[9].get_text().strip())
            FGA.append(stats.find_all("td")[10].get_text().strip())
            FG_per.append(stats.find_all("td")[11].get_text().strip())
            TPM.append(stats.find_all("td")[12].get_text().strip())
            TPA.append(stats.find_all("td")[13].get_text().strip())
            TP_per.append(stats.find_all("td")[14].get_text().strip())
            FTM.append(stats.find_all("td")[16].get_text().strip())
            FTA.append(stats.find_all("td")[17].get_text().strip())
            FT_per.append(stats.find_all("td")[18].get_text().strip())
            OREB.append(stats.find_all("td")[19].get_text().strip())
            DREB.append(stats.find_all("td")[20].get_text().strip())
            REB.append(stats.find_all("td")[21].get_text().strip())
            AST.append(stats.find_all("td")[22].get_text().strip())
            TOV.append(stats.find_all("td")[23].get_text().strip())
            STL.append(stats.find_all("td")[24].get_text().strip())
            BLK.append(stats.find_all("td")[25].get_text().strip())
            PF.append(stats.find_all("td")[26].get_text().strip())
            DD2.append(stats.find_all("td")[27].get_text().strip())
            TD3.append(stats.find_all("td")[28].get_text().strip())
            PM.append(stats.find_all("td")[29].get_text().strip())
print("done")
PlayerStats = pd.DataFrame()
PlayerStats['YR'] = pd.Series(YR)
PlayerStats['PLAYER'] = pd.Series(PLAYER)
PlayerStats['TEAM'] = pd.Series(TEAM)
PlayerStats['AGE'] = pd.Series(AGE)
PlayerStats['GP'] = pd.Series(GP)
PlayerStats['W'] = pd.Series(W)
PlayerStats['L'] = pd.Series(L)
PlayerStats['MIN'] = pd.Series(MIN)
PlayerStats['PTS'] = pd.Series(PTS)
PlayerStats['FGM'] = pd.Series(FGM)
PlayerStats['FGA'] = pd.Series(FGA)
PlayerStats['FG_per'] = pd.Series(FG_per)
PlayerStats['TPM'] = pd.Series(TPM)
PlayerStats['TPA'] = pd.Series(TPA)
PlayerStats['TP_per'] = pd.Series(TP_per)
PlayerStats['FTM'] = pd.Series(FTM)
PlayerStats['FTA'] = pd.Series(FTA)
PlayerStats['FT_per'] = pd.Series(FT_per)
PlayerStats['OREB'] = pd.Series(OREB)
PlayerStats['DREB'] = pd.Series(DREB)
PlayerStats['REB'] = pd.Series(REB)
PlayerStats['AST'] = pd.Series(AST)
PlayerStats['TOV'] = pd.Series(TOV)
PlayerStats['STL'] = pd.Series(STL)
PlayerStats['BLK'] = pd.Series(BLK)
PlayerStats['PF'] = pd.Series(PF)
PlayerStats['DD2'] = pd.Series(DD2)
PlayerStats['TD3'] = pd.Series(TD3)
PlayerStats['PM'] = pd.Series(PM)


PlayerStats.head()

done


Unnamed: 0,YR,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,DREB,REB,AST,TOV,STL,BLK,PF,DD2,TD3,PM
0,2000-01,Allen Iverson,PHI,26,71,50,21,41.9,31.1,10.7,...,3.8,4.6,3.3,2.5,0.3,2.1,47.6,4,0,5.2
1,2000-01,Jerry Stackhouse,DET,26,80,31,49,40.1,29.8,9.7,...,3.9,5.1,4.1,1.2,0.7,2.0,43.8,5,1,-0.8
2,2000-01,Shaquille O'Neal,LAL,29,74,51,23,39.5,28.7,11.0,...,12.7,3.7,2.9,0.6,2.8,3.5,56.8,60,0,6.3
3,2000-01,Kobe Bryant,LAL,22,68,45,23,41.0,28.5,10.3,...,5.9,5.0,3.2,1.7,0.6,3.3,46.7,10,2,5.4
4,2000-01,Vince Carter,TOR,24,75,45,30,39.7,27.6,10.2,...,5.5,3.9,2.2,1.5,1.1,2.7,45.7,10,0,4.3


In [10]:
print(PlayerStats)

           YR                PLAYER TEAM AGE  GP   W   L   MIN   PTS   FGM  \
0     2000-01         Allen Iverson  PHI  26  71  50  21  41.9  31.1  10.7   
1     2000-01      Jerry Stackhouse  DET  26  80  31  49  40.1  29.8   9.7   
2     2000-01      Shaquille O'Neal  LAL  29  74  51  23  39.5  28.7  11.0   
3     2000-01           Kobe Bryant  LAL  22  68  45  23  41.0  28.5  10.3   
4     2000-01          Vince Carter  TOR  24  75  45  30  39.7  27.6  10.2   
5     2000-01          Chris Webber  SAC  28  70  48  22  40.5  27.1  11.2   
6     2000-01         Tracy McGrady  ORL  22  77  39  38  40.2  26.8  10.2   
7     2000-01           Paul Pierce  BOS  23  82  36  46  38.1  25.3   8.4   
8     2000-01        Antawn Jamison  GSW  25  82  17  65  41.4  24.9   9.8   
9     2000-01       Stephon Marbury  NJN  24  67  22  45  38.2  23.9   8.4   
10    2000-01        Antoine Walker  BOS  24  81  35  46  41.9  23.4   8.8   
11    2000-01           Karl Malone  UTA  37  81  52  29  35.7  

## Part 4: Compile Dataframes
Complile NBA player contracts and stats to the same dataframes and ensure consistency across all dataframes

## Part 5: Save Dataframes to CSV files

In [11]:
PlayerStats.to_csv('PlayerStats_raw.csv')
TeamStats.to_csv('TeamStats_Raw.csv')
PlayerContracts.to_csv('PlayerContracts_Raw.csv')

print("done")

done
