#### Set up the environment

In [2]:
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import numpy as np
import pandas as pd

In [3]:
import patsy
import scipy.stats as stats
from scipy.stats import boxcox 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

Set up the web scraper for www.basketball-reference.com to collect basketball stat and data on Google Chrome

In [2]:
chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver

urltest = "https://www.basketball-reference.com/players/c/curryst01.html"
restest = requests.get(urltest)
urlmain = "https://www.basketball-reference.com/contracts/players.html"
res = requests.get(urlmain)
# players = bs(driver.page_source, 'html.parser')

In [260]:
robot = "https://www.basketball-reference.com/robots.txt"

response = requests.get(robot)
requests.get(robot).status_code

200

In [None]:
print(response.text)

In [261]:
soup = bs(res.text, "html5lib")

In [262]:
soup_p = soup.find_all("tbody")[-1]

Web scraper below will collect NBA player names and their profile URL links in a dictionary format, which will become the basis for the web scraper to collect all necessary information

In [None]:
soup = bs(res.text, "html5lib")
soup_p = soup.find_all("tbody")[-1]

def linkcollect(s):
    s_link = s.find("td", class_='left').find('a').get('href')
    s_name = s.find('td',class_='left').find_all('a')[-1].text
    s_team = s.find_all("td", class_='left')[1].find('a').text
    s_salary = s.find('td',class_='right').text
    headers = ['link','pname','team','salary']
    player_dict = dict(zip(headers, [s_link,s_name,s_team,s_salary]))
    return player_dict

player_dict=[]                       
       
# There are 2 line breaks after every 21 rows of tables on page. Could have been done with slicing but to be certain:

for i in range(0,590):
    s = soup_p.find_all("tr")[i]
    if i in [20,21,42,43,64,65,86,87,108,109,130,131,152,153,174,175,196,197,218,219\
            ,240,241,262,263,284,285,306,307,328,329,350,351,372,373,394,395,416,417\
            ,438,439,460,461,482,483,504,505,526,527,548,549,570,571]:
        pass
    else:
        player_dict.append(linkcollect(s))
    print(i)

# player_dict

In [312]:
playerurls = pd.DataFrame(player_dict)
playerurls.set_index('pname', inplace=True)
playerurls.head()
playerurls.to_csv("playerurls.csv")

In [28]:
playerurls = pd.read_csv('playerurls.csv')

Using the dictionary of NBA players and the corresponding URL, web scraper/helper function will collect per season stats of each player in the script below

In [860]:
def playerseason(x):
    website = 'https://www.basketball-reference.com'
    url = website + x
    res = requests.get(url)
    page = res.text
    soup = bs(page,'html5lib')
    bio = soup.select_one('div[itemtype="https://schema.org/Person"]')

    headers = ['player_name','height','weight','season','age','team','pos','gameplayed',\
              'gamestarted','minpergame','fgmade','fgattempt','fgpct','threemade',\
              'threeattempt','threepct','twomade','twoattempt','twopct','efgpct',\
              'ftmade','ftattempt','ftpct','offreb','defreb','allreb','assist',\
              'steals','blocks','turnov','pfouls','points']
    
    if soup.select_one('table[id="per_game"]') != None:
        per_game = soup.select_one('table[id="per_game"]')
        cbody = soup.select_one('div[id="all_all_salaries"]')
        comment = cbody.find(text=lambda text:isinstance(text, Comment))
        commentsoup = bs(comment, 'html5lib')
        saltable = commentsoup.find_all('tr')[:]

    
    # Name
    player_name = bio.find('h1').find('span').text
    #Height
    height = bio.select_one('span[itemprop="height"]').text
    #Weight
    weight = bio.select_one('span[itemprop="weight"]').text
    

    dict_list = []

    if soup.select_one('table[id="per_game"]') != None:
        if per_game.find_all('tr') != None:
            for i in range(1,len(per_game.find_all('tr'))-1):

                pgtr = per_game.find_all('tr')
                try:
                    season = pgtr[i].get('id')[-4:]
                except: 
                    season = ''

                row = pgtr[i]

                try:
                    age = row.select_one('td[data-stat="age"]').text
                except:
                    age = ''

                try:
                    team = row.select_one('td[data-stat="team_id"]').text
                except:
                    team = ''

                try:
                    pos = row.select_one('td[data-stat="pos"]').text
                except:
                    pos = ''

                try:
                    gameplayed = row.select_one('td[data-stat="g"]').text
                except:
                    gameplayed = ''

                try:
                    gamestarted = row.select_one('td[data-stat="gs"]').text
                except:
                    gamestarted = ''

                try:
                    minpergame = row.select_one('td[data-stat="mp_per_g"]').text
                except:
                    minpergame = ''

                try:
                    fgmade = row.select_one('td[data-stat="fg_per_g"]').text
                except:
                    fgmade = ''

                try:
                    fgattempt = row.select_one('td[data-stat="fga_per_g"]').text
                except:
                    fgattempt = ''

                try:
                    fgpct = row.select_one('td[data-stat="fg_pct"]').text
                except:
                    fgpct = ''

                try:
                    threemade = row.select_one('td[data-stat="fg3_per_g"]').text
                except:
                    threemade = ''

                try:
                    threeattempt = row.select_one('td[data-stat="fg3a_per_g"]').text
                except:
                    threeattempt = ''

                try:
                    threepct = row.select_one('td[data-stat="fg3_pct"]').text
                except:
                    threepct = ''

                try:
                    twomade = row.select_one('td[data-stat="fg2_per_g"]').text
                except:
                    twomade = ''

                try:
                    twoattempt = row.select_one('td[data-stat="fg2a_per_g"]').text
                except:
                    twoattempt = ''

                try:
                    twopct = row.select_one('td[data-stat="fg2_pct"]').text
                except:
                    twopct = ''

                try:
                    efgpct = row.select_one('td[data-stat="efg_pct"]').text
                except:
                    efgpct = ''

                try:
                    ftmade = row.select_one('td[data-stat="ft_per_g"]').text
                except: 
                    ftmade = ''

                try:
                    ftattempt = row.select_one('td[data-stat="fta_per_g"]').text
                except:
                    ftattempt = ''

                try:
                    ftpct = row.select_one('td[data-stat="ft_pct"]').text
                except:
                    ftpct = ''

                try:
                    offreb = row.select_one('td[data-stat="orb_per_g"]').text
                except:
                    offreb = ''

                try:
                    defreb = row.select_one('td[data-stat="drb_per_g"]').text
                except:
                    defreb = ''

                try:
                    allreb = row.select_one('td[data-stat="trb_per_g"]').text
                except:
                    allreb = ''

                try:
                    assist = row.select_one('td[data-stat="ast_per_g"]').text
                except:
                    assist = ''

                try:
                    steals = row.select_one('td[data-stat="stl_per_g"]').text
                except:
                    steals = ''

                try:
                    blocks = row.select_one('td[data-stat="blk_per_g"]').text
                except:
                    blocks = ''

                try: 
                    turnov = row.select_one('td[data-stat="tov_per_g"]').text
                except:
                    turnov = ''

                try:
                    pfouls = row.select_one('td[data-stat="pf_per_g"]').text
                except:
                    pfouls = ''

                try: 
                    points = row.select_one('td[data-stat="pts_per_g"]').text
                except:
                    points = ''

                newdict = dict(zip(headers, [player_name,height,weight,\
                                             season,age,team,pos,gameplayed,\
                      gamestarted,minpergame,fgmade,fgattempt,fgpct,threemade,\
                      threeattempt,threepct,twomade,twoattempt,twopct,efgpct,\
                      ftmade,ftattempt,ftpct,offreb,defreb,allreb,assist,\
                      steals,blocks,turnov,pfouls,points]))
                dict_list.append(newdict)
            
    return dict_list

In [26]:
def playersalary(x):
    website = 'https://www.basketball-reference.com'
    url = website + x
    res = requests.get(url)
    page = res.text
    soup = bs(page,'html5lib')
    bio = soup.select_one('div[itemtype="https://schema.org/Person"]')
    player_name = bio.find('h1').find('span').text

    dict_list = []
    headers=['player_name','season','team','salary']
    if soup.select_one('table[id="per_game"]') != None:
        cbody = soup.select_one('div[id="all_all_salaries"]')
        comment = cbody.find(text=lambda text:isinstance(text, Comment))
        commentsoup = bs(comment, 'html5lib')
        saltable = commentsoup.find_all('tr')[:]
 
        for i in range(1,len(saltable)-1):
            soup_s = bs(saltable[i].text,'html5lib')
            season = int(soup_s.text[:4])+1
            try:
                team = saltable[i].find('a').text
            except:
                team = ''
            try:
                salary = soup_s.find_all('td')[-1].text
            except:
                salary = soup_s.text[-11:]

            newdict_s = dict(zip(headers,[player_name, season, team, salary]))
            dict_list.append(newdict_s)

        return dict_list

In [None]:
playerstats = [] 
for link in playerurls.link:
    playerstats.append(playerseason(link))

pstats = []
for i in playerstats:
    if i != None: 
        for j in i:
            pstats.append(j)
            
dfp = pd.DataFrame(pstats)

In [33]:
psal2 = []
for link in playerurls.link:
    psal2.append(playersalary(link))

psalary = []
for i in psal2:
    if i != None:
        for j in i:
            psalary.append(j)
            
dfs = pd.DataFrame(psalary)

Checkpoint: Save and load data

In [35]:
dfp.to_csv("stats.csv")
dfs.to_csv("salaries.csv")

In [405]:
dfp = pd.read_csv("stats.csv")
dfs = pd.read_csv("salaries.csv")

In [406]:
dfp.drop(dfp.iloc[:,0:1], axis=1, inplace=True)

In [407]:
dfs.drop(dfs.iloc[:,0:1],axis=1, inplace=True)