In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import os
import re

In [348]:
chromedriver = "/home/michael/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.add_argument('--dns-prefetch-disable')

# 1.0 Scraping Data

## 1.1 Get List of URL Extensions for every player
First pull all of the draft url extensions

Then the function takes that list and pulls all the players for that list

In [311]:
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.basketball-reference.com/draft/')
time.sleep(3)

draftyears = driver.find_elements_by_xpath('//th[@data-stat="year_id"]')
draftyears = [x.get_attribute('innerHTML').split('"')[1] for x in draftyears[1:41]]

In [328]:
def get_player_url_list(draftyears):
    """
    Takes the any basketball reference extension and pulls the player url extensions for that page
    Also logs the pages loaded correctly for troubleshooting
    """
    playerlist = []
    pagesloaded =[]
    for x in draftyears:
        driver = webdriver.Chrome(chromedriver,chrome_options=chrome_options)
        driver.get('https://www.basketball-reference.com/'+x)
        time.sleep(5)
        try:
            draftplayers = driver.find_elements_by_xpath('//td[@data-stat="player"]')
            draftplayerlink = [x.get_attribute('innerHTML') for x in draftplayers]
            pagesloaded.append(x)
        except:
            draftplayerlink = ''
        playerlist.extend(draftplayerlink)
        driver.close()
    return playerlist, pagesloaded
        

In [329]:
links,completeyears = get_player_url_list(draftyears[:41])

len(links)

In [331]:
completeyears

['/draft/NBA_2017.html',
 '/draft/NBA_2016.html',
 '/draft/NBA_2015.html',
 '/draft/NBA_2014.html',
 '/draft/NBA_2013.html',
 '/draft/NBA_2012.html',
 '/draft/NBA_2011.html',
 '/draft/NBA_2010.html',
 '/draft/NBA_2009.html',
 '/draft/NBA_2008.html',
 '/draft/NBA_2007.html',
 '/draft/NBA_2006.html',
 '/draft/NBA_2005.html',
 '/draft/NBA_2004.html',
 '/draft/NBA_2003.html',
 '/draft/NBA_2002.html',
 '/draft/NBA_2001.html',
 '/draft/NBA_2000.html',
 '/draft/NBA_1999.html',
 '/draft/NBA_1998.html',
 '/draft/NBA_1997.html',
 '/draft/NBA_1996.html',
 '/draft/NBA_1995.html',
 '/draft/NBA_1994.html',
 '/draft/NBA_1993.html',
 '/draft/NBA_1992.html',
 '/draft/NBA_1991.html',
 '/draft/NBA_1990.html',
 '/draft/NBA_1989.html',
 '/draft/NBA_1988.html',
 '/draft/NBA_1987.html',
 '/draft/NBA_1986.html',
 '/draft/NBA_1985.html',
 '/draft/NBA_1984.html',
 '/draft/NBA_1983.html',
 '/draft/NBA_1982.html',
 '/draft/NBA_1981.html',
 '/draft/NBA_1980.html',
 '/draft/NBA_1979.html',
 '/draft/NBA_1978.html']

In [334]:
links

['<a href="/players/f/fultzma01.html">Markelle Fultz</a>',
 '<a href="/players/b/balllo01.html">Lonzo Ball</a>',
 '<a href="/players/t/tatumja01.html">Jayson Tatum</a>',
 '<a href="/players/j/jacksjo02.html">Josh Jackson</a>',
 '<a href="/players/f/foxde01.html">De\'Aaron Fox</a>',
 '<a href="/players/i/isaacjo01.html">Jonathan Isaac</a>',
 '<a href="/players/m/markkla01.html">Lauri Markkanen</a>',
 '<a href="/players/n/ntilila01.html">Frank Ntilikina</a>',
 '<a href="/players/s/smithde03.html">Dennis Smith</a>',
 '<a href="/players/c/colliza01.html">Zach Collins</a>',
 '<a href="/players/m/monkma01.html">Malik Monk</a>',
 '<a href="/players/k/kennalu01.html">Luke Kennard</a>',
 '<a href="/players/m/mitchdo01.html">Donovan Mitchell</a>',
 '<a href="/players/a/adebaba01.html">Bam Adebayo</a>',
 '<a href="/players/j/jacksju01.html">Justin Jackson</a>',
 '<a href="/players/p/pattoju01.html">Justin Patton</a>',
 '<a href="/players/w/wilsodj01.html">D.J. Wilson</a>',
 '<a href="/players/l/l

The links contain non links and html which needs to be removed

In [354]:
linksclean = [ x for x in links if x[:2] == '<a']
linksclean = [ x.split('"')[1] for x in linksclean]
len(linksclean)

2196

## 1.2 Scrape the data into Data Frames
First there is a function to take the a list of player URL extensions and scrapes those player pages

In [384]:
def draft_list_load(sitelist,playerdf,advstatsdf,colstatsdf):
    """
    Function to Iterate over all players given a list of player websites
    """
    def page_scrape_player(startplayerdf):
        """
        Function to pull all player relevent data in player page and add the information to the base dataframe    
        """
    
        #Name, Height, Weight Come directly from xpath lookups
        name = driver.find_element_by_xpath('//h1[@itemprop="name"]').text

        try:
            height = driver.find_element_by_xpath('//span[@itemprop="height"]').text
            height = height.split('-')
            height = int(height[0])+(int(height[1])/12)
        except:
            height = 0


        try:
            weight = driver.find_element_by_xpath('//span[@itemprop="weight"]').text
            weight = int(weight[:-2])
        except:
            weight = 0

        #Draft Number, Draft year and Recruit number come from a larger info section, this breaks up the info section
        try:
            info = driver.find_element_by_xpath('//div[@id="info"]').text
            info = info.split('\n')
            draft_regex = re.compile('Draft:')
            draftstr = [x  for x in info if draft_regex.search(x)]
            draftstr = draftstr[0].split(',')

            draftnbr = draftstr[2].strip().split(' ')
            draftnbr = draftnbr[0]
            draftnbr = int(draftnbr[:-2])

            draftyr = draftstr[3].strip().split(' ')
            draftyr = int(draftyr[0])
        except:
            draftnbr = 0
            draftyr = 0

        try: #Not all players have recruit number
            recruit_regex = re.compile('Recruiting')
            recruitstr = [x  for x in info if recruit_regex.search(x)]    
            recruitstr = recruitstr[0].split(' ')
            recruitnbr = recruitstr[-1]
            recruitnbr = int(recruitnbr[1:-1])
        except:
            recruitnbr = 0

        #creates a new data frame for player info to append to the starting player df
        dfscrapeinfo = pd.DataFrame(index=[0])
        dfscrapeinfo['player'] = name
        dfscrapeinfo['height'] = height
        dfscrapeinfo['weight'] = weight
        dfscrapeinfo['draftnbr']  = draftnbr
        dfscrapeinfo['draftyr'] = draftyr
        dfscrapeinfo['recruitnbr'] = recruitnbr

        startplayerdf = startplayerdf.append(dfscrapeinfo)
        return startplayerdf
    
    def page_scrape_advstats(startadvstatsdf):
        """
        Function to pull all advanced stats data in player page and add the information to the base dataframe   
        """
        #gets name so the tables can be aligned later
        name = driver.find_element_by_xpath('//h1[@itemprop="name"]').text


        #Gets the advanced stats table into a df then appends on to starting stats df
        try:
            advstatsxpath = driver.find_element_by_xpath('//table[@id="advanced"]').get_attribute('outerHTML')
            dfadvstats = pd.read_html(advstatsxpath,header=0)
            dfadvstats = dfadvstats[0]
            dfadvstats = dfadvstats[:-1].dropna(axis=1, how='all')
            dfadvstats['player'] = name
        except:
            dfadvstats = pd.DataFrame()
        startadvstatsdf = startadvstatsdf.append(dfadvstats)
        return startadvstatsdf
    
    def page_scrape_collegestats(startcollegestatsdf):
        """
        Function to pull all college stats data in player page and add the information to the base dataframe   
        """
        #gets name so the tables can be aligned later
        name = driver.find_element_by_xpath('//h1[@itemprop="name"]').text


        #Gets the college stats table into a df then appends on to starting stats df
        try:
            collegexpath = driver.find_element_by_xpath('//table[@id="all_college_stats"]').get_attribute('outerHTML')
            collegedf = pd.read_html(collegexpath,header=0)
            collegedf = collegedf[0].dropna(axis=0, thresh=4)
            collegedf.columns = collegedf.iloc[0]
            collegedf = collegedf.reindex(collegedf.index.drop(0))
            collegedf['name'] = name
        except:
            collegedf = pd.DataFrame()
        startcollegestatsdf = startcollegestatsdf.append(collegedf)
        return startcollegestatsdf

    
    #this is the section that does the iteration
    #turn off chrome_options to see the browser in action
    for x in sitelist:
        driver = webdriver.Chrome(chromedriver,chrome_options=chrome_options)
        driver.get('https://www.basketball-reference.com/'+x)
        time.sleep(3)
        try:
            expand = driver.find_element_by_xpath('//button[@id="meta_more_button"]')
            expand.click()
            time.sleep(1)
        except:
            time.sleep(1)
        playerdf = page_scrape_player(playerdf)
        advstatsdf = page_scrape_advstats(advstatsdf)
        colstatsdf = page_scrape_collegestats(colstatsdf)
        time.sleep(1)
        driver.close()
    
    return playerdf,advstatsdf,colstatsdf       

In [369]:
playerdf = pd.DataFrame()
advstatsdf = pd.DataFrame()
colstatsdf = pd.DataFrame()

### Partition data
I had to partition the data for easy processing so I broke it into 11 chunks of data

In [417]:
load1 = linksclean[:200]
load2 = linksclean[200:400] 
load3 = linksclean[400:600]
load4 = linksclean[600:800]
load5 = linksclean[800:1000]
load6 = linksclean[1000:1200]
load7 = linksclean[1200:1400]
load8 = linksclean[1400:1600]
load9 = linksclean[1600:1800]
load10 = linksclean[1800:2000]
load11 = linksclean[2000:]

In [433]:
playerdf,advstatsdf,colstatsdf = draft_list_load(load11,playerdf,advstatsdf,colstatsdf)

In [434]:
playerdf.shape

(2196, 6)

In [5]:
toadd = pd.Series(linksclean)
playerdf['urlx'] = toadd.values
playerdf.head()

NameError: name 'linksclean' is not defined

In [436]:
advstatsdf.shape

(15748, 28)

In [437]:
colstatsdf.shape

(8142, 27)

In [4]:
playerdf.to_pickle('data/playerdataraw.pkl')
advstatsdf.to_pickle('data/advstatsdataraw.pkl')
colstatsdf.to_pickle('data/colstatsdataraw.pkl')