In [1]:
import os
import pandas as pd
import numpy as np

##### Data Directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))

finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

##### Read Player Bios

In [14]:
file = f"{rawDataDir}/consolidatedBioData.csv"
bio = pd.read_csv(file)

print(bio.shape)

(4803, 12)


In [15]:
bio[bio['ID']=='abdelal01'][['ID','URL','From','To']]#.values[0]

Unnamed: 0,ID,URL,From,To
0,abdelal01,/players/a/abdelal01.html,1991,1995


##### Scrape Game Logs

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [16]:
'''
1. Identify player to be scraped
2. Identify active years of players career 
3. scrape regular season logs
4. scrape playoff logs
5. handle errors gracefully

Don't forget to handle ABA players and to scrape regular 
AND advanced game logs
'''

"\n1. Identify player to be scraped\n2. Identify active years of players career \n3. scrape regular season logs\n4. scrape playoff logs\n5. handle errors gracefully\n\nDon't forget to handle ABA players and to scrape regular \nAND advanced game logs\n"

In [17]:
bio[bio['ID']=='abdelal01'][['ID','URL','From','To']]

Unnamed: 0,ID,URL,From,To
0,abdelal01,/players/a/abdelal01.html,1991,1995


In [51]:
year = 1991

baseURL = 'https://www.basketball-reference.com'
playerURL = '/players/a/abdelal01.html'
playerURL = playerURL.strip('.html')
basicURL = f'/gamelog/{year}'
advancedURL = f'/gamelog-advanced/{year}'

In [39]:
baseURL + playerURL + basicURL

'https://www.basketball-reference.com/players/a/abdelal01/gamelog/1991'

In [42]:
URL = baseURL + playerURL + advancedURL

In [66]:
URL = baseURL + playerURL + basicURL
df = scrapeLogs(URL,year)

print(df.shape)

(43, 31)


In [69]:
df.head(2)

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,PLAYOFF,LEAGUE
0,1,1,1990-11-02,22-131,POR,,HOU,W (+1),0,5:00,...,2,1,0,0,0,0,0,1.0,N,NBA
1,2,2,1990-11-06,22-135,POR,@,LAL,W (+2),0,4:00,...,0,0,0,0,1,3,0,-2.2,N,NBA


In [65]:
baseURL + playerURL

'https://www.basketball-reference.com/players/a/abdelal01'

In [None]:
# playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
# playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs-advanced/"

In [72]:
bio[bio["ID"]=="ervinju01"]

Unnamed: 0,ID,Player,From,To,Pos,Ht,Ht (cm),Wt,Birth Date,Colleges,HOF,URL
1244,ervinju01,Julius Erving*,1972,1987,F-G,6-7,200.66,210.0,"February 22, 1950",UMass,Y,/players/e/ervinju01.html


In [86]:
idx = "ervinju01"
playerURL = "/players/e/ervinju01.htm"
yrs = (1972,1987)

In [87]:
bio[bio['ID']=='abdelal01']

Unnamed: 0,ID,Player,From,To,Pos,Ht,Ht (cm),Wt,Birth Date,Colleges,HOF,URL
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,208.28,240.0,"June 24, 1968",Duke,N,/players/a/abdelal01.html


In [95]:
idx = "abdelal01"
playerURL = "/players/a/abdelal01.html"
yrs = (1991,1995)

In [96]:
df = scrapePlayerCareer(idx,playerURL,yrs)

print(df.shape)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




(556, 54)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [99]:
df.head().T

Unnamed: 0,0,1,2,3,4
0,,,,,
3P,0,0,0,0,0
3P%,,,,,
3PA,0,0,0,0,0
AST,1,0,1,0,0
AST%,,,,,
Age,22-131,22-135,22-140,22-142,22-144
BLK,0,0,0,0,0
BLK%,,,,,
BPM,,,,,


In [75]:
def scrapePlayerCareer(playerIdx: str, playerURL: str, years: tuple):
    ''' scrapes the career NBA + ABA data of a given player
        results included basic and advanced states and all
        regular season and playoff games
    '''
    
    prev = pd.Series([])
    
    baseURL = 'https://www.basketball-reference.com'
    playerURL = playerURL.strip('.html')
    URL = baseURL + playerURL 
    
    playoffsURL = "/gamelog-playoffs/"
    playoffsAdvURL = "/gamelog-playoffs-advanced/"
    
    for year in range(years[0], years[1] + 1):
        # get regular season data (reg. season data is displayed by year)
        basicURL = f'/gamelog/{year}'
        advancedURL = f'/gamelog-advanced/{year}' 
        
        # regular season data
        basicRegularSeason = scrapeLogs(URL + basicURL, year)
        advancedRegularSeason = scrapeLogs(URL + advancedURL, year)
        
        df = pd.concat([prev,basicRegularSeason, advancedRegularSeason]
                       ,ignore_index=True)
        prev = df
        
    # playoff data (NOTE: playoffs are NOT displayed by year)
    basicPlayoffs = scrapePlayoffs(URL + playoffsURL)
    advancedPlayoffs = scrapePlayoffs(URL + playoffsAdvURL)
                       
    df = pd.concat([df,basicPlayoffs,advancedPlayoffs]
                   ,ignore_index=True)
    
    return df

In [76]:
def scrapeLogs(URL: str, year: int, isNBA=True):
    ''' scapes the regular season game logs of a given NBA or ABA player
        for a given year
    '''
    
    gamelogs = None
            
    webpage = urlopen(URL)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if tables:
        gamelogTable = str(tables[-1])
        gamelogs = pd.read_html(gamelogTable)[0]
        gamelogs["PLAYOFF"] = 'N'
        
        if isNBA:
            gamelogs["LEAGUE"] = 'NBA'
        else:
            gamelogs["LEAGUE"] = 'ABA'
            
        # drop redundant header rows    
        gamelogs = gamelogs[gamelogs["Date"] != "Date"]
        
    # scrape ABA data if tables is empty and year is 1976 or earlier
    if not tables and isNBA and year <= 1976:
        return scrapeLogs(URL + "/aba/", year, isNBA=False)
        
    return gamelogs

In [94]:
def scrapePlayoffs(URL: str):
    ''' scapes playoff gamelogs for a given player
        playoffs game logs are not grouped by year and both NBA & ABA
        games are listed
    '''
    playoffGamelogs = None
    
    webpage = urlopen(URL)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if tables:
        playoffTable = str(tables[-1])
        playoffGamelogs = pd.read_html(playoffTable)[0]
        playoffGamelogs["PLAYOFF"] = 'Y'
        playoffGamelogs["AGE"] = np.nan

        # rename columns
        playoffGamelogs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT'
                                          ,playoffGamelogs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}
                               , inplace=True)
        
        # drop redundant headers
        playoffGamelogs = playoffGamelogs[playoffGamelogs["OPPONENT"] != "Opp"]
    return playoffGamelogs

In [78]:
def errorLogger(directory: str):
    None