In [1]:
import os
import pandas as pd
import numpy as np

##### Data Directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))

finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

##### Scraping Functions

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

##### PLAYOFF URLs
# f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
# f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs-advanced/"

In [4]:
'''
1. Identify player to be scraped
2. Identify active years of players career 
3. scrape regular season logs
4. scrape playoff logs
5. handle errors gracefully

Don't forget to handle ABA players and to scrape regular 
AND advanced game logs
'''

"\n1. Identify player to be scraped\n2. Identify active years of players career \n3. scrape regular season logs\n4. scrape playoff logs\n5. handle errors gracefully\n\nDon't forget to handle ABA players and to scrape regular \nAND advanced game logs\n"

In [5]:
def scrapeURL(URL: str):
    '''scapes a generic basketball reference URL. Returns a pandas DataFrame
    '''
    webpage = urlopen(URL)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if not tables:
        # return empty dataframe
        return pd.Series([])
    
    scrapedData = str(tables[-1])
    gamelogs = pd.read_html(scrapedData)[0]
    
    return gamelogs

In [6]:
def scrapeRegularSeason(URL: str, year: int, isNBA=True):
    ''' scapes the regular season game logs of a given NBA or ABA player for a given year
    '''
    gamelogs = scrapeURL(URL)
    
    if not gamelogs.empty:
        gamelogs.dropna(axis=0, how="all", inplace=True)
        
        gamelogs["PLAYOFF"] = 'N'
        
        if isNBA:
            gamelogs["LEAGUE"] = 'NBA'
        else:
            gamelogs["LEAGUE"] = 'ABA'
            
        # drop redundant header rows    
        gamelogs = gamelogs[gamelogs["Date"] != "Date"]
        gamelogs.reset_index(drop=True,inplace=True)
        
    # scrape ABA data if tables is empty and year is 1976 or earlier
    if gamelogs.empty and isNBA and year <= 1976:
        return scrapeRegularSeason(URL + "/aba/", year, isNBA=False)
    
    if gamelogs.empty:
        errorLogger(URL + ", failed to scrape regular season data")
        
    return gamelogs

In [7]:
def scrapePlayoffs(URL: str):
    ''' scapes playoff gamelogs for a given player
        playoffs game logs are not grouped by year and both NBA & ABA
        games are listed
    '''
    playoffGamelogs = scrapeURL(URL)
    
    if not playoffGamelogs.empty:
        playoffGamelogs.dropna(axis=0, how="all", inplace=True)
        
        playoffGamelogs["PLAYOFF"] = 'Y'
        #playoffGamelogs["AGE"] = np.nan

        # rename columns
        ### TO DO: improve how playoff columns are renamed
        playoffGamelogs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT'
                                          ,playoffGamelogs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}
                               , inplace=True)
        
        # drop redundant headers
        playoffGamelogs = playoffGamelogs[playoffGamelogs["OPPONENT"] != "Opp"]
        playoffGamelogs.reset_index(drop=True,inplace=True)
        
    else:
        errorLogger(URL + ", failed to scrape playoff data")
        
    return playoffGamelogs

In [8]:
def errorLogger(message: str, directory=errorLog):
    logger = f"{directory}/errorlog.txt"
    time = datetime.now().strftime("%m/%d/%Y %H:%M:%S")

    with open(logger,"a") as logs: 
        line = time + "," + message + ", \n"
        logs.write(line)
    

In [9]:
def merge(left: pd.DataFrame, right: pd.DataFrame, joinCols: list):
    '''helper function that merges two dataframes and drops duplicated columns'''
    
    df = pd.merge(left, right, how="outer", on=joinCols, suffixes=("","_dup"))
    keepCols = [col for col in df.columns if "_dup" not in col]
    df = df[keepCols]
    
    return df
    

In [46]:
def scrapePlayerGamelogs(playerIdx: str, playerURL: str, years: tuple):
    ''' scrapes the complete career (reg. season + playoffs) NBA + ABA gamelogs of a given player
        results include basic and advanced states for all regular season + playoff games
    '''
    basicPrev, advPrev = pd.Series([]), pd.Series([])
    start, end = years
    
    baseURL = 'https://www.basketball-reference.com'
    playerURL = playerURL.strip('.html')
    URL = baseURL + playerURL 
    
    playoffsURL = "/gamelog-playoffs/"
    playoffsAdvURL = "/gamelog-playoffs-advanced/"
    
    for year in range(start, end + 1):
        # get regular season data (reg. season data is displayed by year)
        basicURL = f'/gamelog/{year}'
        advancedURL = f'/gamelog-advanced/{year}' 
        
        # regular season data
        basicRegularSeason = scrapeRegularSeason(URL + basicURL, year)
        advancedRegularSeason = scrapeRegularSeason(URL + advancedURL, year)
        
        # df = pd.concat([prev,basicRegularSeason, advancedRegularSeason],ignore_index=True,sort=False)
        basicReg = pd.concat([basicPrev,basicRegularSeason], ignore_index=True, sort=False)
        advReg = pd.concat([advPrev,advancedRegularSeason], ignore_index=True, sort=False)
        
        basicPrev, advPrev = basicReg, advReg
    
    # merge basic + advanced regular season logs. Drop duplicated columns
    regSeason = merge(basicReg, advReg, ["Date","Tm","Opp"])
    
    # playoff data (NOTE: playoffs are NOT shown by year on Bball ref)
    basicPlayoffs = scrapePlayoffs(URL + playoffsURL)
    advancedPlayoffs = scrapePlayoffs(URL + playoffsAdvURL)
    
    # merge basic + advanced playoff season logs. Drop duplicated columns
    playoffs = merge(basicPlayoffs, advancedPlayoffs, ["DATE","TEAM","OPPONENT"])
                       
    df = pd.concat([regSeason,playoffs], ignore_index=True, sort=False)
    df["ID"] = playerIdx
    
    return df

##### Testing

In [11]:
##### read player bios

file = f"{rawDataDir}/consolidatedBioData.csv"
bio = pd.read_csv(file)

print(bio.shape)

(4803, 12)


In [29]:
bio[bio['ID']=='abdelal01']
#bio[bio["ID"]=="ervinju01"]

Unnamed: 0,ID,Player,From,To,Pos,Ht,Ht (cm),Wt,Birth Date,Colleges,HOF,URL
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,208.28,240.0,"June 24, 1968",Duke,N,/players/a/abdelal01.html


In [30]:
bio.head()

Unnamed: 0,ID,Player,From,To,Pos,Ht,Ht (cm),Wt,Birth Date,Colleges,HOF,URL
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,208.28,240.0,"June 24, 1968",Duke,N,/players/a/abdelal01.html
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,205.74,235.0,"April 7, 1946",Iowa State,N,/players/a/abdulza01.html
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,218.44,225.0,"April 16, 1947",UCLA,Y,/players/a/abdulka01.html
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,185.42,162.0,"March 9, 1969",LSU,N,/players/a/abdulma02.html
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,198.12,223.0,"November 3, 1974","Michigan, San Jose State",N,/players/a/abdulta01.html


In [47]:
idx = "abdelal01"
playerURL = "/players/a/abdelal01.html"
yrs = (1991,1995)

'''
idx = "ervinju01"
playerURL = "/players/e/ervinju01.html"
yrs = (1972,1987)
'''
idx = "abdulma02"
playerURL = "/players/a/abdulma02.html"
yrs = (1991,2001)


In [48]:
df = scrapePlayerGamelogs(idx,playerURL,yrs)

print("no. of games:",df.shape)
'''
location = rawDataDir + "/testPlayer.csv"
df.to_csv(location,index=False)
'''

no. of games: (663, 55)


'\nlocation = rawDataDir + "/testPlayer.csv"\ndf.to_csv(location,index=False)\n'

In [50]:
df["PLAYOFF"].value_counts()

N    648
Y     15
Name: PLAYOFF, dtype: int64

In [52]:
df['ID'].value_counts()

abdulma02    663
Name: ID, dtype: int64

In [43]:
df[df["PLAYOFF"]=="N"].shape

(648, 54)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663 entries, 0 to 662
Data columns (total 54 columns):
0             0 non-null float64
3P            663 non-null object
3P%           520 non-null object
3PA           663 non-null object
AST           663 non-null object
Age           648 non-null object
BLK           663 non-null object
DRB           663 non-null object
Date          648 non-null object
FG            663 non-null object
FG%           661 non-null object
FGA           663 non-null object
FT            663 non-null object
FT%           445 non-null object
FTA           663 non-null object
G             601 non-null object
GS            663 non-null object
GmSc          648 non-null object
LEAGUE        648 non-null object
MP            663 non-null object
ORB           663 non-null object
Opp           648 non-null object
PF            663 non-null object
PLAYOFF       663 non-null object
PTS           663 non-null object
Rk            663 non-null object
STL         

In [15]:
|

SyntaxError: invalid syntax (<ipython-input-15-4b37ef281455>, line 1)

In [None]:
baseURL = 'https://www.basketball-reference.com'
playerURL = playerURL.strip('.html')
URL = baseURL + playerURL 
year = yrs[0]
    
playoffsURL = "/gamelog-playoffs/"
playoffsAdvURL = "/gamelog-playoffs-advanced/"
    
# get regular season data (reg. season data is displayed by year)
basicURL = f'/gamelog/{year}'
advancedURL = f'/gamelog-advanced/{year}' 

In [None]:
URL + playoffsAdvURL

In [None]:
# playoff data (NOTE: playoffs are NOT displayed by year)
basic = scrapePlayoffs(URL + playoffsURL)
adv = scrapePlayoffs(URL + playoffsAdvURL)

'''
# regular season data
basic = scrapeRegularSeason(URL + basicURL, year)
adv = scrapeRegularSeason(URL + advancedURL, year)
'''

print(basic.shape)
print(adv.shape)

In [None]:
basic.head()

In [None]:
adv.head()

In [None]:
#joinCols = ["Date","Tm","Opp"]
joinCols = ["DATE","TEAM","OPPONENT"]
test = pd.merge(basic,adv,how="outer",on=joinCols,suffixes=("","_dup"))
print(test.shape)

keepCols = [col for col in test.columns if "_dup" not in col]
test = test[keepCols]
print(test.shape)

In [None]:
test.head()

In [None]:
|

In [None]:
year = 1991
baseURL = 'https://www.basketball-reference.com'

playerURL = '/players/a/abdelal01.html'
playerURL = playerURL.strip('.html')

basicURL = f'/gamelog/{year}'
advancedURL = f'/gamelog-advanced/{year}'
playoffURL =

In [None]:
reg = scrapeRegularSeason(URL,year)
playoffs = scrapePlayoffs()
df = scrapePlayerCareer(idx,playerURL,yrs)

print(reg.shape)
print(playoffs.shape)
print(df.shape)