In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### read player bio data

In [43]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print("No of distinct players:",df['index'].nunique())

No of distinct players: 4800


In [44]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [6]:


def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
        '''
        if len(tables) > 0:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_dfs.append(yearly_game_log)
        else: pass
        '''
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [6]:


def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
        '''
        if len(tables) > 0:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_dfs.append(yearly_game_log)
        else: pass
        '''
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [7]:


def get_playoff_game_logs(player_idx,letter):
    
    playoff_game_logs = None
    
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables) > 0:
        table = str(html.findAll('table')[7])
        playoff_game_logs = pd.read_html(table)[0]
        playoff_game_logs["PLAYOFF"] = 'Y'
        playoff_game_logs["AGE"] = np.nan

        playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [8]:


def get_career_game_logs(player_data,verbose=True):
    
    logger = f"{errorLog}/log.txt"
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    if verbose:
        print(player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    try:
        reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
    except:
        reg_season_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in regular season game logs \n")

    # GET PLAYOFF GAME LOGS
    try:
        playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    except:
        playoff_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in playoff game logs \n")
            
        
    try:  
        # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
        career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True,sort=False)
        career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

        # FORMAT CAREER GAME LOGS, SORT BY DATE
        career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
        career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

        career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
        career_game_logs['INDEX'] = player_idx
        career_game_logs['NAME']  = player_name

        career_game_logs.sort_values('DATE',inplace=True)
        career_game_logs.reset_index(drop=True,inplace=True)

    except:
        career_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, no playoff OR regular season data \n")
        


    return career_game_logs
    

In [17]:
k, w = 4000, 5000
player_dfs = []
for num,(index, row) in enumerate(df.iloc[k:w].iterrows()):
    print(k + num)
    
    if (num + 1) % 10 == 0:
        game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
        game_logs.to_csv(f"{rawDataDir}/player_game_logs.csv",index=False)
        print("df shape",game_logs.shape)
    
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    try:
        print(career_game_logs.shape)
    except:
        print("both dfs are None")
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

game_logs.to_csv(f"{rawDataDir}/player_game_logs.csv",index=False)

4000
snellto01 Tony Snell
(602, 37)
4001
snower01 Eric Snow
(1050, 36)
4002
snydedi01 Dick Snyder
(995, 32)
4003
snydeki01 Kirk Snyder
(327, 36)
4004
sobekch01 Chips Sobek
(63, 26)
4005
soberri01 Ricky Sobers
(850, 35)
4006
sobiero01 Ron Sobie
(192, 25)
4007
sojoumi01 Mike Sojourner
(191, 29)
4008
sojouwi01 Willie Sojourner
(340, 34)
4009
df shape (4610, 37)
solomwi01 Will Solomon
(160, 35)
4010
somerwi01 Willie Somerset
(145, 32)
4011
songada01 Darius Songaila
(676, 36)
4012
sorenda01 Dave Sorenson
(212, 25)
4013
southja01 James Southerland
(25, 35)
4014
sovragi01 Gino Sovran
(6, 25)
4015
sowpa01 Pape Sow
(193, 35)
4016
spainke01 Ken Spain
(11, 31)
4017
spaldra01 Ray Spalding
(74, 35)
4018
spanaji01 Jim Spanarkel
(259, 35)
4019
df shape (6371, 37)
spanova01 Vassilis Spanoulis
(83, 36)
4020
sparkda01 Daniel Sparks
(67, 32)
4021
sparrgu01 Guy Sparrow
(149, 25)
4022
sparrro01 Rory Sparrow
(866, 35)
4023
spearod01 Odie Spears
(510, 26)
4024
spectar01 Art Spector
(169, 26)
4025
speigma01 M

In [21]:
print(game_logs.shape)

(251227, 37)


##### scraping errata

In [64]:
#to_scrape = pd.read_csv(f"{interimDataDir}/error_indices.csv")
#scrape_idx = list(to_scrape['index'].values)
scrape_idx = ['anderer01',
 'bowmair01',
 'brindau01',
 'carnebo01',
 'colonjo01',
 'corleke01',
 'dickehe01',
 'flemial01',
 'henrysk01',
 'jacksaa01',
 'jentch01',
 'johnstr01',
 'killuea01',
 'kingda01',
 'kitchcu01',
 'klotzhe01',
 'lowerch01',
 'mahonmo01',
 'napolpa01',
 'raikesh01',
 'rockge01',
 'stockda01',
 'widbyro01',
 'winklma01',
 'yonakri01']

scrape_df = df[df['index'].isin(scrape_idx)]

In [65]:
scrape_df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
89,anderer01,Eric Anderson,1993,1994,F,6-9,220.0,"May 26, 1970",Indiana
429,bowmair01,Ira Bowman,2000,2002,G,6-5,195.0,"June 11, 1973","Providence, University of Pennsylvania"
488,brindau01,Aud Brindley,1947,1947,F,6-4,175.0,"December 31, 1923",Dartmouth College
662,carnebo01,Bob Carney,1955,1955,G,6-3,170.0,"August 3, 1932",Bradley
804,colonjo01,Joe Colone,1949,1949,F,6-5,210.0,"January 23, 1924",Bloomsburg University of Pennsylvania


In [54]:
player_dfs = []
for num,(index, row) in enumerate(scrape_df.iterrows()):
    print(num)
    
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    try:
        print(career_game_logs.shape)
    except:
        print("both dfs are None")
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

#game_logs.to_csv(f"{interimDataDir}/player_game_logs - errata.csv",index=False)

0
alarima01 Mark Alarie
(329, 35)
1
alcorga01 Gary Alcorn
(79, 25)
2
aldemfu01 Furkan Aldemir
(59, 35)
3
aldrico01 Cole Aldrich
(657, 37)
4
aldrila01 LaMarcus Aldridge
(1183, 36)
5
aleksch01 Chuck Aleksinas
(74, 34)
6
alexacl01 Cliff Alexander
(82, 35)
7
anderer01 Eric Anderson
(0, 35)
8
anigbik01 Ike Anigbogu
(136, 35)
9
babbch01 Chris Babb
(23, 35)
10
bairsca01 Cameron Bairstow
(164, 35)
11
bambamo01 Mohamed Bamba
(147, 35)
12
bembrde01 DeAndre' Bembry
(313, 35)
13
bowmair01 Ira Bowman
(0, 36)
14
brindau01 Aud Brindley
(0, 26)
15
buyckdw01 Dwight Buycks
(170, 35)
16
cabocbr01 Bruno Caboclo
(428, 35)
17
carnebo01 Bob Carney
(0, 26)
18
casspom01 Omri Casspi
(765, 35)
19
cavanty01 Tyler Cavanaugh
(155, 35)
20
chiozch01 Chris Chiozza
(69, 35)
21
chrisra01 Rakeem Christmas
(164, 35)
22
colonjo01 Joe Colone
(0, 26)
23
colsobo01 Bonzie Colson
(40, 35)
24
copato01 Tom Copa
(33, 34)
25
copelch01 Chris Copeland
(333, 37)
26
copelho01 Hollis Copeland
(92, 34)
27
copella01 Lanard Copeland
(33, 3

In [60]:
game_logs[game_logs['INDEX']=='corbity01'].shape

(1199, 37)

In [63]:
game_logs[game_logs['INDEX']=='carnebo01'].shape

(0, 37)

In [56]:
game_logs.shape

(22090, 37)

In [13]:
|

SyntaxError: invalid syntax (<ipython-input-13-4b37ef281455>, line 1)

In [None]:

k = 45
player_data = df.iloc[k]

player_idx = player_data['index']
letter = player_idx[0]
player_name = player_data['Player']
from_yr, to_yr = player_data[["From","To"]].values

print(player_name)
'''
reg = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
plf = get_playoff_game_logs(player_idx,letter)

print(reg.shape)
print(plf.shape)
'''

In [None]:
yearly_dfs = []
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')

    if len(tables)==0:
        if year > 1976: continue
        else:
            aba_url = url + "/aba/"
            webpage = urlopen(aba_url)
            html = BeautifulSoup(webpage)
            aba_tables = html.findAll('table')
      
            if len(aba_tables)==0: continue
            else: 
                table = str(aba_tables[-1])
                yearly_game_log = pd.read_html(table)[0]
                yearly_game_log["PLAYOFF"] = 'N'
                yearly_game_log["LEAGUE"] = 'ABA'
                
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        yearly_game_log["LEAGUE"] = 'NBA'

    yearly_dfs.append(yearly_game_log)
    print(len(yearly_game_log))


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  

In [None]:
reg_season_game_logs

In [None]:
url,playoff_url

In [None]:
# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

In [None]:
"""
row = df.iloc[1]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values
yearly_dfs = []

# SCRAPING REGULAR SEASON GAMES
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables)==0: continue
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'

        yearly_dfs.append(yearly_game_log)
        #print(url)


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    

# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

# CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

# FORMAT CAREER GAME LOGS, SORT BY DATE
career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
career_game_logs['INDEX'] = player_idx
career_game_logs['NAME']  = player_name

career_game_logs.sort_values('DATE',inplace=True)
career_game_logs.reset_index(drop=True,inplace=True)
"""