In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print(df.shape)

(4800, 9)


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [6]:


def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
        '''
        if len(tables) > 0:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_dfs.append(yearly_game_log)
        else: pass
        '''
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [7]:


def get_playoff_game_logs(player_idx,letter):
    
    playoff_game_logs = None
    
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables) > 0:
        table = str(html.findAll('table')[7])
        playoff_game_logs = pd.read_html(table)[0]
        playoff_game_logs["PLAYOFF"] = 'Y'
        playoff_game_logs["AGE"] = np.nan

        playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [8]:


def get_career_game_logs(player_data,verbose=True):
    
    logger = f"{errorLog}/log.txt"
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    if verbose:
        print(player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    try:
        reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
    except:
        reg_season_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in regular season game logs \n")

    # GET PLAYOFF GAME LOGS
    try:
        playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    except:
        playoff_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in playoff game logs \n")
            
        
    try:  
        # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
        career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True,sort=False)
        career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

        # FORMAT CAREER GAME LOGS, SORT BY DATE
        career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
        career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

        career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
        career_game_logs['INDEX'] = player_idx
        career_game_logs['NAME']  = player_name

        career_game_logs.sort_values('DATE',inplace=True)
        career_game_logs.reset_index(drop=True,inplace=True)

    except:
        career_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, no playoff OR regular season data \n")
        


    return career_game_logs
    

In [9]:
k, w = 3330, 4000
player_dfs = []
for num,(index, row) in enumerate(df.iloc[k:w].iterrows()):
    print(k + num)
    
    if (num + 1) % 10 == 0:
        game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
        game_logs.to_csv(f"{finalDataDir}/player_game_logs.csv",index=False)
        print("df shape",game_logs.shape)
    
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    try:
        print(career_game_logs.shape)
    except:
        print("both dfs are None")
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

game_logs.to_csv(f"{finalDataDir}/player_game_logs.csv",index=False)

3330
pecheol01 Oleksiy Pecherov
(249, 36)
3331
peckwi01 Wiley Peck
(54, 35)
3332
peekri01 Rich Peek
(51, 32)
3333
peelean01 Anthony Peeler
(954, 36)
3334
peeplge01 George Peeples
(329, 34)
3335
pekovni01 Nikola Peković
(476, 35)
3336
pelkija01 Jake Pelkington
(57, 26)
3337
pelleno01 Norvel Pelle
(65, 35)
3338
pellosa01 Sam Pellom
(201, 35)
3339
df shape (2436, 36)
penbemi01 Mike Penberthy
(61, 35)
3340
pendeje01 Jerry Pender
(11, 33)
3341
penigde01 Desmond Penigar
(10, 35)
3342
penneki01 Kirk Penney
(7, 35)
3343
peplomi01 Mike Peplowski
(68, 34)
3344
perduwi01 Will Perdue
(985, 36)
3345
perkike01 Kendrick Perkins
(1190, 37)
3346
perkisa01 Sam Perkins
(1464, 36)
3347
perkiwa01 Warren Perkins
(126, 26)
3348
perovko01 Kosta Perović
(82, 35)
3349
df shape (6440, 37)
perralo01 London Perrantes
(81, 35)
3350
perryau01 Aulcie Perry
(21, 33)
3351
perrycu01 Curtis Perry
(533, 32)
3352
perryel01 Elliot Perry
(631, 36)
3353
perryro01 Ron Perry
(187, 32)
3354
perryti01 Tim Perry
(504, 35)
3355
per

In [10]:
game_logs['NAME'].unique()

array(['Oleksiy Pecherov', 'Wiley Peck', 'Rich Peek', 'Anthony Peeler',
       'George Peeples', 'Nikola Peković', 'Jake Pelkington',
       'Norvel Pelle', 'Sam Pellom', 'Mike Penberthy', 'Jerry Pender',
       'Desmond Penigar', 'Kirk Penney', 'Mike Peplowski', 'Will Perdue',
       'Kendrick Perkins', 'Sam Perkins', 'Warren Perkins',
       'Kosta Perović', 'London Perrantes', 'Aulcie Perry',
       'Curtis Perry', 'Elliot Perry', 'Ron Perry', 'Tim Perry',
       'Chuck Person', 'Wesley Person', 'Alec Peters', 'Jim Petersen',
       'Loy Petersen', 'Bob Peterson', 'Ed Peterson', 'Mel Peterson',
       'Morris Peterson', 'Geoff Petrie', 'Johan Petro',
       'Dražen Petrović*', 'Richard Petruška', 'Bob Pettit*',
       'Jerry Pettway', 'Roger Phegley', 'Jack Phelan', 'James Phelan',
       'Derrick Phelps', 'Michael Phelps', 'Andy Phillip*',
       'Eddie Phillips', 'Gary Phillips', 'Gene Phillips', 'Bobby Phills',
       'Eric Piatkowski', 'Walter Piatkowski', 'Paul Pierce',
       

In [11]:
game_logs.tail()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,PTS,GAME_SCORE,+/-,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,Unnamed: 31
213944,33,32,1954-03-08,24-290,BLB,AWAY,MLH,L (-10),,,...,6,,,N,NBA,,,smythjo01,Joe Smyth,
213945,34,33,1954-03-08,24-290,BLB,AWAY,MLH,L (-11),,,...,5,,,N,NBA,,,smythjo01,Joe Smyth,
213946,35,34,1954-03-10,24-292,BLB,HOME,SYR,L (-8),,16:00,...,4,,,N,NBA,,,smythjo01,Joe Smyth,
213947,36,35,1954-03-13,24-295,BLB,AWAY,BOS,L (-12),,,...,3,,,N,NBA,,,smythjo01,Joe Smyth,
213948,37,36,1954-03-14,24-296,BLB,AWAY,FTW,L (-5),,,...,7,,,N,NBA,,,smythjo01,Joe Smyth,


In [12]:
game_logs[game_logs["PLAYOFF"]=='Y']

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,PTS,GAME_SCORE,+/-,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,Unnamed: 31
82,1,1,2008-04-21,,WAS,AWAY,CLE,L (-30),0,3:13,...,0,0,-6,Y,,EC1,2,pecheol01,Oleksiy Pecherov,
83,2,2,2008-04-24,,WAS,HOME,CLE,W (+36),0,2:58,...,2,2.3,0,Y,,EC1,3,pecheol01,Oleksiy Pecherov,
84,3,3,2008-05-02,,WAS,HOME,CLE,L (-17),0,1:59,...,0,1,7,Y,,EC1,6,pecheol01,Oleksiy Pecherov,
301,1,1,1980-04-02,,SAS,AWAY,HOU,L (-10),,,...,0,,,Y,,EC1,1,peckwi01,Wiley Peck,
302,2,2,1980-04-06,,SAS,AWAY,HOU,L (-21),,,...,0,,,Y,,EC1,3,peckwi01,Wiley Peck,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213834,17,4,1988-05-19,,LAL,AWAY,UTA,L (-28),0,9:00,...,0,-0.8,,Y,,WCS,6,smrekmi01,Mike Smrek,
213835,18,5,1988-05-25,,LAL,HOME,DAL,W (+22),0,4:00,...,2,1.2,,Y,,WCF,2,smrekmi01,Mike Smrek,
213836,19,6,1988-05-31,,LAL,HOME,DAL,W (+17),0,3:00,...,0,0.3,,Y,,WCF,5,smrekmi01,Mike Smrek,
213837,20,7,1988-06-14,,LAL,AWAY,DET,L (-25),0,5:00,...,0,0.6,,Y,,FIN,4,smrekmi01,Mike Smrek,


In [None]:
len(df)

In [13]:
|

SyntaxError: invalid syntax (<ipython-input-13-4b37ef281455>, line 1)

In [None]:

k = 45
player_data = df.iloc[k]

player_idx = player_data['index']
letter = player_idx[0]
player_name = player_data['Player']
from_yr, to_yr = player_data[["From","To"]].values

print(player_name)
'''
reg = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
plf = get_playoff_game_logs(player_idx,letter)

print(reg.shape)
print(plf.shape)
'''

In [None]:
yearly_dfs = []
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')

    if len(tables)==0:
        if year > 1976: continue
        else:
            aba_url = url + "/aba/"
            webpage = urlopen(aba_url)
            html = BeautifulSoup(webpage)
            aba_tables = html.findAll('table')
      
            if len(aba_tables)==0: continue
            else: 
                table = str(aba_tables[-1])
                yearly_game_log = pd.read_html(table)[0]
                yearly_game_log["PLAYOFF"] = 'N'
                yearly_game_log["LEAGUE"] = 'ABA'
                
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        yearly_game_log["LEAGUE"] = 'NBA'

    yearly_dfs.append(yearly_game_log)
    print(len(yearly_game_log))


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  

In [None]:
reg_season_game_logs

In [None]:
url,playoff_url

In [None]:
# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

In [None]:
"""
row = df.iloc[1]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values
yearly_dfs = []

# SCRAPING REGULAR SEASON GAMES
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables)==0: continue
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'

        yearly_dfs.append(yearly_game_log)
        #print(url)


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    

# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

# CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

# FORMAT CAREER GAME LOGS, SORT BY DATE
career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
career_game_logs['INDEX'] = player_idx
career_game_logs['NAME']  = player_name

career_game_logs.sort_values('DATE',inplace=True)
career_game_logs.reset_index(drop=True,inplace=True)
"""