In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print(df.shape)

(4800, 9)


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [6]:
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

In [98]:
for index, row in df.iloc[:1].iterrows():
    
    player_idx = row['index']
    letter = player_idx[0]
    print(player_idx,letter)
    
    from_yr, to_yr = row[["From","To"]].values
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    for year in range(from_yr, to_yr+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        
        table = str(html.findAll('table')[7])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        
        yearly_dfs.append(yearly_game_log)
        print(url)

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['Series'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    
    
    # SCRAPING PLAYOFF GAMES
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    print(playoff_url)
     
    table = str(html.findAll('table')[7])
    playoff_game_logs = pd.read_html(table)[0]
    playoff_game_logs["PLAYOFF"] = 'Y'
    playoff_game_logs["AGE"] = np.nan
    
    playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                      'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]
    
    # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
    career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
    career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')
    
    # FORMAT CAREER GAME LOGS, SORT BY DATE
    career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
    career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]
    
    career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
    career_game_logs['INDEX'] = player_idx
    career_game_logs['NAME']  = row['Player']
    
    career_game_logs.sort_values('DATE',inplace=True)
    career_game_logs.reset_index(drop=True,inplace=True)


abdelal01 a
https://www.basketball-reference.com/players/a/abdelal01/gamelog/1991
https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
https://www.basketball-reference.com/players/a/abdelal01/gamelog/1993
https://www.basketball-reference.com/players/a/abdelal01/gamelog/1994
https://www.basketball-reference.com/players/a/abdelal01/gamelog/1995
https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/


In [99]:
career_game_logs.head()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,STL,BLK,TOV,PF,PTS,GAME_SCORE,PLAYOFF,SERIES,INDEX,NAME
0,1,1,1990-11-02,22-131,POR,HOME,HOU,W (+1),0,5:00,...,0,0,0,0,0,1.0,N,,abdelal01,Alaa Abdelnaby
1,2,2,1990-11-06,22-135,POR,AWAY,LAL,W (+2),0,4:00,...,0,0,1,3,0,-2.2,N,,abdelal01,Alaa Abdelnaby
2,3,3,1990-11-11,22-140,POR,HOME,LAC,W (+31),0,8:00,...,0,0,0,1,4,3.9,N,,abdelal01,Alaa Abdelnaby
3,4,4,1990-11-13,22-142,POR,HOME,DEN,W (+26),0,3:00,...,0,0,1,0,0,-2.1,N,,abdelal01,Alaa Abdelnaby
4,5,5,1990-11-15,22-144,POR,HOME,NYK,W (+16),0,6:00,...,0,0,0,0,1,-0.1,N,,abdelal01,Alaa Abdelnaby


In [100]:
career_game_logs.tail()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,STL,BLK,TOV,PF,PTS,GAME_SCORE,PLAYOFF,SERIES,INDEX,NAME
268,50,50,1995-03-16,26-265,SAC,AWAY,DEN,L (-22),0,11:00,...,0,0,1,2,2,0.9,N,,abdelal01,Alaa Abdelnaby
269,51,51,1995-03-17,26-266,SAC,HOME,GSW,W (+18),0,2:00,...,0,0,0,1,0,-0.4,N,,abdelal01,Alaa Abdelnaby
270,52,52,1995-03-24,26-273,PHI,HOME,BOS,L (-32),0,14:00,...,0,0,3,2,0,-5.4,N,,abdelal01,Alaa Abdelnaby
271,53,53,1995-03-31,26-280,PHI,HOME,POR,W (+1),0,8:00,...,0,0,0,0,2,0.3,N,,abdelal01,Alaa Abdelnaby
272,54,54,1995-04-05,26-285,PHI,AWAY,CHH,L (-18),0,8:00,...,0,0,2,0,0,-2.4,N,,abdelal01,Alaa Abdelnaby
