In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print(df.shape)

(4800, 9)


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [13]:


def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
        '''
        if len(tables) > 0:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_dfs.append(yearly_game_log)
        else: pass
        '''
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [14]:


def get_playoff_game_logs(player_idx,letter):
    
    playoff_game_logs = None
    
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables) > 0:
        table = str(html.findAll('table')[7])
        playoff_game_logs = pd.read_html(table)[0]
        playoff_game_logs["PLAYOFF"] = 'Y'
        playoff_game_logs["AGE"] = np.nan

        playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [15]:


def get_career_game_logs(player_data,verbose=True):
    
    logger = f"{errorLog}/log.txt"
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    if verbose:
        print(player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    try:
        reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
    except:
        reg_season_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in regular season game logs \n")

    # GET PLAYOFF GAME LOGS
    try:
        playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    except:
        playoff_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in playoff game logs \n")
            
        
    try:  
        # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
        career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True,sort=False)
        career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

        # FORMAT CAREER GAME LOGS, SORT BY DATE
        career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
        career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

        career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
        career_game_logs['INDEX'] = player_idx
        career_game_logs['NAME']  = player_name

        career_game_logs.sort_values('DATE',inplace=True)
        career_game_logs.reset_index(drop=True,inplace=True)

    except:
        career_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, no playoff OR regular season data \n")
        


    return career_game_logs
    

In [17]:
k, w = 850, 1500
player_dfs = []
for num,(index, row) in enumerate(df.iloc[k:w].iterrows()):
    print(k + num)
    
    if (num + 1) % 10 == 0:
        game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
        game_logs.to_csv(f"{finalDataDir}/player_game_logs.csv",index=False)
        print("df shape",game_logs.shape)
    
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    try:
        print(career_game_logs.shape)
    except:
        print("both dfs are None")
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

game_logs.to_csv(f"{finalDataDir}/player_game_logs.csv",index=False)

850
corleke01 Ken Corley
both dfs are None
851
corlera01 Ray Corley
(100, 26)
852
corzida01 Dave Corzine
(959, 35)
853
costela01 Larry Costello
(757, 26)
854
costema01 Matt Costello
(82, 35)
855
cottobr01 Bryce Cotton
(58, 35)
856
cottoja01 Jack Cotton
(54, 25)
857
cottoja02 James Cotton
(39, 34)
858
coughjo01 John Coughran
(24, 34)
859
df shape (2073, 36)
countme01 Mel Counts
(874, 30)
860
courtst01 Steve Courtin
(24, 25)
861
courtjo01 Joe Courtney
(98, 34)
862
cousima01 Marcus Cousin
(7, 35)
863
couside01 DeMarcus Cousins
(753, 37)
864
cousybo01 Bob Cousy*
(1033, 26)
865
covinro01 Robert Covington
(564, 37)
866
cowenda01 Dave Cowens*
(855, 35)
867
coxch01 Chubby Cox
(7, 34)
868
coxjo01 Johnny Cox
(73, 25)
869
df shape (6361, 37)
coxwe01 Wesley Cox
(74, 31)
870
crabbal01 Allen Crabbe
(587, 37)
871
craigto01 Torrey Craig
(229, 36)
872
crawfch01 Chris Crawford
(414, 36)
873
crawffr01 Freddie Crawford
(332, 26)
874
crawfja01 Jamal Crawford
(1545, 36)
875
crawfjo01 Joe Crawford
(11, 35)
8

In [None]:
game_logs['NAME'].unique()

In [18]:
game_logs.tail()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,DRB,STL,BLK,TOV,GAME_SCORE,3P,3PA,3P%,+/-,Unnamed: 31
212514,32,14,1977-05-22,,POR,AWAY,PHI,L (-6),0,27:00,...,2,0,0,,,,,,,
212515,33,15,1977-05-26,,POR,AWAY,PHI,L (-18),0,10:00,...,0,0,0,,,,,,,
212516,34,16,1977-05-29,,POR,HOME,PHI,W (+22),0,8:00,...,0,0,0,,,,,,,
212517,35,17,1977-05-31,,POR,HOME,PHI,W (+32),0,18:00,...,0,2,0,,,,,,,
212518,36,18,1977-06-03,,POR,AWAY,PHI,W (+6),0,5:00,...,1,0,0,,,,,,,


In [None]:
game_logs[game_logs["PLAYOFF"]=='Y']

In [None]:
|

In [None]:

k = 45
player_data = df.iloc[k]

player_idx = player_data['index']
letter = player_idx[0]
player_name = player_data['Player']
from_yr, to_yr = player_data[["From","To"]].values

print(player_name)
'''
reg = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
plf = get_playoff_game_logs(player_idx,letter)

print(reg.shape)
print(plf.shape)
'''

In [None]:
yearly_dfs = []
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')

    if len(tables)==0:
        if year > 1976: continue
        else:
            aba_url = url + "/aba/"
            webpage = urlopen(aba_url)
            html = BeautifulSoup(webpage)
            aba_tables = html.findAll('table')
      
            if len(aba_tables)==0: continue
            else: 
                table = str(aba_tables[-1])
                yearly_game_log = pd.read_html(table)[0]
                yearly_game_log["PLAYOFF"] = 'N'
                yearly_game_log["LEAGUE"] = 'ABA'
                
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        yearly_game_log["LEAGUE"] = 'NBA'

    yearly_dfs.append(yearly_game_log)
    print(len(yearly_game_log))


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  

In [None]:
reg_season_game_logs

In [None]:
url,playoff_url

In [None]:
# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

In [None]:
"""
row = df.iloc[1]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values
yearly_dfs = []

# SCRAPING REGULAR SEASON GAMES
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables)==0: continue
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'

        yearly_dfs.append(yearly_game_log)
        #print(url)


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    

# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

# CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

# FORMAT CAREER GAME LOGS, SORT BY DATE
career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
career_game_logs['INDEX'] = player_idx
career_game_logs['NAME']  = player_name

career_game_logs.sort_values('DATE',inplace=True)
career_game_logs.reset_index(drop=True,inplace=True)
"""