In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print(df.shape)

(4800, 9)


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [6]:


def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
        '''
        if len(tables) > 0:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_dfs.append(yearly_game_log)
        else: pass
        '''
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [7]:


def get_playoff_game_logs(player_idx,letter):
    
    playoff_game_logs = None
    
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables) > 0:
        table = str(html.findAll('table')[7])
        playoff_game_logs = pd.read_html(table)[0]
        playoff_game_logs["PLAYOFF"] = 'Y'
        playoff_game_logs["AGE"] = np.nan

        playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [8]:


def get_career_game_logs(player_data,verbose=True):
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    if verbose:
        print(player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)

    # GET PLAYOFF GAME LOGS
    playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    # playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]
    
    # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
    career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True,sort=False)
    career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')
    
    # FORMAT CAREER GAME LOGS, SORT BY DATE
    career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
    career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]
    
    career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
    career_game_logs['INDEX'] = player_idx
    career_game_logs['NAME']  = player_name
    
    career_game_logs.sort_values('DATE',inplace=True)
    career_game_logs.reset_index(drop=True,inplace=True)
        
    return career_game_logs
    

In [9]:
k = 200
player_dfs = []
for num,(index, row) in enumerate(df.iloc[:k].iterrows()):
    print(num)
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    print(career_game_logs.shape)
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

0
abdelal01 Alaa Abdelnaby
(273, 35)
1
abdulza01 Zaid Abdul-Aziz
(521, 32)
2
abdulka01 Kareem Abdul-Jabbar*
(1797, 35)
3
abdulma02 Mahmoud Abdul-Rauf
(663, 36)
4
abdulta01 Tariq Abdul-Wahad
(305, 36)
5
abdursh01 Shareef Abdur-Rahim
(939, 36)
6
abernto01 Tom Abernethy
(331, 35)
7
ablefo01 Forest Able
(1, 25)
8
abramjo01 John Abramovic
(56, 25)
9
abrinal01 Álex Abrines
(229, 36)
10
ackeral01 Alex Acker
(161, 35)
11
ackerdo01 Don Ackerman
(31, 26)
12
acresma01 Mark Acres
(394, 35)
13
actonbu01 Bud Acton
(23, 25)
14
acyqu01 Quincy Acy
(477, 35)
15
adamsal01 Alvan Adams
(1066, 35)
16
adamsdo01 Don Adams
(519, 34)
17
adamsge01 George Adams
(223, 34)
18
adamsha01 Hassan Adams
(118, 36)
19
adamsja01 Jaylen Adams
(82, 35)
20
adamsjo01 Jordan Adams
(179, 37)
21
adamsmi01 Michael Adams
(673, 35)
22
adamsst01 Steven Adams
(609, 37)
23
addisra01 Rafael Addison
(400, 35)
24
adebaba01 Bam Adebayo
(234, 36)
25
adelde01 Deng Adel
(38, 35)
26
adelmri01 Rick Adelman
(482, 30)
27
adrieje01 Jeff Adrien
(23

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
game_logs['NAME'].unique()

In [None]:
game_logs.tail()

In [None]:
game_logs[game_logs["PLAYOFF"]=='Y']

In [None]:
|

In [None]:

k = 17
player_data = df.iloc[k]

player_idx = player_data['index']
letter = player_idx[0]
player_name = player_data['Player']
from_yr, to_yr = player_data[["From","To"]].values

print(player_name)
'''
reg = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
plf = get_playoff_game_logs(player_idx,letter)

print(reg.shape)
print(plf.shape)
'''

In [None]:
yearly_dfs = []
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')

    if len(tables)==0:
        if year > 1976: continue
        else:
            aba_url = url + "/aba/"
            webpage = urlopen(aba_url)
            html = BeautifulSoup(webpage)
            aba_tables = html.findAll('table')
      
            if len(aba_tables)==0: continue
            else: 
                table = str(aba_tables[-1])
                yearly_game_log = pd.read_html(table)[0]
                yearly_game_log["PLAYOFF"] = 'N'
                yearly_game_log["LEAGUE"] = 'ABA'
                
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        yearly_game_log["LEAGUE"] = 'NBA'

    yearly_dfs.append(yearly_game_log)
    print(len(yearly_game_log))


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  

In [None]:
reg_season_game_logs

In [None]:
"""
row = df.iloc[1]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values
yearly_dfs = []

# SCRAPING REGULAR SEASON GAMES
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables)==0: continue
    else:
        table = str(tables[-1])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'

        yearly_dfs.append(yearly_game_log)
        #print(url)


reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    

# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

# CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

# FORMAT CAREER GAME LOGS, SORT BY DATE
career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
career_game_logs['INDEX'] = player_idx
career_game_logs['NAME']  = player_name

career_game_logs.sort_values('DATE',inplace=True)
career_game_logs.reset_index(drop=True,inplace=True)
"""