In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print(df.shape)

(4800, 9)


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [6]:
def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    for year in range(from_year, to_year+1):
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        
        table = str(html.findAll('table')[7])
        yearly_game_log = pd.read_html(table)[0]
        yearly_game_log["PLAYOFF"] = 'N'
        
        yearly_dfs.append(yearly_game_log)
        #print(url)

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['Series'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [7]:
def get_playoff_game_logs(player_idx,letter):
    
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    #print(playoff_url)
     
    table = str(html.findAll('table')[7])
    playoff_game_logs = pd.read_html(table)[0]
    playoff_game_logs["PLAYOFF"] = 'Y'
    playoff_game_logs["AGE"] = np.nan
    
    playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                      'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [8]:
def get_career_game_logs(player_data):
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    print(letter,player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)

    # GET PLAYOFF GAME LOGS
    playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]
    
    # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
    career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
    career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')
    
    # FORMAT CAREER GAME LOGS, SORT BY DATE
    career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
    career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]
    
    career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
    career_game_logs['INDEX'] = player_idx
    career_game_logs['NAME']  = player_name
    
    career_game_logs.sort_values('DATE',inplace=True)
    career_game_logs.reset_index(drop=True,inplace=True)
    
    return career_game_logs
    

In [9]:
k = 1
player_dfs = []
for index, row in df.iloc[:k].iterrows():
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    print(career_game_logs.shape)
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True)
print("\n")
print(game_logs.shape)

a abdelal01 Alaa Abdelnaby
(273, 33)


(273, 33)


In [10]:
game_logs['NAME'].unique()

array(['Alaa Abdelnaby'], dtype=object)

<pandas.core.indexing._iLocIndexer at 0x113e6f188>

In [12]:
row = df.iloc[1]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values
yearly_dfs = []

# SCRAPING REGULAR SEASON GAMES
for year in range(from_yr, to_yr+1):
    url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
    webpage = urlopen(url)
    html = BeautifulSoup(webpage)

    table = str(html.findAll('table')[7])
    yearly_game_log = pd.read_html(table)[0]
    yearly_game_log["PLAYOFF"] = 'N'

    yearly_dfs.append(yearly_game_log)
    #print(url)

reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True)
reg_season_game_logs["PLAYOFF"] = 'N'
reg_season_game_logs['Series'] = np.nan

reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                  'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)    

# SCRAPING PLAYOFF GAMES
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
#print(playoff_url)

table = str(html.findAll('table')[7])
playoff_game_logs = pd.read_html(table)[0]
playoff_game_logs["PLAYOFF"] = 'Y'
playoff_game_logs["AGE"] = np.nan

playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                  'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                  'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
playoff_game_logs = playoff_game_logs[reg_season_game_logs.columns]

# CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True)
career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

# FORMAT CAREER GAME LOGS, SORT BY DATE
career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
career_game_logs['INDEX'] = player_idx
career_game_logs['NAME']  = player_name

career_game_logs.sort_values('DATE',inplace=True)
career_game_logs.reset_index(drop=True,inplace=True)


a abdulza01 Zaid Abdul-Aziz


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




KeyError: "['GAME_SCORE', 'TOV'] not in index"

In [13]:
reg_season_game_logs

Unnamed: 0,AST,AGE,BLK,DRB,DATE,FG,FG%,FGA,FT,FT%,...,PLAYOFF,PTS,Rk,STL,TOV,TRB,TEAM,HOME/AWAY,RESULT,Series
0,0,22-192,,,1968-10-16,0,,0,0,,...,N,0,1,,,0,CIN,@,W (+15),
1,,22-195,,,1968-10-19,2,.667,3,1,.500,...,N,5,2,,,1,CIN,,W (+12),
2,,22-199,,,1968-10-23,0,,0,0,,...,N,0,3,,,1,CIN,@,L (-7),
3,1,22-208,,,1968-11-01,2,.667,3,0,,...,N,4,4,,,2,CIN,,W (+18),
4,0,22-213,,,1968-11-06,0,.000,1,0,,...,N,0,5,,,1,CIN,,W (+11),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,1,31-359,0,,1978-04-01,2,0.4,5,2,0.5,...,N,6,12,,,3,HOU,,W (+4),
519,1,31-360,0,4,1978-04-02,1,0.125,8,0,,...,N,2,13,1,0.0,6,HOU,@,L (-9),
520,,31-363,,,1978-04-05,0,,,3,0.75,...,N,3,14,,,,HOU,@,L (-13),
521,,32-000,,,1978-04-07,0,,,0,,...,N,0,15,,,,HOU,,L (-3),
