In [1]:
import os
import pandas as pd
import numpy as np

#### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))

advDataDir = os.path.abspath(os.path.join(dataDir,'advanced_stats'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))


#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
df = pd.read_csv(file)

print("No of distinct players:",df['index'].nunique())

No of distinct players: 4800


In [4]:
df.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State"


#### scraping player game logs

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### BASIC STATS
# https://www.basketball-reference.com/players/a/abdelal01.html
# https://www.basketball-reference.com/players/a/abdelal01/gamelog/1992
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs/

#### ADVANCED STATS
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-advanced/1992/
# https://www.basketball-reference.com/players/a/abdelal01/gamelog-playoffs-advanced/

In [6]:
def get_reg_season_game_logs(player_idx,letter,from_year,to_year):
    reg_season_game_logs = None
    yearly_dfs = []
    
    # SCRAPING REGULAR SEASON GAMES
    
    for year in range(from_year, to_year+1):
        #url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"
        url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-advanced/{year}/"
        webpage = urlopen(url)
        html = BeautifulSoup(webpage)
        tables = html.findAll('table')
      
        if len(tables)==0:
            if year > 1976: continue
            else:
                aba_url = url + "/aba/"
                webpage = urlopen(aba_url)
                html = BeautifulSoup(webpage)
                aba_tables = html.findAll('table')

                if len(aba_tables)==0: continue
                else: 
                    table = str(aba_tables[-1])
                    yearly_game_log = pd.read_html(table)[0]
                    yearly_game_log["PLAYOFF"] = 'N'
                    yearly_game_log["LEAGUE"] = 'ABA'

        else:
            table = str(tables[-1])
            yearly_game_log = pd.read_html(table)[0]
            yearly_game_log["PLAYOFF"] = 'N'
            yearly_game_log["LEAGUE"] = 'NBA'

        yearly_dfs.append(yearly_game_log)        

    reg_season_game_logs = pd.concat([x for x in yearly_dfs],ignore_index=True,sort=False)
    reg_season_game_logs["PLAYOFF"] = 'N'
    reg_season_game_logs['SERIES'] = np.nan

    reg_season_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 
                                      'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',
                                      'Unnamed: 7': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)  
    
    return reg_season_game_logs

In [7]:
def get_playoff_game_logs(player_idx,letter):
    
    playoff_game_logs = None
    
    #playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
    playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs-advanced/"
    webpage = urlopen(playoff_url)
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    if len(tables) > 0:
        table = str(html.findAll('table')[-1])
        playoff_game_logs = pd.read_html(table)[0]
        playoff_game_logs["PLAYOFF"] = 'Y'
        playoff_game_logs["AGE"] = np.nan

        playoff_game_logs.rename(columns = {'Date': 'DATE', 'Age': 'AGE', 'Tm': 'TEAM', 'Series':'SERIES',
                                          'Unnamed: 5': 'HOME/AWAY', 'Opp': 'OPPONENT',playoff_game_logs.columns[2]:'DATE',
                                          'Unnamed: 8': 'RESULT', 'GmSc': 'GAME_SCORE'}, inplace=True)
    
    return playoff_game_logs

In [8]:
def get_career_game_logs(player_data,verbose=True):
    
    logger = f"{errorLog}/log.txt"
    
    # PLAYER DATA
    player_idx = player_data['index']
    letter = player_idx[0]
    player_name = player_data['Player']
    from_yr, to_yr = row[["From","To"]].values

    if verbose:
        print(player_idx,player_name)
    
    # GET REGULAR SEASON GAME LOGS
    try:
        reg_season_game_logs = get_reg_season_game_logs(player_idx,letter,from_yr,to_yr)
    except:
        reg_season_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in regular season game logs \n")

    # GET PLAYOFF GAME LOGS
    try:
        playoff_game_logs = get_playoff_game_logs(player_idx,letter)
    except:
        playoff_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, error in playoff game logs \n")
            
        
    try:  
        # CONCATENATING REGULAR SEASON AND PLAYOFF GAMES
        career_game_logs = pd.concat([reg_season_game_logs,playoff_game_logs],ignore_index=True,sort=False)
        career_game_logs['HOME/AWAY'] = career_game_logs['HOME/AWAY'].apply(lambda x: 'AWAY' if x=='@' else 'HOME')

        # FORMAT CAREER GAME LOGS, SORT BY DATE
        career_game_logs['Rk'] = career_game_logs['Rk'].astype(str)
        career_game_logs = career_game_logs[career_game_logs['Rk'].str.isnumeric().values]

        career_game_logs.rename(columns = {'Series':'SERIES'},inplace=True)
        career_game_logs['INDEX'] = player_idx
        career_game_logs['NAME']  = player_name

        career_game_logs.sort_values('DATE',inplace=True)
        career_game_logs.reset_index(drop=True,inplace=True)

    except:
        career_game_logs = None
        with open(logger,"a") as logs: 
            logs.write(f"{player_idx} , {player_name}, no playoff OR regular season data \n")
        
    return career_game_logs

In [None]:
k, w = 1000, 2000
player_dfs = []
for num,(index, row) in enumerate(df.iloc[k:w].iterrows()):
    print(k + num)
    
    if (num + 1) % 10 == 0:
        game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
        game_logs.to_csv(f"{advDataDir}/test_adv_game_logs.csv",index=False)
        print("df shape",game_logs.shape)
    
    career_game_logs = get_career_game_logs(row)
    player_dfs.append(career_game_logs)
    
    try:
        print(career_game_logs.shape)
    except:
        print("both dfs are None")
    
game_logs = pd.concat([player for player in player_dfs],ignore_index=True,sort=False)
print("\n")
print(game_logs.shape)

game_logs.to_csv(f"{advDataDir}/test_adv_game_logs - extract {k}-{w-1}.csv"
                 ,index=False)

1000
dawsoer01 Eric Dawson
(8, 29)
1001
dawsoji01 Jimmy Dawson
(20, 25)
1002
dawsoto01 Tony Dawson
(6, 29)
1003
dayto01 Todd Day
(501, 30)
1004
dayeau01 Austin Daye
(507, 31)
1005
dayeda01 Darren Daye
(362, 30)
1006
decolna01 Nando De Colo
(193, 31)
1007
deangbi01 Billy DeAngelis
(8, 25)
1008
debusda01 Dave DeBusschere*
(971, 25)
1009
df shape (2576, 31)
declean01 Andrew DeClercq
(655, 30)
1010
delonna01 Nate DeLong
(17, 22)
1011
deprejo01 Joe DePre
(138, 27)
1012
derozde01 DeMar DeRozan
(925, 30)
1013
dezonha01 Hank DeZonie
(5, 22)
1014
deanegr01 Greg Deane
(7, 28)
1015
dedmode01 Dewayne Dedmon
(519, 31)
1016
deedo01 Don Dee
(56, 26)
1017
deesar01 Archie Dees
(193, 23)
1018
deherte01 Terry Dehere
(427, 30)
1019
df shape (5518, 31)
dehnere01 Red Dehnert
(9, 22)
1020
dejeabr01 Bryce Dejean-Jones
(40, 29)
1021
dekkesa01 Sam Dekker
(344, 31)
1022
delnevi01 Vinny Del Negro
(882, 30)
1023
delanma01 Malcolm Delaney
(170, 31)
1024
delebi01 Bison Dele
(458, 30)
1025
delfica01 Carlos Delfino
(6

In [22]:
print(game_logs.shape)

(351913, 31)


In [None]:
print(game_logs.shape)

In [None]:
game_logs.tail()

In [16]:
|

SyntaxError: invalid syntax (<ipython-input-16-4b37ef281455>, line 1)

In [None]:
playoff_game_logs = None

row = df.iloc[0]

player_idx = row['index']
letter = player_idx[0]
player_name = row['Player']
print(letter,player_idx,player_name)

from_yr, to_yr = row[["From","To"]].values

#playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs/"
playoff_url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog-playoffs-advanced/"
webpage = urlopen(playoff_url)
html = BeautifulSoup(webpage)
tables = html.findAll('table')

if len(tables) > 0:
    table = str(html.findAll('table')[-1])
    playoff_game_logs = pd.read_html(table)[0]
    playoff_game_logs["PLAYOFF"] = 'Y'

In [None]:
playoff_game_logs