In [1]:
import os
import pandas as pd
import numpy as np

##### data directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
bios = pd.read_csv(file)

# keep a list of each player's unique identifier
index = bios['index'].values

print("No of distinct players:",bios['index'].nunique())

No of distinct players: 4800


##### aggregating scraped player data

In [12]:
#combine all files in the directory
files = os.listdir(rawDataDir)
files.sort()
files = files[2:]

df = pd.concat([pd.read_csv(f"{rawDataDir}/{f}") 
                  for f in files], sort = False)
df.reset_index(drop=True,inplace=True)

print(f"No of game logs: {df.shape[0]:,}")

No of game logs: 1,830,907


In [13]:
df.head()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,PTS,GAME_SCORE,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,+/-,Unnamed: 31
0,1,1.0,1990-11-02,22-131,POR,HOME,HOU,W (+1),0,5:00,...,0,1.0,N,NBA,,,abdelal01,Alaa Abdelnaby,,
1,2,2.0,1990-11-06,22-135,POR,AWAY,LAL,W (+2),0,4:00,...,0,-2.2,N,NBA,,,abdelal01,Alaa Abdelnaby,,
2,3,3.0,1990-11-11,22-140,POR,HOME,LAC,W (+31),0,8:00,...,4,3.9,N,NBA,,,abdelal01,Alaa Abdelnaby,,
3,4,4.0,1990-11-13,22-142,POR,HOME,DEN,W (+26),0,3:00,...,0,-2.1,N,NBA,,,abdelal01,Alaa Abdelnaby,,
4,5,5.0,1990-11-15,22-144,POR,HOME,NYK,W (+16),0,6:00,...,1,-0.1,N,NBA,,,abdelal01,Alaa Abdelnaby,,


In [14]:
df.tail()

Unnamed: 0,Rk,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,RESULT,GS,MP,...,PTS,GAME_SCORE,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,+/-,Unnamed: 31
1830902,52,52.0,1949-03-09,29-080,WSC,AWAY,NYK,L (-16),,,...,2,,N,NBA,,,zunicma01,Matt Zunic,,
1830903,53,53.0,1949-03-12,29-083,WSC,HOME,FTW,W (+27),,,...,6,,N,NBA,,,zunicma01,Matt Zunic,,
1830904,54,54.0,1949-03-16,29-087,WSC,HOME,MNL,W (+13),,,...,12,,N,NBA,,,zunicma01,Matt Zunic,,
1830905,55,55.0,1949-03-17,29-088,WSC,AWAY,PHW,L (-3),,,...,5,,N,NBA,,,zunicma01,Matt Zunic,,
1830906,56,56.0,1949-03-19,29-090,WSC,HOME,NYK,L (-12),,,...,2,,N,NBA,,,zunicma01,Matt Zunic,,


##### identifying players whose data has not been scraped

In [21]:
not_scraped = [x for x in index if x not in df['INDEX'].values]

print("No. of players no scraped:",len(not_scraped))

No. of players no scraped: 41


In [29]:
# players in the error log
logger = f"{errorLog}/log.txt"

with open(logger,"r") as logs:
    lines = [[x.strip() for x in line.strip().split(',')] 
             for line in logs]

cols = ['index','name','reason']
error_df = pd.DataFrame(lines,columns=cols)
error_idx = error_df['index'].values

print(error_df['index'].nunique())

93


In [6]:
error_df.head()

Unnamed: 0,index,name,reason
0,alexacl01,Cliff Alexander,error in playoff game logs
1,anigbik01,Ike Anigbogu,error in playoff game logs
2,babbch01,Chris Babb,error in playoff game logs
3,bairsca01,Cameron Bairstow,error in playoff game logs
4,bambamo01,Mohamed Bamba,error in playoff game logs


In [26]:
error_df[error_df['index'].isin(not_scraped)]

Unnamed: 0,index,name,reason
13,corleke01,Ken Corley,error in regular season game logs
14,corleke01,Ken Corley,error in regular season game logs
15,corleke01,Ken Corley,no playoff OR regular season data
44,kingda01,Dan King,error in regular season game logs
45,kingda01,Dan King,no playoff OR regular season data
61,napolpa01,Paul Napolitano,error in regular season game logs
62,napolpa01,Paul Napolitano,no playoff OR regular season data
63,nolenpa01,Paul Nolen,error in regular season game logs
64,nolenpa01,Paul Nolen,no playoff OR regular season data
86,walkefo01,Foots Walker,error in regular season game logs


In [31]:
error_df[error_df['reason']=='no playoff OR regular season data']

Unnamed: 0,index,name,reason
15,corleke01,Ken Corley,no playoff OR regular season data
45,kingda01,Dan King,no playoff OR regular season data
62,napolpa01,Paul Napolitano,no playoff OR regular season data
64,nolenpa01,Paul Nolen,no playoff OR regular season data
88,walkefo01,Foots Walker,no playoff OR regular season data
91,walkebi01,Henry Walker,no playoff OR regular season data
93,walkeho01,Horace Walker,no playoff OR regular season data


In [32]:
not_scraped

['alarima01',
 'alcorga01',
 'aldemfu01',
 'aldrico01',
 'aldrila01',
 'aleksch01',
 'anderer01',
 'bowmair01',
 'brindau01',
 'carnebo01',
 'colonjo01',
 'copato01',
 'copelch01',
 'copelho01',
 'copella01',
 'corbity01',
 'corchch01',
 'corleke01',
 'dickehe01',
 'flemial01',
 'henrysk01',
 'jacksaa01',
 'jentch01',
 'johnstr01',
 'killuea01',
 'kingda01',
 'kitchcu01',
 'klotzhe01',
 'lowerch01',
 'mahonmo01',
 'napolpa01',
 'nolenpa01',
 'raikesh01',
 'rockge01',
 'stockda01',
 'walkefo01',
 'walkebi01',
 'walkeho01',
 'widbyro01',
 'winklma01',
 'yonakri01']