In [1]:
import os
import pandas as pd
import numpy as np

##### data directories

In [57]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
bios = pd.read_csv(file)

# keep a list of each player's unique identifier
index = bios['index'].values

print("No of distinct players:",bios['index'].nunique())

No of distinct players: 4800


##### aggregating scraped player data

In [83]:
#combine all files in the directory
files = os.listdir(rawDataDir)
files.sort()
files = files[2:]

df = pd.concat([pd.read_csv(f"{rawDataDir}/{f}") 
                  for f in files], sort = False)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True,inplace=True)

print(f"No of game logs: {df.shape[0]:,}")

No of game logs: 1,585,441


In [86]:
df[df['INDEX']=='henrysk01'].shape

(0, 37)

#### rent calcs

In [82]:
# expensive no fee 
1650*12

19800

In [81]:
# cheap w fee
1495*12 + 1500

19440

In [87]:
# more expensive no fee
1700*12

20400

##### identifying players whose data has not been scraped

In [89]:
not_scraped = [x for x in index if x not in df['INDEX'].values]

print("No. of players no scraped:",len(not_scraped))

No. of players no scraped: 25


In [90]:
# players in the error log
logger = f"{errorLog}/log.txt"

with open(logger,"r") as logs:
    lines = [[x.strip() for x in line.strip().split(',')] 
             for line in logs]

cols = ['index','name','reason']
error_df = pd.DataFrame(lines,columns=cols)
error_idx = error_df['index'].values

print(error_df['index'].nunique())

93


In [97]:
error_df['index'].unique()

array(['alexacl01', 'anigbik01', 'babbch01', 'bairsca01', 'bambamo01',
       'bembrde01', 'buyckdw01', 'cabocbr01', 'casspom01', 'cavanty01',
       'chiozch01', 'chrisra01', 'colsobo01', 'corleke01', 'dawsobr01',
       'derrima01', 'divindo01', 'diallha01', 'duvaltr01', 'edwarvi01',
       'feldeka01', 'frazimi01', 'fredeji01', 'gibsojo01', 'greendo01',
       'hamilju01', 'harprma01', 'hillida01', 'inglida01', 'jacksde01',
       'jacksfr01', 'jacksje01', 'jamesda01', 'jeffeam01', 'jeffeco01',
       'jerregr01', 'johnsal02', 'johnsbr02', 'josepkr01', 'karasse01',
       'kaunsa01', 'kellyry01', 'kingda01', 'ledori01', 'leeda03',
       'leonaga01', 'loydjo01', 'lucaska01', 'luwawti01', 'lydonty01',
       'mcculch01', 'mcgarmi01', 'mekelga01', 'melofa01', 'metuch01',
       'millequ01', 'mitrona01', 'mykhasv01', 'napolpa01', 'nolenpa01',
       'ohlbrti01', 'papagge01', 'pattoju01', 'paytoga02', 'portemi01',
       'poythal01', 'reedda01', 'reedwi02', 'ricegl02', 'richama01',
    

In [99]:
df[df['INDEX']=='metuch01'].shape

(145, 37)

In [42]:
'''
to_scrape = list(error_idx) + not_scraped
to_scrape = [x for x in set(to_scrape)]
to_scrape.sort()

print(len(to_scrape))
'''

127


In [48]:
'''
pd.DataFrame(to_scrape,columns=['index']).to_csv(
    f"{interimDataDir}/error_indices.csv"
    ,index=False)
'''

In [95]:
not_scraped

['anderer01',
 'bowmair01',
 'brindau01',
 'carnebo01',
 'colonjo01',
 'corleke01',
 'dickehe01',
 'flemial01',
 'henrysk01',
 'jacksaa01',
 'jentch01',
 'johnstr01',
 'killuea01',
 'kingda01',
 'kitchcu01',
 'klotzhe01',
 'lowerch01',
 'mahonmo01',
 'napolpa01',
 'raikesh01',
 'rockge01',
 'stockda01',
 'widbyro01',
 'winklma01',
 'yonakri01']

In [94]:
bios[bios['index'].isin(not_scraped)]

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges
89,anderer01,Eric Anderson,1993,1994,F,6-9,220.0,"May 26, 1970",Indiana
429,bowmair01,Ira Bowman,2000,2002,G,6-5,195.0,"June 11, 1973","Providence, University of Pennsylvania"
488,brindau01,Aud Brindley,1947,1947,F,6-4,175.0,"December 31, 1923",Dartmouth College
662,carnebo01,Bob Carney,1955,1955,G,6-3,170.0,"August 3, 1932",Bradley
804,colonjo01,Joe Colone,1949,1949,F,6-5,210.0,"January 23, 1924",Bloomsburg University of Pennsylvania
850,corleke01,Ken Corley,1947,1947,C,6-5,210.0,"May 10, 1920",Central State College
1052,dickehe01,Henry Dickerson,1976,1977,G,6-4,190.0,"November 27, 1951",University of Charleston
1325,flemial01,Al Fleming,1978,1978,F,6-7,215.0,"April 5, 1954",Arizona
1826,henrysk01,Skeeter Henry,1994,1994,G,6-7,190.0,"December 8, 1967","Midland College, Oklahoma"
2021,jacksaa01,Aaron Jackson,2018,2018,G,6-4,185.0,"May 6, 1986",Duquesne
