In [1]:
import os
import pandas as pd
import numpy as np

##### data directories

In [57]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

#### helper functions

In [280]:
def get_margin(data):
    
    values = [x.strip().split() for x in data]
    outcome, margin = [v for v in zip(*values)]
    
    margin = [int(m[1:-1]) if m[1]=='-' else int(m[2:-1])
              for m in margin]
    
    return outcome, margin

In [374]:
def get_seconds_played(data):
    
    values = [x.split(":") if type(x)==str
              else [np.nan,np.nan]
              for x in data]
    converted_values = [(int(mins)*60) + int(secs) if type(mins)==str
                        else np.nan
                        for mins,secs in values]
    
    return converted_values  

#### read player bio data

In [3]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
bios = pd.read_csv(file)

# keep a list of each player's unique identifier
index = bios['index'].values

print("No of distinct players:",bios['index'].nunique())

No of distinct players: 4800


##### aggregating scraped player data

In [392]:
save = True

#combine all files in the directory
files = os.listdir(rawDataDir)
files.sort()
files = files[2:]

# concat data and drop duplicates
df = pd.concat([pd.read_csv(f"{rawDataDir}/{f}") 
                  for f in files], sort = False)
df.drop_duplicates(inplace=True)

# handling error in margins for game 7 of 2016 NBA finals
replace_dict = {"RESULT":{"W ()":"W (+4)",
                          "L ()":"L (-4)"}}
df.replace(replace_dict,inplace=True)

# dropping DNPs
DNP = ['Did Not Play','Inactive','Did Not Dress'
       ,'Not With Team','Player Suspended']
df = df[~df['GS'].isin(DNP)]

# adding columns
df['OUTCOME'], df['MARGIN'] = get_margin(df['RESULT'])
df['SP'] = get_seconds_played(df['MP'])

# dropping columns
dropcols = ['Rk','Unnamed: 31','RESULT']
df.drop(dropcols,axis=1,inplace=True)

# sorting values
df.sort_values(['INDEX','DATE'],inplace=True)

# reset index
df.reset_index(drop=True,inplace=True)

if save:
    df.to_csv(f"{interimDataDir}/consolidated_game_logs.csv",index=False)
    print("df saved")
    
print(f"No of game logs: {df.shape[0]:,}")

df saved
No of game logs: 1,385,152


In [393]:
# No of game logs: 1,391,661

In [394]:
df.columns

Index(['G', 'DATE', 'AGE', 'TEAM', 'HOME/AWAY', 'OPPONENT', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GAME_SCORE', 'PLAYOFF',
       'LEAGUE', 'SERIES', 'G#', 'INDEX', 'NAME', '+/-', 'OUTCOME', 'MARGIN',
       'SP'],
      dtype='object')

In [395]:
df.head()

Unnamed: 0,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,GS,MP,FG,FGA,...,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,+/-,OUTCOME,MARGIN,SP
0,1.0,1990-11-02,22-131,POR,HOME,HOU,0,5:00,0,1,...,N,NBA,,,abdelal01,Alaa Abdelnaby,,W,1,300.0
1,2.0,1990-11-06,22-135,POR,AWAY,LAL,0,4:00,0,0,...,N,NBA,,,abdelal01,Alaa Abdelnaby,,W,2,240.0
2,3.0,1990-11-11,22-140,POR,HOME,LAC,0,8:00,2,4,...,N,NBA,,,abdelal01,Alaa Abdelnaby,,W,31,480.0
3,4.0,1990-11-13,22-142,POR,HOME,DEN,0,3:00,0,2,...,N,NBA,,,abdelal01,Alaa Abdelnaby,,W,26,180.0
4,5.0,1990-11-15,22-144,POR,HOME,NYK,0,6:00,0,2,...,N,NBA,,,abdelal01,Alaa Abdelnaby,,W,16,360.0


In [397]:
df.tail()

Unnamed: 0,G,DATE,AGE,TEAM,HOME/AWAY,OPPONENT,GS,MP,FG,FGA,...,PLAYOFF,LEAGUE,SERIES,G#,INDEX,NAME,+/-,OUTCOME,MARGIN,SP
1385147,52.0,1949-03-09,29-080,WSC,AWAY,NYK,,,1,,...,N,NBA,,,zunicma01,Matt Zunic,,L,-16,
1385148,53.0,1949-03-12,29-083,WSC,HOME,FTW,,,3,,...,N,NBA,,,zunicma01,Matt Zunic,,W,27,
1385149,54.0,1949-03-16,29-087,WSC,HOME,MNL,,,3,,...,N,NBA,,,zunicma01,Matt Zunic,,W,13,
1385150,55.0,1949-03-17,29-088,WSC,AWAY,PHW,,,2,,...,N,NBA,,,zunicma01,Matt Zunic,,L,-3,
1385151,56.0,1949-03-19,29-090,WSC,HOME,NYK,,,1,,...,N,NBA,,,zunicma01,Matt Zunic,,L,-12,


##### identifying players whose data has not been scraped

In [89]:
not_scraped = [x for x in index if x not in df['INDEX'].values]

print("No. of players no scraped:",len(not_scraped))

No. of players no scraped: 25


In [90]:
# players in the error log
logger = f"{errorLog}/log.txt"

with open(logger,"r") as logs:
    lines = [[x.strip() for x in line.strip().split(',')] 
             for line in logs]

cols = ['index','name','reason']
error_df = pd.DataFrame(lines,columns=cols)
error_idx = error_df['index'].values

print(error_df['index'].nunique())

93


In [103]:
#error_df['index'].unique()

In [104]:
#df[df['INDEX']=='tavarwa01'].shape

In [42]:
'''
to_scrape = list(error_idx) + not_scraped
to_scrape = [x for x in set(to_scrape)]
to_scrape.sort()

print(len(to_scrape))
'''

127


In [48]:
'''
pd.DataFrame(to_scrape,columns=['index']).to_csv(
    f"{interimDataDir}/error_indices.csv"
    ,index=False)
'''

In [101]:
#bios[bios['index'].isin(not_scraped)]