In [424]:
import os
import pandas as pd
import numpy as np

##### Data Directories

In [425]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))

finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))
errorLog = os.path.abspath(os.path.join(dataDir,'error_log'))

##### Read Bio Data

In [426]:
file = f"{rawDataDir}/all_NBA_ABA_players.csv"
bio = pd.read_csv(file)

bio['HOF'] = ['Y' if "*" in player else 'N' 
              for player in bio['Player'].values]

print("No of distinct players:",bio['index'].nunique())

No of distinct players: 4800


In [427]:
bio.head()

Unnamed: 0,index,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,HOF
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke,N
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State,N
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA,Y
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU,N
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State",N


##### Scrape Bio Data

In [428]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import string

# url = f"https://www.basketball-reference.com/players/{letter}/{player_idx}/gamelog/{year}"


In [429]:
def feetToCm(data: pd.DataFrame):
    '''converts height in feet to centimetres'''
    
    if data.empty:
        return None
    
    cm = []
    for val in data:
        ft, inch = val.split("-")
        inches = int(ft)*12 + int(inch)
        cm.append(inches*2.54)
        
    '''
    values = [x.split("-") for x in data]
    inches = [int(ft)*12 + int(inch) for ft,inch in values]
    cm = [np.round(inch*2.54,2) for inch in inches]
    '''
    return np.round(cm,2)

def getPlayerMetadata(letter: str,letterHTML=None):
    '''Extracts the unique index and URL of each player
       Extractions occur on a letter by letter basis
    '''
    if not letterHTML:
        return None
    
    playerMetadata = []
    metaData = letterHTML.findAll("tr")
    
    for row in metaData:
        content = row.contents[0]
        
        try:
            playerIdx = content.attrs["data-append-csv"]
            playerURL = f"/players/{letter}/{playerIdx}.html"
            
            playerMetadata += [playerIdx,playerURL],
        except: pass
            
    return playerMetadata

In [430]:
def getAllBioData():
    '''extracts bios of all active and retired players
       extraction occurs letter-by-letter through the alphabet
       a complete dataframe is returned
    '''
    
    alphabet = list(string.ascii_lowercase)
    df = pd.Series([])
    
    for letter in alphabet:
        letterBios = getBioDataByLetter(letter)
        
        if df.empty:
            df = letterBios
        else:
            df = pd.concat([prev,letterBios],ignore_index=True)
            
        prev = df
        
    return formatBioData(df)

def formatBioData(df: pd.DataFrame):
    '''adds hall of fame indicator and re-orders columns
       converts height to centimetres
    '''
        
    df['HOF'] = ['Y' if "*" in player else 'N' 
                 for player in df['Player'].values]
    df["Ht (cm)"] = feetToCm(df["Ht"])
    
    cols = ['ID','Player', 'From', 'To', 'Pos', 'Ht', "Ht (cm)"
            ,'Wt','Birth Date', 'Colleges','HOF', 'URL']
    
    return df[cols]
    
def getBioDataByLetter(letter: str):
    '''extracts all bios for players whose last name begins
       with a given letter
    '''
    
    letterURL = f"https://www.basketball-reference.com/players/{letter}/"
    webpage = urlopen(letterURL)
    
    html = BeautifulSoup(webpage)
    tables = html.findAll('table')
    
    bioData = pd.read_html(str(tables[0]))[0]
    metaData = getPlayerMetadata(letter,html)
    
    assert len(bioData) == len(metaData)
    
    letterDF = pd.concat([bioData, 
                          pd.DataFrame(metaData,columns=["ID","URL"])]
                         ,axis=1)
    
    #print(letter,letterDF.shape)
    return letterDF

In [431]:
df = getAllBioData()
print(df.shape)

(4803, 12)


In [432]:
'''
location = rawDataDir + "/consolidatedBioData.csv"
df.to_csv(location,index=False)
'''

In [434]:
df.head(10)

Unnamed: 0,ID,Player,From,To,Pos,Ht,Ht (cm),Wt,Birth Date,Colleges,HOF,URL
0,abdelal01,Alaa Abdelnaby,1991,1995,F-C,6-10,208.28,240.0,"June 24, 1968",Duke,N,/players/a/abdelal01.html
1,abdulza01,Zaid Abdul-Aziz,1969,1978,C-F,6-9,205.74,235.0,"April 7, 1946",Iowa State,N,/players/a/abdulza01.html
2,abdulka01,Kareem Abdul-Jabbar*,1970,1989,C,7-2,218.44,225.0,"April 16, 1947",UCLA,Y,/players/a/abdulka01.html
3,abdulma02,Mahmoud Abdul-Rauf,1991,2001,G,6-1,185.42,162.0,"March 9, 1969",LSU,N,/players/a/abdulma02.html
4,abdulta01,Tariq Abdul-Wahad,1998,2003,F,6-6,198.12,223.0,"November 3, 1974","Michigan, San Jose State",N,/players/a/abdulta01.html
5,abdursh01,Shareef Abdur-Rahim,1997,2008,F,6-9,205.74,225.0,"December 11, 1976",California,N,/players/a/abdursh01.html
6,abernto01,Tom Abernethy,1977,1981,F,6-7,200.66,220.0,"May 6, 1954",Indiana,N,/players/a/abernto01.html
7,ablefo01,Forest Able,1957,1957,G,6-3,190.5,180.0,"July 27, 1932",Western Kentucky,N,/players/a/ablefo01.html
8,abramjo01,John Abramovic,1947,1948,F,6-3,190.5,195.0,"February 9, 1919",Salem International University,N,/players/a/abramjo01.html
9,abrinal01,Álex Abrines,2017,2019,G-F,6-6,198.12,200.0,"August 1, 1993",,N,/players/a/abrinal01.html


In [440]:
df[df['ID']=='abdelal01'][['ID','URL','From','To']]#.values[0]

Unnamed: 0,ID,URL,From,To
0,abdelal01,/players/a/abdelal01.html,1991,1995
