In [15]:
import pandas as pd
import pickle
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import time

In [16]:
pd.set_option('display.max_columns', 100)

In [17]:
def getAndParseUrl(url):
    results = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(results.text, 'html.parser')
    return soup

In [125]:
def get_draft_years():
    df = pd.DataFrame() # empty dataframe to start
    
    # loop through each page of draft year search url
    for i in tqdm(range(0, 7800, 100)):
        # this grabs the table on each page with all stats
        table = pd.read_html(f'https://www.basketball-reference.com/play-index/draft_finder.cgi?request=1&year_min=1950&year_max=2019&round_min=&round_max=&pick_overall_min=&pick_overall_max=&franch_id=&college_id=0&is_active=&is_hof=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=year_id&order_by_asc=&offset={i}', header=1)
        
        # this parses the html to extract each unique player id
        soup = getAndParseUrl(f'https://www.basketball-reference.com/play-index/draft_finder.cgi?request=1&year_min=1950&year_max=2019&round_min=&round_max=&pick_overall_min=&pick_overall_max=&franch_id=&college_id=0&is_active=&is_hof=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=year_id&order_by_asc=&offset={i}')
        
        #list container to store all player ids
        list_col = []
        
        # selects only the relevant html tags and stores in list
        tags = [t for t in [td.attrs for td in soup.select('td.left')] if t['data-stat'] == 'player']
        
        # loop through tags to extract player id and store in list container (list_col)
        for tag in tags:
            # checks if each tag has the player id
            if 'data-append-csv' in tag:
                # if it does, append to list_col
                list_col.append(tag['data-append-csv'])
            else:
                # if not, append None to list_col
                list_col.append('None') # this is necessary to make list_col the same length as the rows in table[0]

        # remove the rows that were just extra headers from the table[0]
        table[0] = table[0][table[0].Player != 'Player']
        
        # remove any player rows that had NaN as the player's name
        table[0] = table[0][table[0].Player.isna() == False]
        
        # make a column pid and populate with list_col (which is all the player ids)
        table[0]['pid'] = list_col
        
        # concat to df
        df = pd.concat([df,table[0]])
        
        # save as pickle each loop so data will not be lost if error occurs
        df.to_pickle('draft_history_df.pickle')
    
    # make all column names lowercase (personal preference)
    df.columns = [col.lower() for col in df.columns]
    
    return df

In [126]:
draft_df = get_draft_years()

100%|██████████| 78/78 [03:47<00:00,  2.73s/it]


In [128]:
draft_df.to_pickle('draft_history_df.pickle')

In [129]:
draft_df

Unnamed: 0,rk,year,lg,rd,pk,tm,player,age,pos,born,college,from,to,g,mp,pts,trb,ast,stl,blk,fg%,2p%,3p%,ft%,ws,ws/48,pid
0,1,2019,NBA,1,1,NOP,Zion Williamson,,F,us,Duke,,,,,,,,,,,,,,,,willizi01
1,2,2019,NBA,1,2,MEM,Ja Morant,,G,us,Murray State,,,,,,,,,,,,,,,,moranja01
2,3,2019,NBA,1,3,NYK,RJ Barrett,,G-F,ca,Duke,,,,,,,,,,,,,,,,barrerj01
3,4,2019,NBA,1,4,LAL,De'Andre Hunter,,F,us,Virginia,,,,,,,,,,,,,,,,huntede01
4,5,2019,NBA,1,5,CLE,Darius Garland,,G,us,Vanderbilt,,,,,,,,,,,,,,,,garlada01
5,6,2019,NBA,1,6,PHO,Jarrett Culver,,G,us,Texas Tech,,,,,,,,,,,,,,,,culveja01
6,7,2019,NBA,1,7,CHI,Coby White,,G,us,UNC,,,,,,,,,,,,,,,,whiteco01
7,8,2019,NBA,1,8,ATL,Jaxson Hayes,,F,us,Texas,,,,,,,,,,,,,,,,hayesja02
8,9,2019,NBA,1,9,WAS,Rui Hachimura,,F,jp,Gonzaga,,,,,,,,,,,,,,,,hachiru01
9,10,2019,NBA,1,10,ATL,Cam Reddish,,F,us,Duke,,,,,,,,,,,,,,,,reddica01


In [145]:
for col in draft_df.columns:
    try:
        draft_df[col] = pd.to_numeric(draft_df[col])
    except:
        pass

In [146]:
draft_df.dtypes

year        int64
tm         object
player     object
pos        object
from      float64
to        float64
dtype: object