In [63]:
import pandas as pd
from pandas import Series,DataFrame,read_html
import numpy as np

from bs4 import BeautifulSoup
import html5lib

In [64]:
# start by creating a url template to use in the loop
url_template = "http://www.basketball-reference.com/draft/NBA_{year}.html"

In [65]:
# create an empty DataFrame to append each draft year DataFrame to
draft_dframe = pd.DataFrame()

In [66]:
for year in range(1989, 2018): # Will build the DataFrame for each year of interest
    url = url_template.format(year = year) # Grabbing the correct url
    
    dframe_list_year = pd.io.html.read_html(url)
    dframe_year = dframe_list_year[0]
    
    # Dropping the NaN filled columns
    dframe_year.drop(dframe_year.columns[[0]],inplace=True,axis=1)
    
    # Add in a column for the draft year
    dframe_year.insert(0, 'Draft_Yr', year)
    
    # Append to the big DataFrame
    draft_dframe = draft_dframe.append(dframe_year, ignore_index=True)

In [67]:
pd.set_option('display.max_columns', None)
column_names = ['Draft_Yr','Pk','Team','Player','College','Yrs','Games','Minutes Played','PTS','TRB','AST','FG_Percentage','3P_Percentage','FT_Percentage','Minutes per Game','Points per Game','TRB per game','Assits per Game','Win Share','WS_per_game','BPM','VORP']
draft_dframe.columns = column_names

In [68]:
draft_dframe

Unnamed: 0,Draft_Yr,Pk,Team,Player,College,Yrs,Games,Minutes Played,PTS,TRB,AST,FG_Percentage,3P_Percentage,FT_Percentage,Minutes per Game,Points per Game,TRB per game,Assits per Game,Win Share,WS_per_game,BPM,VORP
0,1989,1,SAC,Pervis Ellison,University of Louisville,11,474,11593,4494,3170,691,.510,.050,.689,24.5,9.5,6.7,1.5,21.8,.090,0.5,7.4
1,1989,2,LAC,Danny Ferry,Duke University,13,917,18133,6439,2550,1185,.446,.393,.840,19.8,7.0,2.8,1.3,34.8,.092,-0.9,5.0
2,1989,3,SAS,Sean Elliott,University of Arizona,12,742,24502,10544,3204,1897,.465,.375,.799,33.0,14.2,4.3,2.6,55.7,.109,0.6,16.1
3,1989,4,MIA,Glen Rice,University of Michigan,15,1000,34985,18336,4387,2097,.456,.400,.846,35.0,18.3,4.4,2.1,88.7,.122,0.5,22.1
4,1989,5,CHH,J.R. Reid,University of North Carolina,11,672,15370,5680,3381,639,.472,.135,.716,22.9,8.5,5.0,1.0,22.5,.070,-2.0,-0.2
5,1989,6,CHI,Stacey King,University of Oklahoma,8,438,7406,2819,1460,387,.478,.235,.707,16.9,6.4,3.3,0.9,10.9,.071,-3.0,-1.9
6,1989,7,IND,George McCloud,Florida State University,12,766,17429,6925,2342,1769,.402,.358,.810,22.8,9.0,3.1,2.3,24.6,.068,-0.7,5.6
7,1989,8,DAL,Randy White,Louisiana Tech University,5,281,5382,2083,1366,175,.401,.193,.707,19.2,7.4,4.9,0.6,1.9,.017,-4.4,-3.3
8,1989,9,WSB,Tom Hammonds,Georgia Institute of Technology,12,687,10419,3617,2243,378,.480,.000,.691,15.2,5.3,3.3,0.6,14.7,.068,-4.3,-6.0
9,1989,10,MIN,Pooh Richardson,"University of California, Los Angeles",10,639,19399,7083,1807,4180,.444,.329,.652,30.4,11.1,2.8,6.5,26.7,.066,-0.9,5.5


In [71]:
# Dropping the rows that served as breaks for different rounds of the draft
draft_dframe = draft_dframe[draft_dframe.Pk.notnull()]
draft_dframe = draft_dframe[draft_dframe['Pk'] != 'Pk']

In [72]:
draft_dframe

Unnamed: 0,Draft_Yr,Pk,Team,Player,College,Yrs,Games,Minutes Played,PTS,TRB,AST,FG_Percentage,3P_Percentage,FT_Percentage,Minutes per Game,Points per Game,TRB per game,Assits per Game,Win Share,WS_per_game,BPM,VORP
0,1989,1,SAC,Pervis Ellison,University of Louisville,11,474,11593,4494,3170,691,.510,.050,.689,24.5,9.5,6.7,1.5,21.8,.090,0.5,7.4
1,1989,2,LAC,Danny Ferry,Duke University,13,917,18133,6439,2550,1185,.446,.393,.840,19.8,7.0,2.8,1.3,34.8,.092,-0.9,5.0
2,1989,3,SAS,Sean Elliott,University of Arizona,12,742,24502,10544,3204,1897,.465,.375,.799,33.0,14.2,4.3,2.6,55.7,.109,0.6,16.1
3,1989,4,MIA,Glen Rice,University of Michigan,15,1000,34985,18336,4387,2097,.456,.400,.846,35.0,18.3,4.4,2.1,88.7,.122,0.5,22.1
4,1989,5,CHH,J.R. Reid,University of North Carolina,11,672,15370,5680,3381,639,.472,.135,.716,22.9,8.5,5.0,1.0,22.5,.070,-2.0,-0.2
5,1989,6,CHI,Stacey King,University of Oklahoma,8,438,7406,2819,1460,387,.478,.235,.707,16.9,6.4,3.3,0.9,10.9,.071,-3.0,-1.9
6,1989,7,IND,George McCloud,Florida State University,12,766,17429,6925,2342,1769,.402,.358,.810,22.8,9.0,3.1,2.3,24.6,.068,-0.7,5.6
7,1989,8,DAL,Randy White,Louisiana Tech University,5,281,5382,2083,1366,175,.401,.193,.707,19.2,7.4,4.9,0.6,1.9,.017,-4.4,-3.3
8,1989,9,WSB,Tom Hammonds,Georgia Institute of Technology,12,687,10419,3617,2243,378,.480,.000,.691,15.2,5.3,3.3,0.6,14.7,.068,-4.3,-6.0
9,1989,10,MIN,Pooh Richardson,"University of California, Los Angeles",10,639,19399,7083,1807,4180,.444,.329,.652,30.4,11.1,2.8,6.5,26.7,.066,-0.9,5.5


In [74]:
# changing the remaining NaN's to zeroes
draft_dframe = draft_dframe.fillna(0)
# reindexing to align the row index correctly
draft_dframe.index = range(1682)

In [75]:
draft_dframe

Unnamed: 0,Draft_Yr,Pk,Team,Player,College,Yrs,Games,Minutes Played,PTS,TRB,AST,FG_Percentage,3P_Percentage,FT_Percentage,Minutes per Game,Points per Game,TRB per game,Assits per Game,Win Share,WS_per_game,BPM,VORP
0,1989,1,SAC,Pervis Ellison,University of Louisville,11,474,11593,4494,3170,691,.510,.050,.689,24.5,9.5,6.7,1.5,21.8,.090,0.5,7.4
1,1989,2,LAC,Danny Ferry,Duke University,13,917,18133,6439,2550,1185,.446,.393,.840,19.8,7.0,2.8,1.3,34.8,.092,-0.9,5.0
2,1989,3,SAS,Sean Elliott,University of Arizona,12,742,24502,10544,3204,1897,.465,.375,.799,33.0,14.2,4.3,2.6,55.7,.109,0.6,16.1
3,1989,4,MIA,Glen Rice,University of Michigan,15,1000,34985,18336,4387,2097,.456,.400,.846,35.0,18.3,4.4,2.1,88.7,.122,0.5,22.1
4,1989,5,CHH,J.R. Reid,University of North Carolina,11,672,15370,5680,3381,639,.472,.135,.716,22.9,8.5,5.0,1.0,22.5,.070,-2.0,-0.2
5,1989,6,CHI,Stacey King,University of Oklahoma,8,438,7406,2819,1460,387,.478,.235,.707,16.9,6.4,3.3,0.9,10.9,.071,-3.0,-1.9
6,1989,7,IND,George McCloud,Florida State University,12,766,17429,6925,2342,1769,.402,.358,.810,22.8,9.0,3.1,2.3,24.6,.068,-0.7,5.6
7,1989,8,DAL,Randy White,Louisiana Tech University,5,281,5382,2083,1366,175,.401,.193,.707,19.2,7.4,4.9,0.6,1.9,.017,-4.4,-3.3
8,1989,9,WSB,Tom Hammonds,Georgia Institute of Technology,12,687,10419,3617,2243,378,.480,.000,.691,15.2,5.3,3.3,0.6,14.7,.068,-4.3,-6.0
9,1989,10,MIN,Pooh Richardson,"University of California, Los Angeles",10,639,19399,7083,1807,4180,.444,.329,.652,30.4,11.1,2.8,6.5,26.7,.066,-0.9,5.5


In [76]:
# checking whether there are still any missing values
draft_dframe.isnull().sum().sum()

0L