In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle

In [2]:
def players(year):
    '''Use BeautifulSoup to scrape and return the names of all NBA basketball players who played 
    in the season year according to basketball-reference.com'''
    
    url="http://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=\
    totals&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min={0}&year_max=\
    {1}&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=\
    0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=\
    &is_active=&debut_yr_aba_start=&debut_yr_aba_end=&debut_yr_nba_start=&debut_yr_nba_end=\
    &is_hof=&is_as=&as_comp=gt&as_val=&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=\
    Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=\
    &c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&c5stat=&c5comp=&c6mult=1.0&c6stat=&order_by=\
    ws&order_by_asc=&offset=000".format(year,year)

    names=[]
    while True:
        response=requests.get(url)
        page=response.text
        page = re.sub('<!--', '', page)
        page = re.sub('-->', '', page)
        
        soup=BeautifulSoup(page, 'lxml') 
        
        if soup.find(id="all_stats")==None:
            break
        
        for row in soup.find(class_="over_header").parent.parent.find_all('tr'):
            if row.find(class_='left active')!=None:
                for entry in row.find(class_='left active'):
                    names.append(entry['href'])

        url=url[:len(url)-3]+str(int(url[len(url)-3:])+100)
    return names

I'm interested in players who played in both 2016 and 2017, so that I can use their 2016 performance to predict their 2017 salary.

In [3]:
links=list(set(players('2016')) & set(players('2017')))

In [4]:
df=[]

# for each player (equiv., link) go to the player's page and scrape features about that player. Put in to df.
for link in links:
    url="http://www.basketball-reference.com/{0}".format(link)
    response=requests.get(url)
    page=response.text
    page = re.sub('<!--', '', page)
    page = re.sub('-->', '', page)
    
    soup=BeautifulSoup(page, 'lxml')
    
    for elt in soup(text=re.compile('Contract Table')):
        for row in elt.parent.parent.find_all('td')[1]:
            salary=row.get_text().replace('$','').replace(',','')

    for elt in soup(text=re.compile('Experience')):
        experience=(elt.parent.parent).get_text().split()[1]

    for elt in soup(text=re.compile('Shoots')):
        shoots=elt.parent.parent.get_text().split()[-1]

    name=soup.find('h1').get_text()
    
    height=soup.find(itemprop="height").get_text()
    height=list(map(int,height.split("-")))
    height=str(height[1]+12*height[0])
    
    weight=soup.find(itemprop="weight").get_text()
    weight=weight.replace("lb","")
    
    per_game=[row.get_text().strip() for row in soup.find_all(id="per_game.2016")[0].find_all('td')]
    
    data=[name, salary, experience, height, weight, shoots]+per_game
    
    df.append(data)

In [5]:
df_new=pd.DataFrame(df)
print(df_new.head())

                 0         1  2   3    4      5   6    7    8   9   ...   \
0  D'Angelo Russell   5332800  1  77  195   Left  19  LAL  NBA  PG  ...    
1     DeMar DeRozan  26540100  7  79  221  Right  26  TOR  NBA  SG  ...    
2  Brandon Jennings   5000000  7  73  170   Left  26  TOT  NBA  PG  ...    
3        Jeff Green  15000000  8  81  235  Right  29  TOT  NBA  SF  ...    
4   Justin Anderson   1514160  1  78  228   Left  22  DAL  NBA  SF  ...    

     25   26   27   28   29   30   31   32   33    34  
0  .737  0.6  2.8  3.4  3.3  1.2  0.2  2.5  1.8  13.2  
1  .850  0.8  3.7  4.5  4.0  1.0  0.3  2.2  2.1  23.5  
2  .731  0.3  1.7  2.0  3.5  0.6  0.1  1.2  1.2   6.9  
3  .745  0.9  3.2  4.2  1.7  0.7  0.5  1.2  2.1  11.7  
4  .800  0.3  2.1  2.4  0.5  0.3  0.5  0.4  0.8   3.8  

[5 rows x 35 columns]


In [6]:
with open('2016_2017.pkl', 'wb') as picklefile:
    pickle.dump(df_new, picklefile)