In [3]:
import pandas as pd
import numpy as np
import requests
import urllib
from bs4 import BeautifulSoup

In [4]:
base = ('http://www.basketball-reference.com/leagues/NBA_2017_totals.html')

#15492 players
#base = ('http://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=1980&year_max=&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_aba_start=&debut_yr_aba_end=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=mp_per_g&c1comp=gt&c1val=&c2stat=fg_per_mp&c2comp=gt&c2val=&c3stat=fg_per_poss&c3comp=gt&c3val=&c4stat=fg_pct&c4comp=gt&c4val=&c5stat=&c5comp=&c6mult=0&c6stat=&order_by=g&order_by_asc=&offset=')

In [5]:
def get_soup(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

def next_page(base, player_count):
    url = base+str(player_count)
    return url

def get_rows(soup):
    body = soup.findAll('tbody')
    rows = body[0].findAll('tr')
    return rows

def strip_stats(rows):
    stats = []
    for x in range(0, len(rows)):
        player_stats = rows[x].findAll('td')
        stats.append([z.text for z in player_stats])
    stats = [x for x in stats if x!=[]]
    return stats

## this will be used to get the stat names of the stats being collected for each player
## We will set these as our column names in our created dataframe
def get_stat_names(soup):
    body = soup.findAll('thead')
    row = body[0].findAll('tr')    
    stat_names = []
    player_stats = row[0].findAll('th')
    stat_names.append([z.text for z in player_stats])
    return stat_names

In [6]:
def tot_df_creation(base_url):
    ## we will create our initial soup to use to collect the names of our statistics which will turn
    ## into the columns of our dataframe
    soup = get_soup(base_url)
    stat_names = get_stat_names(soup)
    
    ## this allows us to instantiate our total dataframe with the correct column names which we will
    ## append the individual season dataframes to
    tot_df = pd.DataFrame(columns = [z for z in stat_names[0][1:]])
    
    ## this column is not a part of the stats on the webpage but we are collecting it, so we need 
    ## to instantiate the column as part of total_df
    tot_df['Season'] = pd.Series()
    return tot_df

In [7]:
def get_team_link(rows):
    link_list = []
    ## this will take me through every row of player data and attach a team link as a feature for that player
    for x in range(0, len(rows)):
        try:
            ## this gives me their row of stats on the website so I can see what team they were on
            player_stats = rows[x].findAll('td')
            ## the team name contains a hyperlink to that team's stats for the year. it is in index
            ## position 3, so we will access that location and collect the embedded link
            link = 'http://www.basketball-reference.com'+player_stats[3].find('a').get('href')
            ## we will append all of our links to a list to then add as a series to our dataframe
            link_list.append(link)
        
        ## If a player was on multiple teams in one season, he has a row of stats for each team he was on,
        ## as well as a "total" row with all the team rows added together. This total row does NOT have
        ## a hyperlink to a team page and raises an attribute error. We will pass the link variable as
        ## None in this situation
        except AttributeError:
            link = None
            link_list.append(link)
        
        ## every 20 or so rows there is a divider row with the stat names which raises an index error,
        ## this allows us to pass over those rows
        except IndexError:
            pass
        
    return link_list    
    

In [8]:
def get_player_stats(base_url):
    ## from start of 3-point era to present day
    years = [1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
             1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
             2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
             2013, 2014, 2015, 2016, 2017]
    
    #instantiate an empty dataframe with the correct columns
    tot_df = tot_df_creation(base_url)

    ## cycle through the seasons, creating an individual dataframe for each season, then append
    ## that dataframe to our total_df
    for x in years[:]:
        #cycle through url's 
        base_url = base_url[:48]+str(x)+base_url[52:]
        soup = get_soup(base_url)
        rows = get_rows(soup)
        stats = strip_stats(rows)
        stat_names = get_stat_names(soup)
        link_series = pd.Series(get_team_link(rows))
        
        ## Make a dataframe from our collected stats
        df = pd.DataFrame(stats, columns = [i for i in stat_names[0][1:]])
        ## Add a season column for the season we are currently scraping
        df['season'] = x
        ## Add a column of the link to that players team page that we created
        df['team_link'] = link_series.values
        
        ## Append the individual dataframe to our total_df
        tot_df = tot_df.append(df, ignore_index = True)
        
        ## Something to keep track of where we are in the process
        print "\n==========JUST FINISHED %s SEASON==========\n" % x
    
    ## appending dataframes messes with column order - this resorts the columns to original order
    tot_df = tot_df[df.columns]
    
    ## we can filter out rows where there is no team link since those are aggregate rows of player
    ## data when they are on multiple teams. We can treat them as individuals and then cumulate their WAR
    ## on a per season basis
    tot_df = tot_df[tot_df['team_link'].isnull() == False]
    
    ## this brings the season column to the front of the df - just for viewing purposes
    tot_df = tot_df.set_index('season').reset_index()
    
    return tot_df

In [9]:
## assign the output of the function to a variable so that it only has to be called once
df = get_player_stats(base)















































































In [10]:
## check the number of rows in our dataframe
print len(df)
## examine the head, make sure everything looks right
df.head()

17291


Unnamed: 0,season,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,team_link
0,1980.0,Kareem Abdul-Jabbar*,C,32,LAL,82,,3143,835,1383,...,190,696,886,371,81,280,297,216,2034,http://www.basketball-reference.com/teams/LAL/...
1,1980.0,Tom Abernethy,PF,25,GSW,67,,1222,153,318,...,62,129,191,87,35,12,39,118,362,http://www.basketball-reference.com/teams/GSW/...
2,1980.0,Alvan Adams,C,25,PHO,75,,2168,465,875,...,158,451,609,322,108,55,218,237,1118,http://www.basketball-reference.com/teams/PHO/...
3,1980.0,Tiny Archibald*,PG,31,BOS,80,80.0,2864,383,794,...,59,138,197,671,106,10,242,218,1131,http://www.basketball-reference.com/teams/BOS/...
4,1980.0,Dennis Awtrey,C,31,CHI,26,,560,27,60,...,29,86,115,40,12,15,27,66,86,http://www.basketball-reference.com/teams/CHI/...


In [11]:
## check the number of unique team links in the dataframe. This list of unique values will be what we scrape for our
## team data
len(df.team_link.unique())

1044

In [12]:
test_list = [x.encode() for x in df.team_link.unique() if '1992' in x]

In [13]:
def team_opp_stats(rows):
    

IndentationError: expected an indented block (<ipython-input-13-a07b63759d2f>, line 2)

In [50]:
def get_team_stats(df):
    team_links = [x.encode() for x in df.team_link.unique()]
    
    for link in team_links[:1]:
        print "link is:       \n"+link+"\n"
        soup = get_soup(link)
        print type(soup)
#         table = soup.findAll('div', id='div_team_and_opponent')
#         print table
#         table=soup.findAll("table")
#         print table
        table = soup.findAll('table',{'id': 'team_and_opponent'})
        print table
        rows = get_rows(table[1])
        
#         stats = team_opp_stats(rows)
        
        print rows


In [51]:
get_team_stats(df)

link is:       
http://www.basketball-reference.com/teams/LAL/1980.html

<class 'bs4.BeautifulSoup'>
[]


IndexError: list index out of range

In [34]:
##     INITIAL ATTEMPT AT COLLECTING TEAM STATISTICS AND ATTACHING TO A PLAYERS ROW
##     THIS STRATEGY IS BEING REVISED

def attach_team_stats(df):
    for index, row in df.iterrows():
        base_url = row['team_link']
        soup = get_soup(base_url)
        rows = get_rows(soup)
        stats = strip_stats(rows)
        stat_names = get_stat_names(soup)
        
        for stat, value in zip(stats, stat_names):
            print "stat is %r and value is %r" % (stat, value)
            row[stat] = value
    return df
        

In [35]:
attach_team_stats(df)

stat is [u'Kareem Abdul-Jabbar', u'C', u'7-2', u'225', u'April 16, 1947', u'us', u'10', u'University of California, Los Angeles'] and value is [u'No.', u'Player', u'Pos', u'Ht', u'Wt', u'Birth Date', u'\xa0', u'Exp', u'College']


ValueError: [u'Kareem Abdul-Jabbar' u'C' u'7-2' u'225' u'April 16, 1947' u'us' u'10'
 u'University of California, Los Angeles'] not contained in the index