In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib2
import sys
import datetime as dt
now=dt.datetime.now()
print now

Get name, position, height, weight, college, and years of service of every player and link to their player page. HOF players are seperated from regular players by the * denotation used by BasketballReference

In [None]:
all_players = []
for letter in range(97,123):
    url = "https://www.basketball-reference.com/players/"+chr(letter)
    try:
        players = pd.read_html(url)[0]
    except:
        continue
        
    player_links = BeautifulSoup(urllib2.urlopen(url),'html.parser')
    links = []
    for link in player_links.find_all(attrs={"data-stat":'player'}):
        try:
            links.append([link.text, link.a['href']])
        except:
            pass
    player_links = pd.DataFrame(links, columns=['Player','URL'])    
    players = players.merge(player_links,how='left',on="Player")
        
    players["Years of Service"] = players["To"] - players["From"]
    all_players.append(players)
    sys.stdout.write("\r" + chr(letter))
    sys.stdout.flush()

Append the lists of players into a dataframe

In [None]:
all_players = pd.concat(all_players)

Using each individual player's player page, load up a dataframe containing their career stats

In [None]:
player_stats = pd.DataFrame()
for _,player in all_players.iterrows():
    sys.stdout.write("\r" + player['Player'])
    sys.stdout.flush()
    url = "https://www.basketball-reference.com/" + player['URL']
    single_stats = {}
    #get stats
    for retries in range(0,3):
        try:
            stats = pd.read_html(url)[0]
            other_stats = BeautifulSoup(urllib2.urlopen(url),'html.parser')
            break
        except:
            pass
    stats = stats[stats['Season'] == 'Career']
    single_stats['Player'] = player['Player']
    for stat in ['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P','2PA', '2P%', 'eFG%','FT', 'FTA', 'FT%',\
                                               'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']:
        try:
            single_stats[stat] = stats[stat]
        except:
            pass
    
    #Distinct amount of teams 
    teams = set()
    for team in other_stats.find("div", {"class": "uni_holder"}).find_all('a'):
        teams.add(team['data-tip'].split(',')[0])
    single_stats['Teams'] = len(teams)
    
    player_stats = pd.concat([player_stats,pd.DataFrame(single_stats)])

Reorder the columns since converting a dictionary to a dataframe leads to randomized column orders(dictionaries are not ordered)

In [None]:
player_stats = player_stats[['Player', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%','FT', 'FTA', 'FT%',\
                                              'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Teams']]

Merge the dataframe containing a player's vitals to their career stats with a left join on their vitals

In [None]:
all_players = all_players.merge(player_stats,how='left',on="Player")
all_players.fillna(0,inplace=True)

Separate the dataframe into regular players and a HOF players. This makes it easier later on as the MVP and AllStar dataframes doesn't make this distinction

In [None]:
reg_players = all_players[~all_players['Player'].str.contains('\*')]
hof_players = all_players[all_players['Player'].str.contains('\*')]

Delete * from HOF playerss names

In [None]:
hof_players['Player'] = hof_players['Player'].str.replace("*","")

Get list of players and the amounts of AllStar and MVP selections they've acheived and merge it with the existing dataframes

In [None]:
allstars = pd.read_html('https://www.basketball-reference.com/awards/all_star_by_player.html')[0][['Player','Tot']]
allstars.columns = ['Player','AllStars']

mvps = pd.read_html('https://www.basketball-reference.com/awards/mvp.html')[2][['Player','Count']]
mvps.columns = ['Player','MVPs']

reg_players = reg_players.merge(allstars,how='left',on="Player")
reg_players = reg_players.merge(mvps,how='left',on="Player")
reg_players.fillna(0,inplace=True)

hof_players = hof_players.merge(allstars,how='left',on="Player")
hof_players = hof_players.merge(mvps,how='left',on="Player")
hof_players.fillna(0,inplace=True)

Drop all duplicates

In [None]:
all_players.drop_duplicates(subset=["Player","Birth Date"],inplace=True)
reg_players.drop_duplicates(subset=["Player","Birth Date"],inplace=True)
hof_players.drop_duplicates(subset=["Player","Birth Date"],inplace=True)

Save all current data since a single run takes about 2.5 hours

In [None]:
all_players.to_pickle("all_players.pkl")
reg_players.to_pickle("reg_players.pkl")
hof_players.to_pickle("hof_players.pkl")

all_players.to_csv("all_players.csv")
reg_players.to_csv("reg_players.csv")
hof_players.to_csv("hof_players.csv")

In [None]:
print dt.datetime.now()-now