In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import pickle
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 100)

# Scrape data

In [3]:
def getAndParseUrl(url):
    results = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(results.text, 'html.parser')
    return soup

In [4]:
url1 = 'https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=1980&year_max=2019&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=per&c1comp=gt&c1val=-100&c2stat=gs&c2comp=gt&c2val=-1&order_by=season&order_by=&offset='

In [5]:
def get_players(url): # function to grab every player and his stats from the page as a dictionary, and store as list
    
    # player dictionaries container
    players = []
    
    # loop through each page(for this website, each page is accessed by 'offset={some multiple of 100}')
    # last page is offset=16000
    for i in tqdm(range(0,16100,100)):
        url = url+str(i)
        soup = getAndParseUrl(url)
        
        # select each td tag under tr tag: each td tag in player_soup is a player's stat, all stored as a nested list
        # each element in this list is a list of the players
        player_soup = [t.select('td') for t in soup.select('#stats tbody tr') if 'class' not in t.attrs.keys()]
        
        # loop through each stat list in the player_soup
        for stat_list in tqdm(player_soup):
            
            # each player container (player with stats)
            each_player = {}
            
            #loop through p to create dictionary of each player
            for stat in stat_list:
                # set each stat equal to the tag text (i.e. pts_per_g = 15.4)
                each_player[stat.attrs['data-stat']] = stat.text
            
            # add each player dictionary to the players container
            players.append(each_player)
            
        # check to see if players list is actually getting filled
        print(len(players))    
        
#         # check current page to see if there is a next page
#         # if not, current page is last page, and break out of looping through offset pages
#         next_pg_checker = soup.select('p a[href]')
#         if 'Next page' not in [c.text for c in next_pg_checker]:
#             break
#         else: continue
        
    return players

In [6]:
players=get_players(url1)

  0%|          | 0/161 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/161 [00:02<06:27,  2.42s/it]t/s][A

100



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 2/161 [00:04<06:08,  2.32s/it]t/s][A

200



  2%|▏         | 3/161 [00:06<05:33,  2.11s/it]

200



  2%|▏         | 4/161 [00:07<05:11,  1.99s/it]

200



  3%|▎         | 5/161 [00:09<04:51,  1.87s/it]

200





KeyboardInterrupt: 

In [None]:
players

In [None]:
# check to see if all player dicts were created
len(players)

# THEY WERE

In [None]:
# save the players as a pickle
with open('player_season_totals.pickle', 'wb') as outfile:
    pickle.dump(players, outfile)

outfile.close()

# Setting up the dataframe

## NBA dataframe

In [None]:
# make a dataframe of all players and stats
df = pd.read_pickle('player_season_totals.pickle')

##### change dtypes for columns with numerical data from str to int or float


In [None]:
cols = list(players[0].keys())

In [None]:
df.age = df.age.astype(int)

In [None]:
df.per = df.per.astype(float)

In [None]:
for i in range(6,len(cols)):
    df[cols[i]] = pd.to_numeric(df[cols[i]])

In [None]:
df = df.sort_values(by=['player','season']).reset_index().drop('index',axis=1)

In [None]:
df.head()

In [None]:
# saving nba df as json file
df.to_json('nba_df.json')