In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import pickle
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', 100)

# Scrape data

In [4]:
def getAndParseUrl(url):
    results = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(results.text, 'html.parser')
    return soup

In [37]:
# this url is for all players between 2014 and 2019 with a PER >= 0
url1 = 'https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=per_poss&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=2015&year_max=2019&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=0&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=per&c1comp=gt&c1val=&order_by=season&order_by_asc=&offset='

In [6]:
# this url is for all players between 2014 and 2019 with a PER <=0
url2 = 'https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&per_minute_base=36&type=per_poss&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99&year_min=2015&year_max=2019&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=per&c1comp=lt&order_by=season&order_by_asc=&offset='

In [45]:
def get_players(website): # function to grab every player and his stats from the page as a dictionary, and store as list
    
    # player dictionaries container
    players = []
    
    # loop through each page(for this website, each page is accessed by 'offset={some multiple of 100}')
    # last page is offset=2500
    for i in tqdm(range(0,2500,100)):
        url = website+str(i)
        soup = getAndParseUrl(url)
        
        # select each td tag under tr tag: each td tag in player_soup is a player's stat, all stored as a list
        player_soup = soup.select('#stats tr td')
        
        # create slicing points to group the 33 elements(html tags/columns) into single list(player)
        j=0
        k=32
        
        # there are a 100 players per page, 31 stats per player == 3100 elements in player_soup (except last page)
        # len(player_soup) gives the number of stats on each page
        # divide that by 31 to get number of players per page
        for i in range(int(len(player_soup)/32)):
            
            # each player container (player with stats)
            each_player = {}
            
            # p is a single player with stats sliced from player_soup using slicing points
            p = player_soup[j:k]
            
            #loop through p to create dictionary of each player
            for stat in p:
                # set each stat equal to the tag text (i.e. pts_per_g = 15.4)
                each_player[stat.attrs['data-stat']] = stat.text
            
            # add each player dictionary to the players container
            players.append(each_player)
            
            # move slicer to next player
            j+=32
            k+=32
        
        #checking to see if scraping is working at each step
        print(len(players))
        
        # check to see if current page is the last page containing stats
        # if so, break out of loop
        # else go on to next page
        next_pg_checker = soup.select('p a[href]')
        if 'Next page' not in [c.text for c in next_pg_checker]:
            break
        else: continue
        
    return players

In [46]:
players=get_players(url1)



  0%|          | 0/25 [00:00<?, ?it/s][A[A

  4%|▍         | 1/25 [00:02<00:56,  2.35s/it][A[A

100




  8%|▊         | 2/25 [00:04<00:53,  2.31s/it][A[A

200




 12%|█▏        | 3/25 [00:06<00:49,  2.26s/it][A[A

300




 16%|█▌        | 4/25 [00:09<00:47,  2.28s/it][A[A

400




 20%|██        | 5/25 [00:11<00:44,  2.24s/it][A[A

500




 24%|██▍       | 6/25 [00:13<00:42,  2.24s/it][A[A

600




 28%|██▊       | 7/25 [00:16<00:43,  2.42s/it][A[A

700




 32%|███▏      | 8/25 [00:18<00:40,  2.37s/it][A[A

800




 36%|███▌      | 9/25 [00:20<00:37,  2.31s/it][A[A

900




 40%|████      | 10/25 [00:23<00:34,  2.32s/it][A[A

1000




 44%|████▍     | 11/25 [00:25<00:32,  2.29s/it][A[A

1100




 48%|████▊     | 12/25 [00:27<00:30,  2.31s/it][A[A

1200




 52%|█████▏    | 13/25 [00:29<00:27,  2.29s/it][A[A

1300




 56%|█████▌    | 14/25 [00:32<00:25,  2.29s/it][A[A

1400




 60%|██████    | 15/25 [00:34<00:22,  2.28s/it][A[A

1500




 64%|██████▍   | 16/25 [00:36<00:20,  2.24s/it][A[A

1600




 68%|██████▊   | 17/25 [00:38<00:18,  2.26s/it][A[A

1700




 72%|███████▏  | 18/25 [00:41<00:15,  2.23s/it][A[A

1800




 76%|███████▌  | 19/25 [00:43<00:13,  2.24s/it][A[A

1900




 80%|████████  | 20/25 [00:45<00:11,  2.23s/it][A[A

2000




 84%|████████▍ | 21/25 [00:48<00:09,  2.35s/it][A[A

2100




 88%|████████▊ | 22/25 [00:50<00:06,  2.33s/it][A[A

2200




 92%|█████████▏| 23/25 [00:53<00:04,  2.44s/it][A[A

2300




 96%|█████████▌| 24/25 [00:55<00:02,  2.45s/it][A[A

2400
2473


In [48]:
players.extend(get_players(url2))




  0%|          | 0/25 [00:00<?, ?it/s][A[A[A

57


In [49]:
# save the players as a pickle
with open('player_stats_per_100p.pickle', 'wb') as outfile:
    pickle.dump(players, outfile)

outfile.close()

# Setting up the dataframe

## NBA dataframe

In [51]:
# make a dataframe of all players and stats
df = pd.DataFrame(players, columns = players[0].keys())

##### change dtypes for columns with numerical data from str to int or float


In [52]:
cols = list(players[0].keys())

In [53]:
df.age = df.age.astype(int)

In [54]:
df.per = df.per.astype(float)

In [55]:
for i in range(6,len(cols)):
    df[cols[i]] = pd.to_numeric(df[cols[i]])

In [56]:
df = df.sort_values(by=['player','season']).reset_index().drop('index',axis=1)

In [57]:
df.head()

Unnamed: 0,player,age,team_id,lg_id,per,season,g,gs,mp,fg_per_poss,fga_per_poss,fg2_per_poss,fg2a_per_poss,fg3_per_poss,fg3a_per_poss,ft_per_poss,fta_per_poss,orb_per_poss,drb_per_poss,trb_per_poss,ast_per_poss,stl_per_poss,blk_per_poss,tov_per_poss,pf_per_poss,pts_per_poss,fg_pct,fg2_pct,fg3_pct,efg_pct,ft_pct,ts_pct
0,A.J. Hammons,24,DAL,NBA,8.4,2016-17,22,0,163,5.4,13.4,3.8,10.2,1.6,3.2,2.9,6.4,2.6,8.9,11.5,1.3,0.3,4.2,3.2,6.7,15.3,0.405,0.375,0.5,0.464,0.45,0.472
1,A.J. Price,28,TOT,NBA,12.0,2014-15,26,0,324,8.1,21.7,5.7,12.7,2.4,9.0,2.5,3.8,1.0,4.1,5.1,7.3,1.1,0.0,2.2,2.4,21.1,0.372,0.45,0.263,0.427,0.667,0.451
2,Aaron Brooks,30,CHI,NBA,14.4,2014-15,82,21,1885,9.4,22.4,6.1,13.8,3.3,8.6,4.0,4.8,0.9,3.7,4.6,7.2,1.5,0.4,4.3,5.2,26.2,0.421,0.442,0.387,0.495,0.833,0.534
3,Aaron Brooks,31,CHI,NBA,11.8,2015-16,69,0,1108,8.5,21.2,5.5,12.9,3.0,8.4,2.2,2.9,1.0,3.6,4.6,8.1,1.4,0.5,3.7,6.0,22.2,0.401,0.43,0.357,0.471,0.766,0.494
4,Aaron Brooks,32,IND,NBA,9.5,2016-17,65,0,894,6.8,16.8,4.1,9.6,2.7,7.2,1.8,2.2,1.0,2.9,3.9,7.0,1.4,0.5,3.7,5.2,18.0,0.403,0.424,0.375,0.483,0.8,0.507


In [58]:
# saving nba df as json file
df.to_json('nba_100p_df.json')

## Rookies dataframe

In [59]:
rookies_soup = getAndParseUrl('https://www.nba.com/article/2019/06/21/2019-nba-draft-results-picks-1-60')

rookie_names = [r.text for r in rookies_soup.select('div p a[href]') if r.text not in ['officially traded', 'first being dealt', 'then to']]

rookie_names = [r for r in rookie_names if 'traded' not in r]

rookie_names = [r for r in rookie_names if 'officially' not in r][2:-3]

In [60]:
rookie_names

['Zion Williamson',
 'Ja Morant',
 'RJ Barrett',
 "De'Andre Hunter",
 'Darius Garland',
 'Jarrett Culver',
 'Coby White',
 'Jaxson Hayes',
 'Rui Hachimura',
 'Cam Reddish',
 'Cameron Johnson',
 'PJ Washington',
 'Tyler Herro',
 'Romeo Langford',
 'Sekou Doumbouya',
 'Chuma Okeke',
 'Nickeil Alexander-Walker',
 'Goga Bitadze',
 'Luka Samanic',
 'Matisse Thybulle',
 'Brandon Clarke',
 'Grant Williams',
 'Darius Bazley',
 'Ty Jerome',
 'Nassir Little',
 'Dylan Windler',
 'Mfiondu Kabengele',
 'Jordan Poole',
 'Keldon Johnson',
 'Kevin Porter Jr.',
 'Nicolas Claxton',
 'KZ Okpala',
 'Carsen Edwards',
 'Bruno Fernando',
 'Marcos Louzada Silva',
 'Cody Martin',
 'Deividas Sirvydis',
 'Daniel Gafford',
 'Alen Smailagic',
 'Justin James',
 'Eric Paschall',
 'Admiral Schofield',
 'Jaylen Nowell',
 'Bol Bol',
 'Isaiah Roby',
 'Talen Horton-Tucker',
 'Ignas Brazdeikis',
 'Terance Mann',
 'Quinndary Weatherspoon',
 'Jarrell Brantley',
 'Tremont Waters',
 'Jalen McDaniels',
 'Justin Wright-Foreman'

rookie schools

In [61]:
rook_school_soup = getAndParseUrl('https://en.wikipedia.org/wiki/2019_NBA_draft')

In [62]:
# clean the school names 
rookie_schools = list(set([r.text.strip('\n').replace(' (Fr.)','').replace(' (So.)','').replace(' (Jr.)','').replace(' (Sr.)','').lower().replace(' ','-') for r in rook_school_soup.select('tr td:last-child')][11:71]))


In [63]:
rookie_schools

['maryland',
 'ucla',
 'mississippi-state',
 'kentucky',
 'stanford',
 'texas',
 'texas-tech',
 'iowa-state',
 'santa-cruz-warriors-(g-league)',
 'michigan',
 'washington',
 'rytas-vilnius-(lithuania)',
 'purdue',
 'mega-bemax-(serbia)',
 'wyoming',
 'nevada',
 'north-carolina',
 'nebraska',
 'usc',
 'partizan-belgrade-(serbia)',
 'indiana',
 'hofstra',
 'villanova',
 'oregon',
 'georgia',
 'yale',
 'virginia-tech',
 'murray-state',
 'sesi/franca-(brazil)',
 'belmont',
 'miami',
 'charleston',
 'florida-state',
 'virginia',
 'gonzaga',
 'tennessee',
 'duke',
 'olimpija-ljubljana-(slovenia)',
 'lsu',
 'limoges-csp-(france)',
 'san-diego-state',
 'princeton-hs-(sharonville,-ohio;-hs-sr.)',
 'arkansas',
 'vanderbilt',
 'auburn']

scrape for college stats on upcoming rookies

In [70]:
def get_college_players(): # function to grab every player and his stats from the page as a dictionary, and store as list
    
    # player dictionaries container
    players = []
    
    # loop through each school page to grab each player at each school
    for school in tqdm(rookie_schools):
        try:
            url = f'https://www.sports-reference.com/cbb/schools/{school}/2019.html'
            soup = getAndParseUrl(url)

            # select each td tag under tr tag in the per_game table: each td tag in player_soup is a player's stat, all stored as a list
            player_soup = soup.select('#per_poss tr td')

            # create slicing points to group the 26 elements(html tags) into single list(player)
            j=0
            k=26

            # len(player_soup) gives the number of stats on each page
            # divide that by 27 to get number of players per page
            for i in range(int(len(player_soup)/26)):

                # each player container (player with stats)
                each_player = {}

                # p is a single player with stats sliced from player_soup
                p = player_soup[j:k]

                #loop through p to create dictionary of each player
                for stat in p:
                    # set each stat equal to the tag text (i.e. pts_per_g = 15.4)
                    each_player[stat.attrs['data-stat']] = stat.text

                # add each player dictionary to the players container
                players.append(each_player)

                # move slicer to next player
                j+=26
                k+=26
        except:
            continue

    return players

In [71]:
rookies = get_college_players()




  0%|          | 0/45 [00:00<?, ?it/s][A[A[A


  2%|▏         | 1/45 [00:01<01:20,  1.84s/it][A[A[A


  4%|▍         | 2/45 [00:03<01:19,  1.85s/it][A[A[A


  7%|▋         | 3/45 [00:05<01:15,  1.80s/it][A[A[A


  9%|▉         | 4/45 [00:06<01:10,  1.72s/it][A[A[A


 11%|█         | 5/45 [00:08<01:10,  1.76s/it][A[A[A


 13%|█▎        | 6/45 [00:10<01:07,  1.72s/it][A[A[A


 16%|█▌        | 7/45 [00:11<01:03,  1.67s/it][A[A[A


 18%|█▊        | 8/45 [00:13<01:00,  1.64s/it][A[A[A


 20%|██        | 9/45 [00:14<00:55,  1.54s/it][A[A[A


 22%|██▏       | 10/45 [00:16<00:58,  1.67s/it][A[A[A


 24%|██▍       | 11/45 [00:18<00:55,  1.64s/it][A[A[A


 27%|██▋       | 12/45 [00:19<00:51,  1.55s/it][A[A[A


 29%|██▉       | 13/45 [00:21<00:49,  1.54s/it][A[A[A


 31%|███       | 14/45 [00:22<00:45,  1.46s/it][A[A[A


 33%|███▎      | 15/45 [00:24<00:44,  1.49s/it][A[A[A


 36%|███▌      | 16/45 [00:25<00:43,  1.50s/it][A[A[A


 38%|███▊   

In [76]:
rookies

[]

In [72]:
# set up the rookies dataframe to have the same order of columns as the nba players dataframe
# college stats are missing some columns that nba dataframe has
rookie_df = pd.DataFrame(rookies, columns=[x for x in list(df.columns) if x not in ['efg_pct','ts_pct','age','team_id','lg_id','per','season']])

In [73]:
# change numerical data from strings to ints/floats
for col in rookie_df.columns:
    try:
        rookie_df[col] = pd.to_numeric(rookie_df[col])
    except:
        pass # this is to skip over strings that are not numerical

In [74]:
# filter out all college players that weren't drafted into nba in 2019
#!!!!! ISSUE: MISSING ROOKIE DATA FOR THE ROOKIES NOT DRAFTED FROM COLLEGE (i.e. overseas, G-league, etc.) !!!!!
rookie_df = rookie_df.loc[rookie_df['player'].isin(rookie_names)].sort_values(by='player').reset_index().drop('index',axis=1)

In [75]:
rookie_df.head(1)

Unnamed: 0,player,g,gs,mp,fg_per_poss,fga_per_poss,fg2_per_poss,fg2a_per_poss,fg3_per_poss,fg3a_per_poss,ft_per_poss,fta_per_poss,orb_per_poss,drb_per_poss,trb_per_poss,ast_per_poss,stl_per_poss,blk_per_poss,tov_per_poss,pf_per_poss,pts_per_poss,fg_pct,fg2_pct,fg3_pct,ft_pct


In [430]:
# saving rookie df as json file
rookie_df.to_json('rookie_df.json')

##### Adding missing columns to rookie_df to match the nba df
some columns are calculable

missing: ['efg_pct','ts_pct','age','team_id','lg_id','per','season']

In [432]:
# calculating effective field goal percentage for college players (efg_pct)
rookie_df['efg_pct'] = (rookie_df.fg2_per_g + (1.5*rookie_df.fg3_per_g))/rookie_df.fga_per_g

In [439]:
# calculating true shooting percentage (ts_pct)
rookie_df['ts_pct'] = rookie_df.pts_per_g/(2*(rookie_df.fga_per_g + (.44*rookie_df.fta_per_g)))

In [440]:
# # calculating player efficiency rating (per) in accordance with basketball-reference.com's formula
# # this is the url for calculating per: 'https://www.basketball-reference.com/about/per.html'
# factor = (2/3) - (.5*())