# Data Collection

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2017_per_game.html'
res = requests.get(url)

In [3]:
res.status_code

200

In [299]:
soup = BeautifulSoup(res.content, 'lxml')

In [91]:
header = soup.find('thead')

In [184]:
col_heads = []
for head in header.find_all('th')[1::]:
    col_heads.append(head.attrs['data-stat'])


In [272]:
players = []
table = soup.find('tbody')
for player in table.find_all('tr'):
    player_dict = {}
    for i, stat in enumerate(col_heads):
        try:
            player_dict[stat] = player.find_all('td')[i].text
        except:
            pass
    players.append(player_dict)
    
per_game = pd.DataFrame(players)
per_game.dropna(inplace = True)

per_game.head()

Unnamed: 0,player,pos,age,team_id,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,Álex Abrines,SG,23,OKC,68,6,15.5,2.0,5.0,0.393,...,0.898,0.3,1.0,1.3,0.6,0.5,0.1,0.5,1.7,6.0
1,Quincy Acy,PF,26,TOT,38,1,14.7,1.8,4.5,0.412,...,0.75,0.5,2.5,3.0,0.5,0.4,0.4,0.6,1.8,5.8
2,Quincy Acy,PF,26,DAL,6,0,8.0,0.8,2.8,0.294,...,0.667,0.3,1.0,1.3,0.0,0.0,0.0,0.3,1.5,2.2
3,Quincy Acy,PF,26,BRK,32,1,15.9,2.0,4.8,0.425,...,0.754,0.6,2.8,3.3,0.6,0.4,0.5,0.6,1.8,6.5
4,Steven Adams,C,23,OKC,80,80,29.9,4.7,8.2,0.571,...,0.611,3.5,4.2,7.7,1.1,1.1,1.0,1.8,2.4,11.3


In [273]:
per_game.shape

(595, 29)

In [276]:
len(per_game.dropna()['player'].unique())

486

In [300]:
advanced_url = f'https://www.basketball-reference.com/leagues/NBA_2017_advanced.html'
res_a = requests.get(advanced_url)
soup_a = BeautifulSoup(res_a.content, 'lxml')

In [302]:
header_a = soup_a.find('thead')

col_heads_a = []
for head in header_a.find_all('th')[1::]:
    col_heads_a.append(head.attrs['data-stat'])

In [312]:
players_a = []
table_a = soup_a.find('tbody')
for player in table_a.find_all('tr'):
    player_dict_a = {}
    for i, stat in enumerate(col_heads_a):
        try:
            player_dict_a[stat] = player.find_all('td')[i].text
        except:
            pass
    players_a.append(player_dict_a)
    
advanced = pd.DataFrame(players)
advanced.dropna(inplace = True)
advanced.drop(columns = ['pos', 'age', 'g', 'gs'], inplace = True)

advanced.head()

Unnamed: 0,player,team_id,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg3_per_g,fg3a_per_g,fg3_pct,fg2_per_g,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,Álex Abrines,OKC,15.5,2.0,5.0,0.393,1.4,3.6,0.381,0.6,...,0.898,0.3,1.0,1.3,0.6,0.5,0.1,0.5,1.7,6.0
1,Quincy Acy,TOT,14.7,1.8,4.5,0.412,1.0,2.4,0.411,0.9,...,0.75,0.5,2.5,3.0,0.5,0.4,0.4,0.6,1.8,5.8
2,Quincy Acy,DAL,8.0,0.8,2.8,0.294,0.2,1.2,0.143,0.7,...,0.667,0.3,1.0,1.3,0.0,0.0,0.0,0.3,1.5,2.2
3,Quincy Acy,BRK,15.9,2.0,4.8,0.425,1.1,2.6,0.434,0.9,...,0.754,0.6,2.8,3.3,0.6,0.4,0.5,0.6,1.8,6.5
4,Steven Adams,OKC,29.9,4.7,8.2,0.571,0.0,0.0,0.0,4.7,...,0.611,3.5,4.2,7.7,1.1,1.1,1.0,1.8,2.4,11.3


In [313]:
full_stats = pd.merge(per_game, advanced, on = ['player', 'team_id'])

In [314]:
full_stats

Unnamed: 0,player,pos,age,team_id,g,gs,mp_per_g_x,fg_per_g_x,fga_per_g_x,fg_pct_x,...,ft_pct_y,orb_per_g_y,drb_per_g_y,trb_per_g_y,ast_per_g_y,stl_per_g_y,blk_per_g_y,tov_per_g_y,pf_per_g_y,pts_per_g_y
0,Álex Abrines,SG,23,OKC,68,6,15.5,2.0,5.0,.393,...,.898,0.3,1.0,1.3,0.6,0.5,0.1,0.5,1.7,6.0
1,Quincy Acy,PF,26,TOT,38,1,14.7,1.8,4.5,.412,...,.750,0.5,2.5,3.0,0.5,0.4,0.4,0.6,1.8,5.8
2,Quincy Acy,PF,26,DAL,6,0,8.0,0.8,2.8,.294,...,.667,0.3,1.0,1.3,0.0,0.0,0.0,0.3,1.5,2.2
3,Quincy Acy,PF,26,BRK,32,1,15.9,2.0,4.8,.425,...,.754,0.6,2.8,3.3,0.6,0.4,0.5,0.6,1.8,6.5
4,Steven Adams,C,23,OKC,80,80,29.9,4.7,8.2,.571,...,.611,3.5,4.2,7.7,1.1,1.1,1.0,1.8,2.4,11.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,Cody Zeller,C,24,CHO,62,58,27.8,4.1,7.1,.571,...,.679,2.2,4.4,6.5,1.6,1.0,0.9,1.0,3.0,10.3
591,Tyler Zeller,C,27,BOS,51,5,10.3,1.5,3.1,.494,...,.564,0.8,1.6,2.4,0.8,0.1,0.4,0.4,1.2,3.5
592,Stephen Zimmerman,C,20,ORL,19,0,5.7,0.5,1.6,.323,...,.600,0.6,1.3,1.8,0.2,0.1,0.3,0.2,0.9,1.2
593,Paul Zipser,SF,22,CHI,44,18,19.2,2.0,5.0,.398,...,.775,0.3,2.5,2.8,0.8,0.3,0.4,0.9,1.8,5.5


In [338]:
def get_stats(seasons = []):
    
    season_dfs = [] 
    
    for season in seasons:
    
        per_game_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html'
        res = requests.get(per_game_url)
        soup = BeautifulSoup(res.content, 'lxml')
        
        col_heads = []
        for head in header.find_all('th')[1::]:
            col_heads.append(head.attrs['data-stat'])
            
        players = []
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}
            for i, stat in enumerate(col_heads):
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season
            
            players.append(player_dict)
    
        per_game_df = pd.DataFrame(players)
        per_game_df.dropna(inplace = True)
        
        
        advanced_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html'
        res_advanced = requests.get(advanced_url)
        soup = BeautifulSoup(res_advanced.content, 'lxml')
        
        col_heads = []
        for head in header.find_all('th')[1::]:
            col_heads.append(head.attrs['data-stat'])
            
        players = []
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}
            for i, stat in enumerate(col_heads):
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season
            
            players.append(player_dict)
    
        advanced_df = pd.DataFrame(players)
        advanced_df.drop(columns = ['pos', 'age', 'g', 'gs'], inplace = True)
        advanced_df.dropna(inplace = True)
    
        season_df = pd.merge(per_game_df, advanced_df, on = ['player', 'season', 'team_id'])
        season_dfs.append(season_df)
        
    full_df = pd.concat(objs = season_dfs)
    
    full_df.reset_index(drop = True, inplace = True)
        
    return full_df
        

In [339]:
stats = get_stats(seasons = [2017, 2018])

In [340]:
stats.shape

(1259, 52)

In [342]:
stats['player'][0]

'Álex Abrines'