# Data Collection

In [60]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import unicodedata

In [61]:
# function from user BartoszKP on Stack Overflow solution

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [62]:
int_stats = ['age', 'g', 'gs', 'mp', 'season']

float_stats = ['mp_per_g', 'fg_per_g',
       'fga_per_g', 'fg_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct',
       'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'efg_pct', 'ft_per_g',
       'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g',
       'pts_per_g','per', 'ts_pct', 'fg3a_per_fga_pct',
       'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct',
       'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ws-dum', 'ows', 'dws',
       'ws', 'ws_per_48', 'bpm-dum', 'obpm', 'dbpm', 'bpm', 'vorp']

In [63]:
team_floats = ['team_PTS', 'team_FGM', 'team_FGA', 'team_FG%',
       'team_3PM', 'team_3PA', 'team_3P%', 'team_FTM', 'team_FTA', 'team_FT%',
       'team_OR', 'team_DR', 'team_REB', 'team_AST', 'team_STL', 'team_BLK',
       'team_TO', 'team_PF']

In [79]:
abbr_dict  = {'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BRK',
              'Chicago Bulls': 'CHI', 'Charlotte Hornets': 'CHO','Cleveland Cavaliers': 'CLE',
              'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
              'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND', 'Los Angeles Clippers': 'LAC',
              'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
              'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New York Knicks': 'NYK',
              'New Orleans Pelicans': 'NOP', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
              'Pheonix Suns': 'PHO', 'Philadelphia 76ers': 'PHI', 'Portland Trailblazers': 'POR', 
              'Sacramento Kings': 'SAC','San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
              'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'}

In [68]:
def empty_is_zero(s):
    if s == '':
        return '0.0'
    else:
        return s

In [69]:
def get_stats(seasons = []):
    
    season_dfs = [] 
    
    for season in seasons:
    
        per_game_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html'
        res = requests.get(per_game_url)
        soup = BeautifulSoup(res.content, 'lxml')
        
        header = soup.find('thead')
        
        col_heads = []
        for head in header.find_all('th')[1::]:
            col_heads.append(head.attrs['data-stat'])
            
        players = []
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}
            for i, stat in enumerate(col_heads):
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season
            
            players.append(player_dict)
    
        per_game_df = pd.DataFrame(players)
        per_game_df.dropna(inplace = True)
        
        
        advanced_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html'
        res_advanced = requests.get(advanced_url)
        soup = BeautifulSoup(res_advanced.content, 'lxml')
        
        
            
        header_a = soup.find('thead')

        col_heads_a = []
        for head in header_a.find_all('th')[1::]:
            col_heads_a.append(head.attrs['data-stat'])
            
        players = []
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}
            for i, stat in enumerate(col_heads_a):
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season
            
            players.append(player_dict)
    
        advanced_df = pd.DataFrame(players)
        
        
        advanced_df.drop(columns = ['pos', 'age', 'g'], inplace = True)
        advanced_df.dropna(inplace = True)
    
        season_df = pd.merge(per_game_df, advanced_df, on = ['player', 'season', 'team_id'])
        season_dfs.append(season_df)
        
    full_df = pd.concat(objs = season_dfs)
    
    full_df.reset_index(drop = True, inplace = True)
    
    full_df['player'] = full_df['player'].map(strip_accents)
    
    
    for col in full_df[float_stats]:
        full_df[col] = full_df[col].map(empty_is_zero)
    
    
    full_df[float_stats] = full_df[float_stats].astype(float)
    full_df[int_stats] = full_df[int_stats].astype(int)
        
    return full_df
        

In [70]:
def get_salaries(seasons = []):
    season_dfs = []
    
    for season in seasons:
        
        pay_url = f'https://hoopshype.com/salaries/players/{season - 1}-{season}/'
        pay_res = requests.get(pay_url)
        pay_soup = BeautifulSoup(pay_res.content, 'lxml')
        
        
        salary_list = []
        pay_table = pay_soup.find('tbody')
        for player in pay_table.find_all('tr'):
            salary_dict = {}
            
            salary_dict['player'] = player.find('td', class_ = 'name').text.replace('\t', '').replace('\n', '')
            
            salary_dict['salary'] = player.find_all('td')[-2].text.replace('\t', '').replace('\n', '').replace('$', '').replace(',', '')
    
            salary_list.append(salary_dict)
    
        salaries = pd.DataFrame(salary_list)
        salaries['salary'] = salaries['salary'].astype(int)
        salaries['season'] = season
        
        season_dfs.append(salaries)
        
    salaries_full = pd.concat(objs = season_dfs)
    
    return salaries_full

In [94]:
def get_team_stats(seasons = []):
    
    df_list = []
    for season in seasons:


        url = f'https://www.espn.com/nba/stats/team/_/season/{season}/seasontype/2'
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')

        team_table = soup.find('table').find_all('td')[1::2]

        stat_table = soup.find('table', class_= 'Table Table--align-right')

        team_list = []
        for i, team in enumerate(team_table):
            team_dict = {}
            team_dict['team_id'] = team.text
    
            for n, stat in enumerate(stat_table.find_all('th')):
                team_dict[f'team_{stat.text}'] = stat_table.find_all('tr')[i + 1].find_all('td')[n].text
        
    
            team_list.append(team_dict)
    
        team_df = pd.DataFrame(team_list)

        team_df['team_id'] = team_df['team_id'].map(abbr_dict)

        team_df[team_floats] = team_df[team_floats].astype(float)
        
        team_df['season'] = season
        
        df_list.append(team_df)
        
    teams = pd.concat(objs = df_list)
    
    return teams


In [99]:
def get_nba_data(seasons = []):
    
    stats = get_stats(seasons)
    
    salaries = get_salaries(seasons)
    
    team_stats = get_team_stats(seasons)
    
    player_data = pd.merge(stats, salaries, on = ['player', 'season'])
    
    nba_data = pd.merge(player_data, team_stats, on = ['team_id', 'season'])
    
    return nba_data

In [100]:
nba_data = get_nba_data(seasons = [2017, 2018, 2019])

In [102]:
nba_data.shape

(1497, 73)

In [101]:
nba_data.head()

Unnamed: 0,player,pos,age,team_id,g,gs,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,team_FTA,team_FT%,team_OR,team_DR,team_REB,team_AST,team_STL,team_BLK,team_TO,team_PF
0,Alex Abrines,SG,23,OKC,68,6,15.5,2.0,5.0,0.393,...,25.8,74.5,12.2,34.4,46.6,21.0,7.9,5.0,14.6,20.9
1,Steven Adams,C,23,OKC,80,80,29.9,4.7,8.2,0.571,...,25.8,74.5,12.2,34.4,46.6,21.0,7.9,5.0,14.6,20.9
2,Semaj Christon,PG,24,OKC,64,1,15.2,1.2,3.5,0.345,...,25.8,74.5,12.2,34.4,46.6,21.0,7.9,5.0,14.6,20.9
3,Norris Cole,PG,28,OKC,13,0,9.6,1.2,4.0,0.308,...,25.8,74.5,12.2,34.4,46.6,21.0,7.9,5.0,14.6,20.9
4,Nick Collison,PF,36,OKC,20,0,6.4,0.7,1.2,0.609,...,25.8,74.5,12.2,34.4,46.6,21.0,7.9,5.0,14.6,20.9


In [106]:
nba_data.columns

Index(['player', 'pos', 'age', 'team_id', 'g', 'gs', 'mp_per_g', 'fg_per_g',
       'fga_per_g', 'fg_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct',
       'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'efg_pct', 'ft_per_g',
       'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g',
       'pts_per_g', 'season', 'mp', 'per', 'ts_pct', 'fg3a_per_fga_pct',
       'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct',
       'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ws-dum', 'ows', 'dws',
       'ws', 'ws_per_48', 'bpm-dum', 'obpm', 'dbpm', 'bpm', 'vorp', 'salary',
       'team_GP', 'team_PTS', 'team_FGM', 'team_FGA', 'team_FG%', 'team_3PM',
       'team_3PA', 'team_3P%', 'team_FTM', 'team_FTA', 'team_FT%', 'team_OR',
       'team_DR', 'team_REB', 'team_AST', 'team_STL', 'team_BLK', 'team_TO',
       'team_PF'],
      dtype='object')

In [107]:
drop_cols = ['team_GP', 'team_FG%','team_FT%', 'team_OR','team_DR','team_STL', 'team_BLK', 'team_TO', 'team_PF',
             'team_3P%',]

In [108]:
nba_data.drop(columns = drop_cols, inplace = True)

In [109]:
nba_data.shape

(1497, 63)

In [110]:
nba_data.to_csv('../Data/nba_data.csv')