# Data Collection

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import unicodedata

In [None]:
# function from user BartoszKP on Stack Overflow solution

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [None]:
# Specifying the data types that each statistic belong to

int_stats = ['age', 'g', 'gs', 'mp', 'season']

float_stats = ['mp_per_g', 'fg_per_g',
       'fga_per_g', 'fg_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct',
       'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'efg_pct', 'ft_per_g',
       'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g',
       'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g',
       'pts_per_g','per', 'ts_pct', 'fg3a_per_fga_pct',
       'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct',
       'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ws-dum', 'ows', 'dws',
       'ws', 'ws_per_48', 'bpm-dum', 'obpm', 'dbpm', 'bpm', 'vorp']

team_floats = ['team_PTS', 'team_FGM', 'team_FGA', 'team_FG%',
       'team_3PM', 'team_3PA', 'team_3P%', 'team_FTM', 'team_FTA', 'team_FT%',
       'team_OR', 'team_DR', 'team_REB', 'team_AST', 'team_STL', 'team_BLK',
       'team_TO', 'team_PF']

In [None]:
# Creating a dictionary that specifies with the official abbreviation for each team name
# This is for merging datasets 

abbr_dict  = {'Atlanta Hawks': 'ATL', 'Boston Celtics': 'BOS', 'Brooklyn Nets': 'BRK',
              'Chicago Bulls': 'CHI', 'Charlotte Hornets': 'CHO','Cleveland Cavaliers': 'CLE',
              'Dallas Mavericks': 'DAL', 'Denver Nuggets': 'DEN', 'Detroit Pistons': 'DET', 'Golden State Warriors': 'GSW',
              'Houston Rockets': 'HOU', 'Indiana Pacers': 'IND', 'LA Clippers': 'LAC',
              'Los Angeles Lakers': 'LAL', 'Memphis Grizzlies': 'MEM', 'Miami Heat': 'MIA',
              'Milwaukee Bucks': 'MIL', 'Minnesota Timberwolves': 'MIN', 'New York Knicks': 'NYK',
              'New Orleans Pelicans': 'NOP', 'Oklahoma City Thunder': 'OKC', 'Orlando Magic': 'ORL',
              'Phoenix Suns': 'PHO', 'Philadelphia 76ers': 'PHI', 'Portland Trail Blazers': 'POR', 
              'Sacramento Kings': 'SAC','San Antonio Spurs': 'SAS', 'Toronto Raptors': 'TOR',
              'Utah Jazz': 'UTA', 'Washington Wizards': 'WAS'}

In [None]:
# Creating a function that sets each empty string to 0
# Empty strings represent 0 in the statistics data that will be pulled

def empty_is_zero(s):
    if s == '':
        return '0.0'
    else:
        return s

In [None]:
# Defines function pull the basic and advanced stats from Basketball Reference

def get_stats(seasons = []):
    
    # Instantiates empty list where complete dataframes will be stored 
    season_dfs = [] 
    
    # Loop through each season provided to the function
    for season in seasons:
        
        # Scrape the basic statistics page for the season
        per_game_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html'
        res = requests.get(per_game_url)
        soup = BeautifulSoup(res.content, 'lxml')
        
        # Generate the column headers by looping through the statistics table header
        # And storing each header in a list
        header = soup.find('thead')
        col_heads = []
        for head in header.find_all('th')[1::]:
            col_heads.append(head.attrs['data-stat'])
            
        # Instantiate an empty list where player stat dictionaries will be stored    
        players = []
        
        # Loops through each player in the statistics table and finds the corresponding statistic for each column header
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}                                             # Stores each players' name in stats in the dictionary
            for i, stat in enumerate(col_heads):                         # Enumerates column headers so that correct column can be chosen using i
                
                # Try/Except statement used because there are table breaks after every 25 columns
                # Except continues the For Loop when after in encounters one of these breaks
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season          # Adds the current season provided as a column
            
            players.append(player_dict)             # Adds the current player dictionary to the list of dictionaries
    
        per_game_df = pd.DataFrame(players)         # Creates a dataframe from the dictionaries added to the list of player dictionaries
        per_game_df.dropna(inplace = True)
        
        # Repeats the same process for the Advanced Statistics
        
        # Scrape the advanced statistics page for the season
        advanced_url = f'https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html'
        res_advanced = requests.get(advanced_url)
        soup = BeautifulSoup(res_advanced.content, 'lxml')
        
        
        # Generate the column headers by looping through the statistics table header
        # And storing each header in a list   
        header_a = soup.find('thead')
        col_heads_a = []
        for head in header_a.find_all('th')[1::]:
            col_heads_a.append(head.attrs['data-stat'])
            
        # Instantiate an empty list where player stat dictionaries will be stored    
        players = []
        
        # Loops through each player in the statistics table and finds the corresponding statistic for each column header
        table = soup.find('tbody')
        for player in table.find_all('tr'):
            player_dict = {}                            # Stores each players' name in stats in the dictionary
            for i, stat in enumerate(col_heads_a):      # Enumerates column headers so that correct column can be chosen using i
                
                # Try/Except statement used because there are table breaks after every 25 columns
                # Except continues the For Loop when after in encounters one of these breaks
                try:
                    player_dict[stat] = player.find_all('td')[i].text
                except:
                    pass
            player_dict['season'] = season              # Adds the current season provided as a column
            
            players.append(player_dict)                 # Adds the current player dictionary to the list of dictionaries
    
        advanced_df = pd.DataFrame(players)             # Creates a dataframe from the dictionaries added to the list of player dictionaries
        
        
        # Drops columns from the advanced statistics dataframe that are also present
        # In the basic statistics dataframe
        advanced_df.drop(columns = ['pos', 'age', 'g'], inplace = True)
        advanced_df.dropna(inplace = True)
    
        # Creates a dataframe for the current season by merging the basic and advanced statistic
        # Dataframes on player, season, and team_id
        # This ensures that each row is still a unique data point and no data is lost
        season_df = pd.merge(per_game_df, advanced_df, on = ['player', 'season', 'team_id'])
        
        # Adds dataframe for this season to list of dataframe seasons
        season_dfs.append(season_df)
        
    full_df = pd.concat(objs = season_dfs)                         # Create new dataframe by combining the dataframes from each season
     
    full_df.reset_index(drop = True, inplace = True)               # Reset index of new dataframe
    
    # Remove any accents from player names so that salary data can be merged later
    full_df['player'] = full_df['player'].map(strip_accents)       
    
    # Loops through every column in the specified float_stats list and changes
    # Each empty string to 0 by mapping the previously defined empty_is_zero function
    for col in full_df[float_stats]:
        full_df[col] = full_df[col].map(empty_is_zero)
    
    # Change all float and int categories to their appropriate data type
    full_df[float_stats] = full_df[float_stats].astype(float)
    full_df[int_stats] = full_df[int_stats].astype(int)
        
    return full_df
        

In [None]:
def get_salaries(seasons = []):
    
    # Instantiate empty list to store the data frames for each individual season
    season_dfs = []
    
    # Loop through each provided season
    for season in seasons:
        
        # Scrape the Hoops Hype salary page of each season provided
        pay_url = f'https://hoopshype.com/salaries/players/{season - 1}-{season}/'
        pay_res = requests.get(pay_url)
        pay_soup = BeautifulSoup(pay_res.content, 'lxml')
        
        salary_list = []                               # Instantiate empty list to store player/salary dictionaries
        
        # Loop through each player in the salary table
        pay_table = pay_soup.find('tbody')
        for player in pay_table.find_all('tr'):
            salary_dict = {}
            
            # Add entries to the salary dictionary by scraping the table for the player name and salary
            # Remove HTML, special characters and unneccessary text
            salary_dict['player'] = player.find('td', class_ = 'name').text.replace('\t', '').replace('\n', '')
            salary_dict['salary'] = player.find_all('td')[-2].text.replace('\t', '').replace('\n', '').replace('$', '').replace(',', '')
    
            salary_list.append(salary_dict)          # Add the player/salary dictionary to the list of dictionaries
    
        salaries = pd.DataFrame(salary_list)                  # Create a dataframe from the list of dictionaries
        salaries['salary'] = salaries['salary'].astype(int)   # Change salary to correct int data type
        salaries['season'] = season                           # Add the current season as a column in the data frame
        
        season_dfs.append(salaries)                           # Add the season data frame to the list of data frames
        
    # Create a new dataframe by combining the list of individual season dataframes
    salaries_full = pd.concat(objs = season_dfs)              
    
    return salaries_full

In [None]:
def get_team_stats(seasons = []):
    
    # Instantiate empty list to store the data frames for each individual season
    df_list = []
    
    # Loop through each season provided
    for season in seasons:

        # Scrape the ESPN team stats website for the specified season
        url = f'https://www.espn.com/nba/stats/team/_/season/{season}/seasontype/2'
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')

        # Create team table for the table of the team names
        team_table = soup.find('table').find_all('td')[1::2]
        
        # Create stat table for the table of team statistics
        stat_table = soup.find('table', class_= 'Table Table--align-right')

        # Instantiate empty list to store team stat dictionaries
        team_list = []
        
        # Loop through each team in the team table
        for i, team in enumerate(team_table):
            
            # Create dictionary for stats establish team_id key as the text in the team name
            team_dict = {}
            team_dict['team_id'] = team.text
    
            # Loop through each category in the statistics table
            for n, stat in enumerate(stat_table.find_all('th')):
                
                # Create a new key/value pair for each statistic in the stats table
                team_dict[f'team_{stat.text}'] = stat_table.find_all('tr')[i + 1].find_all('td')[n].text
        
    
            team_list.append(team_dict)        # Add the dictionary to the list of team/stat dictionaries
    
        # Create dataframe by from each dictionary in the list of dictionaries
        team_df = pd.DataFrame(team_list)

        # Changes each full team name in the team stats table to the official abbreviation
        # This is for merging later on
        team_df['team_id'] = team_df['team_id'].map(abbr_dict)
        
        # Change all stats that are supposed to be floats to the appropriate data types
        team_df[team_floats] = team_df[team_floats].astype(float)
        
        # Create a season column that is the current season
        team_df['season'] = season
        
        # Add the dataframe to the list of dataframes
        df_list.append(team_df)
    
    # Create a new dataframe by combining the dataframes for each individual season
    teams = pd.concat(objs = df_list)
    
    return teams


In [None]:
def get_nba_data(seasons = []):
    
    # Create combined stats dataframe for each season provided
    stats = get_stats(seasons)
    
    # Create combined salaries dataframe for each season provided
    salaries = get_salaries(seasons)
    
    # Create combined team stats dataframe for each season provided
    team_stats = get_team_stats(seasons)
    
    # Create stats and salary dataframe by merging stats and salaries dataframes
    # On player and season
    # This ensures each player's individual season is a unique data point
    player_data = pd.merge(stats, salaries, on = ['player', 'season'])
    
    # Create new dataframe by merging the team_stats with the combined player_data dataframes
    # On team_id and season
    nba_data = pd.merge(player_data, team_stats, on = ['team_id', 'season'])
    
    return nba_data

In [None]:
# Save the specified seasons as a dataframe
nba_data = get_nba_data(seasons = [2017, 2018, 2019])

In [None]:
# Drop unnesseccary columns

drop_cols = ['team_GP', 'team_FG%','team_FT%', 'team_OR','team_DR','team_STL', 'team_BLK', 'team_TO', 'team_PF',
             'team_3P%',]

nba_data.drop(columns = drop_cols, inplace = True)

**Important**  
Create a folder titled 'Data' in your main directory

In [None]:
# Save NBA data to your Data folder
nba_data.to_csv('../Data/nba_data.csv')