## Parsing NBA Data for Analysis

#### Declare Libraries

In [None]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
import os

Important thing to note: you'll need to create another directory called 'csv_files'

In [None]:
directory = 'csv_files'

#### Define Data

In [None]:
# get the data from the directories
award_data = os.listdir("nba_awards")
player_stats_data = os.listdir("player_stats")
team_standing_data = os.listdir("team_standings")

# dictionary for team abbreviations 
nba_team_dict = {'ATL': 'Atlanta Hawks', 'BOS':	'Boston Celtics', 'BRK': 'Brooklyn Nets', 
                 'NJN': 'New Jersey Nets', 'CHO': 'Charlotte Hornets', 'CHA': 'Charlotte Bobcats',
                 'CHH': 'Charlotte Hornets', 'CHI': 'Chicago Bulls', 'CLE':	'Cleveland Cavaliers',
                 'DAL':	'Dallas Mavericks', 'DEN': 'Denver Nuggets', 'DET': 'Detroit Pistons', 
                 'GSW': 'Golden State Warriors', 'HOU':	'Houston Rockets', 'IND': 'Indiana Pacers', 
                 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers', 'MEM': 'Memphis Grizzlies',
                 'VAN': 'Vancouver Grizzlies', 'MIA': 'Miami Heat', 'MIL': 'Milwaukee Bucks', 
                 'MIN': 'Minnesota Timberwolves', 'NOP': 'New Orleans Pelicans',
                 'NOH': 'New Orleans Hornets', 'NYK': 'New York Knicks', 'OKC':	'Oklahoma City Thunder',
                 'SEA': 'Seattle SuperSonics', 'ORL': 'Orlando Magic', 'PHI': 'Philadelphia 76ers',
                 'PHO': 'Phoenix Suns', 'POR': 'Portland Trail Blazers', 'SAC': 'Sacramento Kings', 
                 'SAS': 'San Antonio Spurs', 'TOR':	'Toronto Raptors', 'UTA': 'Utah Jazz',
                 'WAS':	'Washington Wizards', 'WSB': 'Washington Bullets',
                 'NOK': 'New Orleans/Oklahoma City Hornets'}

#### Parsing HTML

In [None]:
# function for parsing through html files
def parse_html(temp_file):
    """
    :param temp_file: a temporary html file
    :return: a soup instance from the BeautifulSoup library
    """
    
    with open(temp_file) as f:
        html = f.read()
        
    # create instance of BeautifulSoup
    soup = BeautifulSoup(html)
    
    # will remove any headers and reserve lines from the html
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    
    return soup

#### Get Standing Data using Pandas

In [None]:
def get_standing(temp_soup, season_year):
    """
    :param temp_soup: a temporary soup instance
    :param season_year: the current season year
    :return: a completed and cleaned standing df from the season year
    """
        
    # initialize the final dataframe
    cleaned_final_df = pd.DataFrame()
        
    # parse html to get list of dataframes
    if 1997 <= season_year < 2016:
        season_standings = pd.read_html(str(temp_soup))
    else:
        season_standings = pd.read_html(str(temp_soup))[:2]
    
    # loop through and clean
    for conference_df in season_standings:
        
         # add a conference indicator
        if 'Eastern Conference' in conference_df.columns:
            conference_df["conference_indicator"] = 0
        else:
            conference_df["conference_indicator"] = 1
             
        # rename conference column to team
        curr_conference = conference_df.columns[0]
        conference_df = conference_df.rename(columns={curr_conference: 'team'})
        
        # remove asterisks
        conference_df.iloc[:, 0] = conference_df.iloc[:, 0].str.replace("*", "")
        
        # create a point differential column
        conference_df["PD/G"] = conference_df["PS/G"] - conference_df["PA/G"]
        
        # sort correctly to get seeds
        conference_df = conference_df.sort_values(by=['W', 'PD/G'], ascending=[False, False])
        conference_df.reset_index(drop=True, inplace=True)
        conference_df["conference_seed"] = conference_df.index + 1
        
        # start adding to new dataframe
        cleaned_final_df = cleaned_final_df.append(
            conference_df[["team", "W", "L", "W/L%", "PD/G",
                           "conference_indicator", "conference_seed"]], ignore_index=True)
    
    # sort by win percentage and reset index
    cleaned_final_df = cleaned_final_df.sort_values('W/L%', ascending=False)
    cleaned_final_df.reset_index(drop=True, inplace=True)
    
    # turn all columns to lowercase
    cleaned_final_df.columns = cleaned_final_df.columns.str.lower()
        
    return cleaned_final_df

#### Get Player Stats Data using Pandas

In [None]:
def replace_tot(row, stats_df):
    """
    :param row: a row for a player
    :param stats_df: a comprehensive player stats df 
    :return: a new and replaced name abbreviation for the player
    """
    
    # check if the player does have 'TOT' as their team
    if row['tm'] == 'TOT':
        
        # create a temporary df with only the rows of the player
        temp_df = stats_df[stats_df['player'] == row['player']]

        # drop the row with 'TOT' and then get the team with most games played
        temp_df = temp_df[temp_df['tm'] != 'TOT']
        temp_df = temp_df.sort_values(by='g', ascending=False)
        team_name = temp_df.iloc[0]['tm']
    else:
        team_name = row['tm']

    return team_name

In [None]:
def get_stats(soup_dict, season_year):
    """
    :param soup_dict: a dictionary with the df names and their soup instances
    :param season_year: the current season year
    :return: a completed and cleaned stats df from the season year
    """
    
    # read all the soup instances as dataframes
    df_dict = {}
    for temp_filename, soup_instance in soup_dict.items():
        df_dict[temp_filename] = pd.read_html(str(soup_instance))[0]
    
    # get rid of the per-36, totals, and shooting stats
    df_dict.pop(f"NBA_Season_{season_year}_per_minute.html")
    df_dict.pop(f"NBA_Season_{season_year}_totals.html")
    df_dict.pop(f"NBA_Season_{season_year}_shooting.html")
    
    # remove unnamed columns
    for temp_df in df_dict.values():
        
        unnamed_cols = [col for col in temp_df.columns if 'Unnamed' in col]
        for col in unnamed_cols:
            del temp_df[col]
        
    # remove unwanted columns depending on dataframe
    for temp_df_name, temp_df in df_dict.items():
        
        if 'adj_shooting' in temp_df_name:
            unwanted_cols = ['Rk', 'Player', 'Pos', 'Age',
                             'Team', 'G', 'MP', 'FG', 'FT', 
                             '2P', '3P', 'FG Add', 'TS Add',]
            temp_df.drop(columns=unwanted_cols, inplace=True)
        elif 'play-by-play' in temp_df_name:
            unwanted_cols = ['Rk', 'Player', 'Pos', 'Age',
                             'Tm', 'G', 'MP', 'PG%', 'SG%', 'SF%',
                             'PF%', 'C%', 'OnCourt', 'BadPass',
                             'LostBall', 'Shoot', 'Off.', 'Shoot.1',
                             'Off..1', 'PGA', 'And1', 'Blkd']
            temp_df.drop(columns=unwanted_cols, inplace=True)
        elif 'per_poss' in temp_df_name:
            unwanted_cols = ['Rk', 'Player', 'Pos', 'Age', 'Tm',
                             'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
                             '3P', '3PA', '3P%', '2P', '2PA', '2P%',
                             'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
                             'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
            temp_df.drop(columns=unwanted_cols, inplace=True)
        elif 'per_game' in temp_df_name:
            unwanted_cols = ['Rk', 'eFG%']
            temp_df.drop(columns=unwanted_cols, inplace=True)
        else:
            unwanted_cols = ['Rk', 'Player', 'Pos', 'Age', 'Tm',
                             'G', 'MP', 'TS%', '3PAr', 'FTr']
            temp_df.drop(columns=unwanted_cols, inplace=True)            
        
        
    # concatenate all the dataframes into one and reset index to avoid indexical issues
    df_list = list(df_dict.values())
    combined_df = pd.concat(df_list, axis=1)
    
    # reorganize all the columns
    new_column_order = ['Player', 'Pos', 'Age', 'Tm',
                        'G', 'GS', 'MP', 'PTS', 'ORB',
                        'DRB', 'TRB', 'AST', 'STL', 'BLK',
                        'TOV', 'PF', 'FG', 'FGA', 'FG%', '3P',
                        '3PA', '3P%', '2P', '2PA', '2P%', 'FT',
                        'FTA', 'FT%', 'eFG', 'TS', 'FTr', '3PAr',
                        'FG+', '2P+', '3P+', 'eFG+', 'FT+', 'TS+',
                        'FTr+', '3PAr+', 'On-Off', 'ORtg', 'DRtg',
                        'PER', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
                        'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
                        'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
    combined_df = combined_df[new_column_order]
    
    # turn all columns to lowercase
    combined_df.columns = combined_df.columns.str.lower()
    
    # replace NaNs with 0s
    combined_df.fillna(0, inplace=True)

    # find players who appear multiple times in the dataframe
    combined_df['tm'] = combined_df.apply(lambda x: replace_tot(x, combined_df), axis=1)
    
    # remove duplicates
    combined_df = combined_df.sort_values(by='g', ascending=False)
    combined_df = combined_df.drop_duplicates(subset=['player'], keep='first')
    
    # remove players who have a name of 0
    combined_df = combined_df[combined_df['player'] != 0]
    
    # sort by top PPG and remove asterisks from names
    combined_df = combined_df.sort_values(by='pts', ascending=False)
    combined_df['player'] = combined_df['player'].str.replace("*", "")
    
    return combined_df

#### Get NBA Award using Pandas

In [None]:
def get_awards(soup_dict):
    """
    :param soup_dict: a dictionary with the df names and their soup instances
    :return: a completed and cleaned list of award dfs from the season year
    """
    
    final_dict = {}
    
    # read all the soup instances as dataframes
    df_dict = {}
    for temp_filename, soup_instance in soup_dict.items():
        df_dict[temp_filename] = pd.read_html(str(soup_instance))[0]
        
    # alter each dataframe depending on type and rename columns
    reg_award_wanted_columns = ['Rank', 'Player', 'Age', 'Tm', 'First',
                      'Pts Won', 'Share']
    all_nba_wanted_columns = ['# Tm', 'Player', 'Age', 'Tm',
                                  'Pts Won', 'Share', '1st Tm',
                                  '2nd Tm', '3rd Tm']
    all_defense_wanted_columns = ['# Tm', 'Player', 'Age', 'Tm',
                                  'Pts Won', 'Share']
    for temp_df_name, temp_df in df_dict.items():

        if 'all_defense' in temp_df_name:
            
            # take wanted columns
            temp_df = temp_df[all_defense_wanted_columns]
            
            # rewrite the columns
            new_columns = {col: 'all_defense_' + col for col in temp_df.columns}
            temp_df.rename(columns=new_columns, inplace=True)
            
            # rewrite the name
            temp_df_name = temp_df_name.split("_")[5] + "_" + temp_df_name.split("_")[6]
        elif 'all_nba' in temp_df_name:
            
            # take wanted columns
            temp_df = temp_df[all_nba_wanted_columns]
            
            # rewrite the columns
            new_columns = {col: 'all_nba_' + col for col in temp_df.columns}
            temp_df.rename(columns=new_columns, inplace=True) 
            
            # get rid of NaNs
            temp_df.fillna(0, inplace=True)
            
            # rewrite the name
            temp_df_name = temp_df_name.split("_")[5] + "_" + temp_df_name.split("_")[6]
        else:
            
            # take wanted columns
            temp_df = temp_df[reg_award_wanted_columns]
            
            # rewrite the columns
            temp_add_name = temp_df_name.split("_")[3] + "_" + temp_df_name.split("_")[4] + "_"
            new_columns = {col: temp_add_name + col for col in temp_df.columns}
            temp_df.rename(columns=new_columns, inplace=True)
            
            # rewrite the name
            temp_df_name = temp_df_name.split("_")[3] + "_" + temp_df_name.split("_")[4]


        # turn all columns to lowercase
        temp_df.columns = temp_df.columns.str.lower()
        
        # add to the finalized dictionary
        final_dict[temp_df_name] = temp_df
        
    return final_dict

#### Loop Through each Season and Create a Dataframe for Each

In [None]:
def find_htmls(year, html_files, directory_string):
    """
    :param year: a specific year
    :param html_files: a list of html files spanning multiple years
    :param directory_string: a string representing the directory
    :return: a dictionary of html files with their file names from the specific year
    """
    
    # finds the html files from a given year
    specific_html_files = []
    for temp_html_name in html_files:
        if str(year) in temp_html_name:
            specific_html_files.append(temp_html_name)
    
    # get dictionary
    temp_dict = {file: parse_html(f"{directory_string}/{file}") for file in specific_html_files}
            
    return temp_dict

In [None]:
def get_seed(team_name, standing_df):
    """
    :param team_name: a specific abbreviated team name of a player
    :param standing_df: the dataframe containing important standing information
    :return: the seed of the team the given player
    """
    
    # get the full team name from the global team dictionary
    full_team_name = nba_team_dict[team_name]
                
    # get the seed of the team from the standing dataframe
    return standing_df.loc[standing_df.index[standing_df['team'].str.contains(
        full_team_name)][0], 'conference_seed']

In [None]:
def get_win_perc(team_name, standing_df):
    """
    :param team_name: a specific abbreviated team name of a player
    :param standing_df: the dataframe containing important standing information
    :return: the win percentage of the team of the given player 
    """
    
    # get the full team name from the global team dictionary
    full_team_name = nba_team_dict[team_name]
    
    # get the win percentage of the team from the standing dataframe
    return standing_df.loc[standing_df.index[standing_df['team'].str.contains(
        full_team_name)][0], 'w/l%']

In [None]:
def get_award_share(award_name, player_name, award_df):
    """
    :param award_name: the award name of the dataframe
    :param player_name: a specific player name
    :param award_df: a specific award ranking dataframe
    :return: the rank of the specific player of the specific award
    """
    
    
    # finds the share of the player for the given award, or just give a share value of 0.0 if not in ranking
    if player_name in award_df[f'{award_name}_player'].values:
        share = award_df.loc[award_df.index[award_df[f'{award_name}_player']
                                           == player_name][0],f'{award_name}_share']
    else:
        share = 0.0
            
    return share

In [None]:
def get_season_data(season):
    """
    :param season: a current nba season
    :return: a dataframe with nba data from tha current season
    """
    
    # get the standing dataframe
    corr_html = [html_string for html_string in team_standing_data if str(temp_season) in html_string][0]
    filename = f"team_standings/{corr_html}"
    temp_standing_soup = parse_html(filename)
    season_standing_df = get_standing(temp_standing_soup, temp_season)
            
    # get the stats dataframe by making a dictionary and then forging a dataframe
    temp_season_stats_dict = find_htmls(temp_season, player_stats_data, 'player_stats')
    season_stats_df = get_stats(temp_season_stats_dict, temp_season)
    
    # get the awards dataframes by making a dictionary and then forging a dataframe
    season_awards_df_dict = get_awards(find_htmls(temp_season, award_data, 'nba_awards'))
        
    # add standing columns for each player
    season_stats_df['team_seed'] = (season_stats_df['tm'].apply
                                    (lambda x: get_seed(x, season_standing_df)))
    season_stats_df['win_percentage'] = (season_stats_df['tm'].apply
                                         (lambda x: get_win_perc(x, season_standing_df)))
    

    # add award columns for each player
    for df_name, df in season_awards_df_dict.items():
        
        season_stats_df[f'{df_name}_share'] = (season_stats_df['player'].apply
                                                  (lambda x: get_award_share(df_name, x, df)))
                
        """
        # add a rank column for each player for each regular award
        if df_name != 'all_defense' and df_name != 'all_nba':
            season_stats_df[f'{df_name}_rank'] = (season_stats_df['player'].apply
                                                  (lambda x: get_reg_award_rank(df_name, x, df)))
        # if it's one of the all team awards
        else:
            season_stats_df[f'{df_name}_team_rank'] = (season_stats_df['player'].apply
                                                  (lambda x: get_all_team_award_rank(df_name, x, df)))
        """
    
    # finally add the season year for each player
    season_stats_df.reset_index(drop=True, inplace=True)
    season_stats_df['year'] = season
    
    return season_stats_df

In [None]:
# initialize season list
seasons = list(range(1997, 2024))

# list for all season dataframes
season_info = []

# loop through each season and get dataframes, then merge and concat as needed
for temp_season in seasons:
            
    # add to the list
    season_info.append(get_season_data(temp_season))
    print(temp_season)

##### Create the Final Dataframe for CSV Output

In [None]:
# concatenate all the dataframes from each season on rows
nba_df = pd.concat(season_info, axis=0)
nba_df.reset_index(drop=True, inplace=True)

# specify file path for csv file
file_path = os.path.join(directory, 'nba_data.csv')

# export to a csv file
nba_df.to_csv(file_path, index=False)

##### Create a Separate Dataframe just for the 2024 Season

In [None]:
# get the standing dataframe
htmls = [html_string for html_string in team_standing_data if str(2024) in html_string][0]
standing_filename = f"team_standings/{htmls}"
standing_2024_soup = parse_html(standing_filename)
season_2024_standings = get_standing(standing_2024_soup, 2024)
            
# get the stats dataframe by making a dictionary and then forging a dataframe
stats_dict_2024 = find_htmls(2024, player_stats_data, 'player_stats')
season_2024_df = get_stats(stats_dict_2024, 2024)
        
# add standing columns for each player
season_2024_df['team_seed'] = (season_2024_df['tm'].apply
                                (lambda x: get_seed(x, season_2024_standings)))
season_2024_df['win_percentage'] = (season_2024_df['tm'].apply
                                    (lambda x: get_win_perc(x, season_2024_standings)))

# add year column
season_2024_df['year'] = 2024
    
# reset index
season_2024_df.reset_index(drop=True, inplace=True)

# specify file path for csv file
file_path = os.path.join(directory, '2024_nba_data.csv')

# export to a csv file
season_2024_df.to_csv(file_path, index=False)

##### Parse the Rookies' List HTML into a Dataframe and Export as a CSV File

In [None]:
# read and parse the html into a dataframe
rookies = pd.read_html('NBA_Season_2024_Rookies.html')[0]

# drop the annoying top row
rookies.columns = rookies.columns.droplevel(0)

# only keep the player list
rookies = rookies[['Player']]

# remove NaNs or anyone called 'Player
rookies = rookies[~(rookies['Player'].isna() | rookies['Player'].str.contains('Player'))]

# reset index
rookies.reset_index(drop=True, inplace=True)

# specify file path for csv file
file_path = os.path.join(directory, '2024_nba_rookies.csv')

# export to a csv file
rookies.to_csv(file_path, index=False)