In [1]:
import pandas as pd
import pickle
import unicodedata
import os

#--------------------------Define Function---------------------------------

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

#--------------------------User Inputs---------------------------------

team_pickple_path = '/Users/nya/NBA project/NBA Project/utilities/teams_abrv_dictionary.p'

#--------------------------Load Files---------------------------------

player_df = pd.read_csv('/Users/nya/NBA project/NBA Project/00/scraped datasets/player_stats_2013_2023.csv')
team_name_dict = pickle.load(open(team_pickple_path, "rb"))

#-------------------Clean Data - Section 1: Player Names----------------------

# clean up player names
# strip accents, removing '*' from player names
player_df['Player'] = player_df['Player'].apply(strip_accents)
player_df['Player'] = player_df['Player'].replace(regex=['\*'], value='')
# removing periods from player names
player_df['Player'] = player_df['Player'].replace('/.', '', regex=True)
# removing extra white spaces from player names
player_df['Player'] = player_df['Player'].str.strip()

#----------------- Clean Data - Section 2: Format Team Names------------------

# change team abbreviations to the full team name
player_df['Tm'] = player_df['Tm'].replace(team_name_dict)

#----------------- Clean Data - Section 3: Format Dataset------------------

# removing first column in dataset
player_df.drop(player_df.columns[0], axis=1, inplace=True)

# dropping columns that won't be used for analysis 
columns_to_drop = ['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
player_df.drop(columns_to_drop, axis=1, inplace=True)

# changing column names
player_df.columns = ['Year', 'Season', 'Player', 'Pos', 'Age', 'Team', 'G', 'GS', 'MPPG']
# creating a panda series containing unique player names
player_unique_df = player_df.drop_duplicates('Player')
player_names_series = player_unique_df['Player']

#-----------------------Save Cleaned Data-----------------------------

# directory to save cleaned files
save_directory = '/Users/nya/NBA project/NBA Project/01/clean datasets'

# define filepaths
cleaned_data_path = os.path.join(save_directory, 'player_stats_cleaned.csv')
unique_player_names_path = os.path.join(save_directory, 'unique_player_names.csv')

# saving files
player_df.to_csv(cleaned_data_path, index=False)
player_names_series.to_csv(unique_player_names_path, index=False)

print(f"Cleaned dataset saved to {cleaned_data_path}")
print(f"Unique player names saved to {unique_player_names_path}")

Cleaned dataset saved to /Users/nya/NBA project/NBA Project/01/clean datasets/player_stats_cleaned.csv
Unique player names saved to /Users/nya/NBA project/NBA Project/01/clean datasets/unique_player_names.csv
