In [1]:
import pandas as pd
import pickle
import numpy as np
import sys
import os

#--------------------------User Inputs---------------------------------

team_pickle_path = '/Users/nya/NBA project/NBA Project/utilities/teams_nickname_dictionary.p'
names_file_path = '/Users/nya/NBA project/NBA Project/01/clean datasets/unique_player_names.csv'

#-------------------------Load Files-----------------------------------

# loading in the required files
movement_df = pd.read_csv('/Users/nya/NBA project/NBA Project/00/scraped datasets/nba_movement_data.csv')
team_name_dict = pickle.load(open(team_pickle_path, "rb"))


player_names_df = pd.read_csv(names_file_path)
player_names_series = player_names_df.iloc[:, 0]

#-------------------Clean Data - Section 1: Format Date----------------------

# converting 'date' column to datetime and sort
movement_df['Date'] = pd.to_datetime(movement_df['Date'], errors='coerce')
movement_df.sort_values(by='Date', inplace=True)
movement_df.reset_index(drop=True, inplace=True)

#----------------- Clean Data - Section 2: Format Team Names------------------

# handling teams that have the same mascot, but different cities
special_cases = {
    'Hornets': {
        'cutoff': pd.to_datetime('2013-06-18'),
        'before': 'New Orleans Hornets',
        'after': 'Charlotte Hornets'
    }
}

for team, info in special_cases.items():
    movement_df.loc[(movement_df['Team'] == team) & (movement_df['Date'] <= info['cutoff']), 'Team'] = info['before']
    movement_df.loc[(movement_df['Team'] == team) & (movement_df['Date'] > info['cutoff']), 'Team'] = info['after']

# handling the remaining teams and standardizing team names
movement_df['Team'] = movement_df['Team'].map(team_name_dict).fillna(movement_df['Team'])

# dropping rows with no team names
movement_df.dropna(subset=['Team'], inplace=True)

#---------------Clean Data - Section 3: Format Player Names-------------------

# define the preprocessing function for player names
def preprocess_names(series):
    series = series.fillna('')  # Fill NaN values with empty strings
    series = series.astype(str)  # Ensure all data is treated as strings
    series = series.str.replace(r'\(.*?\)', '', regex=True)  # Remove parentheses
    series = series.str.replace(r'\.', '', regex=True)  # Remove periods
    series = series.str.replace(r'Jr\.', '', regex=True)  # Remove 'Jr.'
    series = series.str.replace(r'(III|IV)', '', regex=True)  # Remove 'III' and 'IV'
    series = series.str.replace(r'/', ' ', regex=True)  # Replace slashes with space
    series = series.str.replace(r"'", '', regex=True)  # Remove apostrophes
    series = series.str.strip()  # Remove extra whitespace

    # Split based on whitespace and take the first two non-empty elements which should represent the first and last name
    series = series.apply(lambda x: ' '.join(x.split()[:2]) if x and isinstance(x, str) else x)
    
    return series

# apply pre-processing function to 'Relinquished' and 'Acquired'
movement_df['Relinquished'] = preprocess_names(movement_df['Relinquished'])
if 'Acquired' in movement_df.columns:
    movement_df['Acquired'] = preprocess_names(movement_df['Acquired'])

# convert NaNs to a string "NaN" in 'Relinquished' and 'Acquired' columns
movement_df['Relinquished'] = movement_df['Relinquished'].fillna('NaN')
if 'Acquired' in movement_df.columns:
    movement_df['Acquired'] = movement_df['Acquired'].fillna('NaN')

# import and apply player_name_standardizer with pre-processed names
sys.path.append('/Users/nya/NBA project/NBA Project/cleaning')
from player_name_standardizer import player_name_standardizer
player_spelling_dict = player_name_standardizer(player_names_series, movement_df['Relinquished'], movement_df.get('Acquired', pd.Series([])))

# apply standardized names back to dataframe
movement_df['Relinquished'] = movement_df['Relinquished'].apply(lambda x: player_spelling_dict.get(x, x))
if 'Acquired' in movement_df.columns:
    movement_df['Acquired'] = movement_df['Acquired'].apply(lambda x: player_spelling_dict.get(x, x))

#-----------------------Save Cleaned Data-----------------------------

# Directory to save cleaned files
save_directory = '/Users/nya/NBA project/NBA Project/01/clean datasets'

# Define filepaths
cleaned_data_path = os.path.join(save_directory, 'movement_data_cleaned.csv')

movement_df.to_csv(cleaned_data_path, index=False, encoding='utf-8')
print(f"Cleaned dataset saved to {cleaned_data_path}")

Cleaned dataset saved to /Users/nya/NBA project/NBA Project/01/clean datasets/movement_data_cleaned.csv
