In [1]:
import pandas as pd
import os

#--------------------------User Inputs---------------------------------

stats_file_path = '/Users/nya/NBA project/NBA Project/01/clean datasets/player_stats_cleaned.csv'
schedule_file_path = '/Users/nya/NBA project/NBA Project/01/clean datasets/teams_schedule_cleaned.csv'

#-------------------------Load Files-----------------------------------

player_stats_df = pd.read_csv(stats_file_path)
teams_schedule_df = pd.read_csv(schedule_file_path)

#-----Feature Engineering - Section 1: Total Minutes and Games Played---------

""" This section will calculating the following for each player in each season (regular/post)/ year they were active:
- Total games played in PRIOR seasons (TGP_prior_seasons)
- Total minutes played in PRIOR seasons (TMP_prior_seasons)
- Total minutes played in CURRENT season (TMP_current_season)
"""

#cases where a player has multiple entries for a given regular season. which
#usually occurs when a player has been traded mid-season. in the data, there 
#is a row for the players time with each time and one where 'Tm' == 'TOT' (this
#row contains totals across all teams for that season. we want to keep just the 
#row containing totals)

# find  rows which have the same 'year', 'player', and 'season-type'; keep the "first" occurence which corresponds to the row containing "TOT"
rows_to_drop_df = player_stats_df[player_stats_df.duplicated(subset = ['Year', 'Player', 'Season'], keep = 'first')]
row_indices_to_drop = rows_to_drop_df.index
# drop rows
player_stats_df.drop(row_indices_to_drop, inplace = True)

#get a list of unique player names
unique_player_names = player_stats_df['Player'].unique()
# loop through each player
for name in unique_player_names:
    # find all rows corresponding to a player
    single_stats_df = player_stats_df[player_stats_df['Player'] == name].copy()

    # calculate total games played in prior seasons
    single_stats_df['TGP_prior_seasons'] = single_stats_df['G'].cumsum() - single_stats_df['G']
    
    # calculate the minutes played in the current season and total minutes played in prior seasons
    single_stats_df['TMP_current_season'] = single_stats_df['MPPG'] * single_stats_df['G']
    single_stats_df['TMP_prior_seasons'] = single_stats_df['TMP_current_season'].cumsum() - single_stats_df['TMP_current_season']

    # update original dataframe
    player_stats_df.loc[single_stats_df.index, 'TGP_prior_seasons'] = single_stats_df['TGP_prior_seasons']
    player_stats_df.loc[single_stats_df.index, 'TMP_current_season'] = single_stats_df['TMP_current_season']
    player_stats_df.loc[single_stats_df.index, 'TMP_prior_seasons'] = single_stats_df['TMP_prior_seasons']

#-----------------------Save Data-----------------------------

# directory to save cleaned files
save_directory = '/Users/nya/NBA project/NBA Project/02/processed datasets'

# define filepaths
cleaned_data_path = os.path.join(save_directory, 'player_stats_processed.csv')

# saving files
player_stats_df.to_csv(cleaned_data_path, index=False)

print(f"Cleaned dataset saved to {cleaned_data_path}")

Cleaned dataset saved to /Users/nya/NBA project/NBA Project/02/processed datasets/player_stats_processed.csv
