In [5]:
team_abbr_to_id = {
    "ATL": "1610612737",
    "BOS": "1610612738",
    "BKN": "1610612751",
    "CHA": "1610612766",
    "CHI": "1610612741",
    "CLE": "1610612739",
    "DAL": "1610612742",
    "DEN": "1610612743",
    "DET": "1610612765",
    "GSW": "1610612744",
    "HOU": "1610612745",
    "IND": "1610612754",
    "LAC": "1610612746",
    "LAL": "1610612747",
    "MEM": "1610612763",
    "MIA": "1610612748",
    "MIL": "1610612749",
    "MIN": "1610612750",
    "NOP": "1610612740",
    "NYK": "1610612752",
    "OKC": "1610612760",
    "ORL": "1610612753",
    "PHI": "1610612755",
    "PHX": "1610612756",
    "POR": "1610612757",
    "SAC": "1610612758",
    "SAS": "1610612759",
    "TOR": "1610612761",
    "UTA": "1610612762",
    "WAS": "1610612764"
}

In [12]:
import os
import pandas as pd
from nba_api.stats.endpoints import playerestimatedmetrics

def preprocess_player_gamelogs(players_folder, output_folder):
    """
    Preprocesses each player's game logs for use in training a machine learning model to predict points in future games.
    Includes original stats, rolling stats, cumulative season statistics, and advanced metrics from the NBA API.

    Parameters:
    - players_folder (str): The directory containing player game logs in CSV files formatted as 'firstname_lastname.csv'.
    - output_folder (str): The directory where the preprocessed player statistics will be saved.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # # Fetch PlayerEstimatedMetrics data (for example, for the 2023-24 season)
    # estimated_metrics = playerestimatedmetrics.PlayerEstimatedMetrics(season='2023-24')
    # estimated_df = estimated_metrics.get_data_frames()[0]
    # estimated_df = estimated_df[['PLAYER_ID', 'E_USG_PCT', 'E_PACE', 'E_OFF_RATING', 'E_DEF_RATING']]

    # Loop through each player file in the folder
    for player_file in os.listdir(players_folder):
        if player_file.endswith('.csv'):
            player_file_path = os.path.join(players_folder, player_file)

            try:
                # Load the player's game log
                player_gamelog_df = pd.read_csv(player_file_path)

                if player_gamelog_df.empty:
                    print(f"No game log data found for player file {player_file}. Skipping...")
                    continue

                # Extract Player ID from the player's game log file
                player_id = player_gamelog_df['Player_ID'].iloc[0]  # Assuming 'Player_ID' column exists

                # Convert GAME_DATE to datetime and sort by date
                player_gamelog_df['GAME_DATE'] = pd.to_datetime(player_gamelog_df['GAME_DATE'], format='%Y-%m-%d')
                player_gamelog_df = player_gamelog_df.sort_values('GAME_DATE')

                # Convert the 'WL' column: 'W' -> 1, 'L' -> 0
                player_gamelog_df['WL'] = player_gamelog_df['WL'].map({'W': 1, 'L': 0})

                # Fill missing values for points, minutes, etc.
                player_gamelog_df['PTS'] = player_gamelog_df['PTS'].fillna(0)
                player_gamelog_df['MIN'] = player_gamelog_df['MIN'].fillna(0)
                player_gamelog_df['FGA'] = player_gamelog_df['FGA'].fillna(0)
                player_gamelog_df['FGM'] = player_gamelog_df['FGM'].fillna(0)
                player_gamelog_df['AST'] = player_gamelog_df['AST'].fillna(0)
                player_gamelog_df['REB'] = player_gamelog_df['REB'].fillna(0)
                player_gamelog_df['STL'] = player_gamelog_df['STL'].fillna(0)
                player_gamelog_df['BLK'] = player_gamelog_df['BLK'].fillna(0)
                player_gamelog_df['TOV'] = player_gamelog_df['TOV'].fillna(0)
                player_gamelog_df['PF'] = player_gamelog_df['PF'].fillna(0)

                # Convert the 'MATCHUP' column to 'OPP_TEAM' with the opposing team's TeamID
                player_gamelog_df['OPP_TEAM'] = player_gamelog_df['MATCHUP'].apply(lambda x: x.split()[-1])  # Get opponent abbreviation
                player_gamelog_df['OPP_TEAM'] = player_gamelog_df['OPP_TEAM'].map(team_abbr_to_id)  # Map abbreviation to TeamID
                player_gamelog_df = player_gamelog_df.drop(columns=['MATCHUP'])  # Drop 'MATCHUP' column

                # Calculate rolling statistics for last 20, 10, 5, and 3 games
                for window in [20, 10, 5, 3]:
                    player_gamelog_df[f'ROLLING_PTS_{window}_GAMES'] = player_gamelog_df['PTS'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_MIN_{window}_GAMES'] = player_gamelog_df['MIN'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_FGA_{window}_GAMES'] = player_gamelog_df['FGA'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_FGM_{window}_GAMES'] = player_gamelog_df['FGM'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_AST_{window}_GAMES'] = player_gamelog_df['AST'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_REB_{window}_GAMES'] = player_gamelog_df['REB'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_STL_{window}_GAMES'] = player_gamelog_df['STL'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_BLK_{window}_GAMES'] = player_gamelog_df['BLK'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_TOV_{window}_GAMES'] = player_gamelog_df['TOV'].rolling(window=window, min_periods=1).mean()
                    player_gamelog_df[f'ROLLING_PF_{window}_GAMES'] = player_gamelog_df['PF'].rolling(window=window, min_periods=1).mean()

                # Compute cumulative season statistics (2021 to present)
                player_gamelog_df['SEASON_YEAR'] = player_gamelog_df['GAME_DATE'].dt.year.apply(lambda x: f'{x}-{str(x+1)[-2:]}')
                player_gamelog_df['CUM_PTS_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['PTS'].cumsum()
                player_gamelog_df['CUM_MIN_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['MIN'].cumsum()
                player_gamelog_df['CUM_FGM_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['FGM'].cumsum()
                player_gamelog_df['CUM_FGA_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['FGA'].cumsum()
                player_gamelog_df['CUM_AST_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['AST'].cumsum()
                player_gamelog_df['CUM_REB_SEASON'] = player_gamelog_df.groupby('SEASON_YEAR')['REB'].cumsum()

                # # Merge additional stats from PlayerEstimatedMetrics using PLAYER_ID
                # player_estimated_stats = estimated_df[estimated_df['PLAYER_ID'] == player_id]

                # if not player_estimated_stats.empty:
                #     player_gamelog_df = player_gamelog_df.merge(player_estimated_stats, how='left', left_on='Player_ID', right_on='PLAYER_ID')

                # Round all statistics to 3 decimal places
                player_gamelog_df = player_gamelog_df.round(3)

                # Save the preprocessed data to a new CSV file
                player_filename = player_file.split('.')[0]  # Remove ".csv"
                output_file_name = f'{player_filename}_preprocessed_stats.csv'
                output_file_path = os.path.join(output_folder, output_file_name)
                player_gamelog_df.to_csv(output_file_path, index=False)
                #print(f"Preprocessed data saved for player {player_filename} to {output_file_path}")

            except Exception as e:
                print(f"Error processing {player_file_path}: {e}")

# Example usage
preprocess_player_gamelogs('players', 'preprocessed_players_stats')
