In [1]:
import os
import pandas as pd

def get_player_stats(bbref_id, player_type, game_id):
    """
    Get the player's stats for the specific game_id. If not available, return the most recent stats.
    """
    stats_dir = 'batters' if player_type == 'batting' else 'pitchers'
    stats_file = os.path.join(stats_dir, f'{bbref_id}_stats_{player_type}.csv')
    
    if not os.path.exists(stats_file):
        print(f"Stats file for {bbref_id} not found ({player_type}).")
        return None
    
    stats_df = pd.read_csv(stats_file)
    game_stats = stats_df[stats_df['game_id'] == game_id]
    
    if not game_stats.empty:
        return game_stats.iloc[0]
    else:
        return stats_df.iloc[-1]

def process_game(game_id):
    # Read the gamelog file
    game_file = f'gamelogs/game_{game_id}.csv'
    if not os.path.exists(game_file):
        print(f"Gamelog file for game {game_id} not found.")
        return
    
    game_df = pd.read_csv(game_file)
    game_data = game_df.iloc[0].to_dict()
    
    # Define relevant columns for batters and pitchers
    batter_columns = ['AVG_20', 'OBP_20', 'SLG_20', 'OPS_20', 'SB_20', 'CS_20', 'XB_20', 'TB_20', 'SO_20',
                      'AVG_10', 'OBP_10', 'SLG_10', 'OPS_10', 'SB_10', 'CS_10', 'XB_10', 'TB_10', 'SO_10',
                      'AVG_5', 'OBP_5', 'SLG_5', 'OPS_5', 'SB_5', 'CS_5', 'XB_5', 'TB_5', 'SO_5',
                      'AVG_3', 'OBP_3', 'SLG_3', 'OPS_3', 'SB_3', 'CS_3', 'XB_3', 'TB_3', 'SO_3']
    pitcher_columns = ['IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
                       'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
                       'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
                       'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
                       'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3']
    
    # Fetch stats for each batter
    for i in range(1, 10):
        for team in ['Away', 'Home']:
            bbref_id = game_data.get(f'{team}_Batter{i}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'batting', game_id)
                if stats is not None:
                    for col in batter_columns:
                        game_data[f'{team}_Batter{i}_{col}'] = stats.get(col, '')
            else:
                print(f'missing bbrefID for game {game_id}')

    # Fetch stats for each pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 11):
            role = 'SP' if i == 1 else f'P_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')

    # Fetch stats for each bullpen pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 15):  # Adjust the range according to your maximum expected number of bullpen pitchers
            role = f'bullpen_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')
    
    # Create a DataFrame from the updated game data
    updated_game_df = pd.DataFrame([game_data])
    
    # Save the updated game data to a new CSV file
    output_file = f'gamelogs/gamestats_{game_id}.csv'
    updated_game_df.to_csv(output_file, index=False)
    print(f"Processed and saved game stats for game {game_id} to {output_file}")

def process_recent_games(num_recent_games):
    game_pks_file = 'game_pks.csv'
    if not os.path.exists(game_pks_file):
        print(f"{game_pks_file} not found.")
        return

    game_pks_df = pd.read_csv(game_pks_file)
    recent_game_pks = game_pks_df.tail(num_recent_games)['game_id'].tolist()
    
    for game_id in recent_game_pks:
        process_game(game_id)

# Input the number of most recent games to process
num_recent_games = 100
process_recent_games(num_recent_games)

Processed and saved game stats for game 746025 to gamelogs/gamestats_746025.csv
Processed and saved game stats for game 745872 to gamelogs/gamestats_745872.csv
Processed and saved game stats for game 745956 to gamelogs/gamestats_745956.csv
Processed and saved game stats for game 746517 to gamelogs/gamestats_746517.csv
Processed and saved game stats for game 745303 to gamelogs/gamestats_745303.csv
Processed and saved game stats for game 746114 to gamelogs/gamestats_746114.csv
Processed and saved game stats for game 747166 to gamelogs/gamestats_747166.csv
Processed and saved game stats for game 745216 to gamelogs/gamestats_745216.csv
Processed and saved game stats for game 746603 to gamelogs/gamestats_746603.csv
Processed and saved game stats for game 746682 to gamelogs/gamestats_746682.csv
Processed and saved game stats for game 745058 to gamelogs/gamestats_745058.csv
Processed and saved game stats for game 746921 to gamelogs/gamestats_746921.csv
Processed and saved game stats for game 