In [16]:
import os
import re
import pandas as pd
from pybaseball import statcast_batter, statcast_pitcher, playerid_lookup, pitching_stats_range, batting_stats_range, schedule_and_record, team_game_logs, playerid_reverse_lookup
from datetime import timedelta, datetime
import statsapi
from statsapi import boxscore_data
import pprint
import openpyxl
import mlbstatsapi

In [14]:
gamepk = 745333
boxscore = boxscore_data(gamePk=gamepk)


In [17]:
# Create the gamelogs directory if it doesn't exist
if not os.path.exists('gamelogs'):
    os.makedirs('gamelogs')

# Function to extract player details from a team and include bbref ID
def extract_team_player_details(team_data, team_prefix, player_count):
    player_info = team_data.get('players', {})
    batting_order = team_data.get('battingOrder', [])
    batting_position_map = {player_id: position for position, player_id in enumerate(batting_order, start=1)}
    
    team_data = {}
    for player_id in player_info:
        player_data = player_info[player_id]
        player_id = player_data['person']['id']
        full_name = player_data['person']['fullName']
        batting_position = batting_position_map.get(player_id)
        
        # Skip players with no batting position
        if batting_position is None:
            continue
        
        column_name = f'{team_prefix}_Batter{player_count}_'
        team_data[f'{column_name}Name'] = full_name
        team_data[f'{column_name}ID'] = player_id
        
        # Get bbref ID
        bbref_id = get_bbref_id(player_id)
        team_data[f'{column_name}bbrefID'] = bbref_id
        
        player_count += 1
    
    return team_data, player_count

def get_bbref_id(mlb_id):
    try:
        lookup_df = playerid_reverse_lookup([mlb_id], 'mlbam')
        bbref_id = lookup_df.loc[0, 'key_bbref'] if not lookup_df.empty else ''
        return bbref_id
    except Exception as e:
        print(f"Error retrieving bbref ID for MLB ID {mlb_id}: {e}")
        return ''

# Read the game_pks.csv file
game_pks_df = pd.read_csv('game_pks.csv').tail(50) ###############################################  Pulls / Rewrites GAMELOGS for last 50 games
processed_games = 0

for gamepk in game_pks_df['game_id']:

    filename = f'gamelogs/game_{gamepk}.csv'
    #if os.path.exists(filename):
    #    continue  # Skip processing if the file already exists
    
    boxscore = boxscore_data(gamePk=gamepk)

    # Extract data for both teams
    away_player_count = 1
    home_player_count = 1
    away_team_data, away_player_count = extract_team_player_details(boxscore.get('away', {}), 'Away', away_player_count)
    home_team_data, home_player_count = extract_team_player_details(boxscore.get('home', {}), 'Home', home_player_count)

    # Add the pitching data for both teams and include bbref ID
    away_pitchers = boxscore.get('awayPitchers', [])[1:]  # Skip the first element
    home_pitchers = boxscore.get('homePitchers', [])[1:]  # Skip the first element

    for i, pitcher_data in enumerate(away_pitchers, start=1):
        pitcher_name = pitcher_data['name']
        pitcher_id = pitcher_data['personId']
        column_name_name = f'Away_{("SP" if i == 1 else f"P_{i}")}_Name'
        column_name_id = f'Away_{("SP" if i == 1 else f"P_{i}")}_ID'
        away_team_data[column_name_name] = pitcher_name
        away_team_data[column_name_id] = pitcher_id

        # Get bbref ID
        bbref_id = get_bbref_id(pitcher_id)
        column_name_bbref = f'Away_{("SP" if i == 1 else f"P_{i}")}_bbrefID'
        away_team_data[column_name_bbref] = bbref_id

    for i, pitcher_data in enumerate(home_pitchers, start=1):
        pitcher_name = pitcher_data['name']
        pitcher_id = pitcher_data['personId']
        column_name_name = f'Home_{("SP" if i == 1 else f"P_{i}")}_Name'
        column_name_id = f'Home_{("SP" if i == 1 else f"P_{i}")}_ID'
        home_team_data[column_name_name] = pitcher_name
        home_team_data[column_name_id] = pitcher_id

        # Get bbref ID
        bbref_id = get_bbref_id(pitcher_id)
        column_name_bbref = f'Home_{("SP" if i == 1 else f"P_{i}")}_bbrefID'
        home_team_data[column_name_bbref] = bbref_id

    # Ensure all positions are filled with empty strings for missing pitchers (up to 10)
    for i in range(len(away_pitchers) + 1, 11):
        column_name_name = f'Away_P_{i}_Name'
        column_name_id = f'Away_P_{i}_ID'
        column_name_bbref = f'Away_P_{i}_bbrefID'
        away_team_data[column_name_name] = ''
        away_team_data[column_name_id] = ''
        away_team_data[column_name_bbref] = ''

    for i in range(len(home_pitchers) + 1, 11):
        column_name_name = f'Home_P_{i}_Name'
        column_name_id = f'Home_P_{i}_ID'
        column_name_bbref = f'Home_P_{i}_bbrefID'
        home_team_data[column_name_name] = ''
        home_team_data[column_name_id] = ''
        home_team_data[column_name_bbref] = ''

    # Combine the extracted data into a single dictionary
    combined_data = {**away_team_data, **home_team_data}

    # Add the 'gamepk' column
    combined_data['gamepk'] = gamepk

    # Find the gamepk in the game_pks_df and get the corresponding game_date
    game_date = game_pks_df.loc[game_pks_df['game_id'] == gamepk, 'game_date'].values[0]

    # Add the 'game_date' column
    combined_data['game_date'] = game_date

    # Adding runs columns
    runs_home = int(boxscore['homeBattingTotals']['r'])
    runs_away = int(boxscore['awayBattingTotals']['r'])
    combined_data['runs_home'] = runs_home
    combined_data['runs_away'] = runs_away
    combined_data['runs_total'] = runs_home + runs_away

    # Ensure all batter columns are filled
    for i in range(1, 10):
        for team in ['Away', 'Home']:
            combined_data.setdefault(f'{team}_Batter{i}_Name', '')
            combined_data.setdefault(f'{team}_Batter{i}_ID', '')
            combined_data.setdefault(f'{team}_Batter{i}_bbrefID', '')

    # Ensure all pitcher columns are filled
    for i in range(1, 11):
        for team in ['Away', 'Home']:
            role = 'SP' if i == 1 else f'P_{i}'
            combined_data.setdefault(f'{team}_{role}_Name', '')
            combined_data.setdefault(f'{team}_{role}_ID', '')
            combined_data.setdefault(f'{team}_{role}_bbrefID', '')

    # Create a DataFrame from the combined data
    df = pd.DataFrame([combined_data])

    # Reorder columns
    column_order = ['game_date', 'gamepk', 'runs_away', 'runs_home', 'runs_total']
    for i in range(1, 10):
        column_order += [f'Away_Batter{i}_Name', f'Away_Batter{i}_ID', f'Away_Batter{i}_bbrefID']
    for i in range(1, 10):
        column_order += [f'Home_Batter{i}_Name', f'Home_Batter{i}_ID', f'Home_Batter{i}_bbrefID']
    for i in range(1, 11):
        column_order += [f'Away_SP_Name' if i == 1 else f'Away_P_{i}_Name', f'Away_SP_ID' if i == 1 else f'Away_P_{i}_ID', f'Away_SP_bbrefID' if i == 1 else f'Away_P_{i}_bbrefID']
    for i in range(1, 11):
        column_order += [f'Home_SP_Name' if i == 1 else f'Home_P_{i}_Name', f'Home_SP_ID' if i == 1 else f'Home_P_{i}_ID', f'Home_SP_bbrefID' if i == 1 else f'Home_P_{i}_bbrefID']

    # Filter out empty pitcher columns
    df = df[column_order]
    non_empty_columns = [col for col in df.columns if not df[col].isnull().all() and df[col].astype(bool).sum() > 0]
    df = df[non_empty_columns]

    # Save the DataFrame to a CSV file
    df.to_csv(f'gamelogs/game_{gamepk}.csv', index=False)
    
    processed_games += 1

    # Print progress every 100 games
    if processed_games % 100 == 0:
        print(processed_games)

print("All games have been processed and saved.")

All games have been processed and saved.
