## Run this file once per day

In [1]:
# ======================================== GAME PKs ================================================

import os
import re
import pandas as pd
from pybaseball import statcast_batter, statcast_pitcher, playerid_lookup, pitching_stats_range, batting_stats_range, schedule_and_record, team_game_logs, pybaseball
from datetime import timedelta, datetime
import statsapi
import pprint

today = datetime.now()
end_date = today.strftime('%Y-%m-%d')
end_date = '2024-05-24'

season = today.year


def get_game_pks():
    
    desired_seasons = [2021, 2022, 2023, 2024] # Add Desired Seasons Here

    data_fields = ['game_date', 'game_id', 'away_name', 'away_id', 'home_name', 'home_id']

    ids_data = []

    today = datetime.now().strftime('%Y-%m-%d')


    for year in desired_seasons:

        if desired_seasons.index(year) < len(desired_seasons)-1:
            schedule = statsapi.schedule(start_date=f'{year}-01-01', end_date=f'{year}-12-31')

            for game in schedule:
                row = {field: game[field] for field in data_fields}
                ids_data.append(row)

            ids = pd.DataFrame(ids_data, columns=data_fields)
        else:
            schedule = statsapi.schedule(start_date=f'{year}-01-01', end_date=today)

            for game in schedule:
                row = {field: game[field] for field in data_fields}
                ids_data.append(row)

            ids = pd.DataFrame(ids_data, columns=data_fields)

    ids.to_csv('game_pks.csv')

get_game_pks() 

print('\n\nGame PKs updated.\nNow creating Player IDs.\n')

# =========================================== PLAYER IDs ===================================================

import pandas as pd
from statsapi import player_stats
from pybaseball import playerid_lookup, batting_stats_range, pitching_stats_range, playerid_reverse_lookup
from datetime import datetime
import csv
import pprint

start_date = '2021-01-01'
start_season = 2021

today = datetime.now()
end_date = today.strftime('%Y-%m-%d')

season = today.year

batting = batting_stats_range(start_date, end_date)
pitching = pitching_stats_range(start_date, end_date)

batter_ids = batting[['Name', 'Tm', 'mlbID']]
# Add an empty column 'key_bbref' initialized with None
batter_ids['key_bbref'] = None

# Loop through each player ID and fetch their bbref ID
for idx, mlb_id in batter_ids.iterrows():

    try:
        # Fetch the data using the player ID
        batterdata = playerid_reverse_lookup([mlb_id['mlbID']], key_type='mlbam')
        if not batterdata.empty:
            bbref = batterdata.at[0, 'key_bbref']  # Extract the bbref ID from the returned DataFrame
            batter_ids.at[idx, 'key_bbref'] = bbref  # Assign the bbref ID to the respective row
        else:
                print(f"No BBref ID found for mlbID: {mlb_id['mlbID']}, {mlb_id['Name']} (Batter)")
    except Exception as e:
        print(f"Error processing mlbID: {mlb_id['mlbID']}, Error: {e}")


# Ensure mlbID is converted to integers
pitching['mlbID'] = pitching['mlbID'].astype(int)

# Add an empty column 'key_bbref' initialized with None
pitcher_ids = pitching[['Name', 'Tm', 'mlbID']].copy()
pitcher_ids['key_bbref'] = None

# Loop through each player ID and fetch their bbref ID
for idx, row in pitcher_ids.iterrows():
    mlb_id = row['mlbID']
    
    try:
        # Fetch the data using the player ID
        pitcherdata = playerid_reverse_lookup([mlb_id], key_type='mlbam')
        
        if not pitcherdata.empty:
            bbref = pitcherdata.iloc[0]['key_bbref']  # Extract the bbref ID from the returned DataFrame
            pitcher_ids.at[idx, 'key_bbref'] = bbref  # Assign the bbref ID to the respective row
        else:
            print(f"No BBref ID found for mlbID: {mlb_id}, {row['Name']} (Pitcher)")
    except Exception as e:
        print(f"Error processing mlbID: {mlb_id}, Error: {e}")

batter_ids.to_csv('batter_ids.csv')
pitcher_ids.to_csv('pitcher_ids.csv')

print('\n\nPlayer IDs updated.\nNow creating gamelogs.\n')

# ========================================= GAME LOGS ===============================================

import requests
import pandas as pd
from datetime import datetime
from pybaseball import playerid_reverse_lookup

# Load game_pks.csv to get team names and IDs
game_pks_df = pd.read_csv('game_pks.csv')

# Mapping of team 3-digit IDs to oddshark 5-digit IDs 
team_to_oddshark_id = {
    120: 27017, 146: 27022, 139: 27003, 144: 27009, 140: 27002, 117: 27023,
    135: 26996, 143: 26995, 110: 27008, 136: 27011, 121: 27014, 109: 27007,
    108: 26998, 133: 27016, 141: 27010, 114: 27014, 138: 27019, 142: 27005,
    116: 26999, 147: 27001, 137: 26997, 118: 27006, 145: 27018, 115: 27004,
    111: 27021, 119: 27015, 112: 27020, 158: 27012, 113: 27000, 134: 27013
}

errors = []

def get_game_data(gamepk):
    url = f"https://statsapi.mlb.com/api/v1.1/game/{gamepk}/feed/live"
    response = requests.get(url)
    return response.json()

def get_bbref_id(mlbam_id):
    try:
        lookup_df = playerid_reverse_lookup([mlbam_id], key_type='mlbam')
        bbref_id = lookup_df.loc[lookup_df['key_mlbam'] == mlbam_id, 'key_bbref'].values[0]
        return bbref_id
    except IndexError:
        return 'unknown'

def extract_starting_lineup(game_data, team_side):
    lineup = {}
    team = game_data['liveData']['boxscore']['teams'][team_side]['players']
    
    for player_id, player_info in team.items():
        try:
            if 'battingOrder' in player_info and int(player_info['battingOrder']) % 100 == 0:
                order = int(player_info['battingOrder']) // 100
                mlbam_id = player_info['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                lineup[order] = {
                    'name': player_info['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                }
        except Exception as e:
            errors.append((gamepk, f"Error processing player {player_id} in {team_side} lineup: {str(e)}"))
    
    # Ensure lineup is filled and sorted by batting order
    return [lineup.get(i, {'name': '', 'mlbam_id': '', 'bbref_id': ''}) for i in range(1, 10)]

def get_pitchers(game_data, team_side):
    team = game_data['liveData']['boxscore']['teams'][team_side]
    pitchers = []
    for idx, pitcher_id in enumerate(team['pitchers']):
        try:
            pitcher = team['players'][f'ID{pitcher_id}']
            mlbam_id = pitcher['person']['id']
            bbref_id = get_bbref_id(mlbam_id)
            pitchers.append({
                'name': pitcher['person']['fullName'],
                'mlbam_id': mlbam_id,
                'bbref_id': bbref_id,
                'order': idx + 1
            })
        except Exception as e:
            errors.append((gamepk, f"Error processing pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    return pitchers

def get_bullpen(game_data, team_side):
    team = game_data['liveData']['boxscore']['teams'][team_side]
    bullpen = []
    if team['pitchers']:  # Game already played
        for pitcher_id in team['bullpen'] + team['pitchers'][1:]:
            try:
                pitcher = team['players'][f'ID{pitcher_id}']
                mlbam_id = pitcher['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                bullpen.append({
                    'name': pitcher['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                })
            except Exception as e:
                errors.append((gamepk, f"Error processing bullpen pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    else:  # Game not yet played
        for pitcher_id in team['bullpen']:
            try:
                pitcher = team['players'][f'ID{pitcher_id}']
                mlbam_id = pitcher['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                bullpen.append({
                    'name': pitcher['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                })
            except Exception as e:
                errors.append((gamepk, f"Error processing bullpen pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    return bullpen

def create_game_dataframe(gamepk):
    game_data = get_game_data(gamepk)
    
    try:
        home_lineup = extract_starting_lineup(game_data, 'home')
        away_lineup = extract_starting_lineup(game_data, 'away')
        home_bullpen = get_bullpen(game_data, 'home')
        away_bullpen = get_bullpen(game_data, 'away')
    except Exception as e:
        errors.append((gamepk, f"Error extracting lineups: {str(e)}"))
        return pd.DataFrame()
    
    try:
        home_pitchers = get_pitchers(game_data, 'home')
        away_pitchers = get_pitchers(game_data, 'away')
    except Exception as e:
        errors.append((gamepk, f"Error extracting pitchers: {str(e)}"))
        return pd.DataFrame()
    
    # Get additional game information
    game_info = game_pks_df[game_pks_df['game_id'] == gamepk].iloc[0]
    
    try:
        game_date = game_info['game_date']
    except KeyError:
        game_date = 'unknown'
        errors.append((gamepk, "Error getting game date"))
    
    try:
        runs_home = game_data['liveData']['linescore']['teams']['home']['runs']
    except KeyError:
        runs_home = 0
        errors.append((gamepk, "Error getting home runs"))
    
    try:
        runs_away = game_data['liveData']['linescore']['teams']['away']['runs']
    except KeyError:
        runs_away = 0
        errors.append((gamepk, "Error getting away runs"))
    
    runs_total = runs_home + runs_away
    
    # Get team information from game_pks.csv
    home_id = game_info['home_id']
    away_id = game_info['away_id']
    home_name = game_info['home_name']
    away_name = game_info['away_name']
    
    # Check for invalid team IDs
    if home_id not in team_to_oddshark_id or away_id not in team_to_oddshark_id:
        print(f"Invalid team IDs - {home_name} {home_id} vs {away_name} {away_id}")
    
    home_oddshark_id = team_to_oddshark_id.get(home_id, 'unknown')
    away_oddshark_id = team_to_oddshark_id.get(away_id, 'unknown')
    
    game_record = {
        'game_id': gamepk, 'game_date': game_date, 'runs_home': runs_home, 'runs_away': runs_away, 'runs_total': runs_total,
        'home_id': home_id, 'home_name': home_name, 'away_id': away_id, 'away_name': away_name,
        'home_oddshark_id': home_oddshark_id, 'away_oddshark_id': away_oddshark_id
    }

    for i, player in enumerate(away_lineup, start=1):
        game_record[f'Away_Batter{i}_Name'] = player['name']
        game_record[f'Away_Batter{i}_ID'] = player['mlbam_id']
        game_record[f'Away_Batter{i}_bbrefID'] = player['bbref_id']
        
    for i, player in enumerate(home_lineup, start=1):
        game_record[f'Home_Batter{i}_Name'] = player['name']
        game_record[f'Home_Batter{i}_ID'] = player['mlbam_id']
        game_record[f'Home_Batter{i}_bbrefID'] = player['bbref_id']
    
    # Add starting pitchers
    if home_pitchers:
        game_record['Home_SP_Name'] = home_pitchers[0]['name']
        game_record['Home_SP_ID'] = home_pitchers[0]['mlbam_id']
        game_record['Home_SP_bbrefID'] = home_pitchers[0]['bbref_id']
    
    if away_pitchers:
        game_record['Away_SP_Name'] = away_pitchers[0]['name']
        game_record['Away_SP_ID'] = away_pitchers[0]['mlbam_id']
        game_record['Away_SP_bbrefID'] = away_pitchers[0]['bbref_id']
    
    # Add bullpen pitchers
    for i, pitcher in enumerate(home_bullpen, start=1):
        game_record[f'Home_bullpen_{i}_Name'] = pitcher['name']
        game_record[f'Home_bullpen_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Home_bullpen_{i}_bbrefID'] = pitcher['bbref_id']
    
    for i, pitcher in enumerate(away_bullpen, start=1):
        game_record[f'Away_bullpen_{i}_Name'] = pitcher['name']
        game_record[f'Away_bullpen_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Away_bullpen_{i}_bbrefID'] = pitcher['bbref_id']


    # Add remaining pitchers
    for i, pitcher in enumerate(home_pitchers[1:], start=2):
        game_record[f'Home_P_{i}_Name'] = pitcher['name']
        game_record[f'Home_P_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Home_P_{i}_bbrefID'] = pitcher['bbref_id']
    
    for i, pitcher in enumerate(away_pitchers[1:], start=2):
        game_record[f'Away_P_{i}_Name'] = pitcher['name']
        game_record[f'Away_P_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Away_P_{i}_bbrefID'] = pitcher['bbref_id']
    
    return pd.DataFrame([game_record])

def save_game_to_csv(gamepk):
    game_df = create_game_dataframe(gamepk)
    if not game_df.empty:
        file_name = f'gamelogs/game_{gamepk}.csv'
        game_df.to_csv(file_name, index=False)
        #print(f"Data exported to {file_name}")
    else:
        print(f"No data exported for game {gamepk}")

all_games = pd.read_csv('game_pks.csv').game_id


gamepks = all_games.tail(100)
count = 0
for gamepk in gamepks:
    save_game_to_csv(gamepk)
    count += 1
    if count % 10 == 0:
        print(count)

# Print collected errors
if errors:
    print("Errors encountered:")
    for error in errors:
        print(error)
else:
    print("No errors encountered creating gamelogs.")

print('\n\nGame logs updated.\nNow collecting odds data.\n')

# ==================================== ODDS DATA =======================================================================

import pandas as pd
from datetime import datetime

def fetch_over_under_runline(oddshark_id, game_date):
    year = game_date.year
    url = f"https://www.oddsshark.com/stats/gamelog/baseball/mlb/{oddshark_id}?season={year}"
    
    try:
        tables = pd.read_html(url)
        df = tables[0]
    except Exception as e:
        print(f"BAD - error for team {oddshark_id} on date {game_date}: {e}")
        return 'unknown', None, None, None
    
    if df.empty:
        print(f"BAD - No data in table for team {oddshark_id} on date {game_date}")
        return 'unknown', None, None, None
    
    df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
    matching_rows = df[df['Date'] == game_date]

    if len(matching_rows) > 1:
        print(f"DOUBLEHEADER on {game_date}")
        return '', oddshark_id, year, game_date
    
    if matching_rows.empty:
        print(f"BAD - No matching date found for team {oddshark_id} on date {game_date}")
        return 'unknown', None, None, None
    
    over_under = matching_rows.iloc[0]['Total']
    return over_under, None, None, None

def update_gamelogs_with_over_under(game_pks_file, gamelogs_folder):
    game_pks_df = pd.read_csv(game_pks_file)
    game_pks = game_pks_df['game_id'].tail(50)  # Set the number of recent games to do
    duplicates = []
    
    count = 0
    for game_id in game_pks:
        try:
            gamelog_file = f'{gamelogs_folder}/game_{game_id}.csv'
            gamelog_df = pd.read_csv(gamelog_file)
            
            home_oddshark_id = gamelog_df.loc[0, 'home_oddshark_id']
            game_date_str = gamelog_df.loc[0, 'game_date']
            game_date = datetime.strptime(game_date_str, '%Y-%m-%d')
            
            over_under_runline, duplicate_id, duplicate_year, duplicate_date = fetch_over_under_runline(home_oddshark_id, game_date)
            
            if duplicate_id:
                duplicates.append((duplicate_id, duplicate_year, duplicate_date))
                
            gamelog_df['over_under_runline'] = over_under_runline
             
            gamelog_df.to_csv(gamelog_file, index=False)
            print(f"Updated {gamelog_file} with over/under runline.")
        except Exception as e:
            print(f"Error updating gamelog for game_id {game_id}: {e}")

        count+=1
        if count % 100:
            print(count)
    
    print("\nGames with duplicate dates:")
    for dup in duplicates:
        print(f"Team Oddshark ID: {dup[0]}, Year: {dup[1]}, Date: {dup[2].strftime('%Y-%m-%d')}")

# Update the paths as necessary
game_pks_file = 'game_pks.csv'
gamelogs_folder = 'gamelogs'

# Run the update function
update_gamelogs_with_over_under(game_pks_file, gamelogs_folder)

print('\n\nOdds Data updated.\nNow updating player stats.\n')

# ================================== PLAYER STATS ===================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import re 
from dateutil import parser
import time
from datetime import date, datetime, timedelta
from pybaseball import batting_stats_range, pitching_stats_range, playerid_reverse_lookup
import statsapi
import os

def fetch_b_game_log(player_id, year):
    # Construct the URL for the batter's game log for the given year
    url = f'https://www.baseball-reference.com/players/gl.fcgi?id={player_id}&t=b&year={year}'
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f" BAD - Failed to fetch data for batter {player_id} in {year}")
        return None
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the table containing the game logs
    table = soup.find('table', {'id': 'batting_gamelogs'})
    
    # Check if the table is found
    if table is None:
        print(f"No data found for batter {player_id} in {year} - OK")
        return None
    
    # Read the table into a pandas DataFrame
    df = pd.read_html(str(table))[0]
    
    # Remove rows where 'Rk' is not a number (header rows that repeat in the table)
    df = df[pd.to_numeric(df['Rk'], errors='coerce').notnull()]
    
    # Add the year to the 'Date' column if the year is not already present
    df['Date'] = df['Date'].apply(lambda x: f"{x}, {year}" if '(' not in x else x)
    
    # Extract the value from parentheses (if present) and assign it to a new column 'dbl'
    df['dbl'] = df['Date'].str.extract(r'\((\d+)\)').astype(float)
    
    # Add the year to the 'Date' column for doubleheader dates
    df.loc[df['dbl'].notnull(), 'Date'] = df['Date'] + ', ' + str(year)
    
    # Format 'Date' to 'game_date' in YYYY-MM-DD format
    df['game_date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    
    return df

def fetch_p_game_log(player_id, year):
    # Construct the URL for the pitcher's game log for the given year
    url = f'https://www.baseball-reference.com/players/gl.fcgi?id={player_id}&t=p&year={year}'
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f" BAD - Failed to fetch data for pitcher {player_id} in {year}")
        return None
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the table containing the game logs
    table = soup.find('table', {'id': 'pitching_gamelogs'})
    
    # Check if the table is found
    if table is None:
        print(f"No data found for pitcher {player_id} in {year} - OK")
        return None
    
    # Read the table into a pandas DataFrame
    df = pd.read_html(str(table))[0]
    
    # Remove rows where 'Rk' is not a number (header rows that repeat in the table)
    df = df[pd.to_numeric(df['Rk'], errors='coerce').notnull()]
    
    # Add the year to the 'Date' column if the year is not already present
    df['Date'] = df['Date'].apply(lambda x: f"{x}, {year}" if '(' not in x else x)
    
    # Extract the value from parentheses (if present) and assign it to a new column 'dbl'
    df['dbl'] = df['Date'].str.extract(r'\((\d+)\)').astype(float)
    
    # Add the year to the 'Date' column for doubleheader dates
    df.loc[df['dbl'].notnull(), 'Date'] = df['Date'] + ', ' + str(year)
    
    # Format 'Date' to 'game_date' in YYYY-MM-DD format
    df['game_date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    
    return df

# Function to clean and parse dates
def clean_date(date_str, year):
    try:
        # Replace invisible characters like U+00A0 with a space
        date_str = date_str.replace('\xa0', ' ')
        # Remove any null characters and non-printable characters
        date_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', date_str)
        # Remove unwanted characters and extra text like "(1)" or "susp"
        date_str = re.sub(r'\(.*?\)', '', date_str)  # Remove text inside parentheses
        date_str = ''.join(char for char in date_str if char.isalnum() or char.isspace() or char == ',')
        # Remove specific unwanted words like "susp"
        date_str = date_str.replace('susp', '').strip()
        # Parse the cleaned string to a date object
        parsed_date = parser.parse(date_str)
        # Force the year to be 2021
        parsed_date = parsed_date.replace(year=year)
        # Format the date to 'YYYY-MM-DD'
        #print(parsed_date)
        return parsed_date.strftime('%Y-%m-%d')
    except Exception as e:
        # Print the error for debugging purposes
        print(f"Error parsing date '{date_str}': {e}")
        # Handle any parsing errors by returning None
        return None

# Get today's date
today = date.today()

# Define end date
end_date = today.strftime('%Y-%m-%d')

def get_active_player_ids(game_data):
    active_batters = set()  # Use a set to avoid duplicates
    active_pitchers = set()  # Use a set to avoid duplicates
    
    for game in game_data:
        game_id = game['game_id']
        boxscore = statsapi.boxscore_data(game_id)
        
        for team_key in ['away', 'home']:
            if team_key in boxscore:
                team_data = boxscore[team_key]
                if 'batters' in team_data:
                    active_batters.update(team_data['batters'])
                if 'pitchers' in team_data:
                    active_pitchers.update(team_data['pitchers'])
                    
    return list(active_batters), list(active_pitchers)

# Get recent games
recent_games = statsapi.schedule(start_date=(today - timedelta(days=2)).strftime('%Y-%m-%d'), end_date=end_date)

# Get active batters and pitchers
active_batter_ids, active_pitcher_ids = get_active_player_ids(recent_games)

# Use playerid_reverse_lookup to get bbref_id
def get_bbref_ids(player_ids):
    player_data = playerid_reverse_lookup(player_ids, key_type='mlbam')
    return player_data[['key_mlbam', 'key_bbref']]

# Get bbref IDs for active batters and pitchers
active_batter_data = get_bbref_ids(active_batter_ids)
active_pitcher_data = get_bbref_ids(active_pitcher_ids)

# Save to CSV
active_batter_data.to_csv('active_batter_ids.csv', index=False)
active_pitcher_data.to_csv('active_pitcher_ids.csv', index=False)

# Load active player IDs
active_batter_ids = pd.read_csv('active_batter_ids.csv')['key_bbref']
active_pitcher_ids = pd.read_csv('active_pitcher_ids.csv')['key_bbref']

# Load game Pks
game_pks = pd.read_csv('game_pks.csv')

# Define the mapping from abbreviated team names to full team names
team_id_mapping = {
    'WSN': 120, 'MIA': 146, 'TBR': 139, 'ATL': 144, 'TEX': 140, 'HOU': 117,
    'SD': 135, 'SDP': 135, 'PHI': 143, 'BAL': 110, 'SEA': 136, 'NYM': 121,
    'ARI': 109, 'LAA': 108, 'OAK': 133, 'TOR': 141, 'CLE': 114, 'STL': 138,
    'MIN': 142, 'DET': 116, 'NYY': 147, 'SFG': 137, 'KCR': 118, 'CWS': 145,
    'CHW': 145, 'COL': 115, 'BOS': 111, 'LAD': 119, 'CHC': 112, 'MIL': 158,
    'CIN': 113, 'PIT': 134
}

# Define the current year
current_year = 2024

# Function to process and save player data
def process_player_data(player_ids, player_type='batter'):
    fetch_game_log = fetch_b_game_log if player_type == 'batter' else fetch_p_game_log
    
    for id in player_ids:
        if not id or pd.isna(id):
            continue

         # Load the existing player data if it exists
        player_file_path = f'{player_type}s/{id}_{player_type}ing.csv'
        if player_type == 'batter':
            player_file_path = f'batters/{id}_batting.csv'
        elif player_type == 'pitcher':
            player_file_path = f'pitchers/{id}_pitching.csv'
            
        if os.path.exists(player_file_path):
            player_df = pd.read_csv(player_file_path)
        else:
            player_df = pd.DataFrame()

        # Fetch data for the current year
        new_data_df = fetch_game_log(id, current_year)
        time.sleep(0.2)

        # Check if the fetched dataframe is None or empty
        if new_data_df is None or new_data_df.empty:
            continue  # Skip if no data available

        # Apply the function to the date_column and create a new column
        new_data_df['game_date'] = new_data_df['Date'].apply(lambda date: clean_date(date, current_year))
        new_data_df['Date'] = new_data_df['game_date']

        # Ensure the 'Date' column in new_data_df and 'game_date' column in game_pks are in datetime format
        new_data_df['Date'] = pd.to_datetime(new_data_df['Date'])
        game_pks['game_date'] = pd.to_datetime(game_pks['game_date'])

        # Map the team abbreviations to full team names
        new_data_df['team_id'] = new_data_df['Tm'].map(team_id_mapping)
        new_data_df['opp_id'] = new_data_df['Opp'].map(team_id_mapping)

        # Initialize a new column in new_data_df for game_id
        new_data_df['game_id'] = None

        # Iterate over the rows in new_data_df to find the corresponding game_id in game_pks
        for index, row in new_data_df.iterrows():
            # Filter the game_pks for the matching date and teams
            game_day_matches = game_pks[
                (game_pks['game_date'] == row['Date']) &
                (
                    ((game_pks['home_id'] == row['team_id']) & (game_pks['away_id'] == row['opp_id'])) |
                    ((game_pks['home_id'] == row['opp_id']) & (game_pks['away_id'] == row['team_id']))
                )
            ]

            # Check the 'dbl' column to assign the correct game_id
            if not game_day_matches.empty:
                if row['dbl'] == 1:
                    # For the first game of a double-header
                    game_id = game_day_matches.iloc[0]['game_id']
                elif row['dbl'] == 2:
                    # For the second game of a double-header
                    if len(game_day_matches) > 1:
                        game_id = game_day_matches.iloc[1]['game_id']
                    else:
                        game_id = game_day_matches.iloc[0]['game_id']
                else:
                    # For days without double-headers or unmarked double-headers, take the first game
                    game_id = game_day_matches.iloc[0]['game_id']
                new_data_df.at[index, 'game_id'] = game_id
            else:
                print(f"BAD - NO GAME MATCHES FOUND for {id} on {row['Date']}")

        # Concatenate the new data with the existing data, ensuring no duplicates
        if not player_df.empty:
            combined_df = pd.concat([player_df, new_data_df]).drop_duplicates(subset=['game_id'])
        else:
            combined_df = new_data_df

        # Save the updated player data to a CSV file
        combined_df.to_csv(player_file_path, index=False)

    print(f'All {player_type} IDs processed and saved')

# Process batter and pitcher data   
process_player_data(active_batter_ids, player_type='batter')
process_player_data(active_pitcher_ids, player_type='pitcher')

print('\n\nPlayer stats updated.\nNow generating custom stats.\n')

# =================================================== CUSTOM STATS ==============================================

import pandas as pd
import numpy as np
import os

idlist = pd.read_csv('batter_ids.csv')
batter_ids = idlist.key_bbref

game_pks = pd.read_csv('game_pks.csv')

# Define function to create an empty DataFrame with the correct structure
def create_empty_stats_df():
    columns = ['Rk', 'Gcar', 'Gtm', 'Date', 'Tm', 'Unnamed: 5', 'Opp', 'Rslt', 'Inngs', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS', 'BOP', 'aLI', 'WPA', 'acLI', 'cWPA', 'RE24', 'DFS(DK)', 'DFS(FD)', 'Pos', 'dbl', 'game_date', 'team_id', 'opp_id', 'game_id']
    empty_df = pd.DataFrame(columns=columns)
    return empty_df


for id in batter_ids:

    if not id or pd.isna(id):
        continue

    file_path = f'batters/{id}_batting.csv'
    
    df = pd.read_csv(file_path)

    # Remove the irrelevant column 'Gtm'
    df = df.drop(columns=['Gtm'])

    # Ensure the 'game_date' column is in datetime format
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Extract the year from the 'game_date' column
    df['season'] = df['game_date'].dt.year

    # Clean non-numeric values in numeric columns
    def clean_numeric(value):
        try:
            value = str(value).replace('\xa0', '').replace('(', '').replace(')', '').replace(',', '')
            return float(value)
        except ValueError:
            return np.nan

    # Define columns to convert to numeric
    numeric_columns = ['PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'DFS(DK)', 'DFS(FD)']

    # Apply cleaning function to numeric columns
    for col in numeric_columns:
        df[col] = df[col].apply(clean_numeric)

    # Fill NaN values with 0 for numerical calculations
    df[numeric_columns] = df[numeric_columns].fillna(0)

    # Ensure columns are of correct numeric type
    df[numeric_columns] = df[numeric_columns].astype(float)

    # Define functions to calculate required statistics
    def calculate_avg(df):
        return df['H'] / df['AB']

    def calculate_obp(df):
        return (df['H'] + df['BB'] + df['HBP']) / (df['AB'] + df['BB'] + df['HBP'] + df['SF'])

    def calculate_slg(df):
        return (df['H'] + 2*df['2B'] + 3*df['3B'] + 4*df['HR']) / df['AB']

    def calculate_ops(df):
        return calculate_obp(df) + calculate_slg(df)

    def calculate_extra_base_hits(df):
        return df['2B'] + df['3B'] + df['HR']

    def calculate_total_bases(df):
        return df['H'] + df['2B'] + 2*df['3B'] + 3*df['HR']

    def calculate_rolling_stats(df, window, suffix):
        rolling_df = df.rolling(window=window, min_periods=1).sum()
        rolling_df['AVG'] = calculate_avg(rolling_df)
        rolling_df['OBP'] = calculate_obp(rolling_df)
        rolling_df['SLG'] = calculate_slg(rolling_df)
        rolling_df['OPS'] = calculate_ops(rolling_df)
        rolling_df['XB'] = calculate_extra_base_hits(rolling_df)
        rolling_df['TB'] = calculate_total_bases(rolling_df)
        rolling_df = rolling_df[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
        rolling_df.columns = [f'{col}_{suffix}' for col in rolling_df.columns]
        
        # Round the stats to 3 decimal points
        rolling_df = rolling_df.round(3)
        
        return rolling_df

    # Exclude non-numeric columns from rolling stats calculation
    rolling_df = df[numeric_columns].copy()

    # Calculate rolling stats for the last 20 games and shift by one row
    rolling_stats_20 = calculate_rolling_stats(rolling_df, 20, '20').shift(1).fillna(0)

    # Calculate rolling stats for the last 10 games and shift by one row
    rolling_stats_10 = calculate_rolling_stats(rolling_df, 10, '10').shift(1).fillna(0)

    # Calculate rolling stats for the last 5 games and shift by one row
    rolling_stats_5 = calculate_rolling_stats(rolling_df, 5, '5').shift(1).fillna(0)

    # Calculate rolling stats for the last 5 games and shift by one row
    rolling_stats_3 = calculate_rolling_stats(rolling_df, 3, '3').shift(1).fillna(0)

    # Calculate season-long stats for each year and shift by one row
    season_stats = pd.DataFrame()
    for year in range(2021, 2025):
        season_df = df[df['season'] == year][numeric_columns].copy()
        season_cumsum = season_df.cumsum().shift(1).fillna(0)
        season_cumsum['AVG'] = calculate_avg(season_cumsum)
        season_cumsum['OBP'] = calculate_obp(season_cumsum)
        season_cumsum['SLG'] = calculate_slg(season_cumsum)
        season_cumsum['OPS'] = calculate_ops(season_cumsum)
        season_cumsum['XB'] = calculate_extra_base_hits(season_cumsum)
        season_cumsum['TB'] = calculate_total_bases(season_cumsum)
        season_cumsum = season_cumsum[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
        season_cumsum.columns = [f'{col}_current' for col in season_cumsum.columns]
        season_stats = pd.concat([season_stats, season_cumsum])

    # Ensure the season_stats index aligns with the original dataframe
    season_stats.index = df.index

    # Combine all the stats into a single dataframe
    final_df = pd.concat([df, rolling_stats_20, rolling_stats_10, rolling_stats_5, rolling_stats_3, season_stats], axis=1)

    # Round the combined dataframe stats to 3 decimal points
    final_df = final_df.round(3)

    # Display the combined dataframe
    print(final_df.tail())

    # Save the combined stats to a CSV file
    final_df.to_csv(f'batters/{id}_stats_batting.csv', index=False)

    print(f"Generated stats for {id} and saved to CSV file.")

    import pandas as pd
import numpy as np
import os

# Function to clean numeric values
def clean_numeric(value):
    try:
        value = str(value).replace('\xa0', '').replace('(', '').replace(')', '').replace(',', '')
        return float(value)
    except ValueError:
        return np.nan

# Function to convert IP notation to real numbers
def convert_ip_to_real(ip):
    if pd.isna(ip):
        return np.nan
    ip_str = str(ip)
    if '.' in ip_str:
        parts = ip_str.split('.')
        whole = int(parts[0])
        fraction = int(parts[1]) if len(parts) > 1 else 0
        if fraction == 1:
            return whole + 1/3
        elif fraction == 2:
            return whole + 2/3
        else:
            return whole
    return float(ip)

# Function to calculate ERA
def calculate_era(df):
    return (df['ER'] * 9) / df['IP_real']

# Function to calculate WHIP
def calculate_whip(df):
    return (df['H'] + df['BB']) / df['IP_real']

# Function to calculate extra base hits against
def calculate_extra_base_hits_against(df):
    return df['2B'] + df['3B'] + df['HR']

# Function to calculate total bases against
def calculate_total_bases_against(df):
    return df['H'] + df['2B'] + 2 * df['3B'] + 3 * df['HR']

# Function to calculate rolling stats
def calculate_rolling_stats(df, window, suffix):
    rolling_df = df.rolling(window=window, min_periods=1).sum()
    rolling_df['ERA'] = calculate_era(rolling_df)
    rolling_df['WHIP'] = calculate_whip(rolling_df)
    rolling_df['XB_against'] = calculate_extra_base_hits_against(rolling_df)
    rolling_df['TB_against'] = calculate_total_bases_against(rolling_df)
    rolling_df = rolling_df[['IP_real', 'H', 'BF', 'HR', 'R', 'ER', 'BB', 'SO', 'XB_against', 'TB_against', 'ERA', 'WHIP']]
    rolling_df.columns = [f'{col}_{suffix}' for col in rolling_df.columns]
    return rolling_df.round(3)

# Load pitcher IDs
idlist = pd.read_csv('pitcher_ids.csv')
pitcher_ids = idlist.key_bbref

# Load game PKs (if needed)
game_pks = pd.read_csv('game_pks.csv')

for id in pitcher_ids:

    if not id or pd.isna(id):
        continue

    file_path = f'pitchers/{id}_pitching.csv' 
    
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        try:
            df = pd.read_csv(file_path)
        except pd.errors.EmptyDataError:
            print(f"File for ID {id} is empty.")
            continue
        except pd.errors.ParserError:
            print(f"File for ID {id} is improperly formatted.")
            continue
    else:
        print(f"File for ID {id} does not exist or is empty.")
        continue

    # Remove the irrelevant column 'Gtm'
    df = df.drop(columns=['Gtm'])

    # Ensure the 'game_date' column is in datetime format
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Extract the year from the 'game_date' column
    df['season'] = df['game_date'].dt.year

    # Define columns to convert to numeric
    numeric_columns = ['IP', 'H', 'R', 'ER', 'BB', 'SO', 'HR', 'BF', '2B', '3B', 'IBB']

    # Apply cleaning function to numeric columns
    for col in numeric_columns:
        df[col] = df[col].apply(clean_numeric)

    # Fill NaN values with 0 for numerical calculations
    df[numeric_columns] = df[numeric_columns].fillna(0)

    # Ensure columns are of correct numeric type
    df[numeric_columns] = df[numeric_columns].astype(float)

    # Create the IP_real column
    df['IP_real'] = df['IP'].apply(convert_ip_to_real)

    # Exclude non-numeric columns from rolling stats calculation
    rolling_df = df[numeric_columns + ['IP_real']].copy()

    # Calculate rolling stats for the last 20 games and shift by one row
    rolling_stats_20 = calculate_rolling_stats(rolling_df, 20, '20').shift(1).fillna(0)

    # Calculate rolling stats for the last 20 games and shift by one row
    rolling_stats_10 = calculate_rolling_stats(rolling_df, 10, '10').shift(1).fillna(0)

    # Calculate rolling stats for the last 5 games and shift by one row
    rolling_stats_5 = calculate_rolling_stats(rolling_df, 5, '5').shift(1).fillna(0)

    # Calculate rolling stats for the last 20 games and shift by one row
    rolling_stats_3 = calculate_rolling_stats(rolling_df, 3, '3').shift(1).fillna(0)

    # Calculate season-long stats for each year and shift by one row
    season_stats = pd.DataFrame()
    for year in df['season'].unique():
        season_df = df[df['season'] == year][numeric_columns + ['IP_real']].copy()
        season_cumsum = season_df.cumsum().shift(1).fillna(0)
        season_cumsum['ERA'] = calculate_era(season_cumsum)
        season_cumsum['WHIP'] = calculate_whip(season_cumsum)
        season_cumsum['XB_against'] = calculate_extra_base_hits_against(season_cumsum)
        season_cumsum['TB_against'] = calculate_total_bases_against(season_cumsum)
        season_cumsum = season_cumsum[['IP_real', 'H', 'BF', 'HR', 'R', 'ER', 'BB', 'SO', 'XB_against', 'TB_against', 'ERA', 'WHIP']]
        season_cumsum.columns = [f'{col}_current' for col in season_cumsum.columns]
        season_stats = pd.concat([season_stats, season_cumsum])

    # Ensure the season_stats index aligns with the original dataframe
    season_stats.index = df.index

    # Combine all the stats into a single dataframe
    final_df = pd.concat([df, rolling_stats_20, rolling_stats_10, rolling_stats_5, rolling_stats_3, season_stats], axis=1)

    # Round the combined dataframe stats to 3 decimal points
    final_df = final_df.round(3)

    # Display the combined dataframe
    print(final_df.tail())

    # Save the combined stats to a CSV file
    final_df.to_csv(f'pitchers/{id}_stats_pitching.csv', index=False)

    print(f"Generated stats for {id} and saved to CSV file.")

import os
import pandas as pd

def get_player_stats(bbref_id, player_type, game_id):
    """
    Get the player's stats for the specific game_id. If not available, return the most recent stats.
    """
    stats_dir = 'batters' if player_type == 'batting' else 'pitchers'
    stats_file = os.path.join(stats_dir, f'{bbref_id}_stats_{player_type}.csv')
    
    if not os.path.exists(stats_file):
        print(f"Stats file for {bbref_id} not found ({player_type}).")
        return None
    
    stats_df = pd.read_csv(stats_file)
    game_stats = stats_df[stats_df['game_id'] == game_id]
    
    if not game_stats.empty:
        return game_stats.iloc[0]
    else:
        return stats_df.iloc[-1]

def process_game(game_id):
    # Read the gamelog file
    game_file = f'gamelogs/game_{game_id}.csv'
    if not os.path.exists(game_file):
        print(f"Gamelog file for game {game_id} not found.")
        return
    
    game_df = pd.read_csv(game_file)
    game_data = game_df.iloc[0].to_dict()
    
    # Define relevant columns for batters and pitchers
    batter_columns = ['AVG_20', 'OBP_20', 'SLG_20', 'OPS_20', 'SB_20', 'CS_20', 'XB_20', 'TB_20', 'SO_20',
                      'AVG_10', 'OBP_10', 'SLG_10', 'OPS_10', 'SB_10', 'CS_10', 'XB_10', 'TB_10', 'SO_10',
                      'AVG_5', 'OBP_5', 'SLG_5', 'OPS_5', 'SB_5', 'CS_5', 'XB_5', 'TB_5', 'SO_5',
                      'AVG_3', 'OBP_3', 'SLG_3', 'OPS_3', 'SB_3', 'CS_3', 'XB_3', 'TB_3', 'SO_3']
    pitcher_columns = ['IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
                       'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
                       'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
                       'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
                       'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3']
    
    # Fetch stats for each batter
    for i in range(1, 10):
        for team in ['Away', 'Home']:
            bbref_id = game_data.get(f'{team}_Batter{i}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'batting', game_id)
                if stats is not None:
                    for col in batter_columns:
                        game_data[f'{team}_Batter{i}_{col}'] = stats.get(col, '')
            else:
                print(f'missing bbrefID for game {game_id}')

    # Fetch stats for each pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 11):
            role = 'SP' if i == 1 else f'P_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')

    # Fetch stats for each bullpen pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 15):  # Adjust the range according to your maximum expected number of bullpen pitchers
            role = f'bullpen_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')
    
    # Create a DataFrame from the updated game data
    updated_game_df = pd.DataFrame([game_data])
    
    # Save the updated game data to a new CSV file
    output_file = f'gamelogs/gamestats_{game_id}.csv'
    updated_game_df.to_csv(output_file, index=False)
    print(f"Processed and saved game stats for game {game_id} to {output_file}")

def process_recent_games(num_recent_games):
    game_pks_file = 'game_pks.csv'
    if not os.path.exists(game_pks_file):
        print(f"{game_pks_file} not found.")
        return

    game_pks_df = pd.read_csv(game_pks_file)
    recent_game_pks = game_pks_df.tail(num_recent_games)['game_id'].tolist()
    
    for game_id in recent_game_pks:
        process_game(game_id)

# Input the number of most recent games to process
num_recent_games = 100
process_recent_games(num_recent_games)

print('\n\nCustom stats added to gamelogs.\n\nSUCCESS - THIS FILE IS COMPLETE.')


KeyboardInterrupt: 