In [1]:
import requests
import pandas as pd
from datetime import datetime
from pybaseball import playerid_reverse_lookup

# Load game_pks.csv to get team names and IDs
game_pks_df = pd.read_csv('game_pks.csv')

# Mapping of team 3-digit IDs to oddshark 5-digit IDs 
team_to_oddshark_id = {
    120: 27017, 146: 27022, 139: 27003, 144: 27009, 140: 27002, 117: 27023,
    135: 26996, 143: 26995, 110: 27008, 136: 27011, 121: 27014, 109: 27007,
    108: 26998, 133: 27016, 141: 27010, 114: 27014, 138: 27019, 142: 27005,
    116: 26999, 147: 27001, 137: 26997, 118: 27006, 145: 27018, 115: 27004,
    111: 27021, 119: 27015, 112: 27020, 158: 27012, 113: 27000, 134: 27013
}

errors = []

def get_game_data(gamepk):
    url = f"https://statsapi.mlb.com/api/v1.1/game/{gamepk}/feed/live"
    response = requests.get(url)
    return response.json()

def get_bbref_id(mlbam_id):
    try:
        lookup_df = playerid_reverse_lookup([mlbam_id], key_type='mlbam')
        bbref_id = lookup_df.loc[lookup_df['key_mlbam'] == mlbam_id, 'key_bbref'].values[0]
        return bbref_id
    except IndexError:
        return 'unknown'

def extract_starting_lineup(game_data, team_side):
    lineup = {}
    team = game_data['liveData']['boxscore']['teams'][team_side]['players']
    
    for player_id, player_info in team.items():
        try:
            if 'battingOrder' in player_info and int(player_info['battingOrder']) % 100 == 0:
                order = int(player_info['battingOrder']) // 100
                mlbam_id = player_info['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                lineup[order] = {
                    'name': player_info['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                }
        except Exception as e:
            errors.append((gamepk, f"Error processing player {player_id} in {team_side} lineup: {str(e)}"))
    
    # Ensure lineup is filled and sorted by batting order
    return [lineup.get(i, {'name': '', 'mlbam_id': '', 'bbref_id': ''}) for i in range(1, 10)]

def get_pitchers(game_data, team_side):
    team = game_data['liveData']['boxscore']['teams'][team_side]
    pitchers = []
    for idx, pitcher_id in enumerate(team['pitchers']):
        try:
            pitcher = team['players'][f'ID{pitcher_id}']
            mlbam_id = pitcher['person']['id']
            bbref_id = get_bbref_id(mlbam_id)
            pitchers.append({
                'name': pitcher['person']['fullName'],
                'mlbam_id': mlbam_id,
                'bbref_id': bbref_id,
                'order': idx + 1
            })
        except Exception as e:
            errors.append((gamepk, f"Error processing pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    return pitchers

def get_bullpen(game_data, team_side):
    team = game_data['liveData']['boxscore']['teams'][team_side]
    bullpen = []
    if team['pitchers']:  # Game already played
        for pitcher_id in team['bullpen'] + team['pitchers'][1:]:
            try:
                pitcher = team['players'][f'ID{pitcher_id}']
                mlbam_id = pitcher['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                bullpen.append({
                    'name': pitcher['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                })
            except Exception as e:
                errors.append((gamepk, f"Error processing bullpen pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    else:  # Game not yet played
        for pitcher_id in team['bullpen']:
            try:
                pitcher = team['players'][f'ID{pitcher_id}']
                mlbam_id = pitcher['person']['id']
                bbref_id = get_bbref_id(mlbam_id)
                bullpen.append({
                    'name': pitcher['person']['fullName'],
                    'mlbam_id': mlbam_id,
                    'bbref_id': bbref_id
                })
            except Exception as e:
                errors.append((gamepk, f"Error processing bullpen pitcher {pitcher_id} in {team_side} team: {str(e)}"))
    return bullpen

def create_game_dataframe(gamepk):
    game_data = get_game_data(gamepk)
    
    try:
        home_lineup = extract_starting_lineup(game_data, 'home')
        away_lineup = extract_starting_lineup(game_data, 'away')
        home_bullpen = get_bullpen(game_data, 'home')
        away_bullpen = get_bullpen(game_data, 'away')
    except Exception as e:
        errors.append((gamepk, f"Error extracting lineups: {str(e)}"))
        return pd.DataFrame()
    
    try:
        home_pitchers = get_pitchers(game_data, 'home')
        away_pitchers = get_pitchers(game_data, 'away')
    except Exception as e:
        errors.append((gamepk, f"Error extracting pitchers: {str(e)}"))
        return pd.DataFrame()
    
    # Get additional game information
    game_info = game_pks_df[game_pks_df['game_id'] == gamepk].iloc[0]
    
    try:
        game_date = game_info['game_date']
    except KeyError:
        game_date = 'unknown'
        errors.append((gamepk, "Error getting game date"))
    
    try:
        runs_home = game_data['liveData']['linescore']['teams']['home']['runs']
    except KeyError:
        runs_home = 0
        errors.append((gamepk, "Error getting home runs"))
    
    try:
        runs_away = game_data['liveData']['linescore']['teams']['away']['runs']
    except KeyError:
        runs_away = 0
        errors.append((gamepk, "Error getting away runs"))
    
    runs_total = runs_home + runs_away
    
    # Get team information from game_pks.csv
    home_id = game_info['home_id']
    away_id = game_info['away_id']
    home_name = game_info['home_name']
    away_name = game_info['away_name']
    
    # Check for invalid team IDs
    if home_id not in team_to_oddshark_id or away_id not in team_to_oddshark_id:
        print(f"Invalid team IDs - {home_name} {home_id} vs {away_name} {away_id}")
    
    home_oddshark_id = team_to_oddshark_id.get(home_id, 'unknown')
    away_oddshark_id = team_to_oddshark_id.get(away_id, 'unknown')
    
    game_record = {
        'game_id': gamepk, 'game_date': game_date, 'runs_home': runs_home, 'runs_away': runs_away, 'runs_total': runs_total,
        'home_id': home_id, 'home_name': home_name, 'away_id': away_id, 'away_name': away_name,
        'home_oddshark_id': home_oddshark_id, 'away_oddshark_id': away_oddshark_id
    }

    for i, player in enumerate(away_lineup, start=1):
        game_record[f'Away_Batter{i}_Name'] = player['name']
        game_record[f'Away_Batter{i}_ID'] = player['mlbam_id']
        game_record[f'Away_Batter{i}_bbrefID'] = player['bbref_id']
        
    for i, player in enumerate(home_lineup, start=1):
        game_record[f'Home_Batter{i}_Name'] = player['name']
        game_record[f'Home_Batter{i}_ID'] = player['mlbam_id']
        game_record[f'Home_Batter{i}_bbrefID'] = player['bbref_id']
    
    # Add starting pitchers
    if home_pitchers:
        game_record['Home_SP_Name'] = home_pitchers[0]['name']
        game_record['Home_SP_ID'] = home_pitchers[0]['mlbam_id']
        game_record['Home_SP_bbrefID'] = home_pitchers[0]['bbref_id']
    
    if away_pitchers:
        game_record['Away_SP_Name'] = away_pitchers[0]['name']
        game_record['Away_SP_ID'] = away_pitchers[0]['mlbam_id']
        game_record['Away_SP_bbrefID'] = away_pitchers[0]['bbref_id']
    
    # Add bullpen pitchers
    for i, pitcher in enumerate(home_bullpen, start=1):
        game_record[f'Home_bullpen_{i}_Name'] = pitcher['name']
        game_record[f'Home_bullpen_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Home_bullpen_{i}_bbrefID'] = pitcher['bbref_id']
    
    for i, pitcher in enumerate(away_bullpen, start=1):
        game_record[f'Away_bullpen_{i}_Name'] = pitcher['name']
        game_record[f'Away_bullpen_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Away_bullpen_{i}_bbrefID'] = pitcher['bbref_id']


    # Add remaining pitchers
    for i, pitcher in enumerate(home_pitchers[1:], start=2):
        game_record[f'Home_P_{i}_Name'] = pitcher['name']
        game_record[f'Home_P_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Home_P_{i}_bbrefID'] = pitcher['bbref_id']
    
    for i, pitcher in enumerate(away_pitchers[1:], start=2):
        game_record[f'Away_P_{i}_Name'] = pitcher['name']
        game_record[f'Away_P_{i}_ID'] = pitcher['mlbam_id']
        game_record[f'Away_P_{i}_bbrefID'] = pitcher['bbref_id']
    
    return pd.DataFrame([game_record])

def save_game_to_csv(gamepk):
    game_df = create_game_dataframe(gamepk)
    if not game_df.empty:
        file_name = f'gamelogs/game_{gamepk}.csv'
        game_df.to_csv(file_name, index=False)
        #print(f"Data exported to {file_name}")
    else:
        print(f"No data exported for game {gamepk}")

all_games = pd.read_csv('game_pks.csv').game_id


gamepks = all_games.tail(100) ################################################### CHANGE THIS #################################################################
count = 0
for gamepk in gamepks:
    save_game_to_csv(gamepk)
    count += 1
    if count % 10 == 0:
        print(count)

# Print collected errors
if errors:
    print("Errors encountered:")
    for error in errors:
        print(error)
else:
    print("No errors encountered.")

print('\n=================================================\nGamelogs generated. Now collecting odds data.\n=================================================')

# ====================================================== ODDS DATA ==========================================================

import pandas as pd
from datetime import datetime

def fetch_over_under_runline(oddshark_id, game_date):
    year = game_date.year
    url = f"https://www.oddsshark.com/stats/gamelog/baseball/mlb/{oddshark_id}?season={year}"
    
    try:
        tables = pd.read_html(url)
        df = tables[0]
    except Exception as e:
        print(f"BAD - error for team {oddshark_id} on date {game_date}: {e}")
        return 'unknown', None, None, None
    
    if df.empty:
        print(f"BAD - No data in table for team {oddshark_id} on date {game_date}")
        return 'unknown', None, None, None
    
    df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
    matching_rows = df[df['Date'] == game_date]

    if len(matching_rows) > 1:
        print(f"DOUBLEHEADER on {game_date}")
        return '', oddshark_id, year, game_date
    
    if matching_rows.empty:
        print(f"BAD - No matching date found for team {oddshark_id} on date {game_date}")
        return 'unknown', None, None, None
    
    over_under = matching_rows.iloc[0]['Total']
    return over_under, None, None, None

def update_gamelogs_with_over_under(game_pks_file, gamelogs_folder):
    game_pks_df = pd.read_csv(game_pks_file)
    game_pks = game_pks_df['game_id'].tail(50)  # Set the number of recent games to do
    duplicates = []
    
    count = 0
    for game_id in game_pks:
        try:
            gamelog_file = f'{gamelogs_folder}/game_{game_id}.csv'
            gamelog_df = pd.read_csv(gamelog_file)
            
            home_oddshark_id = gamelog_df.loc[0, 'home_oddshark_id']
            game_date_str = gamelog_df.loc[0, 'game_date']
            game_date = datetime.strptime(game_date_str, '%Y-%m-%d')
            
            over_under_runline, duplicate_id, duplicate_year, duplicate_date = fetch_over_under_runline(home_oddshark_id, game_date)
            
            if duplicate_id:
                duplicates.append((duplicate_id, duplicate_year, duplicate_date))
                
            gamelog_df['over_under_runline'] = over_under_runline
             
            gamelog_df.to_csv(gamelog_file, index=False)
            print(f"Updated {gamelog_file} with over/under runline.")
        except Exception as e:
            print(f"Error updating gamelog for game_id {game_id}: {e}")

        count+=1
        if count % 100:
            print(count)
    
    print("\nGames with duplicate dates:")
    for dup in duplicates:
        print(f"Team Oddshark ID: {dup[0]}, Year: {dup[1]}, Date: {dup[2].strftime('%Y-%m-%d')}")

# Update the paths as necessary
game_pks_file = 'game_pks.csv'
gamelogs_folder = 'gamelogs'

# Run the update function
update_gamelogs_with_over_under(game_pks_file, gamelogs_folder)

print('\n=================================================\nOdds data added. Now adding custom stats.\n=================================================')

# ============================================ CUSTOM STATS =======================================================

import os
import pandas as pd

def get_player_stats(bbref_id, player_type, game_id):
    """
    Get the player's stats for the specific game_id. If not available, return the most recent stats.
    """
    stats_dir = 'batters' if player_type == 'batting' else 'pitchers'
    stats_file = os.path.join(stats_dir, f'{bbref_id}_stats_{player_type}.csv')
    
    if not os.path.exists(stats_file):
        print(f"Stats file for {bbref_id} not found ({player_type}).")
        return None
    
    stats_df = pd.read_csv(stats_file)
    game_stats = stats_df[stats_df['game_id'] == game_id]
    
    if not game_stats.empty:
        return game_stats.iloc[0]
    else:
        return stats_df.iloc[-1]

def process_game(game_id):
    # Read the gamelog file
    game_file = f'gamelogs/game_{game_id}.csv'
    if not os.path.exists(game_file):
        print(f"Gamelog file for game {game_id} not found.")
        return
    
    game_df = pd.read_csv(game_file)
    game_data = game_df.iloc[0].to_dict()
    
    # Define relevant columns for batters and pitchers
    batter_columns = ['AVG_20', 'OBP_20', 'SLG_20', 'OPS_20', 'SB_20', 'CS_20', 'XB_20', 'TB_20', 'SO_20',
                      'AVG_10', 'OBP_10', 'SLG_10', 'OPS_10', 'SB_10', 'CS_10', 'XB_10', 'TB_10', 'SO_10',
                      'AVG_5', 'OBP_5', 'SLG_5', 'OPS_5', 'SB_5', 'CS_5', 'XB_5', 'TB_5', 'SO_5',
                      'AVG_3', 'OBP_3', 'SLG_3', 'OPS_3', 'SB_3', 'CS_3', 'XB_3', 'TB_3', 'SO_3']
    pitcher_columns = ['IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
                       'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
                       'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
                       'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
                       'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3']
    
    # Fetch stats for each batter
    for i in range(1, 10):
        for team in ['Away', 'Home']:
            bbref_id = game_data.get(f'{team}_Batter{i}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'batting', game_id)
                if stats is not None:
                    for col in batter_columns:
                        game_data[f'{team}_Batter{i}_{col}'] = stats.get(col, '')
            else:
                print(f'missing bbrefID for game {game_id}')

    # Fetch stats for each pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 11):
            role = 'SP' if i == 1 else f'P_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')

    # Fetch stats for each bullpen pitcher
    for team in ['Away', 'Home']:
        for i in range(1, 15):  # Adjust the range according to your maximum expected number of bullpen pitchers
            role = f'bullpen_{i}'
            bbref_id = game_data.get(f'{team}_{role}_bbrefID')
            if bbref_id:
                stats = get_player_stats(bbref_id, 'pitching', game_id)
                if stats is not None:
                    for col in pitcher_columns:
                        game_data[f'{team}_{role}_{col}'] = stats.get(col, '')
    
    # Create a DataFrame from the updated game data
    updated_game_df = pd.DataFrame([game_data])
    
    # Save the updated game data to a new CSV file
    output_file = f'gamelogs/gamestats_{game_id}.csv'
    updated_game_df.to_csv(output_file, index=False)
    print(f"Processed and saved game stats for game {game_id} to {output_file}")

def process_recent_games(num_recent_games):
    game_pks_file = 'game_pks.csv'
    if not os.path.exists(game_pks_file):
        print(f"{game_pks_file} not found.")
        return

    game_pks_df = pd.read_csv(game_pks_file)
    recent_game_pks = game_pks_df.tail(num_recent_games)['game_id'].tolist()
    
    for game_id in recent_game_pks:
        process_game(game_id)

# Input the number of most recent games to process
num_recent_games = 100  ######################################################### CHANGE THIS ##########################################################
process_recent_games(num_recent_games)

print('\n=================================================\nCustom stats added. Now generating dataset.\n=================================================')

# ================================================= GENERATING DATASET ===========================================================

import os
import pandas as pd

game_pks_path = 'game_pks.csv'
gamelogs_dir = 'gamelogs/'
output_path = 'model/unsorted_currentdata.csv'

# Read the game_pks.csv file
game_pks_df = pd.read_csv(game_pks_path).tail(100)
game_pks_list = game_pks_df['game_id'].tolist()

# Initialize an empty list to store DataFrames
dataframes = []

# Initialize a set to store all columns
all_columns = set()

# First pass: Collect all unique columns
for game_pk in game_pks_list:
    file_path = os.path.join(gamelogs_dir, f'gamestats_{game_pk}.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        all_columns.update(df.columns)
    else:
        print(f"BAD - File {file_path} not found.")

print("First pass: columns collected")

# Second pass: Read files and align columns
for game_pk in game_pks_list:
    file_path = os.path.join(gamelogs_dir, f'gamestats_{game_pk}.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        # Add missing columns with default value of NaN (handled by reindex)
        df = df.reindex(columns=all_columns)
        dataframes.append(df)

print("Second pass: data added.")

# Concatenate all the DataFrames
master_df = pd.concat(dataframes, ignore_index=True)

# Save the master DataFrame to a CSV file
master_df.to_csv(output_path, index=False)

print(f"Master dataset saved to {output_path}")

import numpy as np

# Load the dataset
df = pd.read_csv('model/unsorted_currentdata.csv', low_memory=False)

pitcher_columns = [
    'IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
    'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
    'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
    'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
    'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3'
]
    

# Add 'over_under_runline' column right after 'runs_total'
if 'over_under_runline' in df.columns:
    columns = df.columns.tolist()
    runline_index = columns.index('over_under_runline')
    columns.insert(columns.index('runs_total') + 1, columns.pop(runline_index))
    df = df[columns]
else:
    print("Warning: 'over_under_runline' column not found.")

# Define a function to sort columns by player order and then alphabetically
def sort_columns(df):
    # List to store sorted column names
    sorted_columns = []
    
    # Ensure the specified order of the first few general columns
    first_columns = ['gamepk','game_id', 'game_date', 'home_name', 'away_name', 'runs_home', 'runs_away', 'runs_total', 'over_under_runline']
    for col in first_columns:
        if col in df.columns:
            sorted_columns.append(col)
    
    # Lists to categorize columns
    away_batter_columns = [[] for _ in range(9)]
    home_batter_columns = [[] for _ in range(9)]
    away_pitcher_columns = [[] for _ in range(9)]
    home_pitcher_columns = [[] for _ in range(9)]
    away_bullpen_columns = [[] for _ in range(15)]
    home_bullpen_columns = [[] for _ in range(15)]
    
    # Helper function to ensure the list is long enough
    def ensure_length(lst, index):
        while len(lst) <= index:
            lst.append([])
    
    # Helper function to sort player-specific columns
    def sort_player_columns(columns):
        player_columns = []
        other_columns = []
        for col in columns:
            if any(key in col for key in ['Name', 'ID', 'bbrefID']):
                player_columns.append(col)
            else:
                other_columns.append(col)
        return sorted(player_columns) + sorted(other_columns)
    
    # Categorize columns
    for col in df.columns:
        if col.startswith('Away_Batter'):
            try:
                num = int(col.split('_')[1][6]) - 1
                ensure_length(away_batter_columns, num)
                away_batter_columns[num].append(col)
            except (ValueError, IndexError):
                continue
        elif col.startswith('Home_Batter'):
            try:
                num = int(col.split('_')[1][6]) - 1
                ensure_length(home_batter_columns, num)
                home_batter_columns[num].append(col)
            except (ValueError, IndexError):
                continue
        elif col.startswith('Away_P_') or col.startswith('Away_SP'):
            try:
                if 'SP' in col:
                    num = 0
                else:
                    num = int(col.split('_')[2])
                ensure_length(away_pitcher_columns, num)
                away_pitcher_columns[num].append(col)
            except (ValueError, IndexError):
                continue
        elif col.startswith('Home_P_') or col.startswith('Home_SP'):
            try:
                if 'SP' in col:
                    num = 0
                else:
                    num = int(col.split('_')[2])
                ensure_length(home_pitcher_columns, num)
                home_pitcher_columns[num].append(col)
            except (ValueError, IndexError):
                continue
        elif col.startswith('Away_bullpen'):
            try:
                num = int(col.split('_')[2]) - 1
                ensure_length(away_bullpen_columns, num)
                away_bullpen_columns[num].append(col)
            except (ValueError, IndexError):
                continue
        elif col.startswith('Home_bullpen'):
            try:
                num = int(col.split('_')[2]) - 1
                ensure_length(home_bullpen_columns, num)
                home_bullpen_columns[num].append(col)
            except (ValueError, IndexError):
                continue

    # Sort each category
    for batter_columns in away_batter_columns:
        sorted_columns.extend(sort_player_columns(batter_columns))
    for batter_columns in home_batter_columns:
        sorted_columns.extend(sort_player_columns(batter_columns))
    for pitcher_columns in away_pitcher_columns:
        sorted_columns.extend(sort_player_columns(pitcher_columns))
    for pitcher_columns in home_pitcher_columns:
        sorted_columns.extend(sort_player_columns(pitcher_columns))
    for bullpen_columns in away_bullpen_columns:
        sorted_columns.extend(sort_player_columns(bullpen_columns))
    for bullpen_columns in home_bullpen_columns:
        sorted_columns.extend(sort_player_columns(bullpen_columns))
    
    return df[sorted_columns]

# Sort columns in the dataset
sorted_df = sort_columns(df)

# Save the sorted dataset
sorted_df.to_csv('model/currentdata.csv', index=False)

# Load the sorted dataset
sorted_df = pd.read_csv('model/currentdata.csv', low_memory=False)

# Function to determine if a column should be removed
def should_remove_column(col):
    if col.startswith('Home_bullpen_') or col.startswith('Away_bullpen_'):
        try:
            # Extract the number from the column name and check if it is 16 or higher
            number = int(col.split('_')[2])
            return number >= 16
        except ValueError:
            # If the suffix is not numeric, do not remove
            return False
    return False

# Remove columns that start with 'Home_bullpen_' or 'Away_bullpen_' and have a number 16 or higher
columns_to_remove = [col for col in sorted_df.columns if should_remove_column(col)]
sorted_df.drop(columns=columns_to_remove, inplace=True)


# Remove rows where runline is 'unknown'
filtered_df = sorted_df[sorted_df['over_under_runline'] != 'unknown']

# Function to filter games based on date range
def filter_games_by_date(df):
    # Convert game_date to datetime
    df['game_date'] = pd.to_datetime(df['game_date'])
    
    # Filter out games before April 5 and after October 5
    filtered_df = df[(df['game_date'].dt.month >= 4) & (df['game_date'].dt.month <= 10) &
                     ((df['game_date'].dt.month != 4) | (df['game_date'].dt.day >= 5)) &
                     ((df['game_date'].dt.month != 10) | (df['game_date'].dt.day <= 5))]
    
    return filtered_df

# Filter the games by date
filtered_df = filter_games_by_date(filtered_df)

# Convert over_under_runline to numeric
filtered_df['over_under_runline'] = pd.to_numeric(filtered_df['over_under_runline'])

# Create binary target variable
filtered_df['over_under_target'] = (filtered_df['runs_total'] >= filtered_df['over_under_runline']).astype(int)

# Rearrange columns to move 'over_under_target' to the right of 'over_under_runline'
columns = list(filtered_df.columns)
over_under_runline_index = columns.index('over_under_runline')

# Insert 'over_under_target' right after 'over_under_runline'
columns.insert(over_under_runline_index + 1, columns.pop(columns.index('over_under_target')))
filtered_df = filtered_df[columns]


# Save the filtered dataset with the target variable
filtered_df.to_csv('model/currentdata.csv', index=False)

# Load the dataset
file_path = 'model/currentdata.csv'
df = pd.read_csv(file_path)

# List of pitcher stats columns to be averaged
pitcher_columns = [
    'IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
    'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
    'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
    'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
    'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3'
]

# Function to calculate average stats for bullpen pitchers
def calculate_bullpen_averages(team_prefix):
    for stat in pitcher_columns:
        stat_columns = [f"{team_prefix}_bullpen_{i}_{stat}" for i in range(1, 13)]
        # Convert columns to numeric, coercing errors to NaNs
        df[stat_columns] = df[stat_columns].apply(pd.to_numeric, errors='coerce')
        # Replace infinite values with NaNs
        df[stat_columns] = df[stat_columns].replace([np.inf, -np.inf], np.nan)
        df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)

# Calculate averages for home and away teams
calculate_bullpen_averages('Home')
calculate_bullpen_averages('Away')

# Save the updated dataframe back to the same file
df.to_csv(file_path, index=False)

print("Updated dataset saved successfully.")

# Load the dataset
file_path = 'model/currentdata.csv'
df = pd.read_csv(file_path)

# List of pitcher stats columns to be removed
pitcher_columns = [
    'IP_real_20', 'ERA', 'H_20', 'BF_20', 'HR_20', 'R_20', 'ER_20', 'BB_20', 'SO_20', 'XB_against_20',
    'TB_against_20', 'ERA_20', 'WHIP_20', 'IP_real_10', 'H_10', 'BF_10', 'HR_10', 'R_10', 'ER_10', 'BB_10', 'SO_10', 'XB_against_10',
    'TB_against_10', 'ERA_10', 'WHIP_10', 'IP_real_5', 'H_5', 'BF_5', 'HR_5', 'R_5', 'ER_5', 'BB_5',
    'SO_5', 'XB_against_5', 'TB_against_5', 'ERA_5', 'WHIP_5', 'IP_real_3', 'H_3', 'BF_3', 'HR_3', 'R_3', 'ER_3', 'BB_3',
    'SO_3', 'XB_against_3', 'TB_against_3', 'ERA_3', 'WHIP_3'
]

# Function to remove individual bullpen columns
def remove_bullpen_columns(team_prefix):
    for stat in pitcher_columns:
        stat_columns = [f"{team_prefix}_bullpen_{i}_{stat}" for i in range(1, 13)]
        df.drop(columns=stat_columns, inplace=True)

# Remove bullpen columns for home and away teams
remove_bullpen_columns('Home')
remove_bullpen_columns('Away')

# Save the updated dataframe back to the same file
df.to_csv(file_path, index=False)

print("Updated dataset saved successfully.")

print('\n=================================================\nSUCCESS - THIS FILE IS COMPLETE\n=================================================')

Gathering player lookup table. This may take a moment.
10
20
30
40
50
60
70
80
90
100
Errors encountered:
(746190, 'Error getting home runs')
(746190, 'Error getting away runs')

Gamelogs generated. Now collecting odds data.
Updated gamelogs/game_747163.csv with over/under runline.
1
Updated gamelogs/game_745381.csv with over/under runline.
2
Updated gamelogs/game_746919.csv with over/under runline.
3
Updated gamelogs/game_747000.csv with over/under runline.
4
Updated gamelogs/game_746598.csv with over/under runline.
5
Updated gamelogs/game_746436.csv with over/under runline.
6
Updated gamelogs/game_746677.csv with over/under runline.
7
Updated gamelogs/game_745544.csv with over/under runline.
8
Updated gamelogs/game_745053.csv with over/under runline.
9
Updated gamelogs/game_745790.csv with over/under runline.
10
Updated gamelogs/game_746759.csv with over/under runline.
11
Updated gamelogs/game_745951.csv with over/under runline.
12
Updated gamelogs/game_746188.csv with over/under run

  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat}"] = df[stat_columns].mean(axis=1)
  df[f"{team_prefix}_bullpen_avg_{stat

Updated dataset saved successfully.
Updated dataset saved successfully.

SUCCESS - THIS FILE IS COMPLETE
