In [1]:
import pandas as pd
import numpy as np
import os

idlist = pd.read_csv('batter_ids.csv')
batter_ids = idlist.key_bbref

game_pks = pd.read_csv('game_pks.csv')

# Define function to create an empty DataFrame with the correct structure
def create_empty_stats_df():
    columns = ['Rk', 'Gcar', 'Gtm', 'Date', 'Tm', 'Unnamed: 5', 'Opp', 'Rslt', 'Inngs', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'BA', 'OBP', 'SLG', 'OPS', 'BOP', 'aLI', 'WPA', 'acLI', 'cWPA', 'RE24', 'DFS(DK)', 'DFS(FD)', 'Pos', 'dbl', 'game_date', 'team_id', 'opp_id', 'game_id']
    empty_df = pd.DataFrame(columns=columns)
    return empty_df


for id in batter_ids:

    if not id or pd.isna(id):
        continue

    file_path = f'batters/{id}_batting.csv'
    
    df = pd.read_csv(file_path)

    # Remove the irrelevant column 'Gtm'
    df = df.drop(columns=['Gtm'])

    # Ensure the 'game_date' column is in datetime format
    df['game_date'] = pd.to_datetime(df['game_date'])

    # Extract the year from the 'game_date' column
    df['season'] = df['game_date'].dt.year

    # Clean non-numeric values in numeric columns
    def clean_numeric(value):
        try:
            value = str(value).replace('\xa0', '').replace('(', '').replace(')', '').replace(',', '')
            return float(value)
        except ValueError:
            return np.nan

    # Define columns to convert to numeric
    numeric_columns = ['PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'DFS(DK)', 'DFS(FD)']

    # Apply cleaning function to numeric columns
    for col in numeric_columns:
        df[col] = df[col].apply(clean_numeric)

    # Fill NaN values with 0 for numerical calculations
    df[numeric_columns] = df[numeric_columns].fillna(0)

    # Ensure columns are of correct numeric type
    df[numeric_columns] = df[numeric_columns].astype(float)

    # Define functions to calculate required statistics
    def calculate_avg(df):
        return df['H'] / df['AB']

    def calculate_obp(df):
        return (df['H'] + df['BB'] + df['HBP']) / (df['AB'] + df['BB'] + df['HBP'] + df['SF'])

    def calculate_slg(df):
        return (df['H'] + 2*df['2B'] + 3*df['3B'] + 4*df['HR']) / df['AB']

    def calculate_ops(df):
        return calculate_obp(df) + calculate_slg(df)

    def calculate_extra_base_hits(df):
        return df['2B'] + df['3B'] + df['HR']

    def calculate_total_bases(df):
        return df['H'] + df['2B'] + 2*df['3B'] + 3*df['HR']

    def calculate_rolling_stats(df, window, suffix):
        rolling_df = df.rolling(window=window, min_periods=1).sum()
        rolling_df['AVG'] = calculate_avg(rolling_df)
        rolling_df['OBP'] = calculate_obp(rolling_df)
        rolling_df['SLG'] = calculate_slg(rolling_df)
        rolling_df['OPS'] = calculate_ops(rolling_df)
        rolling_df['XB'] = calculate_extra_base_hits(rolling_df)
        rolling_df['TB'] = calculate_total_bases(rolling_df)
        rolling_df = rolling_df[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
        rolling_df.columns = [f'{col}_{suffix}' for col in rolling_df.columns]
        
        # Round the stats to 3 decimal points
        rolling_df = rolling_df.round(3)
        
        return rolling_df

    # Exclude non-numeric columns from rolling stats calculation
    rolling_df = df[numeric_columns].copy()

    # Calculate rolling stats for the last 20 games and shift by one row
    rolling_stats_20 = calculate_rolling_stats(rolling_df, 20, '20').shift(1).fillna(0)

    # Calculate rolling stats for the last 10 games and shift by one row
    rolling_stats_10 = calculate_rolling_stats(rolling_df, 10, '10').shift(1).fillna(0)

    # Calculate rolling stats for the last 5 games and shift by one row
    rolling_stats_5 = calculate_rolling_stats(rolling_df, 5, '5').shift(1).fillna(0)

    # Calculate rolling stats for the last 5 games and shift by one row
    rolling_stats_3 = calculate_rolling_stats(rolling_df, 3, '3').shift(1).fillna(0)

    # Calculate season-long stats for each year and shift by one row
    season_stats = pd.DataFrame()
    for year in range(2021, 2025):
        season_df = df[df['season'] == year][numeric_columns].copy()
        season_cumsum = season_df.cumsum().shift(1).fillna(0)
        season_cumsum['AVG'] = calculate_avg(season_cumsum)
        season_cumsum['OBP'] = calculate_obp(season_cumsum)
        season_cumsum['SLG'] = calculate_slg(season_cumsum)
        season_cumsum['OPS'] = calculate_ops(season_cumsum)
        season_cumsum['XB'] = calculate_extra_base_hits(season_cumsum)
        season_cumsum['TB'] = calculate_total_bases(season_cumsum)
        season_cumsum = season_cumsum[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
        season_cumsum.columns = [f'{col}_current' for col in season_cumsum.columns]
        season_stats = pd.concat([season_stats, season_cumsum])

    # Ensure the season_stats index aligns with the original dataframe
    season_stats.index = df.index

    # Combine all the stats into a single dataframe
    final_df = pd.concat([df, rolling_stats_20, rolling_stats_10, rolling_stats_5, rolling_stats_3, season_stats], axis=1)

    # Round the combined dataframe stats to 3 decimal points
    final_df = final_df.round(3)

    # Display the combined dataframe
    print(final_df.tail())

    # Save the combined stats to a CSV file
    final_df.to_csv(f'batters/{id}_stats_batting.csv', index=False)

    print(f"Generated stats for {id} and saved to CSV file.")

   Rk  Gcar        Date   Tm Unnamed: 5  Opp    Rslt Inngs   PA   AB  ...  \
3   4     4  2021-06-19  CHC        NaN  MIA  L,1-11   4-6  0.0  0.0  ...   
4   5     5  2021-07-08  CHC        NaN  PHI   L,0-8   6-8  0.0  0.0  ...   
5   6     6  2021-07-28  CHC        NaN  CIN   L,2-8   9-9  0.0  0.0  ...   
6   7     7  2021-10-01  CHC          @  STL   L,3-4  GS-6  2.0  2.0  ...   
7   8     8  2021-10-02  CHC          @  STL   W,6-5   6-6  1.0  1.0  ...   

   SO_3  AVG_current  OBP_current  SLG_current  OPS_current  SB_current  \
3   0.0          NaN          NaN          NaN          NaN         0.0   
4   0.0          NaN          NaN          NaN          NaN         0.0   
5   0.0          NaN          NaN          NaN          NaN         0.0   
6   0.0          NaN          NaN          NaN          NaN         0.0   
7   1.0          0.5          0.5          0.5          1.0         0.0   

   CS_current  XB_current  TB_current  SO_current  
3         0.0         0.0         