In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file for a pitcher
player_id = 'adamja01'  # Replace with the player ID or file path
file_path = f'pitchers/{player_id}_pitching.csv'
df = pd.read_csv(file_path)

# Remove the irrelevant column 'Gtm'
df = df.drop(columns=['Gtm'])

# Ensure the 'game_date' column is in datetime format
df['game_date'] = pd.to_datetime(df['game_date'])

# Extract the year from the 'game_date' column
df['season'] = df['game_date'].dt.year

# Clean non-numeric values in numeric columns
def clean_numeric(value):
    try:
        value = str(value).replace('\xa0', '').replace('(', '').replace(')', '').replace(',', '')
        return float(value)
    except ValueError:
        return np.nan

# Define columns to convert to numeric
numeric_columns = ['IP', 'H', 'R', 'ER', 'BB', 'SO', 'HR', 'BF', '2B', '3B', 'IBB']

# Apply cleaning function to numeric columns
for col in numeric_columns:
    df[col] = df[col].apply(clean_numeric)

# Fill NaN values with 0 for numerical calculations
df[numeric_columns] = df[numeric_columns].fillna(0)

# Ensure columns are of correct numeric type
df[numeric_columns] = df[numeric_columns].astype(float)

# Define functions to calculate required statistics
def calculate_era(df):
    return (df['ER'] * 9) / df['IP']

def calculate_whip(df):
    return (df['H'] + df['BB']) / df['IP']

def calculate_extra_base_hits_against(df):
    return df['2B'] + df['3B'] + df['HR']

def calculate_total_bases_against(df):
    return df['H'] + df['2B'] + 2*df['3B'] + 3*df['HR']

def calculate_rolling_stats(df, window, suffix):
    rolling_df = df.rolling(window=window, min_periods=1).sum()
    rolling_df['ERA'] = calculate_era(rolling_df)
    rolling_df['WHIP'] = calculate_whip(rolling_df)
    rolling_df['XB_against'] = calculate_extra_base_hits_against(rolling_df)
    rolling_df['TB_against'] = calculate_total_bases_against(rolling_df)
    rolling_df = rolling_df[['IP', 'H', 'BF', 'HR', 'R', 'ER', 'BB', 'SO', 'XB_against', 'TB_against', 'ERA', 'WHIP']]
    rolling_df.columns = [f'{col}_{suffix}' for col in rolling_df.columns]
    
    # Round the stats to 3 decimal points
    rolling_df = rolling_df.round(3)
    
    return rolling_df

# Exclude non-numeric columns from rolling stats calculation
rolling_df = df[numeric_columns].copy()

# Calculate rolling stats for the last 20 games and shift by one row
rolling_stats_20 = calculate_rolling_stats(rolling_df, 20, '20').shift(1).fillna(0)

# Calculate rolling stats for the last 5 games and shift by one row
rolling_stats_5 = calculate_rolling_stats(rolling_df, 5, '5').shift(1).fillna(0)

# Calculate season-long stats for each year and shift by one row
season_stats = pd.DataFrame()
for year in range(2021, 2025):
    season_df = df[df['season'] == year][numeric_columns].copy()
    season_cumsum = season_df.cumsum().shift(1).fillna(0)
    season_cumsum['ERA'] = calculate_era(season_cumsum)
    season_cumsum['WHIP'] = calculate_whip(season_cumsum)
    season_cumsum['XB_against'] = calculate_extra_base_hits_against(season_cumsum)
    season_cumsum['TB_against'] = calculate_total_bases_against(season_cumsum)
    season_cumsum = season_cumsum[['IP', 'H', 'BF', 'HR', 'R', 'ER', 'BB', 'SO', 'XB_against', 'TB_against', 'ERA', 'WHIP']]
    season_cumsum.columns = [f'{col}_current' for col in season_cumsum.columns]
    season_stats = pd.concat([season_stats, season_cumsum])

# Ensure the season_stats index aligns with the original dataframe
season_stats.index = df.index

# Combine all the stats into a single dataframe
final_df = pd.concat([df, rolling_stats_20, rolling_stats_5, season_stats], axis=1)

# For each first game of the season, set 'current' stats to 'last 20' stats
for year in range(2021, 2025):
    first_game_index = final_df[final_df['season'] == year].index[0]
    for stat in ['IP', 'H', 'BF', 'HR', 'R', 'ER', 'BB', 'SO', 'XB_against', 'TB_against', 'ERA', 'WHIP']:
        final_df.at[first_game_index, f'{stat}_current'] = final_df.at[first_game_index, f'{stat}_20']

# Round the combined dataframe stats to 3 decimal points
final_df = final_df.round(3)

# Display the combined dataframe
print(final_df.tail())

# Save the combined stats to a CSV file
final_df.to_csv(f'pitchers/{player_id}_stats.csv', index=False)

print(f"Generated stats for {player_id} and saved to CSV file.")

     Rk  Gcar        Date   Tm Unnamed: 5  Opp   Rslt  Inngs     Dec  DR  ...  \
154  20   222  2024-05-14  TBR          @  BOS  L,4-5  10-10     NaN   0  ...   
155  21   223  2024-05-16  TBR          @  BOS  W,7-5    9-9    H(9)   1  ...   
156  22   224  2024-05-18  TBR          @  TOR  W,5-4    8-8   H(10)   1  ...   
157  23   225  2024-05-21  TBR        NaN  BOS  L,2-5    8-8  L(2-1)   2  ...   
158  24   226  2024-05-25  TBR        NaN  KCR  L,4-7    8-8     NaN   3  ...   

     BF_current  HR_current  R_current  ER_current  BB_current  SO_current  \
154        69.0         2.0        4.0         3.0         7.0        16.0   
155        71.0         2.0        4.0         3.0         7.0        16.0   
156        75.0         2.0        4.0         3.0         8.0        17.0   
157        80.0         2.0        4.0         3.0         8.0        18.0   
158        86.0         2.0        6.0         5.0        10.0        19.0   

     XB_against_current  TB_against_current 