In [1]:
import pandas as pd

In [34]:
# Load the CSV file for a player
player_id = 'abramcj01'  # Replace with the player ID or file path
file_path = f'batters/{player_id}_batting.csv'
df = pd.read_csv(file_path)

# Remove the irrelevant column 'Gtm'
df = df.drop(columns=['Gtm'])

# Ensure the 'game_date' column is in datetime format
df['game_date'] = pd.to_datetime(df['game_date'])

# Extract the year from the 'game_date' column
df['season'] = df['game_date'].dt.year

# Clean non-numeric values in numeric columns
def clean_numeric(value):
    try:
        value = str(value).replace('\xa0', '').replace('(', '').replace(')', '').replace(',', '')
        return float(value)
    except ValueError:
        return np.nan

# Define columns to convert to numeric
numeric_columns = ['PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SH', 'SF', 'ROE', 'GDP', 'SB', 'CS', 'DFS(DK)', 'DFS(FD)']

# Apply cleaning function to numeric columns
for col in numeric_columns:
    df[col] = df[col].apply(clean_numeric)

# Fill NaN values with 0 for numerical calculations
df[numeric_columns] = df[numeric_columns].fillna(0)

# Ensure columns are of correct numeric type
df[numeric_columns] = df[numeric_columns].astype(float)

# Define functions to calculate required statistics
def calculate_avg(df):
    return df['H'] / df['AB']

def calculate_obp(df):
    return (df['H'] + df['BB'] + df['HBP']) / (df['AB'] + df['BB'] + df['HBP'] + df['SF'])

def calculate_slg(df):
    return (df['H'] + 2*df['2B'] + 3*df['3B'] + 4*df['HR']) / df['AB']

def calculate_ops(df):
    return calculate_obp(df) + calculate_slg(df)

def calculate_extra_base_hits(df):
    return df['2B'] + df['3B'] + df['HR']

def calculate_total_bases(df):
    return df['H'] + df['2B'] + 2*df['3B'] + 3*df['HR']

def calculate_rolling_stats(df, window, suffix):
    rolling_df = df.rolling(window=window, min_periods=1).sum()
    rolling_df['AVG'] = calculate_avg(rolling_df)
    rolling_df['OBP'] = calculate_obp(rolling_df)
    rolling_df['SLG'] = calculate_slg(rolling_df)
    rolling_df['OPS'] = calculate_ops(rolling_df)
    rolling_df['XB'] = calculate_extra_base_hits(rolling_df)
    rolling_df['TB'] = calculate_total_bases(rolling_df)
    rolling_df = rolling_df[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
    rolling_df.columns = [f'{col}_{suffix}' for col in rolling_df.columns]
    
    # Round the stats to 3 decimal points
    rolling_df = rolling_df.round(3)
    
    return rolling_df

# Exclude non-numeric columns from rolling stats calculation
rolling_df = df[numeric_columns].copy()

# Calculate rolling stats for the last 20 games and shift by one row
rolling_stats_20 = calculate_rolling_stats(rolling_df, 20, '20').shift(1).fillna(0)

# Calculate rolling stats for the last 5 games and shift by one row
rolling_stats_5 = calculate_rolling_stats(rolling_df, 5, '5').shift(1).fillna(0)

# Calculate season-long stats for each year and shift by one row
season_stats = pd.DataFrame()
for year in range(2021, 2025):
    season_df = df[df['season'] == year][numeric_columns].copy()
    season_cumsum = season_df.cumsum().shift(1).fillna(0)
    season_cumsum['AVG'] = calculate_avg(season_cumsum)
    season_cumsum['OBP'] = calculate_obp(season_cumsum)
    season_cumsum['SLG'] = calculate_slg(season_cumsum)
    season_cumsum['OPS'] = calculate_ops(season_cumsum)
    season_cumsum['XB'] = calculate_extra_base_hits(season_cumsum)
    season_cumsum['TB'] = calculate_total_bases(season_cumsum)
    season_cumsum = season_cumsum[['AVG', 'OBP', 'SLG', 'OPS', 'SB', 'CS', 'XB', 'TB', 'SO']]
    season_cumsum.columns = [f'{col}_current' for col in season_cumsum.columns]
    season_stats = pd.concat([season_stats, season_cumsum])

# Ensure the season_stats index aligns with the original dataframe
season_stats.index = df.index

# Combine all the stats into a single dataframe
final_df = pd.concat([df, rolling_stats_20, rolling_stats_5, season_stats], axis=1)

# Round the combined dataframe stats to 3 decimal points
final_df = final_df.round(3)

# Display the combined dataframe
print(final_df.tail())

# Save the combined stats to a CSV file
final_df.to_csv(f'batters/{player_id}_stats.csv', index=False)

print(f"Generated stats for {player_id} and saved to CSV file.")

     Rk  Gcar        Date   Tm Unnamed: 5  Opp   Rslt Inngs   PA   AB  ...  \
283  45   286  2024-05-22  WSN        NaN  MIN  L,2-3    CG  4.0  4.0  ...   
284  46   287  2024-05-24  WSN        NaN  SEA  W,6-1    CG  4.0  4.0  ...   
285  47   288  2024-05-25  WSN        NaN  SEA  W,3-1    CG  4.0  4.0  ...   
286  48   289  2024-05-26  WSN        NaN  SEA  L,5-9    CG  4.0  4.0  ...   
287  49   290  2024-05-27  WSN          @  ATL  W,8-4    CG  5.0  4.0  ...   

     SO_5  AVG_current  OBP_current  SLG_current  OPS_current  SB_current  \
283   4.0        0.263        0.318        0.587        0.905         8.0   
284   4.0        0.257        0.312        0.574        0.885         8.0   
285   3.0        0.262        0.315        0.572        0.887         8.0   
286   3.0        0.257        0.309        0.560        0.869         8.0   
287   3.0        0.256        0.308        0.574        0.882         8.0   

     CS_current  XB_current  TB_current  SO_current  
283         3.