# Pitcher Game Cleaned

Clean and oragnize the pitcher game stat logs with additional engineered features such as k-percentage, csw-percentage, and rolling windows

In [8]:
import pandas as pd

# Load all three datasets
pitcher_logs_2022 = pd.read_csv("../../data/historical/pitcher_game_stats_2022.csv", parse_dates=['game_date'])
pitcher_logs_2023 = pd.read_csv("../../data/historical/pitcher_game_stats_2023.csv", parse_dates=['game_date'])
pitcher_logs_2024 = pd.read_csv("../../data/historical/pitcher_game_stats_2024.csv", parse_dates=['game_date'])

# Combine into single dataset
pitcher_logs_full = pd.concat([pitcher_logs_2022, pitcher_logs_2023, pitcher_logs_2024], ignore_index=True)

In [9]:
# Create new dataframe with all needed columns
pitcher_logs_full_team = pd.DataFrame({
    'game_pk': pitcher_logs_full['game_pk'],
    'game_date': pitcher_logs_full['game_date'],
    'player_name': pitcher_logs_full['player_name'],
    'player_id': pitcher_logs_full['pitcher'],
    'team': pitcher_logs_full['team'],
    'opp': pitcher_logs_full['opp'],
    'is_home': pitcher_logs_full['team'] == pitcher_logs_full['home_team'],
    'total_pitches': pitcher_logs_full['total_pitches'],
    'strikeouts': pitcher_logs_full['strikeouts'],
    'csw_count': pitcher_logs_full['csw_count'],
    'batters_faced': pitcher_logs_full['batters_faced']
})

In [10]:
# Sort by pitcher and date to prepare for trailing stats
pitcher_logs_full_sorted = pitcher_logs_full_team.sort_values(by=["player_id", "game_date"]).copy()

# Compute per-game rates
pitcher_logs_full_sorted['k_pct'] = (
    pitcher_logs_full_sorted['strikeouts'] / 
    pitcher_logs_full_sorted['batters_faced'].replace(0, pd.NA)
)

pitcher_logs_full_sorted['csw_pct'] = (
    pitcher_logs_full_sorted['csw_count'] / 
    pitcher_logs_full_sorted['total_pitches'].replace(0, pd.NA)
)

# Compute EWMA of K% and CSW% (using halflife = 7 games)
pitcher_logs_full_sorted['pit_ewma_k_pct'] = (
    pitcher_logs_full_sorted.groupby('player_id')['k_pct']
    .transform(lambda x: x.shift(1).ewm(halflife=7, min_periods=1).mean())
)

pitcher_logs_full_sorted['pit_ewma_csw_pct'] = (
    pitcher_logs_full_sorted.groupby('player_id')['csw_pct']
    .transform(lambda x: x.shift(1).ewm(halflife=7, min_periods=1).mean())
)

# Extract season from game date
pitcher_logs_full_sorted['season'] = pitcher_logs_full_sorted['game_date'].dt.year

In [11]:
# Select only the columns you want in your training set
pitcher_training_set = pitcher_logs_full_sorted[[
    'game_pk',
    'game_date', 
    'season',
    'player_id',
    'player_name',
    'team',
    'opp',
    'is_home',
    'pit_ewma_k_pct',
    'pit_ewma_csw_pct',
    'strikeouts',
]].copy()

pitcher_training_set = pitcher_training_set.dropna()

# Optional: sort for readability
pitcher_training_set = pitcher_training_set.sort_values(['player_id', 'game_date'])

# Save to CSV
pitcher_training_set.to_csv("../../data/processed/pitcher_game_stats.csv", index=False)