# Pitcher K Dataset

Assemble the dataset used to train pitcher k for a single game

#### Improvements that can be made
- Don't use opponent previous season for current predictions. Especially later in the season, we should have better data.
- Pitcher handedness, left handed batters in lineup, right handed batters in lineup

In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [46]:
# Load the datasets
pitcher_physical = pd.read_csv("../../data/processed/pitcher_physical.csv", parse_dates=["birth_date"])
pitcher_logs = pd.read_csv("../../data/processed/pitcher_game_stats.csv", parse_dates=["game_date"])
batter_game_logs = pd.read_csv("../../data/processed/batter_game_pitcher_logs.csv")
weather_data = pd.read_csv("../../data/historical/game_weather.csv")

In [58]:
# Get the opponent batter's EWMA k-percentage against LHP and RHP

batter_logs_for_k = batter_game_logs.copy()

# Group by opponent team, pitcher handedness faced, and game date
team_vs_pitch_hand = (
    batter_logs_for_k.groupby(["game_date", "team", "pitcher_handedness"])
    .agg(
        strikeouts=("strikeouts", "sum"),
        plate_appearances=("plate_appearances", "sum")
    )
    .reset_index()
)

# Compute daily team-level K%
team_vs_pitch_hand["k_perc"] = (
    team_vs_pitch_hand["strikeouts"] / team_vs_pitch_hand["plate_appearances"]
)

# Sort and compute EWMA of K% using halflife
team_vs_pitch_hand = team_vs_pitch_hand.sort_values("game_date")
team_vs_pitch_hand["ewma_k_perc"] = (
    team_vs_pitch_hand
    .groupby(["team", "pitcher_handedness"])["k_perc"]
    .transform(lambda x: x.ewm(halflife=7, adjust=False).mean())
)

# Pivot to wide format for vs LHP and vs RHP
k_perc_ewma_by_team = team_vs_pitch_hand.pivot_table(
    index=["game_date", "team"],
    columns="pitcher_handedness",
    values="ewma_k_perc"
).reset_index()

# Rename columns for clarity
k_perc_ewma_by_team.columns.name = None
k_perc_ewma_by_team = k_perc_ewma_by_team.rename(columns={
    "L": "opp_ewma_k_perc_vs_lhp",
    "R": "opp_ewma_k_perc_vs_rhp"
})

# Ensure game_date is datetime
k_perc_ewma_by_team["game_date"] = pd.to_datetime(k_perc_ewma_by_team["game_date"])

# Shift the game date forward by 1 day so we don't include the outcome of today's game in the dataset
k_perc_ewma_by_team["date"] = k_perc_ewma_by_team["game_date"] + pd.Timedelta(days=1)

k_perc_ewma_by_team = k_perc_ewma_by_team.drop(columns=["game_date"])

# Now merge
pitcher_logs_with_k_ewma = pitcher_logs.copy()
pitcher_logs_with_k_ewma = pitcher_logs_with_k_ewma.merge(
    k_perc_ewma_by_team.rename(columns={"team": "team_temp"}),
    how="left",
    left_on=["game_date", "opp"],
    right_on=["date", "team_temp"]
)

pitcher_logs_with_k_ewma = pitcher_logs_with_k_ewma.drop(columns=["date", "team_temp"])

In [59]:
# Get the opponent batter's EWMA CSW percentage against LHP and RHP

batter_logs_for_csw = batter_game_logs.copy()

team_vs_pitch_hand_csw = (
    batter_logs_for_csw.groupby(["game_date", "team", "pitcher_handedness"])
    .agg(
        swinging_strikes=("swinging_strikes", "sum"),
        called_strikes=("called_strikes", "sum"),
        total_pitches_seen=("total_pitches_seen", "sum")
    )
    .reset_index()
)

team_vs_pitch_hand_csw["csw_perc"] = (
    (team_vs_pitch_hand_csw["swinging_strikes"] + team_vs_pitch_hand_csw["called_strikes"]) /
    team_vs_pitch_hand_csw["total_pitches_seen"]
)

team_vs_pitch_hand_csw = team_vs_pitch_hand_csw.sort_values("game_date")
team_vs_pitch_hand_csw["ewma_csw_perc"] = (
    team_vs_pitch_hand_csw
    .groupby(["team", "pitcher_handedness"])["csw_perc"]
    .transform(lambda x: x.ewm(halflife=3, adjust=False).mean())
)

csw_ewma_by_team = team_vs_pitch_hand_csw.pivot_table(
    index=["game_date", "team"],
    columns="pitcher_handedness",
    values="ewma_csw_perc"
).reset_index()

csw_ewma_by_team.columns.name = None
csw_ewma_by_team = csw_ewma_by_team.rename(columns={
    "L": "opp_ewma_csw_perc_vs_lhp",
    "R": "opp_ewma_csw_perc_vs_rhp"
})

csw_ewma_by_team["date"] = pd.to_datetime(csw_ewma_by_team["game_date"]) + pd.Timedelta(days=1)
csw_ewma_by_team = csw_ewma_by_team.drop(columns=["game_date"])

pitcher_logs_with_csw_ewma = pitcher_logs_with_k_ewma.merge(
    csw_ewma_by_team.rename(columns={"team": "team_temp"}),
    how="left",
    left_on=["game_date", "opp"],
    right_on=["date", "team_temp"]
)

pitcher_logs_with_csw_ewma = pitcher_logs_with_csw_ewma.drop(columns=["date", "team_temp"])

In [62]:
# Merge with weather data
training_set = pitcher_logs_with_csw_ewma.merge(
    weather_data,
    how="left",
    on="game_pk"
)

# Merge with pitcher physical data
training_set = training_set.merge(
    pitcher_physical,
    how="left",
    on="player_id"
)

# Calculate age at game time
training_set["age"] = (training_set["game_date"] - training_set["birth_date"]).dt.days / 365.25

# Rename season_x back to season and string columns
training_set = training_set.rename(columns={
    "season_x": "season",
    "opp": "opp_str",
    "wind_direction": "wind_direction_str",
    "condition": "condition_str",
    "throws": "throws_str",
})

# Remove any rows with NA values
training_set = training_set.dropna()

# Create label encoders
opp_encoder = LabelEncoder()
wind_dir_encoder = LabelEncoder()
condition_encoder = LabelEncoder()
throws_encoder = LabelEncoder()

# Fit and transform the categorical columns
training_set['opp'] = opp_encoder.fit_transform(training_set['opp_str'])
training_set['wind_direction'] = wind_dir_encoder.fit_transform(training_set['wind_direction_str'])
training_set['condition'] = condition_encoder.fit_transform(training_set['condition_str'])
training_set['throws'] = throws_encoder.fit_transform(training_set['throws_str'])

# Drop original string columns
training_set = training_set.drop(['opp_str', 'wind_direction_str', 'condition_str', 'throws_str', 'birth_date'], axis=1)

# Select final columns
training_set = training_set[[
    "game_pk",
    "game_date", 
    "season",
    "player_id",
    "player_name",
    "strikeouts",
    "height_inches",
    "weight",
    "throws",
    "age",
    "pit_ewma_k_pct",
    "pit_ewma_csw_pct",
    "opp",
    "opp_ewma_k_perc_vs_lhp",
    "opp_ewma_k_perc_vs_rhp",
    "opp_ewma_csw_perc_vs_lhp",
    "opp_ewma_csw_perc_vs_rhp",
    "condition",
    "temp",
    "wind_speed",
    "wind_direction"
]]

In [65]:
training_set.to_csv("../../data/processed/pitcher_game_opponent_stats.csv", index=False)