# Pitcher Game Logs

Similar to batter game logs, this gets the relevant counts of pitcher stats by aggregating statcast data

In [23]:
import pandas as pd
import numpy as np
from pathlib import Path

In [24]:
# Get all statcast CSV paths (recursive search)
statcast_dir = Path("../../data/historical/statcast")
csv_paths = list(statcast_dir.rglob("statcast_*.csv"))

# Load and concatenate
df_list = [pd.read_csv(path) for path in csv_paths]
df = pd.concat(df_list, ignore_index=True)

In [25]:
# Vectorized team and opponent assignment
team_df = df.copy()

# Pitching team: home if top inning, away if bottom
team_df["team"] = np.where(
    team_df["inning_topbot"] == "Top",
    team_df["home_team"],
    team_df["away_team"]
)

# Opponent is the other team
team_df["opp"] = np.where(
    team_df["team"] == team_df["home_team"],
    team_df["away_team"],
    team_df["home_team"]
)

In [26]:
# Sort by true game flow within each team/game
team_df_sorted = team_df.sort_values(
    ["game_pk", "team", "inning", "inning_topbot", "at_bat_number", "pitch_number"]
)

# Get the first pitcher to appear for each team in each game
starting_pitchers = (
    team_df_sorted
    .groupby(["game_pk", "team"], as_index=False)
    .first()[["game_pk", "team", "pitcher"]]
    .rename(columns={"pitcher": "starter_pitcher"})
)

In [27]:
swing_descriptions = {
    "swinging_strike", "swinging_strike_blocked", "foul", "foul_tip",
    "hit_into_play", "hit_into_play_no_out", "hit_into_play_score"
}

contact_descriptions = {
    "foul", "foul_tip", "hit_into_play", "hit_into_play_no_out", "hit_into_play_score"
}

# Each PA = unique batter-inning-at_bat combo
team_df["pa_key"] = (
    team_df["batter"].astype(str)
    + "_" + team_df["inning"].astype(str)
    + "_" + team_df["at_bat_number"].astype(str)
)

pitcher_vs_handedness = (
    team_df.groupby(["game_pk", "pitcher", "stand"])  # "stand" = batter handedness
    .agg(
        pitcher_handedness=("p_throws", "first"),
        game_date=("game_date", "first"),
        team=("team", "first"),
        opp=("opp", "first"),
        total_pitches=("pitch_number", "count"),
        batters_faced=("pa_key", pd.Series.nunique),
        strikeouts=("events", lambda x: (x == "strikeout").sum()),
        walks=("events", lambda x: (x == "walk").sum()),
        called_strikes=("description", lambda x: (x == "called_strike").sum()),
        swinging_strikes=("description", lambda x: x.isin({"swinging_strike", "swinging_strike_blocked"}).sum()),
        swings=("description", lambda x: x.isin(swing_descriptions).sum()),
        contact_swings=("description", lambda x: x.isin(contact_descriptions).sum()),
    )
    .reset_index()
    .rename(columns={"stand": "batter_stance"})
)

# Add starter flag: 1 if this pitcher was the first for their team in the game
pitcher_vs_handedness = pitcher_vs_handedness.merge(
    starting_pitchers,
    on=["game_pk", "team"],
    how="left"
)

pitcher_vs_handedness["is_starter"] = (
    pitcher_vs_handedness["pitcher"] == pitcher_vs_handedness["starter_pitcher"]
).astype(int)

# Optional cleanup
pitcher_vs_handedness = pitcher_vs_handedness.drop(columns=["starter_pitcher"])

pitcher_vs_handedness

Unnamed: 0,game_pk,pitcher,batter_stance,pitcher_handedness,game_date,team,opp,total_pitches,batters_faced,strikeouts,walks,called_strikes,swinging_strikes,swings,contact_swings,is_starter
0,661032,543238,L,L,2022-04-26,CLE,LAA,2,2,0,0,0,0,2,2,0
1,661032,543238,R,L,2022-04-26,CLE,LAA,12,2,0,1,4,0,1,1,0
2,661032,571901,L,L,2022-04-26,LAA,CLE,6,1,0,0,0,0,3,3,0
3,661032,571901,R,L,2022-04-26,LAA,CLE,9,2,1,0,2,1,4,3,0
4,661032,623474,L,R,2022-04-26,LAA,CLE,5,1,0,0,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125925,775345,676684,R,R,2024-10-01,DET,HOU,8,2,2,0,4,0,3,3,0
125926,775345,687911,L,L,2024-10-01,HOU,DET,11,2,1,0,4,1,2,1,0
125927,775345,687911,R,L,2024-10-01,HOU,DET,1,1,0,0,0,0,1,1,0
125928,775345,689225,L,R,2024-10-01,DET,HOU,9,2,0,0,1,2,5,3,0


In [28]:
pitcher_vs_handedness.to_csv("../../data/processed/pitcher_game_logs.csv", index=False)