# Batter Game Logs

Transforms the statcast datasets into batter game logs per pitcher

In [24]:
import pandas as pd
from pathlib import Path

In [26]:
# Get all statcast CSV paths (recursive search)
statcast_dir = Path("../../data/historical/statcast")
csv_paths = list(statcast_dir.rglob("statcast_*.csv"))

# Load and concatenate
df_list = [pd.read_csv(path) for path in csv_paths]
df = pd.concat(df_list, ignore_index=True)

In [27]:

# Assign batter's team based on inning context
def get_batter_team(row):
    return row["away_team"] if row["inning_topbot"] == "Top" else row["home_team"]

df["batter_team"] = df.apply(get_batter_team, axis=1)

# Define helper for identifying hits
def is_event(event_col, target):
    return (event_col == target).sum()

swing_descriptions = {
    "swinging_strike", "swinging_strike_blocked", "foul", "foul_tip",
    "hit_into_play", "hit_into_play_no_out", "hit_into_play_score"
}

contact_descriptions = {
    "foul", "foul_tip", "hit_into_play", "hit_into_play_no_out", "hit_into_play_score"
}

# Group and aggregate
grouped = (
    df.groupby(["game_pk", "batter", "pitcher"])
    .agg(
        team=("batter_team", "first"),
        game_date=("game_date", "first"),
        batter_stance=("stand", "first"),
        pitcher_handedness=("p_throws", "first"),
        total_pitches_seen=("description", "count"),
        plate_appearances=("at_bat_number", pd.Series.nunique),
        strikeouts=("events", lambda x: (x == "strikeout").sum()),
        walks=("events", lambda x: ((x == "walk") | (x == "intent_walk")).sum()),
        singles=("events", lambda x: (x == "single").sum()),
        doubles=("events", lambda x: (x == "double").sum()),
        triples=("events", lambda x: (x == "triple").sum()),
        home_runs=("events", lambda x: (x == "home_run").sum()),
        swinging_strikes=("description", lambda x: ((x == "swinging_strike") | (x == "swinging_strike_blocked")).sum()),
        called_strikes=("description", lambda x: (x == "called_strike").sum()),
        swings=("description", lambda x: x.isin(swing_descriptions).sum()),
        contact_swings=("description", lambda x: x.isin(contact_descriptions).sum()),
    )
    .reset_index()
)

# Preview the result
grouped


Unnamed: 0,game_pk,batter,pitcher,team,game_date,batter_stance,pitcher_handedness,total_pitches_seen,plate_appearances,strikeouts,walks,singles,doubles,triples,home_runs,swinging_strikes,called_strikes,swings,contact_swings
0,661032,435559,660853,LAA,2022-04-26,R,R,3,1,0,0,0,0,0,0,0,1,1,1
1,661032,435559,663474,LAA,2022-04-26,R,R,3,2,0,0,0,0,0,0,0,0,2,2
2,661032,543685,543238,LAA,2022-04-26,R,L,6,1,0,0,0,0,0,0,0,2,1,1
3,661032,543685,663474,LAA,2022-04-26,R,R,16,3,0,0,0,2,0,0,0,1,11,11
4,661032,545361,543238,LAA,2022-04-26,R,L,6,1,0,1,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373531,775345,690993,623352,DET,2024-10-01,L,L,6,1,0,0,0,0,0,0,0,1,3,3
373532,775345,690993,669854,DET,2024-10-01,L,R,9,1,0,1,0,0,0,0,0,0,5,5
373533,775345,700242,664285,DET,2024-10-01,L,L,7,2,1,0,1,0,0,0,2,1,3,1
373534,775345,700242,669854,DET,2024-10-01,L,R,4,1,1,0,0,0,0,0,1,1,2,1


In [28]:
grouped.to_csv("../../data/processed/batter_game_logs.csv", index=False)