In [1]:
from collections.abc import Callable
import os
from pybaseball import statcast, playerid_lookup, playerid_reverse_lookup
import pandas as pd

In [2]:
pd.Timestamp('2025-02-17', tz='America/New_York').strftime("%Y-%m-%d")

'2025-02-17'

In [3]:
class FetchStatcast:
    """Fetch Statcast data.

    Reads from and writes to caches during fetches by default, preventing needless
    API calls and speeding up transactions.
    """
    def __init__(self, local_file_cache: str):
        self.local_file_cache = local_file_cache

    def statcast(
            self,
            start: str,
            end: str,
            preprocess: Callable[[pd.DataFrame], pd.DataFrame],
            read_from_cache=True,
            write_to_cache=True
        ) -> pd.DataFrame:
        """Fetch pitch level data from statcast.

        Data is read from local_file_cache.

        Parameters
        ----------
        start:
            YYYY-MM-DD formatted time string.
        end:
            YYYY-MM-DD formatted time string.
        preprocess:
            A callable for transforming slices of statcast data.

            The fetch process works on one-day chunks of statcast data. This function is called on each
            chunk before aggragating all the data.
        read_from_cache:
            Whether to read from local_file_cache if available.
        write_to_cache:
            Whether to write fetched data to cache.
        """
        dates = pd.date_range(start, end)
        data = []
        for _start, _end in zip(dates, dates[1:]):
            df = self._fetch_data(_start, _end, read_from_cache, write_to_cache)
            data.append(preprocess(df))
        return pd.concat(data)

    def _fetch_data(self, start: pd.Timestamp, end: pd.Timestamp, use_cache: bool, write_to_cache: bool):
        """Fetch a single slice of data through a cache file."""
        filename = self._file_name(start, end)
        if os.path.exists(filename) and use_cache:
            df = pd.read_parquet(filename)
        else:
            start_str = start.strftime("%Y-%m-%d")
            end_str = end.strftime("%Y-%m-%d")
            df = statcast(start_str, end_str)
            if write_to_cache:
                df.to_parquet(filename)
        return df

    def _file_name(self, start: pd.Timestamp, end: pd.Timestamp):
        start_str = start.strftime("%Y-%m-%d")
        end_str = end.strftime("%Y-%m-%d")
        name = f"statcast_{start_str}_{end_str}"
        return os.path.join(self.local_file_cache, name)


In [4]:
fetch_statcast = FetchStatcast('statcast_data')

In [None]:
# fetch all data from 2024 regular season. The early "Seoul Series" is omitted.
df = fetch_statcast.statcast("2024-03-28", "2024-09-29", preprocess=lambda x: x).drop_duplicates()

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 708248 entries, 1728 to 4035
Columns: 113 entries, pitch_type to arm_angle
dtypes: Float64(37), Int64(51), datetime64[ns](1), object(24)
memory usage: 675.4+ MB


In [79]:
def add_at_bat_id(df):
    """Encode at bat as 10 bit shift of game id plus at-bat number."""
    return (df.game_pk.to_numpy() << 10) + df.at_bat_number
at_bat = add_at_bat_id(df)


In [98]:

def filter_to_(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['game_date', 'game_pk', 'batter', 'pitcher', 'at_bat_number', 'pitch_number', 'pitch_type', 'balls', 'strikes']].copy()
    df['at_bat_id'] = add_at_bat_id(df)
    has_1_strike = df.strikes == 1
    has_2_strike = df.strikes == 2
    has_1_ball = df.balls == 1
    has_2_ball = df.balls == 2
    is_1_1_count = has_1_strike & has_1_ball
    can_follow_1_1 = (has_1_strike & has_2_ball) | (has_2_strike & has_1_ball)
    lhs = df.loc[is_1_1_count, :]
    rhs = df.loc[can_follow_1_1, ['pitch_type', 'at_bat_id', 'pitch_number']]
    df = lhs.merge(rhs, on='at_bat_id', how='left', suffixes=('', '_following'))
    return df.query('pitch_number == pitch_number_following - 1')

results = filter_to_(df)#.duplicated()


In [99]:
results[['game_pk', 'batter', 'pitcher', 'at_bat_number', 'pitch_type', 'pitch_type_following']]

Unnamed: 0,game_pk,batter,pitcher,at_bat_number,pitch_type,pitch_type_following
1,747223,666134,656464,70,SL,SL
2,747223,572233,641755,69,SL,SL
3,747223,606466,641755,67,SL,SL
4,747223,682998,641755,66,SL,FF
9,747223,660707,573009,61,CH,CH
...,...,...,...,...,...,...
80164,744798,608336,605400,15,KC,FF
80165,744798,686611,605400,14,KC,SI
80166,744798,624641,663623,12,FF,FF
80167,744798,695578,605400,8,CH,SI
