In [None]:
from collections.abc import Callable
import os
import pandas as pd
from statcast import FetchStatcast

In [None]:
fetch_statcast = FetchStatcast('statcast_data')

In [None]:
# fetch all data from 2024 regular season. The early "Seoul Series" is omitted.
df = fetch_statcast.statcast("2024-03-28", "2024-09-29", preprocess=lambda x: x).drop_duplicates()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 708248 entries, 1728 to 4035
Columns: 113 entries, pitch_type to arm_angle
dtypes: Float64(37), Int64(51), datetime64[ns](1), object(24)
memory usage: 675.4+ MB


In [None]:
def add_at_bat_id(df):
    """Encode at bat as 10 bit shift of game id plus at-bat number."""
    return (df.game_pk.to_numpy() << 10) + df.at_bat_number
at_bat = add_at_bat_id(df)


In [None]:

def merge_to_1_1_and_following(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['game_date', 'game_pk', 'batter', 'pitcher', 'at_bat_number', 'pitch_number', 'pitch_type', 'balls', 'strikes']].copy()
    df['at_bat_id'] = add_at_bat_id(df)
    has_1_strike = df.strikes == 1
    has_2_strike = df.strikes == 2
    has_1_ball = df.balls == 1
    has_2_ball = df.balls == 2
    is_1_1_count = has_1_strike & has_1_ball
    can_follow_1_1 = (has_1_strike & has_2_ball) | (has_2_strike & has_1_ball)
    lhs = df.loc[is_1_1_count, :]
    rhs = df.loc[can_follow_1_1, ['pitch_type', 'at_bat_id', 'pitch_number']]
    df = lhs.merge(rhs, on='at_bat_id', how='left', suffixes=('', '_following'))
    return df.query('pitch_number == pitch_number_following - 1')

results = merge_to_1_1_and_following(df)#.duplicated()


In [None]:
pitch_type_map = {
    'CH': 'Changeup',
    'CU': 'Curveball',
    'EP': "Eephus",
    'FA': 'Fastball',
    'FC': 'Cutter',
    'FF': "Four-seam Fastball",
    'FO': "Forkball",
    'FS': "Splitter",
    'KC': "Knuckle-curve",
    'KN': "Knuckle-ball",
    'PO': "Pickoff",
    'SC': "Screwball",
    'SI': "Sinker",
    'SL': "Slider",
    'ST': "Sweeper",
    'SV': "Slurve"
}

In [None]:
results_clean = (
    results[['game_pk', 'batter', 'pitcher', 'at_bat_number', 'pitch_type', 'pitch_type_following']]
    .assign(
        pitch_type=lambda df: df.pitch_type.map(pitch_type_map),
        pitch_type_following=lambda df: df.pitch_type_following.map(pitch_type_map))
)
results_clean.groupby('pitch_type').describe()

In [None]:
file_name = "pitch_selection_in_one_one_counts.csv"
results_clean.to_csv(file_name)

In [None]:
directory = os.path.normpath("C:/Users/lenha/Repos/ua_ms_ds/phi_msds/info_526/assignments/data")
path = os.path.join(directory, file_name)
results_clean.to_csv(path)