In [1]:

# libraries
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import gc
from matplotlib import style
from pybaseball import statcast
import progressbar 
import warnings
import psycopg2
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


df_sc = statcast(start_dt='2024-01-01', end_dt='2024-09-30')



This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 200/200 [03:16<00:00,  1.02it/s]


In [4]:
df_sc['events'].unique()

array(['field_out', nan, 'strikeout', 'single', 'home_run', 'double',
       'walk', 'hit_by_pitch', 'sac_fly', 'grounded_into_double_play',
       'force_out', 'triple', 'sac_bunt', 'fielders_choice_out',
       'catcher_interf', 'truncated_pa', 'fielders_choice', 'field_error',
       'double_play', 'strikeout_double_play', 'triple_play',
       'sac_fly_double_play', None], dtype=object)

In [13]:


# creating an spray degree column
df_sc['spray_deg'] = np.degrees(np.arctan((df_sc['hc_x'] - 125.42) /  (200 - df_sc['hc_y'])* 0.75 ))

# creating a spray_deg adjusted that accounts for LHH 
# df_sc['spray_deg_adj'] = np.where(df_sc['stand'] == 'L', df_sc['spray_deg'] * -1, df_sc['spray_deg'])

df_sc['outcome'] = np.where(df_sc['events'].isin([
    'field_out', 'strikeout', 
       'walk', 'hit_by_pitch',
       'force_out', 'sac_bunt', 'fielders_choice_out',
       'catcher_interf', 'truncated_pa', 'fielders_choice', 'field_error',
       'double_play', 'strikeout_double_play', 'triple_play',
       'sac_fly_double_play', np.nan, None
]), 'out', df_sc['events'])

# Define buckets
def get_ev_bucket(v):
    if pd.isna(v):
        return None
    return f"{int(v // 5) * 5}-{int(v // 5) * 5 + 5}"

def get_launch_angle_bucket(v):
    if pd.isna(v):
        return None
    return f"{int(v // 10) * 10}-{int(v // 10) * 10 + 10}"

def bucket_spray_angle(angle):
    if pd.isna(angle):
        return np.NaN
    if angle <= -20:
        return '-45 to -20'
    elif -20 < angle <= -5:
        return '-20 to -5'
    elif -5 < angle <= 5:
        return '-5 to 5'
    elif 5 < angle <= 20:
        return '5 to 20'
    else:
        return '20 to 45'



df_sc['ev_bucket'] = df_sc['launch_speed'].apply(get_ev_bucket)
df_sc['launch_angle_bucket'] = df_sc['launch_angle'].apply(get_launch_angle_bucket)
df_sc['spray_angle_bucket'] = df_sc['spray_deg'].apply(bucket_spray_angle)



In [14]:
df_sc['outcome'].unique()


array(['out', 'single', 'home_run', 'double', 'sac_fly',
       'grounded_into_double_play', 'triple'], dtype=object)

In [10]:


df_sc_test = df_sc[~df_sc['events'].isin(['home_run', 'force_out'])].copy().sort_values(by=['game_pk', 'at_bat_number'], ascending=True)

df_sc_test['on_1b_lagged'] = df_sc_test['on_1b'].shift(-1)
df_sc_test['on_2b_lagged'] = df_sc_test['on_2b'].shift(-1)
df_sc_test['on_3b_lagged'] = df_sc_test['on_3b'].shift(-1)

df_sc_test['is_on_1b'] = np.where(df_sc_test['on_1b'].isnull(), 0, 1)
df_sc_test['is_on_2b'] = np.where(df_sc_test['on_2b'].isnull(), 0, 1)
df_sc_test['is_on_3b'] = np.where(df_sc_test['on_3b'].isnull(), 0, 1)

df_sc_test['home_run_scored'] = df_sc_test['post_home_score'] - df_sc_test['home_score']
df_sc_test['away_run_scored'] = df_sc_test['post_away_score'] - df_sc_test['away_score']
df_sc_test['total_runs_scored'] = df_sc_test['home_run_scored'] + df_sc_test['away_run_scored']

# Force float64 for safe comparisons
cols = ['on_1b', 'on_2b', 'on_3b', 'on_1b_lagged', 'on_2b_lagged', 'on_3b_lagged']
df_sc_test[cols] = df_sc_test[cols].astype('float64')

# Convert necessary columns to numpy arrays
runner_1b = df_sc_test['on_1b'].to_numpy()
runner_2b = df_sc_test['on_2b'].to_numpy()
runner_3b = df_sc_test['on_3b'].to_numpy()

on_1b_next = df_sc_test['on_1b_lagged'].to_numpy()
on_2b_next = df_sc_test['on_2b_lagged'].to_numpy()
on_3b_next = df_sc_test['on_3b_lagged'].to_numpy()

total_runs_scored = df_sc_test['total_runs_scored'].to_numpy()

# Runner on 1B
cond_1b_none = np.isnan(runner_1b)
cond_1b_to_2b = (~np.isnan(runner_1b)) & (~np.isnan(on_2b_next)) & (runner_1b == on_2b_next)
cond_1b_to_3b = (~np.isnan(runner_1b)) & (~np.isnan(on_3b_next)) & (runner_1b == on_3b_next)
cond_1b_scored = (
    (~np.isnan(runner_1b)) &
    (runner_1b != on_1b_next) &
    (runner_1b != on_2b_next) &
    (runner_1b != on_3b_next) &
    (total_runs_scored > 0)
)

df_sc_test['runner_1b_move'] = np.select(
    [cond_1b_none, cond_1b_to_2b, cond_1b_to_3b, cond_1b_scored],
    [np.NaN, 1, 2, 3],
    default=np.nan
)

# Runner on 2B
cond_2b_none = np.isnan(runner_2b)
cond_2b_to_3b = (~np.isnan(runner_2b)) & (~np.isnan(on_3b_next)) & (runner_2b == on_3b_next)
cond_2b_scored = (
    (~np.isnan(runner_2b)) &
    (runner_2b != on_1b_next) &
    (runner_2b != on_2b_next) &
    (runner_2b != on_3b_next) &
    (total_runs_scored > 0)
)

df_sc_test['runner_2b_move'] = np.select(
    [cond_2b_none, cond_2b_to_3b, cond_2b_scored],
    [np.NaN, 1, 2],
    default=np.nan
)

# Runner on 3B
cond_3b_none = np.isnan(runner_3b)
cond_3b_scored = (
    (~np.isnan(runner_3b)) &
    (runner_3b != on_1b_next) &
    (runner_3b != on_2b_next) &
    (runner_3b != on_3b_next) &
    (total_runs_scored > 0)
)

df_sc_test['runner_3b_move'] = np.select(
    [cond_3b_none, cond_3b_scored],
    [np.NaN, 1],
    default=np.nan
)

# Aggregate output
runner_movement_df = df_sc_test[
    ['game_pk', 'at_bat_number', 'outcome', 'des', 'total_runs_scored',
     'is_on_1b', 'is_on_2b', 'is_on_3b',
     'runner_1b_move', 'runner_2b_move', 'runner_3b_move',
     'ev_bucket', 'launch_angle_bucket', 'spray_angle_bucket', 'outs_when_up']
].sort_values(by=['game_pk', 'at_bat_number'], ascending=True)

runner_movement_aggregate = (
    runner_movement_df
    .groupby(['outcome'])[
        ['runner_1b_move', 'runner_2b_move', 'runner_3b_move']
    ]
    .agg(['mean', 'count'])
    .reset_index()
)

runner_movement_aggregate



Unnamed: 0_level_0,outcome,runner_1b_move,runner_1b_move,runner_2b_move,runner_2b_move,runner_3b_move,runner_3b_move
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count,mean,count
0,catcher_interf,1.083333,36,1.0,10,1.0,1
1,double,2.430739,2505,1.982699,1445,1.0,804
2,hit_by_pitch,1.050459,872,1.038462,312,1.0,103
3,out,1.190775,3664,1.050267,3561,1.0,2063
4,sac_bunt_double_play,,0,,0,,0
5,single,1.382007,7814,1.628166,4225,1.0,2543
6,strikeout,,0,,0,,0
7,strikeout_double_play,,0,,0,,0
8,triple,3.0,201,2.0,132,1.0,67
9,truncated_pa,3.0,8,2.0,9,1.0,10


In [11]:

runner_movement_df






Unnamed: 0,game_pk,at_bat_number,outcome,des,total_runs_scored,is_on_1b,is_on_2b,is_on_3b,runner_1b_move,runner_2b_move,runner_3b_move,ev_bucket,launch_angle_bucket,spray_angle_bucket,outs_when_up
3166,632169,1,walk,Garrett Hampson walks.,0,0,0,0,,,,,,,0
3231,632169,1,,Garrett Hampson walks.,0,0,0,0,,,,,,,0
3298,632169,1,,Garrett Hampson walks.,0,0,0,0,,,,,,,0
3431,632169,1,,Garrett Hampson walks.,0,0,0,0,,,,,,,0
3636,632169,1,,Garrett Hampson walks.,0,0,0,0,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516,748868,65,,A.J. Ewing strikes out swinging.,0,0,0,0,,,,,,,0
1127,748868,66,out,"Omar Narváez grounds out, shortstop Roderick A...",0,0,0,0,,,,90-95,0-10,-20 to -5,1
957,748868,67,strikeout,Jake Zitella called out on strikes.,0,0,0,0,,,,,,,2
997,748868,67,,Jake Zitella called out on strikes.,0,0,0,0,,,,,,,2


In [14]:
runner_movement_long = (
    runner_movement_df
    .melt(
        id_vars=['outcome', 'launch_angle_bucket', 'spray_angle_bucket'],
        value_vars=['runner_1b_move', 'runner_2b_move', 'runner_3b_move'],
        var_name='runner',
        value_name='move'
    )
)

runner_transition_matrix = (
    runner_movement_long
    .groupby(['outcome', 'launch_angle_bucket', 'spray_angle_bucket', 'runner', 'move'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

# Rename move columns
runner_transition_matrix = runner_transition_matrix.rename(columns={
    1.0: 'plus_1',
    2.0: 'plus_2',
    3.0: 'plus_3'
})

# Add n as sum across all move columns
runner_transition_matrix['n'] = runner_transition_matrix.drop(columns=['outcome', 'launch_angle_bucket', 'spray_angle_bucket', 'runner']).sum(axis=1)

runner_transition_matrix.to_csv('/Users/loganmottley/Desktop/Projects/go-baseball/data/runner_transition_matrix.csv', index=False)

runner_transition_matrix


move,outcome,launch_angle_bucket,spray_angle_bucket,runner,plus_1,plus_2,plus_3,n
0,double,-10-0,-45 to -20,runner_1b_move,0,29,22,51
1,double,-10-0,-45 to -20,runner_2b_move,0,33,0,33
2,double,-10-0,-45 to -20,runner_3b_move,21,0,0,21
3,double,-10-0,20 to 45,runner_1b_move,0,7,8,15
4,double,-10-0,20 to 45,runner_2b_move,0,15,0,15
...,...,...,...,...,...,...,...,...
626,triple,30-40,5 to 20,runner_2b_move,0,3,0,3
627,triple,30-40,5 to 20,runner_3b_move,4,0,0,4
628,triple,40-50,20 to 45,runner_1b_move,0,0,1,1
629,triple,40-50,5 to 20,runner_1b_move,0,0,1,1
