In [1]:

# libraries
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import gc
from matplotlib import style
from pybaseball import statcast
import progressbar 
import warnings
import psycopg2

# pd.options.mode.chained_assignment = None  # default='warn'
# warnings.simplefilter(action='ignore', category=FutureWarning)
# pd.options.display.float_format = '{:.3f}'.format


df_sc = statcast(start_dt='2024-01-01', end_dt='2024-09-30')


# creating an spray degree column
df_sc['spray_deg'] = np.degrees(np.arctan((df_sc['hc_x'] - 125.42) /  (200 - df_sc['hc_y'])* 0.75 ))

# creating a spray_deg adjusted that accounts for LHH 
df_sc['spray_deg_adj'] = np.where(df_sc['stand'] == 'L', df_sc['spray_deg'] * -1, df_sc['spray_deg'])

df_sc['outcome'] = np.where(df_sc['events'].isin([
    'field_out', 'force_out', 'grounded_into_double_play', 'sac_fly', 'field_error',
    'sac_bunt', 'fielders_choice', 'double_play', 'fielders_choice_out', 
    'sac_fly_double_play', 'triple_play'
]), 'out', df_sc['events'])

# df_sc['zone'] = pd.to_numeric(df_sc['zone'], errors='coerce').astype(int)


This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 200/200 [02:31<00:00,  1.32it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [3]:
import pandas as pd
import numpy as np
from scipy.stats import skewnorm
from tqdm import tqdm

df = df_sc.copy()

# Define buckets
def get_ev_bucket(v):
    if pd.isna(v):
        return None
    return f"{int(v // 5) * 5}-{int(v // 5) * 5 + 5}"

def get_launch_angle_bucket(v):
    if pd.isna(v):
        return None
    return f"{int(v // 10) * 10}-{int(v // 10) * 10 + 10}"

df['ev_bucket'] = df['launch_speed'].apply(get_ev_bucket)
df['launch_angle_bucket'] = df['launch_angle'].apply(get_launch_angle_bucket)

# Only keep rows with spray_deg and outcome
df = df[df['spray_deg'].notna() & df['outcome'].notna()]

fallback_levels = [
    ['outcome', 'zone', 'ev_bucket', 'launch_angle_bucket'],
    ['outcome', 'zone', 'ev_bucket'],
    ['outcome', 'zone'],
    ['outcome'],
    []
]

angle_fits = {}
completed_keys = set()

outer_group_cols = ['batter', 'game_year', 'stand', 'p_throws']
outer_grouped = df.groupby(outer_group_cols)

for outer_key, outer_df in tqdm(outer_grouped, desc="Top-level groups"):
    outer_key = (outer_key,) if not isinstance(outer_key, tuple) else outer_key

    for level_keys in fallback_levels:
        if level_keys:
            grouped = outer_df.groupby(level_keys)
            for key, group in grouped:
                key = (key,) if not isinstance(key, tuple) else key
                full_key_id = (outer_key, tuple(level_keys), key)

                if full_key_id in completed_keys:
                    continue

                angles = pd.to_numeric(group['spray_deg'], errors='coerce').dropna()
                angles = angles[angles.apply(lambda x: isinstance(x, (int, float)) and np.isfinite(x))]

                if len(angles) < 10:
                    continue

                try:
                    a, loc, scale = skewnorm.fit(angles.to_numpy(dtype=np.float64))
                    angle_fits[full_key_id] = {
                        'batter': outer_key[0],
                        'game_year': outer_key[1],
                        'stand': outer_key[2],
                        'p_throws': outer_key[3],
                        'keys': level_keys,
                        'key_values': key,
                        'skew': a,
                        'mean': loc,
                        'std': scale,
                        'n': len(angles)
                    }
                    completed_keys.add(full_key_id)
                except Exception as e:
                    print(f"Skipping {key} at level {level_keys} for group {outer_key} due to error: {e}")
        else:
            # Global fallback
            angles = pd.to_numeric(outer_df['spray_deg'], errors='coerce').dropna()
            angles = angles[angles.apply(lambda x: isinstance(x, (int, float)) and np.isfinite(x))]

            if len(angles) >= 10:
                try:
                    a, loc, scale = skewnorm.fit(angles.to_numpy(dtype=np.float64))
                    angle_fits[(outer_key, (), ())] = {
                        'batter': outer_key[0],
                        'game_year': outer_key[1],
                        'stand': outer_key[2],
                        'p_throws': outer_key[3],
                        'keys': [],
                        'key_values': (),
                        'skew': a,
                        'mean': loc,
                        'std': scale,
                        'n': len(angles)
                    }
                except Exception as e:
                    print(f"Skipping global fallback for {outer_key} due to error: {e}")


Top-level groups: 100%|██████████| 2132/2132 [03:08<00:00, 11.33it/s]


In [4]:


flattened = []

for _, fit in angle_fits.items():
    row = {
        'batter': fit['batter'],
        'game_year': fit['game_year'],
        'stand': fit['stand'],
        'p_throws': fit['p_throws'],
        'skew': fit['skew'],
        'mean': fit['mean'],
        'std': fit['std'],
        'n': fit['n'],
        'level': "_".join(fit['keys']) if fit['keys'] else 'global'
    }

    # Define all possible context keys
    context_keys = ['outcome', 'zone', 'ev_bucket', 'launch_angle_bucket']
    context_map = dict(zip(fit['keys'], fit['key_values']))

    for key in context_keys:
        val = context_map.get(key, pd.NA)
        if key == 'zone':
            try:
                val = int(val) if pd.notna(val) and float(val).is_integer() else pd.NA
            except:
                val = pd.NA
        row[key] = val

    flattened.append(row)

# Build DataFrame
spray_dist_df = pd.DataFrame(flattened)

# Set column order
expected_columns = [
    'game_year', 'batter', 'stand', 'p_throws',
    'outcome', 'zone', 'ev_bucket', 'launch_angle_bucket',
    'skew', 'mean', 'std', 'n', 'level'
]
spray_dist_df = spray_dist_df[expected_columns]

# Clean up
spray_dist_df['zone'] = pd.to_numeric(spray_dist_df['zone'], errors='coerce').astype('Int64')
object_cols = spray_dist_df.select_dtypes(include='object').columns
spray_dist_df[object_cols] = spray_dist_df[object_cols].fillna('')

# Export CSV
spray_dist_df.to_csv(
    "/Users/loganmottley/Desktop/Projects/go-baseball/data/spray_distributions_by_batter_year_handedness.csv",
    index=False,
    na_rep=''
)


In [5]:
spray_dist_df



Unnamed: 0,game_year,batter,stand,p_throws,outcome,zone,ev_bucket,launch_angle_bucket,skew,mean,std,n,level
0,2024,444482,L,L,,,,,3.509866e+00,-25.654216,26.969996,13,global
1,2024,444482,L,R,out,4,,,-2.194808e+07,41.707475,39.603288,20,outcome_zone
2,2024,444482,L,R,out,5,,,-7.745084e+00,33.151858,33.443961,14,outcome_zone
3,2024,444482,L,R,out,6,,,-7.603811e+04,35.084901,41.979219,12,outcome_zone
4,2024,444482,L,R,out,8,,,-2.739662e+07,38.839550,33.034445,16,outcome_zone
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6866,2024,808982,L,L,,,,,8.704031e-01,-10.606221,22.013702,44,global
6867,2024,808982,L,R,out,5,,,-4.519809e+00,30.446764,42.340302,10,outcome_zone
6868,2024,808982,L,R,out,,,,-1.164191e+01,36.958098,39.383782,72,outcome
6869,2024,808982,L,R,single,,,,-2.689372e-01,5.052238,19.213632,24,outcome
