In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
# Read in formatted dataset
# assume repo root is two levels up from this notebook
ROOT = Path.cwd().parents[1]  # adjust if needed
DATA_DIR = ROOT / "data"

print(DATA_DIR)

print(ROOT)

df = pd.read_parquet(
    DATA_DIR / "processed_pitches_df_2025-03-28_2025-05-20.parquet"
)


/Users/matthewgillies/PitchArsenalModelling/data
/Users/matthewgillies/PitchArsenalModelling


In [24]:
df.head()

Unnamed: 0,pitcher,pitch_type,season,pitches,whiff_pct,csw_pct,p_throws,velo,VAA,HAA,...,delta_ext_vs_fb,delta_spin_vs_fb,delta_rel_x_vs_fb,delta_rel_z_vs_fb,movdist_vs_fb,shape_dist_vs_fb,second_pitch_usage,max_other_usage,min_movdist_to_other,min_shape_dist_to_other
0,434378,CH,2025,66,0.5,0.318182,R,84.816667,-5.581546,1.506074,...,0.048246,-703.591515,-0.286676,-0.077268,10.089053,20.829588,0.235157,0.494761,10.089053,18.453995
1,434378,CU,2025,104,0.131579,0.326923,R,78.479808,-5.195098,1.813861,...,-0.031387,241.848462,-0.246225,-0.025887,34.452059,46.276993,0.235157,0.494761,10.971585,11.119053
2,434378,FF,2025,425,0.162679,0.24,R,93.928235,-4.378507,1.657135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.235157,0.235157,10.089053,20.829588
3,434378,SL,2025,202,0.324786,0.277228,R,87.34802,-5.446769,2.135975,...,0.019308,36.911485,-0.204609,-0.076917,18.106932,22.384323,0.235157,0.494761,12.249601,18.453995
4,434378,ST,2025,62,0.230769,0.33871,R,79.382258,-4.805905,2.443773,...,-0.038118,185.479355,-0.410831,-0.168573,30.356494,42.04591,0.235157,0.494761,10.971585,11.119053


In [25]:
df.shape

(1704, 46)

In [26]:
df.isna().sum().sort_values(ascending=False)

min_shape_dist_to_other    50
min_movdist_to_other       50
max_other_usage            50
second_pitch_usage         50
delta_IVB_vs_fb            13
fb_HAA                     13
fb_ext                     13
fb_spin                    13
fb_rel_x                   13
fb_rel_z                   13
delta_velo_vs_fb           13
delta_HB_vs_fb             13
delta_HAA_vs_fb            13
delta_VAA_vs_fb            13
fb_IVB                     13
delta_ext_vs_fb            13
delta_spin_vs_fb           13
delta_rel_x_vs_fb          13
delta_rel_z_vs_fb          13
movdist_vs_fb              13
shape_dist_vs_fb           13
fb_VAA                     13
fb_HB                      13
fb_usage                   13
fb_pitch_type              13
fb_velo                    13
pitch_type                  0
ext                         0
season                      0
pitches                     0
whiff_pct                   0
csw_pct                     0
p_throws                    0
velo      

In [27]:
# drop nas -> not a large proportion
df = df.dropna()

In [28]:
# filter to FF only for initial analysis
ff_df = df[df["pitch_type"] == "FF"]

# filter to min 60 pitches thrown
ff_filtered = ff_df[ff_df['pitches'] > 60]

In [29]:
ff_df.head()

Unnamed: 0,pitcher,pitch_type,season,pitches,whiff_pct,csw_pct,p_throws,velo,VAA,HAA,...,delta_ext_vs_fb,delta_spin_vs_fb,delta_rel_x_vs_fb,delta_rel_z_vs_fb,movdist_vs_fb,shape_dist_vs_fb,second_pitch_usage,max_other_usage,min_movdist_to_other,min_shape_dist_to_other
2,434378,FF,2025,425,0.162679,0.24,R,93.928235,-4.378507,1.657135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.235157,0.235157,10.089053,20.829588
9,450203,FF,2025,258,0.233645,0.213178,R,93.955426,-2.405597,2.480507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.332474,0.350515,11.035174,11.060945
12,455119,FF,2025,82,0.157895,0.304878,R,94.780488,-3.77972,2.924514,...,0.072222,-1.495565,0.221922,0.029051,12.248626,14.096428,0.314176,0.37931,8.921315,9.015845
16,458677,FF,2025,124,0.343284,0.330645,L,94.378226,-3.990978,-1.469117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.27907,0.27907,10.9726,12.847035
20,471911,FF,2025,112,0.039216,0.178571,R,91.515179,-3.31759,1.245728,...,-0.044025,135.882402,0.05093,0.062715,11.247917,11.406483,0.227626,0.276265,10.100121,11.406483


In [30]:
ff_df.shape

(389, 46)

In [33]:
# develop base xgboost workflow 
cat_cols = ['p_throws', 'fb_pitch_type']

target = "whiff_pct"

# 1) train/val split by pitcher - when we have multiple seasons with same pitches
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(ff_df, groups=ff_df["pitcher"]))

train = ff_df.iloc[train_idx].copy()
val   = ff_df.iloc[val_idx].copy()

# 2) choose features (exclude ids + targets)
drop_cols = ["pitcher", "season", "pitch_type", "whiff_pct", "csw_pct"]
X_train = train.drop(columns=[c for c in drop_cols if c in train.columns])
X_val   = val.drop(columns=[c for c in drop_cols if c in val.columns])

y_train = train[target].values
y_val   = val[target].values

# 3) one-hot categorical cols (simple + reliable)
X_train = pd.get_dummies(X_train, columns=cat_cols, dummy_na=True)
X_val   = pd.get_dummies(X_val, columns=cat_cols, dummy_na=True)

# align columns
X_train, X_val = X_train.align(X_val, join="left", axis=1, fill_value=0)

# 4) sample weights to reflect stability of rate targets
w_train = train["pitches"].values  # or np.sqrt(train["pitches"].values)

model = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    min_child_weight=5,
    objective="reg:squarederror",
    random_state=42
)

model.fit(X_train, y_train, sample_weight=w_train)

pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred))
r2 = r2_score(y_val, pred)

print("RMSE:", rmse, "R2:", r2)

RMSE: 0.07112937705677985 R2: -0.05556071197521928
