In [1]:
#load packages
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm
Python(20603) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [2]:
#load all pitches since 2015
csv_file_path = '~/baseball-and-stuff/baseball/all_pitches.csv'

df = pd.read_csv(csv_file_path)

  df = pd.read_csv(csv_file_path)


In [3]:
#find primary fastballs against each handedness
fastballs = ['FF', 'SI', 'FC', 'FA']
df_fb = df[df['pitch_type'].isin(fastballs)]

most_common_fb = df_fb.groupby(['pitcher', 'game_year', 'stand'])['pitch_type'].agg(lambda x: x.mode().iloc[0]).reset_index()
most_common_fb = most_common_fb.rename(columns={'pitch_type': 'most_common_fb'})

df_fb = df_fb.merge(most_common_fb, on=['pitcher', 'game_year', 'stand'], how='left')

#find average metrics for the primary fastball
df_fb_filtered = df_fb[df_fb['pitch_type'] == df_fb['most_common_fb']]

average_metrics = df_fb_filtered.groupby(['pitcher', 'game_year', 'stand', 'most_common_fb']).agg({
    'release_speed': 'mean',
    'release_pos_x': 'mean',
    'release_pos_z': 'mean',
    'pfx_x': 'mean',
    'pfx_z': 'mean'
}).reset_index()

average_metrics = average_metrics.rename(columns={
    'release_speed': 'avg_release_speed',
    'release_pos_x': 'avg_release_pos_x',
    'release_pos_z': 'avg_release_pos_z',
    'pfx_x': 'avg_pfx_x',
    'pfx_z': 'avg_pfx_z'
})

df = df.merge(average_metrics, on=['pitcher', 'game_year', 'stand'], how='left')

In [4]:
#label 0 if fastball, 1 if breaker, 2 if offspeed
def assign_pitch_group(row):
    if row['pitch_type'] in ['FF', 'SI', 'FA'] or (row['pitch_type'] == 'FC' and row['most_common_fb'] == 'FC'):
        return 0
    elif row['pitch_type'] in ['SL', 'CU', 'KC', 'ST', 'SV'] or (row['pitch_type'] == 'FC' and row['most_common_fb'] != 'FC'):
        return 1
    elif row['pitch_type'] in ['CH', 'FS', 'KN', 'EP', 'FO', 'SC']:
        return 2
    else:
        return None

df['pitch_group'] = df.apply(assign_pitch_group, axis=1)

In [5]:
#integer encode platoon state
conditions = [
    (df['stand'] == 'L') & (df['p_throws'] == 'L'),
    (df['stand'] == 'L') & (df['p_throws'] == 'R'),  
    (df['stand'] == 'R') & (df['p_throws'] == 'L'),  
    (df['stand'] == 'R') & (df['p_throws'] == 'R') 
]

values = [0, 1, 2, 3]

df['platoon_state'] = np.select(conditions, values)

#integer encode count
count_mapping = {
    (0, 0): 0,
    (0, 1): 1,
    (0, 2): 2,
    (1, 0): 3,
    (1, 1): 4,
    (1, 2): 5,
    (2, 0): 6,
    (2, 1): 7,
    (2, 2): 8,
    (3, 0): 9,
    (3, 1): 10,
    (3, 2): 11
}

df['count'] = df[['balls', 'strikes']].apply(tuple, axis=1).map(count_mapping)

In [6]:
#find average run value for each event so we can ignore game state leverage
description_avg = df.groupby('description')['delta_run_exp'].mean().reset_index()
description_avg = description_avg.rename(columns={'delta_run_exp': 'avg_run_value_desc'})

events_avg = df[df['description'] == 'hit_into_play'].groupby('events')['delta_run_exp'].mean().reset_index()
events_avg = events_avg.rename(columns={'delta_run_exp': 'avg_run_value_event'})

df = df.merge(description_avg, on='description', how='left')

df = df.merge(events_avg, on='events', how='left', suffixes=('', '_event'))

df['run_value'] = df.apply(lambda row: row['avg_run_value_event'] if row['description'] == 'hit_into_play' else row['avg_run_value_desc'], axis=1)

df = df.drop(columns=['avg_run_value_desc', 'avg_run_value_event'])

In [7]:
#features
features = ['release_speed', 'release_pos_x', 'release_pos_z', 'platoon_state', 'count', 'game_year', 'pitch_group', 
            'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'release_extension', 'release_spin_rate', 'spin_axis', 'avg_release_speed',
            'avg_release_pos_x', 'avg_release_pos_z', 'avg_pfx_x', 'avg_pfx_z']

#target variable
target = 'run_value'

In [8]:
#drop NAs
df = df.dropna(subset=features + [target])

#set X and y
X = df[features]
y = df[target]

#define objective function
def objective(trial):
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    
    #define XGBRegressor and use rmse as eval metric
    model = xgb.XGBRegressor(**param, eval_metric='rmse', use_label_encoder=False)

    #set up KFold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=13)
    rmse_scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=kf, n_jobs=-1)
    
    return -np.mean(rmse_scores)

#initialize Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
best_params = study.best_params

#train final model with best params
quality_model = xgb.XGBRegressor(**best_params)
quality_model.fit(X, y)

': 2.1004640912112482e-06, 'grow_policy': 'depthwise', 'subsample': 0.9640591755323056, 'colsample_bytree': 0.943358685503359, 'min_child_weight': 9}. Best is trial 26 with value: 0.2070590964584905.
Python(22128) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(22129) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(22130) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(22131) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(22132) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[I 2024-10-21 18:46:13,959] Trial 27 finished with value: 0.2070584275237135 and parameters: {'lambda': 2.541796572199262e-07, 'alpha': 0.0010139846821814933, 'n_estimators': 539, 'max_depth': 9, 'eta': 0.026302770065209098, 'gamma': 1.010931991913642e-06, 'grow_policy': 'depthwise', 'subsample': 0.9550589849

In [9]:
#make predictions
df['predicted_run_value'] = quality_model.predict(X)

In [11]:
#group by pitcher, pitch type, and year
grouped_df = df.groupby(['pitcher', 'player_name', 'game_year', 'pitch_type'])[['release_speed', 'release_pos_x', 'release_pos_z', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'predicted_run_value',]].mean()

count_df = df.groupby(['pitcher', 'player_name', 'game_year', 'pitch_type']).size().reset_index(name='n')

grouped_df = grouped_df.reset_index().merge(count_df, on=['pitcher', 'player_name', 'game_year', 'pitch_type'])

In [15]:
#display pitch quality leaderboard
grouped_df[grouped_df['n'] >= 100].sort_values(by='predicted_run_value', ascending=True).head(60)

Unnamed: 0,pitcher,player_name,game_year,pitch_type,release_speed,release_pos_x,release_pos_z,pfx_x,pfx_z,plate_x,plate_z,predicted_run_value,n
24044,661403,"Clase, Emmanuel",2022,SL,91.872934,-0.736353,6.066581,0.592393,-0.014501,0.31208,1.77963,-0.038475,351
24041,661403,"Clase, Emmanuel",2021,SL,91.854839,-0.576,6.172548,0.516935,0.067484,0.439645,1.828839,-0.031304,310
24051,661403,"Clase, Emmanuel",2024,SL,91.029703,-0.666931,6.003861,0.588861,0.186535,0.445693,1.968861,-0.029501,202
20706,642207,"Williams, Devin",2020,CH,84.06652,-2.179604,5.56652,-1.507974,-0.168546,-0.382379,1.72696,-0.029404,227
23899,660813,"Graterol, Brusdar",2022,SI,99.748264,-1.580868,6.17625,-1.375035,0.633958,-0.313507,2.551875,-0.02737,288
25029,664126,"Fairbanks, Pete",2023,SL,86.767213,-0.162678,6.951667,0.33418,-0.581776,-0.125765,1.97929,-0.027287,366
25025,664126,"Fairbanks, Pete",2022,FF,98.9625,-0.265046,6.986806,-0.00588,1.720926,-0.04338,2.817731,-0.026394,216
24047,661403,"Clase, Emmanuel",2023,SL,91.106761,-0.635042,5.98462,0.659239,0.117042,0.280423,1.754761,-0.026093,355
14653,605483,"Snell, Blake",2024,CH,85.471061,2.206817,6.492894,1.140514,0.85746,0.580289,1.823215,-0.025899,311
17381,621237,"Alvarado, José",2018,FC,88.928037,0.17785,6.602804,-0.196542,0.089252,-0.512056,1.538037,-0.025818,107
