In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from itertools import product
from scipy.ndimage import gaussian_filter
from itertools import product
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
from fpdf import FPDF
import joblib
from datetime import datetime
import helper_functions as hf
from constants import(
    platoon_state_mapping,
    side_buckets,
    height_buckets,
    count_values,
    num_clusters,
    numerical_features,
    pseudo_sample_size,
    median_features
)
import warnings
warnings.filterwarnings('ignore')

rv_model = joblib.load('rv_model.pkl')
gmm_models = hf.load_gmm_models()

pitches_df = pd.read_csv('all_pitches.csv')
global_means = pd.read_csv('global_means.csv')

pitches_df = hf.prepare_data(pitches_df, game_only=True)

In [None]:
pitches_df = hf.add_prev_pitch(pitches_df)

In [None]:
features = ['RelSpeed', 'SpinRate', 'SpinAxis', 'RelHeight', 'RelSide', 'Extension', 
            'PlateLocSideBucket', 'PlateLocHeightBucket', 'ax0', 'ay0', 'az0', 
            'avg_fb_RelSpeed', 'avg_fb_ax0', 'avg_fb_az0', 'avg_fb_RelHeight', 'avg_fb_RelSide',
            'Year', 'BatterLeagueEncoded', 'PlatoonStateEncoded', 'Balls', 'Strikes', 
            'PitchGroupEncoded', 'BatterStuffValue', 'prev_pitch_RelSpeed', 'prev_pitch_HorzBreak', 
            'prev_pitch_InducedVertBreak', 'prev_pitch_PlateLocSideBucket', 'prev_pitch_PlateLocHeightBucket', 
            'prev_pitch_PitchCall', 'prev_pitch_SamePitch']

In [None]:
pitches_df = hf.add_probabilities(pitches_df)
pitches_df, pivoted_values = hf.calculate_shrunken_means(pitches_df, global_means)
pitches_df = hf.compute_batter_stuff_value(pitches_df, pivoted_values)

In [27]:
model_df = pitches_df.dropna(subset=features + [rv_target])

X = model_df[features]
y = model_df[rv_target]

In [29]:
import pandas as pd
import joblib
import helper_functions as hf
import xgboost as xgb
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
from constants import rv_features, rv_target
import warnings
warnings.filterwarnings('ignore')

rmse_scorer = make_scorer(mean_squared_error, squared=False)

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500)
    }

    model = xgb.XGBRegressor(**params, random_state=100)

    kf = KFold(n_splits=5, shuffle=True, random_state=100)
    cv_scores = cross_val_score(model, X, y, scoring=rmse_scorer, cv=kf)

    return np.mean(cv_scores)

prev_pitch_study = optuna.create_study(direction='minimize')
prev_pitch_study.optimize(objective, n_trials=5)

prev_pitch_best_params = prev_pitch_study.best_params
prev_pitch_model = xgb.XGBRegressor(**prev_pitch_best_params, random_state=100)
prev_pitch_model.fit(X, y)

print(f"Best Parameters: {prev_pitch_best_params}")

joblib.dump(prev_pitch_model, 'prev_pitch_model.pkl')

print("Previous Pitch Model saved!")

[I 2025-02-06 17:49:36,963] A new study created in memory with name: no-name-f53a6aee-eba3-4e05-be15-fe39ab411118
[I 2025-02-06 17:49:57,883] Trial 0 finished with value: 0.17567305408479886 and parameters: {'learning_rate': 0.06528988507548067, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.6902466994590045, 'colsample_bytree': 0.5175318930651351, 'lambda': 0.7219060278096314, 'alpha': 0.09040594034290177, 'gamma': 4.323512470874525, 'n_estimators': 464}. Best is trial 0 with value: 0.17567305408479886.
[I 2025-02-06 17:50:13,900] Trial 1 finished with value: 0.17585854822333352 and parameters: {'learning_rate': 0.24892702161183702, 'max_depth': 5, 'min_child_weight': 2, 'subsample': 0.6061623860029633, 'colsample_bytree': 0.5594909504232574, 'lambda': 3.100593503517779, 'alpha': 0.005531857025143086, 'gamma': 2.696261006863569, 'n_estimators': 399}. Best is trial 0 with value: 0.17567305408479886.
[I 2025-02-06 17:50:25,573] Trial 2 finished with value: 0.17570225980412973 and