In [1]:
# XWhiff Model Training and Prediction
# This notebook develops and optimizes an XGBoost model to predict swing-and-miss probability
# 
# Import necessary libraries for model development and optimization
import pandas as pd                      # Data manipulation and analysis
import numpy as np                       # Numerical operations
import xgboost as xgb                    # Gradient boosting framework
from xgboost import XGBClassifier        # XGBoost classifier for binary classification
from sklearn.linear_model import LogisticRegression  # Alternative linear model
import matplotlib.pyplot as plt          # Basic plotting functionality
import seaborn as sns                    # Statistical data visualization
import sklearn                           # Machine learning library
from sklearn.model_selection import KFold  # Cross-validation functionality
from sklearn.metrics import log_loss     # Evaluation metric for binary classification
import optuna                            # Hyperparameter optimization framework

In [2]:
# Load cleaned pitch tracking data from multiple seasons
# This provides a comprehensive dataset spanning multiple years for robust model training
clean_2024 = pd.read_csv('./whiff_csvs/2024_whiff_clean.csv', low_memory=False)  # Test data
clean_2023 = pd.read_csv('./whiff_csvs/2023_whiff_clean.csv', low_memory=False)  # Training data
clean_2022 = pd.read_csv('./whiff_csvs/2022_whiff_clean.csv', low_memory=False)  # Training data

In [3]:
# Note: Remove any commented code blocks that are no longer needed
# This keeps the notebook clean and focused on the current methodology

In [4]:
clean_2022.head()

Unnamed: 0,PitcherThrows,BatterSide,Outs,Balls,Strikes,TaggedPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,...,Extension,InducedVertBreak,HorzBreak,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,PitcherId,whiff,count
0,Right,Right,0,0,0,Fastball,BallCalled,91.80411,-3.816351,-3.522621,...,5.74749,18.08218,12.78119,1.16387,-0.26392,-6.801089,-1.239948,1000074000.0,0,0-0
1,Right,Right,0,1,0,Fastball,BallCalled,92.38028,-3.537349,-3.658209,...,5.55677,16.22639,14.28202,1.28412,-0.42104,-6.825532,-1.116474,1000074000.0,0,1-0
2,Right,Right,0,2,0,Fastball,StrikeCalled,92.75261,-3.209314,-2.573778,...,5.49731,17.84517,12.29304,1.84823,0.54753,-6.174757,-0.385627,1000074000.0,0,2-0
3,Right,Right,0,2,1,Fastball,FoulBall,92.35726,-2.446575,-2.958796,...,5.58153,19.67907,10.39461,2.68124,0.10833,-5.096501,-1.10735,1000074000.0,0,2-1
4,Right,Right,0,2,2,Fastball,InPlay,94.60763,-3.377392,-2.749154,...,5.7024,17.33721,14.15202,1.7842,0.58484,-6.06264,-0.220896,1000074000.0,0,2-2


In [5]:
clean_2023.head()

Unnamed: 0,PitcherThrows,BatterSide,Outs,Balls,Strikes,TaggedPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,...,RelSide,Extension,InducedVertBreak,HorzBreak,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,whiff,count
0,Right,Left,0,0,0,Fastball,InPlay,88.43126,-3.720299,-0.58548,...,0.59849,5.88136,23.19308,10.90638,1.9163,0.96361,-6.304004,1.371603,0,0-0
1,Right,Left,0,0,0,ChangeUp,StrikeCalled,80.24299,-0.94526,-1.525434,...,0.7117,6.53191,11.33949,11.80068,2.80934,0.29542,-6.900707,0.61973,0,0-0
2,Right,Left,0,0,1,Fastball,BallCalled,87.42133,-3.303045,-0.520038,...,0.6994,5.79229,21.80682,10.33725,2.09101,1.07706,-6.285148,1.332128,0,0-1
3,Right,Left,0,1,1,Curveball,BallCalled,72.85011,-1.028297,0.347481,...,0.3063,5.85791,-10.56816,-17.28883,0.22062,-0.81158,-12.684304,-2.772901,0,1-1
4,Right,Left,0,2,1,ChangeUp,InPlay,82.40725,-2.265863,-1.333121,...,0.72582,6.82618,12.77918,13.40245,1.81788,0.62639,-7.448031,1.117945,0,2-1


In [6]:
clean_2024.head()

Unnamed: 0,PitcherThrows,BatterSide,Outs,Balls,Strikes,TaggedPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,...,Extension,InducedVertBreak,HorzBreak,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,PitcherId,whiff,count
0,Right,Right,0,0,0,Slider,BallCalled,86.34831,-5.087035,-0.556059,...,5.20061,-1.2615,-6.71201,-1.24052,-0.79734,-12.231122,-1.751516,1000234000.0,0,0-0
1,Right,Right,0,1,0,Fastball,StrikeCalled,94.49974,-3.133086,-0.49252,...,5.83655,20.20828,3.49654,3.00046,0.29788,-5.365324,0.134391,1000234000.0,0,1-0
2,Right,Right,0,1,1,Fastball,FoulBallNotFieldable,94.81021,-3.910073,-1.135525,...,5.67326,22.06875,-0.4374,2.48669,-0.87744,-5.815582,-1.213668,1000234000.0,0,1-1
3,Right,Right,0,1,2,Slider,FoulBallNotFieldable,86.30865,-1.385858,-0.791508,...,5.43599,2.3461,-6.38485,2.69703,-1.15108,-7.862841,-1.929632,1000234000.0,0,1-2
4,Right,Right,0,1,2,Slider,BallCalled,87.4587,-4.605749,-1.32325,...,5.28786,0.27646,-4.37162,-0.43496,-1.44974,-11.264591,-2.102031,1000234000.0,0,1-2


In [7]:
# Create train/test split using temporal separation
# 2024 data as test set provides realistic out-of-time validation
# 2022-2023 combined data provides sufficient training samples
test = clean_2024.copy()
train = pd.concat([clean_2023.copy(), clean_2022.copy()])

# Model Development Strategy

## Feature Selection Objectives
The primary goals of this analysis are:
1. **Feature Selection**: Identify which pitch characteristics are most predictive of swing-and-miss outcomes
2. **Model Architecture**: Determine if pitch-type-specific models outperform a unified model
3. **Temporal Validation**: Use 2022-2023 data for training and 2024 for testing to ensure robust performance

## Pitch Type Segmentation Considerations
While grouping pitches by type (fastball, breaking ball, offspeed) seems logical due to similar movement profiles, several challenges exist:

### Data Quality Challenges
- **Subjective Tagging**: Pitch classification relies on human judgment, leading to inconsistencies
- **Operator Variability**: Different taggers may classify similar pitches differently
- **Edge Cases**: Borderline pitches (e.g., hard slider vs. cutter) are particularly prone to misclassification

### Cleaning Limitations
Automated pitch type correction is difficult because:
- Pitch types exist on a spectrum rather than discrete categories
- Movement parameters can overlap significantly between pitch types
- Contextual factors (game situation, pitcher intent) influence classification

## Methodology
- **Training Data**: Combined 2022-2023 seasons for maximum sample size
- **Test Data**: 2024 season for out-of-time validation
- **Evaluation**: Compare unified model vs. pitch-type-specific models using cross-validation

In [8]:
# Examine data types in the combined training dataset
# This helps identify any type inconsistencies between seasons that need addressing
train.dtypes

PitcherThrows        object
BatterSide           object
Outs                  int64
Balls                 int64
Strikes               int64
TaggedPitchType      object
PitchCall            object
RelSpeed            float64
VertRelAngle        float64
HorzRelAngle        float64
SpinRate            float64
SpinAxis            float64
RelHeight           float64
RelSide             float64
Extension           float64
InducedVertBreak    float64
HorzBreak           float64
PlateLocHeight      float64
PlateLocSide        float64
VertApprAngle       float64
HorzApprAngle       float64
whiff                 int64
count                object
PitcherId           float64
dtype: object

In [9]:
# Convert categorical variables to proper data types for XGBoost
# XGBoost with enable_categorical=True can handle categorical features more efficiently
# This ensures handedness variables are treated as discrete categories, not ordinal
train['PitcherThrows'] = train['PitcherThrows'].astype('category')
train['BatterSide'] = train['BatterSide'].astype('category')
test['PitcherThrows'] = test['PitcherThrows'].astype('category')
test['BatterSide'] = test['BatterSide'].astype('category')

In [10]:
# Function to extract features and target variable from dataset
# This standardizes the feature selection process across different data splits
def split(df):
    """
    Extract features and target variable for model training/testing
    
    Features include:
    - Game situation: PitcherThrows, BatterSide, Outs, Balls, Strikes
    - Pitch characteristics: RelSpeed, VertRelAngle, HorzRelAngle, SpinRate, SpinAxis
    - Release point: RelHeight, RelSide, Extension
    - Movement: InducedVertBreak, HorzBreak
    - Location: PlateLocHeight, PlateLocSide, VertApprAngle, HorzApprAngle
    """
    X = df[['PitcherThrows', 'BatterSide', 'Outs', 'Balls', 'Strikes',
            'RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate', 'SpinAxis',
            'RelHeight', 'RelSide', 'Extension', 'InducedVertBreak', 'HorzBreak',
            'PlateLocHeight', 'PlateLocSide', 'VertApprAngle', 'HorzApprAngle']]
    y = df['whiff']
    return X, y

# Apply feature extraction to both training and test sets
X_train, y_train = split(train)
X_test, y_test = split(test)

In [11]:
# Cross-validation function to evaluate model performance
# Uses 5-fold CV with log loss metric for binary classification evaluation
def CV(X_train, Y_train):
    """
    Perform 5-fold cross-validation to estimate model performance
    
    Args:
        X_train: Feature matrix
        Y_train: Target variable (binary: whiff/no-whiff)
    
    Returns:
        Mean log loss across all folds (lower is better)
    """
    model = XGBClassifier(enable_categorical=True)  # Enable categorical feature handling
    kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold CV with shuffling
    validation_errors = np.empty(0)

    for train_idx, valid_idx in kf.split(X_train):
        # Split data for this fold
        split_X_train, split_X_valid = X_train.iloc[train_idx, :], X_train.iloc[valid_idx, :]
        split_Y_train, split_Y_valid = Y_train.iloc[train_idx], Y_train.iloc[valid_idx] 
        
        # Train model on training fold
        model.fit(split_X_train, split_Y_train)
        
        # Predict probabilities on validation fold
        Y_pred = model.predict_proba(split_X_valid)[:, 1]  # Probability of whiff (class 1)
        
        # Calculate log loss for this fold
        validation_errors = np.append(validation_errors, log_loss(split_Y_valid, Y_pred))
    
    return np.mean(validation_errors)

In [12]:
# Calculate baseline cross-validation error using all features
# This establishes our benchmark performance before feature selection
CV_errors = pd.DataFrame({'cv_error': np.array(CV(X_train, y_train))}, index=[0])
CV_errors

Unnamed: 0,cv_error
0,0.289485


In [13]:
# Initialize dataframe to store individual feature performance
# This will help us understand the predictive power of each feature independently
feature_errors = pd.DataFrame({"feature_name": X_train.columns})
feature_errors

Unnamed: 0,feature_name
0,PitcherThrows
1,BatterSide
2,Outs
3,Balls
4,Strikes
5,RelSpeed
6,VertRelAngle
7,HorzRelAngle
8,SpinRate
9,SpinAxis


In [14]:
# Evaluate individual feature performance using single-feature models
# This identifies which features have the strongest independent predictive power
def feature_error(X_train, Y_train):
    """
    Test each feature individually to assess its predictive power
    
    Args:
        X_train: Full feature matrix
        Y_train: Target variable
    
    Returns:
        DataFrame with feature names and their individual CV errors
    """
    feature_errors = pd.DataFrame({"feature_name": X_train.columns})
    model = XGBClassifier(enable_categorical=True)
    feature_error = np.empty(0)

    # Test each feature individually
    for x_var in X_train.columns:
        print(f"Testing feature: {x_var}")
        
        # Create single-feature dataset
        X = X_train[[x_var]]
       
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        validation_errors = np.empty(0)

        # Cross-validate single-feature model
        for train_idx, valid_idx in kf.split(X_train):
            split_X_train, split_X_valid = X.iloc[train_idx, :], X.iloc[valid_idx, :]
            split_Y_train, split_Y_valid = Y_train.iloc[train_idx], Y_train.iloc[valid_idx] 
            
            model.fit(split_X_train, split_Y_train)
            Y_pred = model.predict_proba(split_X_valid)[:, 1]
            validation_errors = np.append(validation_errors, log_loss(split_Y_valid, Y_pred))
        
        # Store mean error for this feature
        feature_error = np.append(feature_error, np.mean(validation_errors))
    
    feature_errors['error'] = feature_error
    return feature_errors

# Execute feature importance analysis
errors = feature_error(X_train, y_train)
errors



Unnamed: 0,feature_name,error
0,PitcherThrows,0.334509
1,BatterSide,0.33446
2,Outs,0.334349
3,Balls,0.333855
4,Strikes,0.330373
5,RelSpeed,0.331889
6,VertRelAngle,0.328804
7,HorzRelAngle,0.331164
8,SpinRate,0.332835
9,SpinAxis,0.333033


In [15]:
feature_errors = errors.copy()

In [16]:
# Rank features by their individual predictive power (lowest error = best feature)
# This creates an ordered list from most to least important features
feature_importance = feature_errors.sort_values(by="error")["feature_name"]
feature_importance

17       VertApprAngle
6         VertRelAngle
15      PlateLocHeight
16        PlateLocSide
4              Strikes
7         HorzRelAngle
5             RelSpeed
13    InducedVertBreak
8             SpinRate
9             SpinAxis
18       HorzApprAngle
14           HorzBreak
3                Balls
2                 Outs
1           BatterSide
0        PitcherThrows
12           Extension
10           RelHeight
11             RelSide
Name: feature_name, dtype: object

In [17]:
# Forward feature selection: incrementally add features in order of importance
# This finds the optimal number of features by testing cumulative feature sets
def best_features(X_train, Y_train, feature_importance):
    """
    Perform forward feature selection using ranked feature importance
    
    Starting with the best individual feature, iteratively add the next best feature
    and evaluate performance until we find the optimal feature subset
    
    Args:
        X_train: Training features
        Y_train: Training target
        feature_importance: Features ranked by individual performance
    
    Returns:
        predictors: List of feature subsets tested
        feature_error: CV error for each feature subset
    """
    model = XGBClassifier(enable_categorical=True)
    feature_error = np.empty(0)
    predictors = []

    # Test each cumulative feature subset
    for i in np.arange(0, len(feature_importance)):
        print(f"Testing {i+1} features...")
        
        # Select top i+1 features
        X = X_train[np.array(feature_importance[0:i+1])]
        predictors.append(feature_importance[0:i+1].values)
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        validation_errors = np.empty(0)

        # Cross-validate this feature subset
        for train_idx, valid_idx in kf.split(X_train):
            split_X_train, split_X_valid = X.iloc[train_idx, :], X.iloc[valid_idx, :]
            split_Y_train, split_Y_valid = Y_train.iloc[train_idx], Y_train.iloc[valid_idx] 
            
            model.fit(split_X_train, split_Y_train)
            Y_pred = model.predict_proba(split_X_valid)[:, 1]
            validation_errors = np.append(validation_errors, log_loss(split_Y_valid, Y_pred))
        
        feature_error = np.append(feature_error, np.mean(validation_errors))
    
    return predictors, feature_error

In [18]:
# Execute forward feature selection to find optimal feature combination
# This process will test all cumulative feature subsets and find the best performing one
predictors, predict_error = best_features(X_train, y_train, feature_importance)
x_vars_errors = pd.DataFrame({"subset": predictors, "validation error": predict_error})

In [19]:
# Identify the optimal feature subset with lowest validation error
# This represents the best balance between model complexity and performance
feature = pd.DataFrame({"subset": predictors, "validation error": predict_error})
new_X_train_columns = feature.sort_values(by='validation error').iloc[0, 0]

# Note: Features like Extension, RelHeight, RelSide may be dropped if they don't improve performance
# This suggests these release point characteristics are less predictive than other features
new_X_train_columns

array(['VertApprAngle', 'VertRelAngle', 'PlateLocHeight', 'PlateLocSide',
       'Strikes', 'HorzRelAngle', 'RelSpeed', 'InducedVertBreak',
       'SpinRate', 'SpinAxis', 'HorzApprAngle', 'HorzBreak', 'Balls',
       'Outs', 'BatterSide', 'PitcherThrows', 'Extension', 'RelHeight'],
      dtype=object)

# Pitch Type-Specific Model Analysis

Now we'll test whether separating pitches into distinct categories improves model performance. The hypothesis is that different pitch types (fastballs vs. breaking balls vs. offspeed) have fundamentally different characteristics that make swing-and-miss prediction, and specialized models might capture these nuances better than a unified approach.

## Approach
We'll repeat the same feature selection process for three pitch categories:
1. **Fastballs**: Fastball, Sinker
2. **Breaking Balls**: Curveball, Cutter, Slider  
3. **Offspeed**: ChangeUp

This will allow us to compare:
- **Unified Model**: Single model trained on all pitch types
- **Specialized Models**: Separate models for each pitch category
- **Feature Importance**: Whether different features matter for different pitch types

In [20]:
# Split datasets by pitch type categories for specialized model training
# Each category has different characteristics that may benefit from specialized modeling

# Fastball category: High velocity, minimal movement
FB_train = train[(train['TaggedPitchType'] == 'Fastball') | (train['TaggedPitchType'] == 'Sinker')]
FB_test = test[(test['TaggedPitchType'] == 'Fastball') | (test['TaggedPitchType'] == 'Sinker')]

# Breaking ball category: Significant movement, spin-dependent
BB_train = train[(train['TaggedPitchType'] == 'Curveball') | (train['TaggedPitchType'] == 'Cutter') | (train['TaggedPitchType'] == 'Slider')]
BB_test = test[(test['TaggedPitchType'] == 'Curveball') | (test['TaggedPitchType'] == 'Slider') | (test['TaggedPitchType'] == 'Cutter')]

# Offspeed category: Deception-based, velocity differential
OFF_train = train[(train['TaggedPitchType'] == 'ChangeUp')]
OFF_test = test[(test['TaggedPitchType'] == 'ChangeUp')]


# Cutter Classification Decision

## Initial Approach vs. Final Decision
**Initial Classification**: Cutter grouped with fastballs due to velocity similarity (essentially opposite of sinker movement)

**Final Classification**: Cutter moved to breaking ball category, though in the MLB cutters are often thought of and are thrown like Fastballs. In college baseball cutters are more commonly thrown as a breaking pitch similar to a Slider

## Reasoning for Change
1. **Movement Profile**: Despite velocity similarity to fastballs, cutters have movement characteristics closer to sliders
2. **Tagging Consistency**: Cutters are frequently mistagged as sliders during games due to visual similarity
3. **Performance Impact**: Moving cutters to the breaking ball category reduced log loss for both:
   - Fastball models (more homogeneous group)
   - Breaking ball models (better handles cutter-slider similarity)

## Key Insight
This demonstrates how data quality issues (subjective pitch tagging) can inform modeling decisions. Sometimes grouping by observed characteristics (how pitches are actually classified) performs better than grouping by theoretical characteristics (how pitches should be classified).

In [21]:
FB_X_train, FB_y_train = split(FB_train)
FB_X_test, FB_y_test = split(FB_test)
BB_X_train, BB_y_train = split(BB_train)
BB_X_test, BB_X_test = split(BB_test)
OFF_X_train, OFF_y_train = split(OFF_train)
OFF_X_test, OFF_y_test = split(OFF_test)

In [22]:
#cv_errors feature_errors x_vars_errors
FB_CV = CV(FB_X_train,FB_y_train)
CV_errors['FB_CV_error'] = [FB_CV]
BB_CV = CV(BB_X_train,BB_y_train)
CV_errors['BB_CV_error'] = [BB_CV]
OFF_CV = CV(OFF_X_train,OFF_y_train)
CV_errors['OFF_CV_error'] = [OFF_CV]

In [23]:
CV_errors

Unnamed: 0,cv_error,FB_CV_error,BB_CV_error,OFF_CV_error
0,0.289485,0.246876,0.334365,0.379937


In [24]:
FB_feature_error = feature_error(FB_X_train,FB_y_train)
BB_feature_error = feature_error(BB_X_train,BB_y_train)
OFF_feature_error = feature_error(OFF_X_train,OFF_y_train)

In [25]:
FB_feature_error, BB_feature_error, OFF_feature_error

(        feature_name     error
 0      PitcherThrows  0.281804
 1         BatterSide  0.281798
 2               Outs  0.281685
 3              Balls  0.281267
 4            Strikes  0.278391
 5           RelSpeed  0.280480
 6       VertRelAngle  0.279179
 7       HorzRelAngle  0.279398
 8           SpinRate  0.281102
 9           SpinAxis  0.280957
 10         RelHeight  0.281942
 11           RelSide  0.281915
 12         Extension  0.281899
 13  InducedVertBreak  0.279886
 14         HorzBreak  0.281556
 15    PlateLocHeight  0.273658
 16      PlateLocSide  0.276041
 17     VertApprAngle  0.274555
 18     HorzApprAngle  0.280907,
         feature_name     error
 0      PitcherThrows  0.385165
 1         BatterSide  0.385086
 2               Outs  0.385128
 3              Balls  0.384107
 4            Strikes  0.380265
 5           RelSpeed  0.380502
 6       VertRelAngle  0.369141
 7       HorzRelAngle  0.379313
 8           SpinRate  0.384430
 9           SpinAxis  0.384583
 10    

In [26]:
FB_feature_importance = FB_feature_error.sort_values(by="error")["feature_name"]
BB_feature_importance = BB_feature_error.sort_values(by="error")["feature_name"]
OFF_feature_importance = OFF_feature_error.sort_values(by="error")["feature_name"]

In [27]:
FB_x_vars, FB_best_features = best_features(FB_X_train, FB_y_train,FB_feature_importance)
BB_x_vars, BB_best_features = best_features(BB_X_train, BB_y_train,BB_feature_importance)
OFF_x_vars, OFF_best_features = best_features(OFF_X_train, OFF_y_train,OFF_feature_importance)

In [28]:
feature['FB_x_vars'] = FB_x_vars
feature['FB_error'] = FB_best_features
feature['BB_x_vars'] = BB_x_vars
feature['BB_error'] = BB_best_features
feature['OFF_x_vars'] = OFF_x_vars
feature['OFF_error'] = OFF_best_features

In [29]:
feature

Unnamed: 0,subset,validation error,FB_x_vars,FB_error,BB_x_vars,BB_error,OFF_x_vars,OFF_error
0,[VertApprAngle],0.327775,[PlateLocHeight],0.273658,[PlateLocHeight],0.364271,[PlateLocHeight],0.405734
1,"[VertApprAngle, VertRelAngle]",0.317636,"[PlateLocHeight, VertApprAngle]",0.269297,"[PlateLocHeight, VertRelAngle]",0.361345,"[PlateLocHeight, VertRelAngle]",0.40544
2,"[VertApprAngle, VertRelAngle, PlateLocHeight]",0.311429,"[PlateLocHeight, VertApprAngle, PlateLocSide]",0.261828,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.352908,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.396119
3,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.303948,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.257593,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.348214,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.393106
4,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.299205,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.255037,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.341742,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.387297
5,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.295263,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.252055,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.340509,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38459
6,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294464,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.251601,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.337806,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.383459
7,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294276,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.25157,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334967,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38032
8,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.293819,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250052,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334906,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38076
9,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.291975,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250001,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334568,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.380336


In [30]:
BB_OFF_train = train[(train['TaggedPitchType'] != 'Fastball') & (train['TaggedPitchType'] != 'Sinker')]
BB_OFF_test = test[(test['TaggedPitchType'] != 'Fastball') & (test['TaggedPitchType'] != 'Sinker')]
comb_X_train,comb_y_train = split(BB_OFF_train)
comb_X_test, comb_y_test = split(BB_OFF_test)

In [31]:
comb_CV = CV(comb_X_train,comb_y_train)
CV_errors['BB_OFF_CV_error'] = [comb_CV]

In [32]:
CV_errors

Unnamed: 0,cv_error,FB_CV_error,BB_CV_error,OFF_CV_error,BB_OFF_CV_error
0,0.289485,0.246876,0.334365,0.379937,0.345476


Here I basically determined that despite the fastball error being lower it wasn't worth how much higher the breaking ball and offspeed error were to use these seperate models to train the data. Like I said at the top, I think one of the main reasons that the error of these pitches seperated is higher is because of how subjective pitch tagging is in college baseball.

In [33]:
comb_feature_error = feature_error(comb_X_train,comb_y_train)
comb_feature_importance = comb_feature_error.sort_values(by="error")["feature_name"]
comb_x_vars, comb_best_features = best_features(comb_X_train, comb_y_train,comb_feature_importance)
feature['comb_x_vars'] = comb_x_vars
feature['comb_error'] = comb_best_features
feature

Unnamed: 0,subset,validation error,FB_x_vars,FB_error,BB_x_vars,BB_error,OFF_x_vars,OFF_error,comb_x_vars,comb_error
0,[VertApprAngle],0.327775,[PlateLocHeight],0.273658,[PlateLocHeight],0.364271,[PlateLocHeight],0.405734,[PlateLocHeight],0.376766
1,"[VertApprAngle, VertRelAngle]",0.317636,"[PlateLocHeight, VertApprAngle]",0.269297,"[PlateLocHeight, VertRelAngle]",0.361345,"[PlateLocHeight, VertRelAngle]",0.40544,"[PlateLocHeight, VertRelAngle]",0.37314
2,"[VertApprAngle, VertRelAngle, PlateLocHeight]",0.311429,"[PlateLocHeight, VertApprAngle, PlateLocSide]",0.261828,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.352908,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.396119,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.365716
3,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.303948,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.257593,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.348214,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.393106,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.360122
4,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.299205,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.255037,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.341742,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.387297,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.354479
5,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.295263,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.252055,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.340509,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38459,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.352058
6,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294464,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.251601,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.337806,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.383459,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.350404
7,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294276,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.25157,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334967,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38032,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.350187
8,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.293819,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250052,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334906,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38076,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.346317
9,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.291975,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250001,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334568,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.380336,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.346093


In [34]:
feature

Unnamed: 0,subset,validation error,FB_x_vars,FB_error,BB_x_vars,BB_error,OFF_x_vars,OFF_error,comb_x_vars,comb_error
0,[VertApprAngle],0.327775,[PlateLocHeight],0.273658,[PlateLocHeight],0.364271,[PlateLocHeight],0.405734,[PlateLocHeight],0.376766
1,"[VertApprAngle, VertRelAngle]",0.317636,"[PlateLocHeight, VertApprAngle]",0.269297,"[PlateLocHeight, VertRelAngle]",0.361345,"[PlateLocHeight, VertRelAngle]",0.40544,"[PlateLocHeight, VertRelAngle]",0.37314
2,"[VertApprAngle, VertRelAngle, PlateLocHeight]",0.311429,"[PlateLocHeight, VertApprAngle, PlateLocSide]",0.261828,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.352908,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.396119,"[PlateLocHeight, VertRelAngle, PlateLocSide]",0.365716
3,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.303948,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.257593,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.348214,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.393106,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.360122
4,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.299205,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.255037,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.341742,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.387297,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.354479
5,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.295263,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.252055,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.340509,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38459,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.352058
6,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294464,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.251601,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.337806,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.383459,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.350404
7,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.294276,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.25157,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334967,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38032,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.350187
8,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.293819,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250052,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334906,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.38076,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.346317
9,"[VertApprAngle, VertRelAngle, PlateLocHeight, ...",0.291975,"[PlateLocHeight, VertApprAngle, PlateLocSide, ...",0.250001,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.334568,"[PlateLocHeight, VertRelAngle, PlateLocSide, V...",0.380336,"[PlateLocHeight, VertRelAngle, PlateLocSide, H...",0.346093


In [35]:
pitches_features = feature.sort_values(by='validation error').iloc[0,:]['subset']
pitches_features

array(['VertApprAngle', 'VertRelAngle', 'PlateLocHeight', 'PlateLocSide',
       'Strikes', 'HorzRelAngle', 'RelSpeed', 'InducedVertBreak',
       'SpinRate', 'SpinAxis', 'HorzApprAngle', 'HorzBreak', 'Balls',
       'Outs', 'BatterSide', 'PitcherThrows', 'Extension', 'RelHeight'],
      dtype=object)

In [36]:
len(pitches_features)

18

In [37]:
# Hyperparameter optimization using Optuna framework
# This systematically searches for the best XGBoost parameters to minimize log loss
def optimze_params(X_train, y_train):
    """
    Optimize XGBoost hyperparameters using Optuna's Tree-structured Parzen Estimator
    
    Parameters optimized:
    - max_depth: Tree depth (complexity control)
    - learning_rate: Step size shrinkage (overfitting control)  
    - n_estimators: Number of boosting rounds
    - colsample_bytree: Feature sampling ratio
    - min_child_weight: Minimum sum of instance weight in child
    
    Args:
        X_train: Training features
        y_train: Training target
        
    Returns:
        Dictionary of optimal hyperparameters
    """
    def objective(trial):
        # Define hyperparameter search space
        param = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'max_depth': trial.suggest_int('max_depth', 2, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'seed': 69  # Fixed seed for reproducibility
        }
        
        # Train model with current hyperparameters
        model = XGBClassifier(**param, enable_categorical=True)
        model.fit(X_train, y_train)

        # Evaluate on training set (Note: could use validation set for better generalization)
        y_pred = model.predict_proba(X_train)[:, 1]
        return log_loss(y_train, y_pred)
    
    # Create optimization study and run trials
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)
    return study.best_params

In [38]:
new_X_train = X_train[new_X_train_columns]
new_X_test = X_test[new_X_train_columns]

In [39]:
best_params = optimze_params(new_X_train,y_train)

[I 2024-12-04 12:16:46,443] A new study created in memory with name: no-name-226432d6-0ae1-4c7f-9a9e-7d32655d0757


[I 2024-12-04 12:17:02,542] Trial 0 finished with value: 0.31265785526445 and parameters: {'max_depth': 2, 'learning_rate': 0.02786157091350208, 'n_estimators': 195, 'colsample_bytree': 0.8975445337494109, 'min_child_weight': 6}. Best is trial 0 with value: 0.31265785526445.
[I 2024-12-04 12:17:21,553] Trial 1 finished with value: 1.6581614331877133 and parameters: {'max_depth': 4, 'learning_rate': 4.14531562224322, 'n_estimators': 463, 'colsample_bytree': 0.5104961905103984, 'min_child_weight': 9}. Best is trial 0 with value: 0.31265785526445.
[I 2024-12-04 12:17:42,975] Trial 2 finished with value: 0.29903362965020064 and parameters: {'max_depth': 8, 'learning_rate': 0.016351996758502647, 'n_estimators': 161, 'colsample_bytree': 0.647207750319435, 'min_child_weight': 9}. Best is trial 2 with value: 0.29903362965020064.
[I 2024-12-04 12:17:51,301] Trial 3 finished with value: 0.3100036350591019 and parameters: {'max_depth': 4, 'learning_rate': 0.026188599942194943, 'n_estimators': 107

In [40]:
# Final model training and prediction function
# Uses optimized hyperparameters to train the final model and generate predictions
def create_model(X_train, y_train, X_test, param):
    """
    Train final XGBoost model with optimized parameters and generate predictions
    
    Args:
        X_train: Training features
        y_train: Training target
        X_test: Test features
        param: Optimized hyperparameters from Optuna
        
    Returns:
        y_pred: Predicted probabilities for test set (xWhiff values)
    """
    # Initialize model with optimal parameters
    model = XGBClassifier(**param, enable_categorical=True)
    
    # Train on full training set
    model.fit(X_train, y_train)

    # Generate probability predictions (xWhiff scores)
    y_pred = model.predict_proba(X_test)[:, 1]  # Probability of whiff (class 1)
    return y_pred

In [41]:
# Generate xWhiff predictions for 2024 test data
# Apply the optimized model to create expected whiff probability for each pitch
xwhiff = create_model(new_X_train, y_train, new_X_test, best_params)
test['xwhiff'] = xwhiff

print(f"xWhiff predictions generated for {len(test)} pitches")
print(f"Mean xWhiff: {test['xwhiff'].mean():.3f}")
print(f"xWhiff range: {test['xwhiff'].min():.3f} to {test['xwhiff'].max():.3f}")

In [42]:
# Display the final dataset with xWhiff predictions
# This shows the original pitch data enhanced with expected whiff probabilities
test

Unnamed: 0,PitcherThrows,BatterSide,Outs,Balls,Strikes,TaggedPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,...,InducedVertBreak,HorzBreak,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,PitcherId,whiff,count,xwhiff
0,Right,Right,0,0,0,Slider,BallCalled,86.34831,-5.087035,-0.556059,...,-1.26150,-6.71201,-1.24052,-0.79734,-12.231122,-1.751516,1.000234e+09,0,0-0,6.303733e-05
1,Right,Right,0,1,0,Fastball,StrikeCalled,94.49974,-3.133086,-0.492520,...,20.20828,3.49654,3.00046,0.29788,-5.365324,0.134391,1.000234e+09,0,1-0,1.164470e-01
2,Right,Right,0,1,1,Fastball,FoulBallNotFieldable,94.81021,-3.910073,-1.135525,...,22.06875,-0.43740,2.48669,-0.87744,-5.815582,-1.213668,1.000234e+09,0,1-1,2.141389e-03
3,Right,Right,0,1,2,Slider,FoulBallNotFieldable,86.30865,-1.385858,-0.791508,...,2.34610,-6.38485,2.69703,-1.15108,-7.862841,-1.929632,1.000234e+09,0,1-2,8.039500e-05
4,Right,Right,0,1,2,Slider,BallCalled,87.45870,-4.605749,-1.323250,...,0.27646,-4.37162,-0.43496,-1.44974,-11.264591,-2.102031,1.000234e+09,0,1-2,3.355367e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299541,Right,Left,1,0,2,Fastball,StrikeSwinging,92.18631,0.167760,-2.363777,...,12.64422,18.25938,3.18957,0.77831,-3.753137,0.867156,6.945510e+05,1,0-2,1.518780e-02
1299542,Right,Right,2,0,0,Fastball,StrikeCalled,92.64280,-0.263162,-3.510955,...,15.40280,13.05642,3.09837,-0.84427,-3.653929,-1.209960,6.945510e+05,0,0-0,5.372665e-02
1299543,Right,Right,2,0,1,Fastball,StrikeSwinging,93.60782,-0.195835,-2.925634,...,17.26068,10.25316,3.60768,-0.76626,-3.090463,-1.109277,6.945510e+05,1,0-1,6.737081e-03
1299544,Right,Right,2,0,2,Fastball,BallCalled,92.70635,1.555863,-3.913135,...,16.00694,10.33784,4.86704,-1.41462,-1.669483,-2.081033,6.945510e+05,0,0-2,8.121342e-08


In [43]:
# Export the enhanced dataset for use in inference and analysis
# This file contains all original features plus the new xWhiff metric for further study
test.to_csv('./whiff_csvs/2024_xwhiff_college.csv', index=False)

print("Dataset exported successfully!")
print("Ready for inference analysis and pitch tunneling studies.")