In [70]:
# Pitch Prediction Neural Network Model
# This notebook trains a deep learning model to predict pitch type based on game situation
# Focus: Binary classification (Fastball vs Other) using PyTorch neural networks
#
# Import necessary libraries for deep learning and data analysis
import numpy as np                              # Numerical operations and array handling
import torch                                   # PyTorch deep learning framework
from torch import nn                           # Neural network modules and loss functions
import matplotlib.pyplot as plt                # Visualization for training curves
from sklearn.model_selection import train_test_split  # Data splitting utilities
import pandas as pd                            # Data manipulation and analysis
import torch.optim as optim                   # Optimization algorithms
import random                                  # Random number generation for reproducibility
import seaborn as sns                         # Statistical data visualization

# Set random seeds for reproducibility across multiple runs
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [71]:
# Load the cleaned and preprocessed data from the data cleaning pipeline
# This dataset contains game situations, pitch sequences, and binary fastball targets
print("Loading cleaned dataset...")
data = pd.read_csv('clean_data.csv', low_memory=False)

print(f"Loaded dataset with {data.shape[0]} samples and {data.shape[1]} features")
print("Data includes game situation features + binary fastball target")

In [72]:
# Create working copy of the dataset
# This preserves the original data while allowing modifications
df = data.copy()
print("Created working copy of dataset")

In [73]:
# Remove index column that was saved during CSV export
# This artifact from pandas.to_csv() is not needed for modeling
df = df.drop(columns=['Unnamed: 0'])

print(f"Cleaned dataset shape: {df.shape}")
print("Removed unnecessary index column")
df

Unnamed: 0,Fastball,Balls_0.0,Balls_1.0,Balls_2.0,Balls_3.0,Strikes_0.0,Strikes_1.0,Strikes_2.0,BatterSide_Left,BatterSide_Right,PreviousGroup_BB,PreviousGroup_FB,PreviousGroup_OS,PreviousResult_BallCalled,PreviousResult_FoulBall,PreviousResult_StrikeCalled,PreviousResult_StrikeSwinging
0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
382,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
383,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
384,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [74]:
# Separate features and target variable for machine learning
# X contains all input features (game situation, sequence context)
# y contains binary target (1 = Fastball, 0 = Other pitch types)

X = df.drop(columns=['Fastball'])  # All features except target
y = df['Fastball']                 # Binary target variable

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"Target distribution - Fastballs: {y.sum()}, Non-fastballs: {(y == 0).sum()}")
print(f"Fastball percentage: {y.mean():.1%}")

In [75]:
# Split data into training, validation, and test sets
# Standard ML practice: train (70%), validation (15%), test (15%)
# This allows proper model evaluation and hyperparameter tuning

print("Splitting data into train/validation/test sets...")

# First split: 70% train, 30% temp (for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Second split: Split the 30% into 15% validation, 15% test
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X):.1%})")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X):.1%})")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X):.1%})")

# Verify target distribution is preserved across splits
print(f"Train fastball %: {y_train.mean():.1%}")
print(f"Validation fastball %: {y_val.mean():.1%}")
print(f"Test fastball %: {y_test.mean():.1%}")

In [76]:
X_train

Unnamed: 0,Balls_0.0,Balls_1.0,Balls_2.0,Balls_3.0,Strikes_0.0,Strikes_1.0,Strikes_2.0,BatterSide_Left,BatterSide_Right,PreviousGroup_BB,PreviousGroup_FB,PreviousGroup_OS,PreviousResult_BallCalled,PreviousResult_FoulBall,PreviousResult_StrikeCalled,PreviousResult_StrikeSwinging
328,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
25,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
278,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
373,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
383,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
340,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
82,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
141,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [77]:
# Neural Network Architecture for Pitch Prediction
# Three-layer feedforward network optimized for binary classification

class NeuralNet(nn.Module):
    """
    Feedforward neural network for predicting pitch type (Fastball vs Other)
    
    Architecture:
    - Input layer: Variable size based on number of features
    - Hidden layer 1: 16 neurons with ReLU activation
    - Hidden layer 2: 8 neurons with ReLU activation  
    - Output layer: 1 neuron with Sigmoid activation (binary probability)
    
    This architecture balances model complexity with interpretability
    for the relatively small feature space of game situations
    """
    
    def __init__(self, input_size):
        super().__init__()
        
        # Define network layers
        self.fc_1 = nn.Linear(input_size, 16)  # First hidden layer
        self.fc_2 = nn.Linear(16, 8)           # Second hidden layer
        self.fc_3 = nn.Linear(8, 1)            # Output layer
        
        # Define activation functions
        self.relu = nn.ReLU()                  # Hidden layer activation
        self.sigmoid = nn.Sigmoid()            # Output activation (probability)

    def forward(self, x):
        """
        Forward pass through the network
        
        Args:
            x: Input tensor of shape (batch_size, input_size)
            
        Returns:
            Probability of fastball (0-1 range)
        """
        # Layer 1: Linear → ReLU
        x = self.fc_1(x)
        x = self.relu(x)
        
        # Layer 2: Linear → ReLU
        x = self.fc_2(x)
        x = self.relu(x)
        
        # Output layer: Linear → Sigmoid (probability)
        x = self.fc_3(x)
        x = self.sigmoid(x)
        
        return x

In [78]:
# Training step function for neural network
# Handles forward pass, loss calculation, backpropagation, and optimization

def train_step(model, X_train, y_train, loss_fn, optimizer, device):
    """
    Perform one training step (forward pass + backpropagation)
    
    Args:
        model: Neural network model
        X_train: Training features
        y_train: Training targets
        loss_fn: Loss function (Binary Cross Entropy)
        optimizer: Optimizer (Adam)
        device: Device for computation (CPU/GPU)
        
    Returns:
        Tuple of (loss_value, accuracy) for this training step
    """
    # Set model to training mode (enables dropout, batch norm, etc.)
    model.train()

    # Convert pandas dataframes to PyTorch tensors and move to device
    X_train = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).to(device)

    # Forward pass: get model predictions
    y_pred = model(X_train).squeeze()  # Remove extra dimension

    # Calculate loss between predictions and true labels
    train_loss = loss_fn(y_pred, y_train)

    # Calculate accuracy (percentage of correct predictions)
    # Convert probabilities to binary predictions using 0.5 threshold
    correct = ((y_pred > 0.5) == y_train).sum().item()
    accuracy = correct / y_train.shape[0]

    # Backpropagation: compute gradients and update weights
    optimizer.zero_grad()    # Clear previous gradients
    train_loss.backward()    # Compute gradients
    optimizer.step()         # Update model parameters

    return train_loss.item(), accuracy

In [79]:
# Evaluation step function for model validation/testing
# Performs forward pass without gradient computation for efficiency

def evaluation_step(model, X_test, y_test, loss_fn, device):
    """
    Evaluate model performance on validation or test data
    
    Args:
        model: Neural network model
        X_test: Test/validation features
        y_test: Test/validation targets
        loss_fn: Loss function (Binary Cross Entropy)
        device: Device for computation (CPU/GPU)
        
    Returns:
        Tuple of (loss_value, accuracy) for evaluation
    """
    # Set model to evaluation mode (disables dropout, batch norm training)
    model.eval()

    # Disable gradient computation for efficiency during evaluation
    with torch.no_grad():
        
        # Convert data to tensors and move to device
        X_test = torch.tensor(X_test.values, dtype=torch.float32).to(device)
        y_test = torch.tensor(y_test.values, dtype=torch.float32).to(device)

        # Forward pass: get model predictions
        y_pred = model(X_test).squeeze()
        
        # Compute loss between predictions and true labels
        loss = loss_fn(y_pred, y_test)

        # Compute accuracy using 0.5 probability threshold
        correct = ((y_pred > 0.5) == y_test).sum().item()
        accuracy = correct / y_test.shape[0]

    return loss.item(), accuracy

In [80]:
# Complete model training function
# Handles device setup, model initialization, and training loop

def train_model(X_train, y_train, X_test, y_test, X_val, y_val, random_seed):
    """
    Train neural network model with comprehensive evaluation
    
    Args:
        X_train, y_train: Training data
        X_test, y_test: Test data  
        X_val, y_val: Validation data
        random_seed: Seed for reproducible results
        
    Returns:
        Tuple containing trained model and performance metrics
    """
    
    # Device selection for optimal performance
    # Priority: CUDA GPU > Apple MPS > CPU
    device = "cpu"  # Default fallback
    if torch.cuda.is_available(): 
        device = "cuda"
        print("Using CUDA GPU for training")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("Using Apple Metal Performance Shaders")
    else:
        print("Using CPU for training")
    
    # Set random seed for reproducible results
    torch.manual_seed(random_seed)
    
    # Training configuration
    num_epochs = 100
    learning_rate = 0.05
    
    # Initialize performance tracking lists
    train_loss = []
    train_accuracy = []
    test_loss = []
    test_accuracy = []
    val_loss = []
    val_accuracy = []

    # Initialize model with input size matching feature count
    input_size = X_train.shape[1]
    model = NeuralNet(input_size).to(device)
    print(f"Model initialized with {input_size} input features")

    # Define loss function and optimizer
    loss_fn = nn.BCELoss()  # Binary Cross Entropy for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    print(f"Starting training for {num_epochs} epochs...")
    for epoch in range(num_epochs):
        # Perform training step
        training_loss, train_acc = train_step(model, X_train, y_train, loss_fn, optimizer, device)
        
        # Evaluate on test and validation sets
        testing_loss, test_acc = evaluation_step(model, X_test, y_test, loss_fn, device)
        validation_loss, validation_acc = evaluation_step(model, X_val, y_val, loss_fn, device)

        # Store metrics for analysis
        train_loss.append(training_loss)
        train_accuracy.append(train_acc)
        test_loss.append(testing_loss)
        test_accuracy.append(test_acc)
        val_loss.append(validation_loss)
        val_accuracy.append(validation_acc)
        
        # Print progress every 20 epochs
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} - "
                  f"Train Acc: {train_acc:.3f}, Val Acc: {validation_acc:.3f}, Test Acc: {test_acc:.3f}")
    
    print("Training completed!")
    print(f'Final Results - Train: {train_accuracy[-1]:.3f}, Val: {val_accuracy[-1]:.3f}, Test: {test_accuracy[-1]:.3f}')

    return model, train_loss, train_accuracy, test_loss, test_accuracy, val_loss, val_accuracy

In [81]:
# Model robustness testing with multiple random seeds
# This ensures results are consistent and not dependent on random initialization

print("Running multiple training sessions for robust performance estimation...")

# Generate 10 different random seeds for independent training runs
random_seeds = [random.randint(0, 10000) for _ in range(10)]
print(f"Using {len(random_seeds)} different random seeds: {random_seeds}")

# Storage for aggregated results across all runs
total_train_acc = []
total_test_acc = []
total_train_loss = []
total_test_loss = []
total_val_acc = []

# Train model multiple times with different initializations
for i, seed in enumerate(random_seeds):
    print(f"\n=== Training Run {i+1}/{len(random_seeds)} (seed: {seed}) ===")
    
    model, train_loss, train_accuracy, test_loss, test_accuracy, val_loss, val_accuracy = train_model(
        X_train, y_train, X_test, y_test, X_val, y_val, seed
    )
    
    # Store final epoch results for each run
    total_train_acc.append(train_accuracy[-1])
    total_test_acc.append(test_accuracy[-1])
    total_train_loss.append(train_loss[-1])
    total_test_loss.append(test_loss[-1])
    total_val_acc.append(val_accuracy[-1])

# Calculate summary statistics across all runs
print(f"\n{'='*50}")
print("FINAL RESULTS ACROSS ALL RUNS:")
print(f"{'='*50}")
print(f'Average Training Accuracy: {np.mean(total_train_acc):.3f} ± {np.std(total_train_acc):.3f}')
print(f'Average Validation Accuracy: {np.mean(total_val_acc):.3f} ± {np.std(total_val_acc):.3f}')
print(f'Average Test Accuracy: {np.mean(total_test_acc):.3f} ± {np.std(total_test_acc):.3f}')
print(f'Average Training Loss: {np.mean(total_train_loss):.3f} ± {np.std(total_train_loss):.3f}')
print(f'Average Test Loss: {np.mean(total_test_loss):.3f} ± {np.std(total_test_loss):.3f}')

# Store the final trained model for prediction
final_model = model

Training Loss: 0.45, Training Accuracy: 0.75, Testing Loss: 0.65, Testing Accuracy: 0.72


In [82]:
# Comprehensive scenario generation for pitch prediction
# Creates all possible game situations to test model predictions

import itertools

def generate_all_scenarios():
    """
    Generate all possible combinations of game situations for prediction
    
    This creates a comprehensive test set covering every possible:
    - Count situation (balls-strikes)
    - Batter handedness  
    - Previous pitch type and outcome
    
    Returns:
        DataFrame with all possible game scenarios (excluding 0-0 counts)
    """
    
    # Define all possible values for each feature
    balls = [0, 1, 2, 3]                                                    # Ball counts
    strikes = [0, 1, 2]                                                     # Strike counts  
    batter_sides = ['Left', 'Right']                                        # Batter handedness
    previous_groups = ['FB', 'BB', 'OS']                                    # Previous pitch categories
    previous_results = ['BallCalled', 'StrikeCalled', 'StrikeSwinging', 'FoulBall']  # Previous outcomes

    print("Generating all possible game scenarios...")
    print(f"Combinations: {len(balls)} ball counts × {len(strikes)} strike counts × "
          f"{len(batter_sides)} batter sides × {len(previous_groups)} prev groups × "
          f"{len(previous_results)} prev results")

    # Create Cartesian product of all feature combinations
    all_combos = list(itertools.product(balls, strikes, batter_sides, previous_groups, previous_results))
    print(f"Total combinations before filtering: {len(all_combos)}")

    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(all_combos, columns=[
        'Balls', 'Strikes', 'BatterSide', 'PreviousGroup', 'PreviousResult'
    ])

    # Sort scenarios for logical organization
    df_sorted = df.sort_values(by=['BatterSide', 'Balls', 'Strikes', 'PreviousGroup']).reset_index(drop=True)

    # Remove 0-0 count scenarios (first pitch situations)
    # These don't have meaningful "previous pitch" context
    df_filtered = df_sorted[~((df_sorted['Balls'] == 0) & (df_sorted['Strikes'] == 0))]
    
    print(f"Final scenarios after removing 0-0 counts: {len(df_filtered)}")
    
    return df_filtered

In [87]:
# Prediction function for all possible game scenarios
# Uses trained model to predict fastball probability in every situation

def predict_probabilities():
    """
    Generate fastball predictions for all possible game scenarios
    
    This creates a comprehensive lookup table showing the model's
    fastball probability predictions for every combination of:
    - Game count (balls-strikes)
    - Batter handedness
    - Previous pitch type and outcome
    
    Returns:
        DataFrame with scenarios and corresponding fastball predictions
    """
    
    # Device setup (same as training)
    device = "cpu"
    if torch.cuda.is_available(): 
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    
    print(f"Running predictions on {device}")

    # Generate all possible game scenarios
    scenario_df = generate_all_scenarios()
    
    # Convert categorical variables to dummy variables (same as training data)
    print("Converting scenarios to model input format...")
    dummy_df = pd.get_dummies(scenario_df, columns=['Balls', 'Strikes', 'BatterSide', 'PreviousGroup', 'PreviousResult'])

    # Convert boolean columns to float32 for PyTorch compatibility
    dummy_df = dummy_df.astype({col: 'float32' for col in dummy_df.select_dtypes(include='bool').columns})

    # Ensure dummy_df has same columns as training data (fill missing with 0)
    # This handles cases where some categorical values might not appear in scenarios
    dummy_df = dummy_df.reindex(columns=X_train.columns, fill_value=0)
    
    print(f"Prepared {len(dummy_df)} scenarios with {dummy_df.shape[1]} features each")

    # Convert to PyTorch tensor for model input
    X_tensor = torch.tensor(dummy_df.values, dtype=torch.float32).to(device)

    # Generate predictions using trained model
    final_model.eval()  # Ensure model is in evaluation mode
    print("Generating predictions...")

    with torch.no_grad():
        # Get probability predictions
        probs = final_model(X_tensor).squeeze().cpu().numpy()
        
        # Convert probabilities to binary predictions using 0.5 threshold
        preds = (probs >= 0.5).astype(int) 
    
    # Add predictions to scenario dataframe
    scenario_df['FB_Probability'] = probs
    scenario_df['Binary_Prediction'] = preds
    scenario_df['Prediction'] = np.where(scenario_df['Binary_Prediction'] == 1, 'Fastball', 'Other')

    # Reset index for clean output
    scenario_df = scenario_df.reset_index(drop=True)
    
    print(f"Predictions complete! {len(scenario_df)} scenarios analyzed")
    print(f"Average fastball probability: {probs.mean():.3f}")
    print(f"Predicted fastballs: {preds.sum()} / {len(preds)} ({preds.mean():.1%})")

    return scenario_df

In [88]:
# Generate comprehensive predictions for all game scenarios
# This creates the final prediction lookup table
print("Creating comprehensive prediction table...")
prediction_results = predict_probabilities()

In [89]:
# Display the comprehensive prediction results
# This table shows fastball probability for every possible game situation
print("Prediction Results Summary:")
print(f"Total scenarios: {len(prediction_results)}")
print(f"Columns: {list(prediction_results.columns)}")
print("\nSample predictions:")
prediction_results.head(10)

Unnamed: 0,Balls,Strikes,BatterSide,PreviousGroup,PreviousResult,FB Probability,Prediction
36,1,0,Left,BB,BallCalled,0.567934,Fastball
37,1,0,Left,BB,StrikeCalled,0.999961,Fastball
38,1,0,Left,BB,StrikeSwinging,0.455118,Other
39,1,0,Left,BB,FoulBall,0.999979,Fastball
40,1,0,Left,FB,BallCalled,0.123673,Other
...,...,...,...,...,...,...,...
259,3,0,Right,FB,FoulBall,0.443388,Other
260,3,0,Right,OS,BallCalled,0.999991,Fastball
261,3,0,Right,OS,StrikeCalled,1.000000,Fastball
262,3,0,Right,OS,StrikeSwinging,0.999967,Fastball


In [86]:
# Export prediction results for practical application
# This CSV can be used by coaches/analysts for real-time game decision making

output_filename = './Predictor_csvs/Maryland_mccoy.csv'
prediction_results.to_csv(output_filename, index=False)

print(f"✅ Prediction model complete!")
print(f"📊 Results saved to: {output_filename}")
print(f"📈 {len(prediction_results)} game scenarios with fastball predictions")
print(f"🎯 Model ready for real-time pitch prediction!")

# Display some interesting insights
high_fb_prob = prediction_results[prediction_results['FB_Probability'] > 0.8]
low_fb_prob = prediction_results[prediction_results['FB_Probability'] < 0.2]

print(f"\n📋 Model Insights:")
print(f"   • High fastball probability situations (>80%): {len(high_fb_prob)}")
print(f"   • Low fastball probability situations (<20%): {len(low_fb_prob)}")
print(f"   • Most predictable situations favor: {'Fastballs' if len(high_fb_prob) > len(low_fb_prob) else 'Off-speed pitches'}")