# MLB GRPO Models - Training and Evaluation

This notebook implements Group Relative Policy Optimization (GRPO) models for MLB game prediction.

## Models Implemented:
1. **GRPO Classifier** - Neural network with GRPO-style training
2. **GRPO Ensemble** - Ensemble of models with different feature subsets
3. **GRPO Ranking Model** - Pairwise ranking approach for matchups
4. **GRPO Betting Optimizer** - Model optimized for betting ROI

## Enhanced Features:
- Pitcher vs batter matchup features
- Head-to-head team results
- Situational features (RISP, clutch, etc.)
- Pitcher 3-start rolling averages
- MLB Advanced stats (45+ features)
- Betting odds integration with ROI analysis

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

print("Imports successful!")

In [None]:
# Import GRPO modules
import sys
sys.path.insert(0, '.')

from grpo_models.feature_engineering import MLBFeatureEngineer, FeatureConfig, get_feature_columns
from grpo_models.grpo_models import (
    GRPOConfig, GRPOClassifier, GRPOEnsemble, 
    GRPORankingModel, GRPOBettingOptimizer
)
from grpo_models.betting_integration import (
    BettingConfig, BettingAnalyzer, ROIAnalyzer,
    add_synthetic_odds, generate_betting_report
)
from grpo_models.training_pipeline import (
    TrainingConfig, DataPreparer, ModelTrainer,
    BettingEvaluator, ResultsReporter
)

print("GRPO modules imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load the data
# Note: Update these paths to match your data location
batting_file = 'cleaned_batting_df.csv'  # Update path if needed
pitching_file = 'cleaned_piching_df.csv'  # Update path if needed
game_file = 'cleaned_game_df.csv'  # Update path if needed

try:
    batting_df = pd.read_csv(batting_file)
    pitching_df = pd.read_csv(pitching_file)
    game_df = pd.read_csv(game_file)
    
    # Clean up index columns
    for df in [batting_df, pitching_df, game_df]:
        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)
    
    print(f"Batting data: {batting_df.shape}")
    print(f"Pitching data: {pitching_df.shape}")
    print(f"Game data: {game_df.shape}")
except FileNotFoundError as e:
    print(f"File not found: {e}")
    print("Please update the file paths above to point to your data files.")

In [None]:
# Quick data exploration
print("Game DataFrame columns:")
print(game_df.columns.tolist())

## 2. Feature Engineering

Apply enhanced feature engineering including:
- Head-to-head features
- Situational features (RISP, clutch hitting)
- Pitcher 3-start rolling averages
- Advanced team stats (Pythagorean expectation, run differential, streaks)

In [None]:
# Initialize feature engineer
feature_config = FeatureConfig(
    rolling_windows=[3, 7, 14],
    pitcher_rolling_starts=3,
    include_advanced_stats=True,
    include_betting_odds=True
)

feature_engineer = MLBFeatureEngineer(feature_config)

print("Feature engineer initialized with config:")
print(f"  Rolling windows: {feature_config.rolling_windows}")
print(f"  Pitcher rolling starts: {feature_config.pitcher_rolling_starts}")

In [None]:
# Standardize team names
batting_df.replace(to_replace='FLA', value='MIA', inplace=True)
pitching_df.replace(to_replace='FLA', value='MIA', inplace=True)
game_df.replace(to_replace='FLO', value='MIA', inplace=True)

# Additional standardization for game_df
team_mapping = {
    "NYA": "NYY", "SDN": "SD", "CHN": "CHC", "SLN": "STL",
    "SFN": "SF", "LAN": "LAD", "TBA": "TB", "KCA": "KC",
    "CHA": "CWS", "ANA": "LAA", "NYN": "NYM"
}
game_df.replace(to_replace=team_mapping, inplace=True)

print("Team names standardized.")

In [None]:
# Create target variable if not exists
if 'Home_team_won?' not in game_df.columns:
    game_df['Home_team_won?'] = game_df['HomeRunsScore'] > game_df['VisitorRunsScored']

# Create date-related columns
if 'New_Date' not in game_df.columns and 'Date' in game_df.columns:
    game_df['New_Date'] = pd.to_datetime(game_df['Date'].astype(str), format='%Y%m%d')

if 'current_year' not in game_df.columns:
    game_df['current_year'] = game_df['New_Date'].dt.year

if 'prior_year' not in game_df.columns:
    game_df['prior_year'] = game_df['New_Date'].dt.year - 1

print(f"Target variable distribution:")
print(game_df['Home_team_won?'].value_counts())

In [None]:
# Apply feature engineering
print("Applying feature engineering...")

df_featured = feature_engineer.engineer_all_features(
    game_df, batting_df, pitching_df,
    include_h2h=True,
    include_situational=True,
    include_pitcher_rolling=True,
    include_advanced=True
)

print(f"\nTotal features after engineering: {len(df_featured.columns)}")
print(f"Total games: {len(df_featured)}")

In [None]:
# Get feature columns (excluding non-feature columns)
feature_columns = get_feature_columns(df_featured)
print(f"\nNumber of model features: {len(feature_columns)}")
print("\nFeature categories:")

# Categorize features
h2h_features = [f for f in feature_columns if 'h2h' in f.lower()]
pitcher_features = [f for f in feature_columns if 'pitcher' in f.lower()]
rolling_features = [f for f in feature_columns if any(x in f for x in ['3d_', '7d_', 'rolling'])]
advanced_features = [f for f in feature_columns if any(x in f for x in ['pythag', 'streak', 'clutch', 'momentum'])]

print(f"  - Head-to-head features: {len(h2h_features)}")
print(f"  - Pitcher features: {len(pitcher_features)}")
print(f"  - Rolling average features: {len(rolling_features)}")
print(f"  - Advanced stats features: {len(advanced_features)}")

## 3. Train/Test Split by Season

In [None]:
# Define custom train/test split function
def train_test_split_by_year(df, feature_cols, target_col, train_years, test_years):
    """
    Split data by season for proper temporal validation.
    """
    train_mask = df['current_year'].isin(train_years)
    test_mask = df['current_year'].isin(test_years)
    
    X_train = df.loc[train_mask, feature_cols].copy()
    y_train = df.loc[train_mask, target_col].values.astype(int)
    X_test = df.loc[test_mask, feature_cols].copy()
    y_test = df.loc[test_mask, target_col].values.astype(int)
    
    # Convert to numeric and handle missing values
    X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(-1)
    X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(-1)
    
    return X_train, X_test, y_train, y_test

In [None]:
# Split data
train_years = list(range(2010, 2018))  # 2010-2017
test_years = [2018, 2019]

X_train, X_test, y_train, y_test = train_test_split_by_year(
    df_featured, feature_columns, 'Home_team_won?', train_years, test_years
)

print(f"Training set: {len(X_train)} games ({train_years[0]}-{train_years[-1]})")
print(f"Test set: {len(X_test)} games ({test_years[0]}-{test_years[-1]})")
print(f"\nTraining home win rate: {y_train.mean():.3f}")
print(f"Test home win rate: {y_test.mean():.3f}")

## 4. Train GRPO Models

Train all four GRPO model variants and compare their performance.

In [None]:
# Configure GRPO models
grpo_config = GRPOConfig(
    hidden_layers=[128, 64, 32],
    dropout_rate=0.3,
    learning_rate=0.001,
    group_size=8,
    epochs=100,
    early_stopping_patience=10
)

# Dictionary to store models and results
models = {}
results = {}

In [None]:
# Train GRPO Classifier
print("Training GRPO Classifier...")
grpo_clf = GRPOClassifier(grpo_config)
grpo_clf.fit(X_train, y_train)

y_pred = grpo_clf.predict(X_test)
y_proba = grpo_clf.predict_proba(X_test)[:, 1]

models['GRPO_Classifier'] = grpo_clf
results['GRPO_Classifier'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"GRPO Classifier Accuracy: {results['GRPO_Classifier']['accuracy']:.4f}")
print(f"GRPO Classifier ROC-AUC: {results['GRPO_Classifier']['roc_auc']:.4f}")

In [None]:
# Train GRPO Ensemble
print("\nTraining GRPO Ensemble...")
grpo_ensemble = GRPOEnsemble(grpo_config)
grpo_ensemble.fit(X_train, y_train)

y_pred = grpo_ensemble.predict(X_test)
y_proba = grpo_ensemble.predict_proba(X_test)[:, 1]

models['GRPO_Ensemble'] = grpo_ensemble
results['GRPO_Ensemble'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"\nGRPO Ensemble Accuracy: {results['GRPO_Ensemble']['accuracy']:.4f}")
print(f"GRPO Ensemble ROC-AUC: {results['GRPO_Ensemble']['roc_auc']:.4f}")

In [None]:
# Train GRPO Ranking Model
print("\nTraining GRPO Ranking Model...")
grpo_ranking = GRPORankingModel(grpo_config)
grpo_ranking.fit(X_train, y_train)

y_pred = grpo_ranking.predict(X_test)
y_proba = grpo_ranking.predict_proba(X_test)[:, 1]

models['GRPO_Ranking'] = grpo_ranking
results['GRPO_Ranking'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"GRPO Ranking Accuracy: {results['GRPO_Ranking']['accuracy']:.4f}")
print(f"GRPO Ranking ROC-AUC: {results['GRPO_Ranking']['roc_auc']:.4f}")

In [None]:
# Train GRPO Betting Optimizer
print("\nTraining GRPO Betting Optimizer...")
grpo_betting = GRPOBettingOptimizer(grpo_config)
grpo_betting.fit(X_train, y_train)

y_pred = grpo_betting.predict(X_test)
y_proba = grpo_betting.predict_proba(X_test)[:, 1]

models['GRPO_Betting'] = grpo_betting
results['GRPO_Betting'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"GRPO Betting Accuracy: {results['GRPO_Betting']['accuracy']:.4f}")
print(f"GRPO Betting ROC-AUC: {results['GRPO_Betting']['roc_auc']:.4f}")

## 5. Baseline Comparison

Compare GRPO models against traditional ML models.

In [None]:
# Train baseline models for comparison
print("Training baseline models...\n")

# Random Forest
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train.values, y_train)
rf_pred = rf.predict(X_test.values)
rf_proba = rf.predict_proba(X_test.values)[:, 1]

results['RandomForest'] = {
    'accuracy': accuracy_score(y_test, rf_pred),
    'roc_auc': roc_auc_score(y_test, rf_proba)
}
print(f"Random Forest Accuracy: {results['RandomForest']['accuracy']:.4f}")

# Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gbc.fit(X_train.values, y_train)
gbc_pred = gbc.predict(X_test.values)
gbc_proba = gbc.predict_proba(X_test.values)[:, 1]

results['GradientBoosting'] = {
    'accuracy': accuracy_score(y_test, gbc_pred),
    'roc_auc': roc_auc_score(y_test, gbc_proba)
}
print(f"Gradient Boosting Accuracy: {results['GradientBoosting']['accuracy']:.4f}")

# AdaBoost
abc = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
abc.fit(X_train.values, y_train)
abc_pred = abc.predict(X_test.values)
abc_proba = abc.predict_proba(X_test.values)[:, 1]

results['AdaBoost'] = {
    'accuracy': accuracy_score(y_test, abc_pred),
    'roc_auc': roc_auc_score(y_test, abc_proba)
}
print(f"AdaBoost Accuracy: {results['AdaBoost']['accuracy']:.4f}")

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.sort_values('accuracy', ascending=False)

print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
print(comparison_df.to_string())

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
colors = ['#2ecc71' if 'GRPO' in name else '#3498db' for name in comparison_df.index]
comparison_df['accuracy'].plot(kind='barh', ax=axes[0], color=colors)
axes[0].set_xlabel('Accuracy')
axes[0].set_title('Model Accuracy Comparison')
axes[0].axvline(x=0.5, color='red', linestyle='--', label='Random Baseline')

# ROC-AUC comparison
comparison_df['roc_auc'].plot(kind='barh', ax=axes[1], color=colors)
axes[1].set_xlabel('ROC-AUC')
axes[1].set_title('Model ROC-AUC Comparison')
axes[1].axvline(x=0.5, color='red', linestyle='--', label='Random Baseline')

plt.tight_layout()
plt.savefig('graphs/grpo_model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Betting Performance Analysis

Evaluate models for betting ROI and profitability.

In [None]:
# Configure betting simulation
betting_config = BettingConfig(
    initial_bankroll=10000.0,
    bet_sizing='half_kelly',
    min_edge_threshold=0.03,
    max_bet_pct=0.10
)

betting_analyzer = BettingAnalyzer(betting_config)
betting_results = {}

In [None]:
# Evaluate betting performance for each GRPO model
for model_name in ['GRPO_Classifier', 'GRPO_Ensemble', 'GRPO_Ranking', 'GRPO_Betting']:
    model = models[model_name]
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Create predictions DataFrame
    predictions = pd.DataFrame({
        'home_win_prob': y_proba,
        'Home_team_won?': y_test
    })
    
    # Add synthetic odds based on model probabilities
    predictions = add_synthetic_odds(predictions, 'home_win_prob')
    
    # Run simulation
    result = betting_analyzer.simulate_betting_season(predictions)
    betting_results[model_name] = result
    
    print(f"\n{model_name}:")
    print(f"  ROI: {result['roi_pct']:.2f}%")
    print(f"  Total Bets: {result['total_bets']}")
    print(f"  Win Rate: {result['win_rate_pct']:.1f}%")
    print(f"  Final Bankroll: ${result['final_bankroll']:,.2f}")

In [None]:
# Plot bankroll progression for each model
fig, ax = plt.subplots(figsize=(14, 6))

for model_name, result in betting_results.items():
    bankroll_history = result['bankroll_history']
    ax.plot(bankroll_history, label=f"{model_name} (ROI: {result['roi_pct']:.1f}%)")

ax.axhline(y=10000, color='black', linestyle='--', label='Initial Bankroll')
ax.set_xlabel('Number of Bets')
ax.set_ylabel('Bankroll ($)')
ax.set_title('Bankroll Progression by Model')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('graphs/grpo_betting_performance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Generate detailed betting report for best performing model
best_betting_model = max(betting_results.items(), key=lambda x: x[1]['roi_pct'])
print(f"\nBest Betting Model: {best_betting_model[0]}")
print(generate_betting_report(best_betting_model[1]))

## 7. ROI Analysis by Confidence Level

In [None]:
# Analyze ROI by confidence level
roi_analyzer = ROIAnalyzer()

# Use the best GRPO model
best_model_name = best_betting_model[0]
best_model = models[best_model_name]
y_proba = best_model.predict_proba(X_test)[:, 1]

# Create analysis DataFrame
predictions = pd.DataFrame({
    'home_win_prob': y_proba,
    'Home_team_won?': y_test
})

confidence_analysis = roi_analyzer.analyze_roi_by_confidence(predictions)
print(f"\nROI Analysis by Confidence Level ({best_model_name}):")
print(confidence_analysis.to_string(index=False))

In [None]:
# Visualize accuracy by confidence
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy by confidence
axes[0].bar(range(len(confidence_analysis)), confidence_analysis['accuracy'])
axes[0].set_xticks(range(len(confidence_analysis)))
axes[0].set_xticklabels(confidence_analysis['confidence_range'], rotation=45)
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Accuracy by Confidence Level')
axes[0].axhline(y=0.5, color='red', linestyle='--')

# Number of games by confidence
axes[1].bar(range(len(confidence_analysis)), confidence_analysis['n_games'])
axes[1].set_xticks(range(len(confidence_analysis)))
axes[1].set_xticklabels(confidence_analysis['confidence_range'], rotation=45)
axes[1].set_ylabel('Number of Games')
axes[1].set_title('Games by Confidence Level')

plt.tight_layout()
plt.savefig('graphs/grpo_confidence_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Get feature importance from Gradient Boosting (most interpretable)
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': gbc.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
fig, ax = plt.subplots(figsize=(12, 10))
top_features = feature_importance.head(20)
ax.barh(range(len(top_features)), top_features['importance'])
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Feature Importance')
ax.set_title('Top 20 Most Important Features')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('graphs/grpo_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

## 9. Summary and Conclusions

In [None]:
# Final summary
print("="*80)
print("MLB GRPO MODELS - FINAL SUMMARY")
print("="*80)

print("\nPREDICTION PERFORMANCE:")
print("-"*40)
for name, result in sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True):
    print(f"  {name}: {result['accuracy']:.4f} accuracy, {result['roc_auc']:.4f} ROC-AUC")

print("\nBETTING PERFORMANCE:")
print("-"*40)
for name, result in sorted(betting_results.items(), key=lambda x: x[1]['roi_pct'], reverse=True):
    print(f"  {name}: {result['roi_pct']:.2f}% ROI, ${result['total_profit']:,.2f} profit")

print("\nKEY FINDINGS:")
print("-"*40)
best_accuracy_model = max(results.items(), key=lambda x: x[1]['accuracy'])
best_roi_model = max(betting_results.items(), key=lambda x: x[1]['roi_pct'])
print(f"  - Best accuracy model: {best_accuracy_model[0]} ({best_accuracy_model[1]['accuracy']:.4f})")
print(f"  - Best ROI model: {best_roi_model[0]} ({best_roi_model[1]['roi_pct']:.2f}%)")
print(f"  - Total features engineered: {len(feature_columns)}")
print(f"  - Training games: {len(X_train)}")
print(f"  - Test games: {len(X_test)}")

In [None]:
# Save models for future use
import pickle
import os

os.makedirs('models', exist_ok=True)

for name, model in models.items():
    with open(f'models/{name.lower()}.pkl', 'wb') as f:
        pickle.dump(model, f)

print("\nModels saved to 'models/' directory.")

## 10. Cross-Validation Results

In [None]:
# Perform cross-validation on the best model
from sklearn.model_selection import cross_val_score

print("Cross-validation results (5-fold):")
print("-"*40)

# For sklearn-compatible models
for name, model in [('GradientBoosting', gbc), ('RandomForest', rf), ('AdaBoost', abc)]:
    cv_scores = cross_val_score(model, X_train.values, y_train, cv=5, scoring='accuracy')
    print(f"{name}: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")