# Feature Engineering and Model Building

This notebook builds on the data exploration to:
- Engineer features for predicting bowl game covers
- Build and compare linear regression models
- Build and compare hierarchical Bayesian regression models
- Evaluate model performance

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bowl_mania.data import CFBDClient
from bowl_mania.models import LinearRegression, BayesianRegression
from bowl_mania.utils import calculate_metrics, evaluate_predictions

from sklearn.model_selection import train_test_split

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Cached Data

In [None]:
# Initialize client and load cached data
client = CFBDClient()

# Load from cache (assuming you ran the exploration notebook first)
try:
    games = client.load_from_cache('games.csv')
    team_stats = client.load_from_cache('team_stats.csv')
    print("Data loaded from cache successfully")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please run the data exploration notebook first to fetch and cache data")

## 2. Feature Engineering

Create features that might be predictive of game outcomes and spreads.

In [None]:
# Example feature engineering (will depend on actual data structure)
# This is a template that will need to be customized based on available data

def create_features(games_df, team_stats_df):
    """
    Create features for modeling.
    This is a template - customize based on actual data structure.
    """
    # Calculate point differential as target
    if 'home_points' in games_df.columns and 'away_points' in games_df.columns:
        games_df['point_diff'] = games_df['home_points'] - games_df['away_points']
    
    # Add more features based on available data
    # Examples:
    # - Team offensive/defensive rankings
    # - Historical performance
    # - Conference strength
    # - Home field advantage
    
    return games_df

# Apply feature engineering
games_engineered = create_features(games, team_stats)
print("Feature engineering completed")
print(f"Available columns: {list(games_engineered.columns)}")

## 3. Prepare Training and Test Sets

In [None]:
# Select features and target
# This is a template - customize based on engineered features

# Example: Select numeric features (you'll need to customize this)
feature_cols = [col for col in games_engineered.columns 
                if games_engineered[col].dtype in ['int64', 'float64'] 
                and col not in ['point_diff', 'home_points', 'away_points']]

print(f"Feature columns: {feature_cols}")

# Prepare X and y (this is a template)
if 'point_diff' in games_engineered.columns and len(feature_cols) > 0:
    # Remove rows with missing values
    model_data = games_engineered[feature_cols + ['point_diff']].dropna()
    
    X = model_data[feature_cols]
    y = model_data['point_diff']
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"\nTraining set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
else:
    print("Not enough features available for modeling")
    print("Please customize the feature engineering step based on your data")

## 4. Simple Linear Regression Model

In [None]:
# Train linear regression model
if 'X_train' in locals():
    lr_model = LinearRegression(normalize=True)
    lr_model.fit(X_train, y_train)
    
    print("Linear Regression Model trained successfully")
    
    # Make predictions
    y_pred_train = lr_model.predict(X_train)
    y_pred_test = lr_model.predict(X_test)
    
    # Evaluate on training set
    train_metrics = lr_model.evaluate(X_train, y_train)
    print("\nTraining Set Performance:")
    for metric, value in train_metrics.items():
        print(f"  {metric}: {value:.4f}")
    
    # Evaluate on test set
    test_metrics = lr_model.evaluate(X_test, y_test)
    print("\nTest Set Performance:")
    for metric, value in test_metrics.items():
        print(f"  {metric}: {value:.4f}")

In [None]:
# Feature importance
if 'lr_model' in locals():
    importance_df = lr_model.get_feature_importance()
    
    print("\nFeature Importance (Top 10):")
    print(importance_df.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['feature'].head(10), importance_df['abs_coefficient'].head(10))
    plt.xlabel('Absolute Coefficient Value')
    plt.title('Top 10 Most Important Features (Linear Regression)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize predictions
if 'y_pred_test' in locals():
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred_test, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Point Differential')
    plt.ylabel('Predicted Point Differential')
    plt.title('Linear Regression: Predicted vs Actual')
    plt.tight_layout()
    plt.show()

## 5. Hierarchical Bayesian Regression Model

Now let's build a Bayesian model that can account for hierarchical structure (e.g., by conference).

In [None]:
# Build simple Bayesian model (non-hierarchical first)
if 'X_train' in locals():
    print("Training Bayesian Regression Model...")
    print("This may take several minutes...\n")
    
    bayes_model = BayesianRegression(hierarchical=False)
    
    # Use a smaller sample for demonstration
    # In practice, you'd use the full dataset
    sample_size = min(500, len(X_train))
    X_train_sample = X_train.iloc[:sample_size]
    y_train_sample = y_train.iloc[:sample_size]
    
    bayes_model.fit(
        X_train_sample, 
        y_train_sample,
        draws=1000,
        tune=500,
        chains=2
    )
    
    print("\nBayesian model training completed!")

In [None]:
# Examine posterior distributions
if 'bayes_model' in locals() and bayes_model.is_fitted:
    summary = bayes_model.get_summary()
    print("Posterior Summary:")
    print(summary)

In [None]:
# Visualize trace plots
if 'bayes_model' in locals() and bayes_model.is_fitted:
    bayes_model.plot_trace(['alpha', 'sigma'])
    plt.tight_layout()
    plt.show()

In [None]:
# Make predictions with Bayesian model
if 'bayes_model' in locals() and bayes_model.is_fitted:
    # Predict on test set
    y_pred_bayes = bayes_model.predict(X_test)
    
    # Evaluate
    bayes_metrics = bayes_model.evaluate(X_test, y_test)
    print("\nBayesian Model Test Set Performance:")
    for metric, value in bayes_metrics.items():
        print(f"  {metric}: {value:.4f}")

## 6. Model Comparison

In [None]:
# Compare models
if 'test_metrics' in locals() and 'bayes_metrics' in locals():
    comparison_df = pd.DataFrame({
        'Linear Regression': test_metrics,
        'Bayesian Regression': bayes_metrics
    })
    
    print("\nModel Comparison:")
    print(comparison_df)
    
    # Visualize comparison
    comparison_df.T.plot(kind='bar', figsize=(12, 6))
    plt.title('Model Performance Comparison')
    plt.ylabel('Metric Value')
    plt.xticks(rotation=45, ha='right')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

## 7. Cover Probability Predictions

Use the models to predict cover probabilities for specific spreads.

In [None]:
# Example: Predict cover probability for a game with spread of -7
if 'lr_model' in locals() and len(X_test) > 0:
    example_spread = -7.0
    
    # Get cover probabilities
    lr_cover_prob = lr_model.predict_cover_probability(
        X_test, 
        actual_spread=example_spread,
        spread_std=test_metrics['residual_std']
    )
    
    print(f"Example spread: {example_spread}")
    print(f"\nLinear Regression Cover Probabilities (first 5):")
    print(lr_cover_prob[:5])
    
    # Visualize distribution of cover probabilities
    plt.figure(figsize=(10, 6))
    plt.hist(lr_cover_prob, bins=30, alpha=0.7, edgecolor='black')
    plt.xlabel('Cover Probability')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of Cover Probabilities (Spread = {example_spread})')
    plt.axvline(x=0.5, color='red', linestyle='--', label='50% threshold')
    plt.legend()
    plt.tight_layout()
    plt.show()

## 8. Next Steps and Recommendations

Based on the model results:

1. **Model Selection**: Choose the model with better out-of-sample performance
2. **Feature Refinement**: Experiment with additional features or feature transformations
3. **Hierarchical Models**: Try hierarchical Bayesian models grouped by conference or other factors
4. **Betting Strategy**: Develop a betting strategy based on cover probability thresholds
5. **Validation**: Test the models on upcoming bowl games to validate predictions

In [None]:
print("\n" + "="*60)
print("MODELING SUMMARY")
print("="*60)

if 'test_metrics' in locals():
    print("\nLinear Regression Performance:")
    print(f"  RMSE: {test_metrics['rmse']:.2f} points")
    print(f"  MAE: {test_metrics['mae']:.2f} points")
    print(f"  RÂ²: {test_metrics['r2']:.4f}")

if 'bayes_metrics' in locals():
    print("\nBayesian Regression Performance:")
    print(f"  RMSE: {bayes_metrics['rmse']:.2f} points")
    print(f"  MAE: {bayes_metrics['mae']:.2f} points")

print("\nThe models can now be used to predict cover probabilities for upcoming bowl games!")