# Machine Learning Modeling

This notebook demonstrates model training and evaluation for volatility prediction using both regression and classification approaches.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, r2_score, f1_score, classification_report, confusion_matrix
import joblib

from src.utils.config import load_config

## Load Processed Dataset

In [None]:
cfg = load_config()
dataset_path = os.path.join(cfg.processed_dir, 'dataset.csv')

if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset not found at {dataset_path}. Run build_dataset.py first.")

df = pd.read_csv(dataset_path, parse_dates=['date'])
print(f"Loaded dataset with shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Tickers: {df['ticker'].unique()}")
df.head()

## 1. Data Preparation for Modeling

In [None]:
# Prepare features and targets
feature_cols = [c for c in df.columns if c not in {'date','ticker','rv_future','vol_bucket'} and df[c].dtype != 'O']
X = df[feature_cols].ffill().bfill().fillna(0.0)
y_reg = df['rv_future']
y_clf = df['vol_bucket']

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"Regression target (rv_future) - missing: {y_reg.isnull().sum()}")
print(f"Classification target (vol_bucket) - missing: {y_clf.isnull().sum()}")

# Feature importance preview
print(f"\nSample features: {feature_cols[:10]}")

In [None]:
# Check target distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Regression target distribution
y_reg_clean = y_reg.dropna()
axes[0].hist(y_reg_clean, bins=50, alpha=0.7, edgecolor='black')
axes[0].set_title('Distribution of Realized Volatility (Target)')
axes[0].set_xlabel('Volatility')
axes[0].set_ylabel('Frequency')

# Classification target distribution
y_clf_clean = y_clf.dropna()
y_clf_clean.value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Distribution of Volatility Buckets')
axes[1].set_xlabel('Risk Level')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"Regression target stats:")
print(f"  Mean: {y_reg_clean.mean():.4f}")
print(f"  Std: {y_reg_clean.std():.4f}")
print(f"  Min: {y_reg_clean.min():.4f}")
print(f"  Max: {y_reg_clean.max():.4f}")
print(f"\nClassification target distribution:")
print(y_clf_clean.value_counts())

## 2. Regression Modeling

In [None]:
# Prepare regression data
mask_reg = y_reg.notna()
X_reg = X.loc[mask_reg]
y_reg_clean = y_reg.loc[mask_reg]

print(f"Regression dataset: {X_reg.shape[0]} samples, {X_reg.shape[1]} features")

# Train-test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg_clean, test_size=0.2, random_state=42, shuffle=False
)

print(f"Training set: {X_train_reg.shape[0]} samples")
print(f"Test set: {X_test_reg.shape[0]} samples")

In [None]:
# Train regression models
reg_models = {
    'Random Forest': RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=3)
}

reg_results = {}

for name, model in reg_models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_reg, y_train_reg)
    
    # Make predictions
    y_pred = model.predict(X_test_reg)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_reg, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_reg, y_pred)
    
    reg_results[name] = {
        'model': model,
        'predictions': y_pred,
        'rmse': rmse,
        'r2': r2
    }
    
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")

# Save best regression model
best_reg_name = min(reg_results.keys(), key=lambda x: reg_results[x]['rmse'])
best_reg_model = reg_results[best_reg_name]['model']
joblib.dump(best_reg_model, os.path.join(cfg.processed_dir, 'models', 'best_regression_model.pkl'))
print(f"\nBest regression model: {best_reg_name} (RMSE: {reg_results[best_reg_name]['rmse']:.4f})")

In [None]:
# Visualize regression results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, (name, results) in enumerate(reg_results.items()):
    ax = axes[i]
    
    # Scatter plot: actual vs predicted
    ax.scatter(y_test_reg, results['predictions'], alpha=0.6)
    ax.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
    ax.set_xlabel('Actual Volatility')
    ax.set_ylabel('Predicted Volatility')
    ax.set_title(f'{name}\nRMSE: {results["rmse"]:.4f}, R²: {results["r2"]:.4f}')

plt.tight_layout()
plt.show()

## 3. Classification Modeling

In [None]:
# Prepare classification data
mask_clf = y_clf.notna()
X_clf = X.loc[mask_clf]
y_clf_clean = y_clf.loc[mask_clf]

print(f"Classification dataset: {X_clf.shape[0]} samples, {X_clf.shape[1]} features")
print(f"Class distribution:")
print(y_clf_clean.value_counts())

# Train-test split
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf_clean, test_size=0.2, random_state=42, shuffle=False, stratify=y_clf_clean
)

print(f"Training set: {X_train_clf.shape[0]} samples")
print(f"Test set: {X_test_clf.shape[0]} samples")

In [None]:
# Train classification models
clf_models = {
    'Random Forest': RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=3)
}

clf_results = {}

for name, model in clf_models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_clf, y_train_clf)
    
    # Make predictions
    y_pred = model.predict(X_test_clf)
    y_pred_proba = model.predict_proba(X_test_clf)
    
    # Calculate metrics
    f1 = f1_score(y_test_clf, y_pred, average='weighted')
    
    clf_results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'f1': f1
    }
    
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test_clf, y_pred))

# Save best classification model
best_clf_name = max(clf_results.keys(), key=lambda x: clf_results[x]['f1'])
best_clf_model = clf_results[best_clf_name]['model']
joblib.dump(best_clf_model, os.path.join(cfg.processed_dir, 'models', 'best_classification_model.pkl'))
print(f"\nBest classification model: {best_clf_name} (F1: {clf_results[best_clf_name]['f1']:.4f})")

In [None]:
# Visualize classification results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

for i, (name, results) in enumerate(clf_results.items()):
    # Confusion matrix
    cm = confusion_matrix(y_test_clf, results['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i, 0])
    axes[i, 0].set_title(f'{name} - Confusion Matrix')
    axes[i, 0].set_xlabel('Predicted')
    axes[i, 0].set_ylabel('Actual')
    
    # Feature importance (for Random Forest)
    if hasattr(results['model'], 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X_clf.columns,
            'importance': results['model'].feature_importances_
        }).sort_values('importance', ascending=False).head(10)
        
        axes[i, 1].barh(range(len(feature_importance)), feature_importance['importance'])
        axes[i, 1].set_yticks(range(len(feature_importance)))
        axes[i, 1].set_yticklabels(feature_importance['feature'])
        axes[i, 1].set_title(f'{name} - Top 10 Feature Importance')
        axes[i, 1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

## 4. Model Performance Summary

In [None]:
# Performance summary
print("=== MODEL PERFORMANCE SUMMARY ===")
print("\nRegression Models:")
for name, results in reg_results.items():
    print(f"  {name}: RMSE={results['rmse']:.4f}, R²={results['r2']:.4f}")

print("\nClassification Models:")
for name, results in clf_results.items():
    print(f"  {name}: F1={results['f1']:.4f}")

print(f"\nBest Models Saved:")
print(f"  Regression: {best_reg_name} (RMSE: {reg_results[best_reg_name]['rmse']:.4f})")
print(f"  Classification: {best_clf_name} (F1: {clf_results[best_clf_name]['f1']:.4f})")

print(f"\nModels saved to: {os.path.join(cfg.processed_dir, 'models')}")