# 06 - Modeling and Backtesting

**Author:** Lucas Little  
**Course:** CSCA 5522: Data Mining Project  
**University:** University of Colorado - Boulder

This notebook implements machine learning models for cryptocurrency volatility prediction and backtesting, incorporating sentiment analysis features on the sampled data.

## Objectives
1. Load enhanced feature sets for each sample
2. Implement multiple ML models for volatility prediction on each sample
3. Compare sentiment-enhanced model with a technical-only baseline across all samples
4. Perform time series cross-validation and evaluate performance
5. Implement backtesting strategy and analyze aggregated results

In [None]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
from pathlib import Path
import time
import json
import joblib
import os

# ML imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
plt.style.use('default')
np.random.seed(42)

print("Environment setup complete!")

## 1. Process Sampled Data

In [None]:
data_dir = Path('../data')
processed_data_dir = data_dir / 'processed'
sampled_dir = processed_data_dir / 'sampled'

all_results = []

for i in range(1, 6):
    print(f"\n--- Processing Sample {i} ---")
    enhanced_features_path = sampled_dir / f'enhanced_features_sample_{i}.csv'
    
    if not enhanced_features_path.exists():
        print(f"⚠️ Enhanced features for sample {i} not found. Skipping.")
        continue
        
    df = pd.read_csv(enhanced_features_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    
    # Define feature sets
    tech_features = ['returns', 'volatility', 'rsi', 'macd', 'volume_ratio']
    sentiment_features = tech_features + ['sentiment_mean', 'sentiment_var', 'sentiment_count', 'sentiment_momentum', 'sentiment_mean_anomaly']
    target_col = 'high_volatility_target'
    
    # Prepare data
    X_tech = df[tech_features]
    X_sentiment = df[sentiment_features]
    y = df[target_col]
    
    # Scale features
    scaler = StandardScaler()
    X_tech_scaled = scaler.fit_transform(X_tech)
    X_sentiment_scaled = scaler.fit_transform(X_sentiment)
    
    # Time series split
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Models to evaluate
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }
    
    # Evaluate models
    for name, model in models.items():
        print(f"\nEvaluating {name} on sample {i}...")
        
        # Technical features only
        cv_scores_tech = []
        for train_index, test_index in tscv.split(X_tech_scaled):
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            if len(np.unique(y_test)) < 2:
                print("Skipping split with only one class.")
                continue
            model.fit(X_tech_scaled[train_index], y_train)
            y_pred_proba = model.predict_proba(X_tech_scaled[test_index])[:, 1]
            cv_scores_tech.append(roc_auc_score(y_test, y_pred_proba))
        
        # With sentiment features
        cv_scores_sentiment = []
        for train_index, test_index in tscv.split(X_sentiment_scaled):
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            if len(np.unique(y_test)) < 2:
                print("Skipping split with only one class.")
                continue
            model.fit(X_sentiment_scaled[train_index], y_train)
            y_pred_proba = model.predict_proba(X_sentiment_scaled[test_index])[:, 1]
            cv_scores_sentiment.append(roc_auc_score(y_test, y_pred_proba))
        
        all_results.append({
            'sample': i,
            'model': name,
            'tech_only_auc': np.mean(cv_scores_tech) if cv_scores_tech else np.nan,
            'sentiment_auc': np.mean(cv_scores_sentiment) if cv_scores_sentiment else np.nan
        })
        print(f"  Tech-only AUC: {np.mean(cv_scores_tech):.4f}")
        print(f"  Sentiment AUC: {np.mean(cv_scores_sentiment):.4f}")

# Display aggregated results
results_df = pd.DataFrame(all_results)
print("\n=== AGGREGATED MODEL COMPARISON ===")
print(results_df.groupby('model').mean())

# Save aggregated results
output_path = processed_data_dir / 'aggregated_model_results.csv'
results_df.to_csv(output_path, index=False)
print(f"Aggregated model results saved to: {output_path}")