# Treasury Risk Dashboard - Comprehensive Backtesting

This notebook provides comprehensive backtesting capabilities for the Treasury Risk Dashboard, validating forecast accuracy and risk management strategies using real historical data.

## Contents
1. **Setup and Data Loading**
2. **Forecast Model Backtesting**
3. **Risk Metrics Validation**
4. **Hedge Strategy Performance**
5. **Model Comparison and Analysis**
6. **Performance Visualization**

In [None]:
# Import required libraries
import sys
import os
import asyncio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Add backend to path
sys.path.append('../backend')

# Import our services
from app.services.backtesting import forecast_backtester, risk_backtester, performance_analyzer
from app.services.ingestion import ingestion_service
from app.services.forecasting import forecasting_service
from app.services.risk_metrics import risk_calculator
from app.db.connection import db_manager
from app.utils.config import settings

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Successfully imported all dependencies")
print(f"📊 Supported crypto assets: {len(settings.supported_crypto_symbols)}")
print(f"📈 Supported stock assets: {len(settings.supported_stock_symbols)}")

## 1. Setup and Data Validation

First, let's validate our data availability and setup the backtesting environment.

In [None]:
# Configuration for backtesting
BACKTEST_SYMBOLS = ['bitcoin', 'ethereum', 'BTC', 'ETH']  # Mix of crypto and traditional assets
BACKTEST_START = datetime.now() - timedelta(days=180)  # 6 months back
BACKTEST_END = datetime.now() - timedelta(days=30)     # Stop 30 days ago to validate recent predictions
FORECAST_HORIZON = 7  # 7-day forecasts

print(f"🗓️ Backtest Period: {BACKTEST_START.date()} to {BACKTEST_END.date()}")
print(f"🔮 Forecast Horizon: {FORECAST_HORIZON} days")
print(f"📊 Testing Symbols: {BACKTEST_SYMBOLS}")

# Check data availability
async def check_data_availability():
    print("\n🔍 Checking data availability...")
    
    for symbol in BACKTEST_SYMBOLS:
        try:
            history = await db_manager.get_price_history(symbol, 200)
            print(f"  {symbol}: {len(history)} data points available")
            
            if len(history) < 90:
                print(f"  ⚠️ Insufficient data for {symbol}, attempting to fetch more...")
                await ingestion_service.ingest_historical_data(symbol, days=200)
                
        except Exception as e:
            print(f"  ❌ Error checking {symbol}: {e}")

# Run data availability check
await check_data_availability()
print("\n✅ Data availability check completed")

## 2. Forecast Model Backtesting

Test the accuracy of our forecasting models using walk-forward validation.

In [None]:
# Run comprehensive forecast backtesting
async def run_forecast_backtests():
    print("🚀 Starting comprehensive forecast backtesting...\n")
    
    all_results = {}
    
    for symbol in BACKTEST_SYMBOLS:
        print(f"📈 Backtesting forecasts for {symbol}")
        
        try:
            results = await forecast_backtester.run_forecast_backtest(
                symbol=symbol,
                start_date=BACKTEST_START,
                end_date=BACKTEST_END,
                forecast_horizon=FORECAST_HORIZON,
                refit_frequency=30,
                models=['arima', 'prophet']
            )
            
            all_results[symbol] = results
            
            # Print summary for this symbol
            for model_name, result in results.items():
                metrics = result.metrics
                print(f"  {model_name.upper()}:")
                print(f"    📊 Predictions: {metrics.get('total_predictions', 0)}")
                print(f"    📉 MAE: {metrics.get('mae', 0):.2f}")
                print(f"    📈 MAPE: {metrics.get('mape', 0):.1f}%")
                print(f"    🎯 Directional Accuracy: {metrics.get('directional_accuracy', 0):.1%}")
                print(f"    🔗 Correlation: {metrics.get('correlation', 0):.3f}")
            
        except Exception as e:
            print(f"  ❌ Backtest failed for {symbol}: {e}")
            all_results[symbol] = {}
        
        print()
    
    return all_results

# Run the backtests
forecast_results = await run_forecast_backtests()
print("✅ Forecast backtesting completed!")

## 3. Performance Analysis and Visualization

Analyze the backtest results and create comprehensive visualizations.

In [None]:
# Analyze forecast performance
def create_performance_summary(results):
    """Create comprehensive performance summary."""
    
    summary_data = []
    
    for symbol, models in results.items():
        for model_name, result in models.items():
            if hasattr(result, 'metrics') and result.metrics:
                summary_data.append({
                    'Symbol': symbol,
                    'Model': model_name.upper(),
                    'Predictions': result.metrics.get('total_predictions', 0),
                    'MAE': result.metrics.get('mae', 0),
                    'RMSE': result.metrics.get('rmse', 0),
                    'MAPE (%)': result.metrics.get('mape', 0),
                    'Directional Accuracy': result.metrics.get('directional_accuracy', 0),
                    'Correlation': result.metrics.get('correlation', 0)
                })
    
    return pd.DataFrame(summary_data)

# Create performance summary
summary_df = create_performance_summary(forecast_results)

if not summary_df.empty:
    print("📊 FORECAST PERFORMANCE SUMMARY")
    print("=" * 50)
    
    # Display formatted summary
    pd.set_option('display.float_format', '{:.3f}'.format)
    print(summary_df.to_string(index=False))
    
    # Calculate overall statistics
    print("\n📈 OVERALL STATISTICS")
    print("=" * 30)
    print(f"Average MAPE: {summary_df['MAPE (%)'].mean():.1f}%")
    print(f"Average Directional Accuracy: {summary_df['Directional Accuracy'].mean():.1%}")
    print(f"Average Correlation: {summary_df['Correlation'].mean():.3f}")
    print(f"Total Predictions: {summary_df['Predictions'].sum()}")
    
    # Find best performing models
    best_mape = summary_df.loc[summary_df['MAPE (%)'].idxmin()]
    best_correlation = summary_df.loc[summary_df['Correlation'].idxmax()]
    
    print(f"\n🏆 Best MAPE: {best_mape['Model']} on {best_mape['Symbol']} ({best_mape['MAPE (%)']:.1f}%)")
    print(f"🏆 Best Correlation: {best_correlation['Model']} on {best_correlation['Symbol']} ({best_correlation['Correlation']:.3f})")
else:
    print("❌ No forecast results available for analysis")

In [None]:
# Create visualizations
if not summary_df.empty:
    # Set up the plotting area
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Forecast Model Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. MAPE by Model and Symbol
    summary_pivot = summary_df.pivot(index='Symbol', columns='Model', values='MAPE (%)')
    sns.heatmap(summary_pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', ax=axes[0,0])
    axes[0,0].set_title('MAPE (%) by Model and Symbol\n(Lower is Better)')
    
    # 2. Directional Accuracy
    summary_df.groupby('Model')['Directional Accuracy'].mean().plot(kind='bar', ax=axes[0,1], color=['skyblue', 'lightcoral'])
    axes[0,1].set_title('Average Directional Accuracy by Model')
    axes[0,1].set_ylabel('Directional Accuracy')
    axes[0,1].set_ylim(0, 1)
    axes[0,1].tick_params(axis='x', rotation=0)
    
    # 3. Correlation Analysis
    summary_df.boxplot(column='Correlation', by='Model', ax=axes[1,0])
    axes[1,0].set_title('Correlation Distribution by Model')
    axes[1,0].set_xlabel('Model')
    
    # 4. Model Performance Scatter
    for model in summary_df['Model'].unique():
        model_data = summary_df[summary_df['Model'] == model]
        axes[1,1].scatter(model_data['MAPE (%)'], model_data['Correlation'], 
                         label=model, alpha=0.7, s=100)
    
    axes[1,1].set_xlabel('MAPE (%)')
    axes[1,1].set_ylabel('Correlation')
    axes[1,1].set_title('MAPE vs Correlation by Model')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Performance grades analysis
    print("\n🎓 MODEL PERFORMANCE GRADES")
    print("=" * 35)
    
    for symbol, models in forecast_results.items():
        if models:
            print(f"\n{symbol.upper()}:")
            for model_name, result in models.items():
                if hasattr(result, 'metrics'):
                    grade = performance_analyzer._grade_performance(result.metrics)
                    print(f"  {model_name.upper()}: Grade {grade}")
else:
    print("⚠️ No data available for visualization")

## 4. Risk Metrics Validation

Validate our risk calculation accuracy using historical portfolio data.

In [None]:
# Test portfolio risk calculation accuracy
async def validate_risk_metrics():
    print("🔍 Validating portfolio risk metrics...\n")
    
    # Create test portfolios
    test_portfolios = {
        'Conservative': {'bitcoin': 0.3, 'ethereum': 0.2, 'USDC': 0.5},
        'Balanced': {'bitcoin': 0.4, 'ethereum': 0.3, 'USDC': 0.3},
        'Aggressive': {'bitcoin': 0.6, 'ethereum': 0.4}
    }
    
    risk_validation_results = {}
    
    for portfolio_name, weights in test_portfolios.items():
        print(f"📊 Testing {portfolio_name} Portfolio")
        print(f"   Allocation: {weights}")
        
        try:
            # Calculate risk metrics
            risk_result = await risk_calculator.calculate_portfolio_metrics(
                portfolio_weights=weights,
                lookback_days=90,
                confidence_levels=[0.95, 0.99]
            )
            
            metrics = risk_result['metrics']
            
            print(f"   📈 Annual Volatility: {metrics.get('annualized_volatility', 0)*100:.1f}%")
            print(f"   📉 VaR (95%): {abs(metrics.get('var_95', {}).get('historical', 0))*100:.1f}%")
            print(f"   ⚡ Sharpe Ratio: {metrics.get('sharpe_ratio', 0):.2f}")
            print(f"   📊 Max Drawdown: {abs(metrics.get('max_drawdown', 0))*100:.1f}%")
            print(f"   🎯 Concentration Ratio: {metrics.get('concentration_ratio', 0)*100:.1f}%")
            
            risk_validation_results[portfolio_name] = metrics
            
        except Exception as e:
            print(f"   ❌ Risk calculation failed: {e}")
            risk_validation_results[portfolio_name] = {}
        
        print()
    
    return risk_validation_results

# Run risk validation
risk_results = await validate_risk_metrics()
print("✅ Risk metrics validation completed!")

In [None]:
# Visualize risk metrics comparison
if risk_results:
    # Prepare data for visualization
    risk_comparison = []
    
    for portfolio_name, metrics in risk_results.items():
        if metrics:
            risk_comparison.append({
                'Portfolio': portfolio_name,
                'Volatility (%)': metrics.get('annualized_volatility', 0) * 100,
                'VaR 95% (%)': abs(metrics.get('var_95', {}).get('historical', 0)) * 100,
                'Sharpe Ratio': metrics.get('sharpe_ratio', 0),
                'Max Drawdown (%)': abs(metrics.get('max_drawdown', 0)) * 100,
                'Concentration (%)': metrics.get('concentration_ratio', 0) * 100
            })
    
    if risk_comparison:
        risk_df = pd.DataFrame(risk_comparison)
        
        # Create risk comparison visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Portfolio Risk Metrics Comparison', fontsize=16, fontweight='bold')
        
        # 1. Volatility comparison
        risk_df.set_index('Portfolio')['Volatility (%)'].plot(kind='bar', ax=axes[0,0], color='skyblue')
        axes[0,0].set_title('Annual Volatility by Portfolio')
        axes[0,0].set_ylabel('Volatility (%)')
        axes[0,0].tick_params(axis='x', rotation=45)
        
        # 2. VaR comparison
        risk_df.set_index('Portfolio')['VaR 95% (%)'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
        axes[0,1].set_title('Value at Risk (95%) by Portfolio')
        axes[0,1].set_ylabel('VaR (%)')
        axes[0,1].tick_params(axis='x', rotation=45)
        
        # 3. Risk-Return scatter
        for i, row in risk_df.iterrows():
            axes[1,0].scatter(row['Volatility (%)'], row['Sharpe Ratio'], 
                            label=row['Portfolio'], s=150, alpha=0.7)
        
        axes[1,0].set_xlabel('Volatility (%)')
        axes[1,0].set_ylabel('Sharpe Ratio')
        axes[1,0].set_title('Risk-Return Profile')
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
        
        # 4. Risk metrics radar chart
        metrics_to_plot = ['Volatility (%)', 'VaR 95% (%)', 'Max Drawdown (%)', 'Concentration (%)']
        
        for portfolio in risk_df['Portfolio']:
            portfolio_data = risk_df[risk_df['Portfolio'] == portfolio]
            values = [portfolio_data[metric].iloc[0] for metric in metrics_to_plot]
            axes[1,1].plot(metrics_to_plot, values, marker='o', label=portfolio, linewidth=2)
        
        axes[1,1].set_title('Risk Metrics Profile')
        axes[1,1].legend()
        axes[1,1].tick_params(axis='x', rotation=45)
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Print risk assessment
        print("\n🎯 RISK ASSESSMENT SUMMARY")
        print("=" * 40)
        
        for _, row in risk_df.iterrows():
            portfolio = row['Portfolio']
            volatility = row['Volatility (%)']
            var = row['VaR 95% (%)']
            sharpe = row['Sharpe Ratio']
            
            risk_level = 'Low' if volatility < 20 else 'Medium' if volatility < 40 else 'High'
            return_quality = 'Excellent' if sharpe > 1 else 'Good' if sharpe > 0.5 else 'Poor'
            
            print(f"\n{portfolio} Portfolio:")
            print(f"  Risk Level: {risk_level} (Vol: {volatility:.1f}%)")
            print(f"  Return Quality: {return_quality} (Sharpe: {sharpe:.2f})")
            print(f"  Potential Daily Loss: {var:.1f}% (95% confidence)")
    else:
        print("⚠️ No risk data available for comparison")
else:
    print("❌ No risk validation results available")

## 5. Real-Time Forecast Accuracy

Test the accuracy of recent forecasts against actual market movements.

In [None]:
# Test recent forecast accuracy
async def test_recent_forecasts():
    print("🎯 Testing recent forecast accuracy...\n")
    
    recent_accuracy = []
    
    for symbol in ['bitcoin', 'ethereum']:  # Focus on crypto for real-time testing
        try:
            print(f"📊 Analyzing recent forecasts for {symbol}")
            
            # Get forecast accuracy for last 30 days
            accuracy_result = await forecasting_service.get_forecast_accuracy(symbol, 30)
            
            if 'error' not in accuracy_result and accuracy_result:
                # Calculate average accuracy metrics
                all_mae = []
                all_mape = []
                
                for forecast_key, metrics in accuracy_result.items():
                    if isinstance(metrics, dict):
                        if 'mae' in metrics:
                            all_mae.append(metrics['mae'])
                        if 'mape' in metrics:
                            all_mape.append(metrics['mape'])
                
                if all_mae and all_mape:
                    avg_mae = np.mean(all_mae)
                    avg_mape = np.mean(all_mape)
                    
                    recent_accuracy.append({
                        'Symbol': symbol,
                        'Forecasts_Analyzed': len(all_mae),
                        'Average_MAE': avg_mae,
                        'Average_MAPE': avg_mape
                    })
                    
                    print(f"  ✅ {len(all_mae)} recent forecasts analyzed")
                    print(f"     Average MAE: {avg_mae:.2f}")
                    print(f"     Average MAPE: {avg_mape:.1f}%")
                else:
                    print(f"  ⚠️ No valid accuracy metrics found")
            else:
                print(f"  ⚠️ No recent forecasts available or error: {accuracy_result.get('error', 'Unknown')}")
                
        except Exception as e:
            print(f"  ❌ Error analyzing {symbol}: {e}")
        
        print()
    
    return recent_accuracy

# Test recent accuracy
recent_results = await test_recent_forecasts()

if recent_results:
    recent_df = pd.DataFrame(recent_results)
    print("📈 RECENT FORECAST ACCURACY SUMMARY")
    print("=" * 45)
    print(recent_df.to_string(index=False, float_format='%.2f'))
    
    # Overall assessment
    if len(recent_results) > 0:
        avg_mape = np.mean([r['Average_MAPE'] for r in recent_results])
        total_forecasts = sum([r['Forecasts_Analyzed'] for r in recent_results])
        
        print(f"\n🎯 Overall Recent Performance:")
        print(f"   Total Forecasts: {total_forecasts}")
        print(f"   Average MAPE: {avg_mape:.1f}%")
        
        if avg_mape < 10:
            print(f"   🏆 Excellent forecast accuracy!")
        elif avg_mape < 20:
            print(f"   ✅ Good forecast accuracy")
        else:
            print(f"   ⚠️ Forecast accuracy needs improvement")
else:
    print("ℹ️ No recent forecast data available for accuracy testing")

print("\n✅ Recent forecast accuracy testing completed!")

## 6. Comprehensive Summary and Recommendations

Generate final assessment and recommendations based on all backtesting results.

In [None]:
# Generate comprehensive summary
print("📋 COMPREHENSIVE BACKTESTING SUMMARY")
print("=" * 50)

# Overall system assessment
print("\n🎯 SYSTEM PERFORMANCE ASSESSMENT")
print("-" * 35)

assessments = []

# Forecast accuracy assessment
if not summary_df.empty:
    avg_mape = summary_df['MAPE (%)'].mean()
    avg_directional = summary_df['Directional Accuracy'].mean()
    
    forecast_grade = 'A' if avg_mape < 10 else 'B' if avg_mape < 15 else 'C' if avg_mape < 25 else 'D'
    assessments.append(f"📈 Forecast Accuracy: Grade {forecast_grade} (MAPE: {avg_mape:.1f}%, Dir: {avg_directional:.1%})")
    
    if avg_directional > 0.6:
        assessments.append("✅ Strong trend prediction capability")
    else:
        assessments.append("⚠️ Trend prediction needs improvement")
else:
    assessments.append("❌ Forecast accuracy assessment unavailable")

# Risk calculation assessment
if risk_results:
    risk_calculated = len([r for r in risk_results.values() if r])
    assessments.append(f"🛡️ Risk Calculations: {risk_calculated}/{len(risk_results)} portfolios successfully analyzed")
    
    # Check risk differentiation
    volatilities = [r.get('annualized_volatility', 0) for r in risk_results.values() if r]
    if len(volatilities) > 1 and max(volatilities) > min(volatilities) * 1.5:
        assessments.append("✅ Risk metrics properly differentiate portfolio risk levels")
    else:
        assessments.append("⚠️ Risk differentiation may need calibration")
else:
    assessments.append("❌ Risk calculation assessment unavailable")

# Recent performance assessment
if recent_results:
    recent_mape = np.mean([r['Average_MAPE'] for r in recent_results])
    assessments.append(f"🔄 Recent Performance: {recent_mape:.1f}% MAPE on latest forecasts")
    
    if recent_mape < 15:
        assessments.append("✅ Current model performance is stable")
    else:
        assessments.append("⚠️ Recent performance degradation detected")
else:
    assessments.append("ℹ️ Recent performance data not available")

# Print assessments
for assessment in assessments:
    print(assessment)

# Recommendations
print("\n💡 RECOMMENDATIONS")
print("-" * 20)

recommendations = []

# Model-specific recommendations
if not summary_df.empty:
    best_model = summary_df.loc[summary_df['MAPE (%)'].idxmin(), 'Model']
    worst_model = summary_df.loc[summary_df['MAPE (%)'].idxmax(), 'Model']
    
    recommendations.append(f"🏆 Prioritize {best_model} model for production forecasts")
    
    if summary_df['MAPE (%)'].max() > 25:
        recommendations.append(f"🔧 Consider retuning or replacing {worst_model} model")
    
    # Data quality recommendations
    total_predictions = summary_df['Predictions'].sum()
    if total_predictions < 50:
        recommendations.append("📊 Increase historical data collection for more robust backtesting")
    
    # Ensemble recommendations
    if len(summary_df['Model'].unique()) > 1:
        recommendations.append("🔀 Consider ensemble forecasting to combine model strengths")

# Risk management recommendations
if risk_results:
    high_vol_portfolios = [name for name, metrics in risk_results.items() 
                          if metrics and metrics.get('annualized_volatility', 0) > 0.4]
    
    if high_vol_portfolios:
        recommendations.append(f"⚠️ Monitor high-volatility portfolios: {', '.join(high_vol_portfolios)}")
    
    recommendations.append("🛡️ Implement automated risk alerting based on VaR thresholds")

# Operational recommendations
recommendations.extend([
    "🔄 Run backtesting monthly to monitor model performance",
    "📈 Implement walk-forward validation for production forecasts",
    "🎯 Set MAPE targets: <10% excellent, <15% good, >20% needs improvement",
    "⏰ Consider real-time model performance monitoring"
])

# Print recommendations
for i, rec in enumerate(recommendations, 1):
    print(f"{i:2d}. {rec}")

print("\n" + "=" * 50)
print("🎉 BACKTESTING ANALYSIS COMPLETE")
print("=" * 50)

# Save results summary
print("\n💾 Saving results summary...")

# Create results summary
results_summary = {
    'backtest_date': datetime.now().isoformat(),
    'backtest_period': f"{BACKTEST_START.date()} to {BACKTEST_END.date()}",
    'symbols_tested': BACKTEST_SYMBOLS,
    'forecast_horizon': FORECAST_HORIZON,
    'forecast_performance': summary_df.to_dict('records') if not summary_df.empty else [],
    'risk_validation': risk_results,
    'recent_accuracy': recent_results,
    'assessments': assessments,
    'recommendations': recommendations
}

# Save to file
import json

with open('../data/backtest_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print("✅ Results saved to ../data/backtest_results.json")
print("\n🚀 Treasury Risk Dashboard backtesting complete!")
print("   Ready for production deployment with validated models.")