# Experiment Summary: Results Across All Experiments

This notebook compiles and visualizes results from all experiments (1-5) to:

1. Compare performance across experiments
2. Identify the best feature configurations
3. Analyze feature importance
4. Generate publication-ready summary tables

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
!pip install -q pandas numpy matplotlib seaborn

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"

if os.path.exists(REPO_DIR):
    %cd {REPO_DIR}
    !git fetch origin && git checkout {BRANCH} && git pull origin {BRANCH}
else:
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
import sys
sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

from datetime import datetime
from pathlib import Path
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from etf_pipeline.utils.paths import ensure_dirs
from etf_pipeline.experiments.results import (
    collect_experiment_results,
    compare_experiments,
    create_results_summary,
    format_results_table,
    compute_feature_ablation_impact,
)

print("Imports successful!")

In [None]:
# Setup paths
paths = ensure_dirs()
runs_dir = paths["runs"]
print(f"Runs directory: {runs_dir}")

## 1. Collect All Results

In [None]:
# Collect results from all experiments
results_df = collect_experiment_results(runs_dir)

if results_df.empty:
    print("No experiment results found!")
    print(f"Please run experiments 1-5 first. Looking in: {runs_dir}")
else:
    print(f"Collected {len(results_df)} result records")
    print(f"Experiments: {sorted(results_df['experiment_name'].unique())}")
    print(f"Symbols: {sorted(results_df['symbol'].unique())}")

In [None]:
# Show all results
if not results_df.empty:
    display_cols = ["experiment_name", "symbol", "run_id", "accuracy", 
                   "balanced_accuracy", "macro_f1", "cohen_kappa"]
    display_cols = [c for c in display_cols if c in results_df.columns]
    print(results_df[display_cols].to_string(index=False))

## 2. Experiment Comparison

In [None]:
# Compare experiments by balanced accuracy
if not results_df.empty:
    comparison = compare_experiments(
        results_df,
        metric="balanced_accuracy",
        group_by="experiment_name"
    )
    
    print("\nExperiment Comparison (Balanced Accuracy):")
    print("=" * 60)
    print(comparison.to_string(index=False))

In [None]:
# Compare by macro F1
if not results_df.empty:
    comparison_f1 = compare_experiments(
        results_df,
        metric="macro_f1",
        group_by="experiment_name"
    )
    
    print("\nExperiment Comparison (Macro F1):")
    print("=" * 60)
    print(comparison_f1.to_string(index=False))

In [None]:
# Overall summary
if not results_df.empty:
    summary = create_results_summary(results_df)
    
    print("\nOverall Summary:")
    print("=" * 60)
    print(f"Total experiments: {summary['n_experiments']}")
    print(f"Total runs: {summary['n_runs']}")
    print(f"Symbols: {summary['symbols']}")
    
    print("\nMetrics Summary:")
    for metric, stats in summary.get("metrics_summary", {}).items():
        print(f"  {metric}: mean={stats['mean']:.4f}, std={stats['std']:.4f}, "
              f"range=[{stats['min']:.4f}, {stats['max']:.4f}]")
    
    print("\nBest Runs by Experiment:")
    for exp, info in summary.get("best_runs", {}).items():
        print(f"  {exp}: {info['run_id']} ({info['symbol']}) - BA: {info['balanced_accuracy']:.4f}")

## 3. Visualizations

In [None]:
# Bar chart comparing experiments
if not results_df.empty and "balanced_accuracy" in results_df.columns:
    plt.figure(figsize=(10, 6))
    
    exp_means = results_df.groupby("experiment_name")["balanced_accuracy"].mean().sort_values(ascending=False)
    exp_stds = results_df.groupby("experiment_name")["balanced_accuracy"].std()
    
    colors = plt.cm.Blues(np.linspace(0.4, 0.8, len(exp_means)))
    
    bars = plt.bar(range(len(exp_means)), exp_means.values, yerr=exp_stds[exp_means.index].values,
                   capsize=5, color=colors, edgecolor='navy', linewidth=1.5)
    
    plt.xticks(range(len(exp_means)), exp_means.index, rotation=45, ha='right')
    plt.ylabel('Balanced Accuracy')
    plt.title('Experiment Comparison: Balanced Accuracy')
    plt.ylim(0, 1)
    
    # Add value labels on bars
    for i, (bar, val) in enumerate(zip(bars, exp_means.values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'{val:.3f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(paths["runs"] / "experiment_comparison.png", dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved to: {paths['runs'] / 'experiment_comparison.png'}")

In [None]:
# Multi-metric comparison
if not results_df.empty:
    metrics = ["accuracy", "balanced_accuracy", "macro_f1"]
    available_metrics = [m for m in metrics if m in results_df.columns]
    
    if available_metrics:
        fig, axes = plt.subplots(1, len(available_metrics), figsize=(5*len(available_metrics), 5))
        if len(available_metrics) == 1:
            axes = [axes]
        
        for ax, metric in zip(axes, available_metrics):
            exp_order = results_df.groupby("experiment_name")[metric].mean().sort_values(ascending=False).index
            
            sns.boxplot(data=results_df, x="experiment_name", y=metric, 
                       order=exp_order, ax=ax, palette="Blues_d")
            ax.set_title(metric.replace("_", " ").title())
            ax.set_xlabel("")
            ax.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig(paths["runs"] / "metrics_comparison.png", dpi=150, bbox_inches='tight')
        plt.show()

## 4. Feature Ablation Impact

In [None]:
# Compute feature ablation impact relative to Exp 1 baseline
if not results_df.empty and "exp1" in results_df["experiment_name"].values:
    impact_df = compute_feature_ablation_impact(
        results_df,
        baseline_exp="exp1",
        metric="balanced_accuracy"
    )
    
    if not impact_df.empty:
        print("\nFeature Ablation Impact (vs Exp 1 Baseline):")
        print("=" * 70)
        print(impact_df.to_string(index=False))
        
        # Save
        impact_df.to_csv(paths["runs"] / "feature_ablation_impact.csv", index=False)
        print(f"\nSaved to: {paths['runs'] / 'feature_ablation_impact.csv'}")

In [None]:
# Visualize ablation impact
if not results_df.empty and 'impact_df' in dir() and not impact_df.empty:
    plt.figure(figsize=(10, 6))
    
    # Group by experiment
    exp_impact = impact_df.groupby("experiment")["relative_change_pct"].mean().sort_values(ascending=False)
    
    colors = ['green' if v > 0 else 'red' for v in exp_impact.values]
    
    plt.barh(range(len(exp_impact)), exp_impact.values, color=colors, alpha=0.7)
    plt.yticks(range(len(exp_impact)), exp_impact.index)
    plt.xlabel('Relative Improvement vs Baseline (%)')
    plt.title('Impact of Feature Additions (vs Exp 1 Baseline)')
    plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
    
    # Add value labels
    for i, v in enumerate(exp_impact.values):
        plt.text(v + 0.5 if v > 0 else v - 0.5, i, f'{v:.1f}%', 
                va='center', ha='left' if v > 0 else 'right')
    
    plt.tight_layout()
    plt.savefig(paths["runs"] / "ablation_impact.png", dpi=150, bbox_inches='tight')
    plt.show()

## 5. Summary Tables

In [None]:
# Generate markdown summary table
if not results_df.empty:
    # Get latest run for each experiment
    latest_results = results_df.sort_values("run_id").groupby(["experiment_name", "symbol"]).last().reset_index()
    
    table_md = format_results_table(
        latest_results,
        metrics=["balanced_accuracy", "macro_f1", "cohen_kappa"],
        sort_by="balanced_accuracy"
    )
    
    print("\nMarkdown Results Table:")
    print("=" * 60)
    print(table_md)
    
    # Save
    with open(paths["runs"] / "results_table.md", "w") as f:
        f.write("# Experiment Results Summary\n\n")
        f.write(f"Generated: {datetime.now().isoformat()}\n\n")
        f.write(table_md)
    print(f"\nSaved to: {paths['runs'] / 'results_table.md'}")

In [None]:
# Create comprehensive experiment description
experiment_descriptions = {
    "exp1": "Baseline: Causal features only (returns, volatility, ATR)",
    "exp2": "Baseline + Time-series forecast features (Chronos)",
    "exp3": "Baseline + Forecast + Context features (SPY/QQQ)",
    "exp4": "Full feature set with forecast error monitoring",
    "exp5": "Ablation studies across configurations",
}

print("\nExperiment Descriptions:")
print("=" * 60)
for exp, desc in experiment_descriptions.items():
    print(f"{exp}: {desc}")

In [None]:
# Final summary report
if not results_df.empty:
    print("\n" + "=" * 70)
    print("FINAL EXPERIMENT SUMMARY")
    print("=" * 70)
    
    # Best overall
    best_idx = results_df["balanced_accuracy"].idxmax()
    best = results_df.loc[best_idx]
    
    print(f"\nBest Overall Configuration:")
    print(f"  Experiment: {best['experiment_name']}")
    print(f"  Symbol: {best['symbol']}")
    print(f"  Run ID: {best['run_id']}")
    print(f"  Balanced Accuracy: {best['balanced_accuracy']:.4f}")
    if 'macro_f1' in best:
        print(f"  Macro F1: {best['macro_f1']:.4f}")
    
    # Progressive improvement
    print("\nProgressive Improvement Across Experiments:")
    exp_order = ["exp1", "exp2", "exp3", "exp4"]
    prev_ba = None
    for exp in exp_order:
        exp_results = results_df[results_df["experiment_name"] == exp]
        if not exp_results.empty:
            ba = exp_results["balanced_accuracy"].mean()
            if prev_ba is not None:
                diff = ba - prev_ba
                print(f"  {exp}: {ba:.4f} (change: {diff:+.4f})")
            else:
                print(f"  {exp}: {ba:.4f} (baseline)")
            prev_ba = ba
    
    print("\n" + "=" * 70)

## 6. Export Results

In [None]:
# Save all results to CSV
if not results_df.empty:
    results_path = paths["runs"] / "all_experiment_results.csv"
    results_df.to_csv(results_path, index=False)
    print(f"All results saved to: {results_path}")
    
    # Save summary JSON
    summary_path = paths["runs"] / "experiment_summary.json"
    summary_data = {
        "generated_at": datetime.now().isoformat(),
        "n_experiments": int(results_df["experiment_name"].nunique()),
        "n_runs": len(results_df),
        "experiments": sorted(results_df["experiment_name"].unique().tolist()),
        "symbols": sorted(results_df["symbol"].unique().tolist()),
        "best_config": {
            "experiment": best["experiment_name"],
            "symbol": best["symbol"],
            "run_id": best["run_id"],
            "balanced_accuracy": float(best["balanced_accuracy"]),
        },
        "experiment_means": results_df.groupby("experiment_name")["balanced_accuracy"].mean().to_dict(),
    }
    
    with open(summary_path, "w") as f:
        json.dump(summary_data, f, indent=2)
    print(f"Summary saved to: {summary_path}")

---

**Experiment Summary Complete!**

All results have been compiled, visualized, and saved to Google Drive.