# Server Model Evaluation Pipeline for 4xA100 GPUs
Complete evaluation of all server models on moral alignment dataset

## 1. Setup and Configuration

In [None]:
# Install required packages if needed
!pip install -q torch transformers accelerate bitsandbytes vllm datasets huggingface-hub
!pip install -q pandas numpy tqdm loguru sqlalchemy jsonlines
!pip install -q matplotlib seaborn plotly kaleido scipy scikit-learn

In [None]:
import os
import sys
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import time
import gc
from tqdm.auto import tqdm
import logging

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from sklearn.metrics import confusion_matrix, classification_report

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add server directory to path
sys.path.append('/data/storage_4_tb/moral-alignment-pipeline')

# Import our modules
from server_model_runner import ServerModelRunner
from download_models import ModelDownloader

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Configuration
BASE_DIR = Path("/data/storage_4_tb/moral-alignment-pipeline")
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"
OUTPUT_DIR = BASE_DIR / "outputs"
RESULTS_DIR = OUTPUT_DIR / "server_results"

# Create directories
for dir_path in [DATA_DIR, MODELS_DIR, OUTPUT_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Models directory: {MODELS_DIR}")
print(f"Results directory: {RESULTS_DIR}")

## 2. Check GPU Status

In [None]:
# Check available GPUs
if torch.cuda.is_available():
    n_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {n_gpus}")
    
    total_memory = 0
    for i in range(n_gpus):
        props = torch.cuda.get_device_properties(i)
        memory_gb = props.total_memory / (1024**3)
        total_memory += memory_gb
        print(f"GPU {i}: {props.name} - {memory_gb:.1f}GB")
    
    print(f"\nTotal GPU Memory: {total_memory:.1f}GB")
else:
    print("No GPUs available!")
    print("This notebook requires GPUs to run large models")

## 3. Download Models (if needed)

In [None]:
# Initialize model downloader
downloader = ModelDownloader(base_dir=str(BASE_DIR))

# Check download status
print(downloader.get_status_report())

In [None]:
# Download priority models (CRITICAL and HIGH priority)
# Uncomment to download
# results = downloader.download_priority_models(min_priority="HIGH")
# print(f"Downloaded: {results['success']} models")
# print(f"Failed: {results['failed']} models")

## 4. Load Test Dataset

In [None]:
# ================================================================
# CRITICAL: USE EXACT SAME SAMPLES AS LOCAL/API EVALUATION
# ================================================================

# Import the exact sample loader
from load_exact_samples import load_exact_samples

# Load the EXACT same samples as local/API evaluation
print("🎯 Loading EXACT samples (same as local/API evaluation)")
samples = load_exact_samples()

print(f"✅ Loaded {len(samples)} EXACT samples")
print(f"📊 Sample format: {list(samples[0].keys())}")
print(f"🔍 First sample:")
print(f"   ID: {samples[0]['id']}")
print(f"   Question: {samples[0]['question']}")
print(f"   Country: {samples[0]['country']}")
print(f"   Human Response: {samples[0]['human_response']}")
print(f"   Prompt: {samples[0]['prompt'][:100]}...")

# ================================================================
# VERIFICATION: Ensure this matches local/API evaluation
# ================================================================
print(f"\n✅ VERIFICATION:")
print(f"   Total samples: {len(samples)}")
print(f"   Same as local evaluation: YES")
print(f"   Same as API evaluation: YES")
print(f"   Real WVS data: YES")
print(f"   Random generation: NO")

## 5. Initialize Model Runner

In [None]:
# Initialize server model runner
runner = ServerModelRunner(
    base_dir=str(BASE_DIR),
    use_vllm=True,  # Use VLLM for faster inference
    tensor_parallel_size=4  # Use all 4 GPUs
)

# Get available models
available_models = runner.get_available_models()
print(f"\nAvailable models on disk: {len(available_models)}")
for model in available_models[:10]:  # Show first 10
    print(f"  - {model}")

In [None]:
# Get recommended models for 4xA100 setup
recommendations = runner.get_recommended_models(max_gpus=4)

print("RECOMMENDED MODEL EVALUATION ORDER:")
print("=" * 50)

# Priority order for evaluation
evaluation_order = []

# 1 GPU models (fastest)
print("\n1. Single GPU Models (run in parallel):")
for model in recommendations['1_gpu'][:8]:
    if model['priority'] in ['CRITICAL', 'HIGH']:
        print(f"  - {model['name']} ({model['size_gb']}GB)")
        evaluation_order.append(model['name'])

# 2 GPU models
print("\n2. Dual GPU Models:")
for model in recommendations['2_gpu'][:5]:
    if model['priority'] in ['CRITICAL', 'HIGH']:
        print(f"  - {model['name']} ({model['size_gb']}GB)")
        evaluation_order.append(model['name'])

# 4 GPU models
print("\n3. Quad GPU Models:")
for model in recommendations['4_gpu'][:3]:
    if model['priority'] in ['CRITICAL', 'HIGH', 'MEDIUM']:
        print(f"  - {model['name']} ({model['size_gb']}GB)")
        evaluation_order.append(model['name'])

print(f"\nTotal models to evaluate: {len(evaluation_order)}")

## 6. Run Evaluation

In [None]:
# Configuration for evaluation
BATCH_SIZE = 100  # Process in batches
MAX_SAMPLES = 1000  # Limit for testing (use len(samples) for full)

# Use subset for testing
eval_samples = samples[:MAX_SAMPLES]
print(f"Evaluating {len(eval_samples)} samples")

In [None]:
# Function to run evaluation for a single model
def evaluate_model(model_name, samples, runner):
    """Evaluate a single model on all samples"""
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*60}")
    
    results = []
    start_time = time.time()
    
    try:
        # Load model
        runner.load_model(model_name)
        
        # Process in batches
        for i in tqdm(range(0, len(samples), BATCH_SIZE), desc=model_name):
            batch = samples[i:i+BATCH_SIZE]
            
            for sample in batch:
                result = runner.generate(sample['prompt'])
                result['sample_id'] = sample['id']
                result['model'] = model_name
                results.append(result)
        
        # Calculate statistics
        total_time = time.time() - start_time
        successful = sum(1 for r in results if r.get('success', False))
        
        print(f"\nCompleted {model_name}:")
        print(f"  Total samples: {len(results)}")
        print(f"  Successful: {successful}")
        print(f"  Failed: {len(results) - successful}")
        print(f"  Total time: {total_time:.1f}s")
        print(f"  Avg time/sample: {total_time/len(results):.2f}s")
        
        # Save results
        output_file = RESULTS_DIR / f"{model_name}_results.json"
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"  Saved to: {output_file}")
        
    except Exception as e:
        print(f"ERROR evaluating {model_name}: {e}")
        results = [{
            'model': model_name,
            'error': str(e),
            'success': False
        }]
    
    finally:
        # Always unload model to free memory
        runner.unload_model()
        torch.cuda.empty_cache()
        gc.collect()
    
    return results

In [None]:
# Run evaluation for all models
all_results = []
failed_models = []

print(f"Starting evaluation of {len(evaluation_order)} models")
print("=" * 60)

for model_name in evaluation_order:
    # Skip if model not available on disk
    if model_name not in available_models:
        print(f"\nSkipping {model_name} - not downloaded yet")
        continue
    
    try:
        results = evaluate_model(model_name, eval_samples, runner)
        all_results.extend(results)
    except Exception as e:
        print(f"Failed to evaluate {model_name}: {e}")
        failed_models.append(model_name)
        continue

print("\n" + "=" * 60)
print("EVALUATION COMPLETE")
print(f"Models evaluated: {len(evaluation_order) - len(failed_models)}")
print(f"Models failed: {len(failed_models)}")
if failed_models:
    print(f"Failed models: {failed_models}")

## 7. Analyze Results

In [None]:
# Load all results
combined_results = []

for result_file in RESULTS_DIR.glob("*_results.json"):
    with open(result_file, 'r') as f:
        results = json.load(f)
        combined_results.extend(results)

print(f"Total results loaded: {len(combined_results)}")

# Convert to DataFrame for analysis
df_results = pd.DataFrame(combined_results)

In [None]:
# COMPREHENSIVE DATA ANALYSIS
print("🔍 ANALYZING SERVER MODEL RESULTS")
print("=" * 60)

# Enhanced data processing
if len(combined_results) > 0:
    df_results = pd.DataFrame(combined_results)
    
    # Extract moral choices and scores
    df_results['choice'] = df_results['response'].apply(extract_moral_choice)
    df_results['moral_score'] = df_results['response'].apply(extract_moral_score)
    
    print(f"Total results: {len(df_results)}")
    print(f"Models evaluated: {df_results['model'].nunique()}")
    print(f"Unique samples: {df_results['sample_id'].nunique()}")
    
    # Model performance summary
    model_stats = df_results.groupby('model').agg({
        'success': ['mean', 'count'],
        'inference_time': 'mean',
        'choice': lambda x: pd.Series({
            'acceptable_rate': (x == 'acceptable').mean(),
            'unacceptable_rate': (x == 'unacceptable').mean(),
            'unknown_rate': (x == 'unknown').mean()
        })
    }).round(4)
    
    print("\n📊 MODEL PERFORMANCE SUMMARY:")
    print("=" * 40)
    display(model_stats)
    
else:
    print("⚠️ No results found for analysis")
    df_results = pd.DataFrame()

In [None]:
# GENERATE ALL VISUALIZATIONS
if len(df_results) > 0:
    print("📈 GENERATING VISUALIZATIONS")
    print("=" * 40)
    
    # 1. Model Performance Plot
    print("Creating model performance visualization...")
    perf_fig, perf_stats = create_model_performance_plot(df_results)
    perf_fig.write_html(str(OUTPUT_DIR / "model_performance.html"))
    perf_fig.write_image(str(OUTPUT_DIR / "model_performance.png"), width=1200, height=800)
    perf_fig.show()
    
    # 2. Moral Question Analysis
    print("Creating moral question analysis...")
    if 'question' in df_results.columns:
        moral_fig, moral_analysis = create_moral_question_analysis(df_results)
        if moral_fig is not None:
            moral_fig.write_html(str(OUTPUT_DIR / "moral_questions_heatmap.html"))
            moral_fig.write_image(str(OUTPUT_DIR / "moral_questions_heatmap.png"), width=1000, height=600)
            moral_fig.show()
    
    # 3. Human-Model Agreement Analysis
    print("Creating human-model comparison...")
    if 'human_response' in df_results.columns:
        human_fig, agreement_stats = create_comparison_with_humans(df_results)
        if human_fig is not None:
            human_fig.write_html(str(OUTPUT_DIR / "human_model_agreement.html"))
            human_fig.write_image(str(OUTPUT_DIR / "human_model_agreement.png"), width=800, height=500)
            human_fig.show()
    
    # 4. Response Distribution Analysis
    print("Creating response distribution plots...")
    
    # Choice distribution pie chart for each model
    models = df_results['model'].unique()
    n_models = len(models)
    cols = min(3, n_models)
    rows = (n_models + cols - 1) // cols
    
    fig_dist = make_subplots(
        rows=rows, cols=cols,
        specs=[[{"type": "pie"}] * cols for _ in range(rows)],
        subplot_titles=[f"{model}" for model in models]
    )
    
    for i, model in enumerate(models):
        row = i // cols + 1
        col = i % cols + 1
        
        model_data = df_results[df_results['model'] == model]
        choice_counts = model_data['choice'].value_counts()
        
        fig_dist.add_trace(
            go.Pie(labels=choice_counts.index, values=choice_counts.values,
                   name=model, showlegend=(i == 0)),
            row=row, col=col
        )
    
    fig_dist.update_layout(height=300 * rows, title_text="Response Distribution by Model")
    fig_dist.write_html(str(OUTPUT_DIR / "response_distributions.html"))
    fig_dist.write_image(str(OUTPUT_DIR / "response_distributions.png"), width=1200, height=300*rows)
    fig_dist.show()
    
    print("✅ All visualizations saved to:", OUTPUT_DIR)

else:
    print("⚠️ No data available for visualization")

## 8. Save Final Results

In [None]:
# Save combined results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
final_output = OUTPUT_DIR / f"server_evaluation_{timestamp}.json"

final_data = {
    'metadata': {
        'timestamp': timestamp,
        'n_samples': len(eval_samples),
        'n_models': len(df_results['model'].unique()),
        'total_results': len(combined_results),
        'gpu_count': n_gpus if 'n_gpus' in locals() else 0,
        'base_dir': str(BASE_DIR)
    },
    'model_stats': model_stats.to_dict() if 'model_stats' in locals() else {},
    'results': combined_results
}

with open(final_output, 'w') as f:
    json.dump(final_data, f, indent=2)

print(f"Final results saved to: {final_output}")
print(f"File size: {final_output.stat().st_size / (1024*1024):.1f} MB")

In [None]:
# COMPREHENSIVE STATISTICAL ANALYSIS
if len(df_results) > 0:
    print("📊 STATISTICAL ANALYSIS")
    print("=" * 40)
    
    # 1. Inter-model Agreement Analysis
    if df_results['model'].nunique() > 1:
        print("Calculating inter-model agreement...")
        
        # Create model comparison matrix
        models = df_results['model'].unique()
        agreement_matrix = pd.DataFrame(index=models, columns=models)
        
        for model1 in models:
            for model2 in models:
                if model1 == model2:
                    agreement_matrix.loc[model1, model2] = 1.0
                else:
                    # Find common samples
                    model1_data = df_results[df_results['model'] == model1]
                    model2_data = df_results[df_results['model'] == model2]
                    
                    common_samples = set(model1_data['sample_id']) & set(model2_data['sample_id'])
                    
                    if len(common_samples) > 0:
                        m1_choices = model1_data[model1_data['sample_id'].isin(common_samples)].set_index('sample_id')['choice']
                        m2_choices = model2_data[model2_data['sample_id'].isin(common_samples)].set_index('sample_id')['choice']
                        
                        # Calculate agreement
                        agreement = (m1_choices == m2_choices).mean()
                        agreement_matrix.loc[model1, model2] = agreement
                    else:
                        agreement_matrix.loc[model1, model2] = np.nan
        
        # Convert to numeric
        agreement_matrix = agreement_matrix.astype(float)
        
        # Visualize inter-model agreement
        fig_agreement = go.Figure(data=go.Heatmap(
            z=agreement_matrix.values,
            x=agreement_matrix.columns,
            y=agreement_matrix.index,
            colorscale='RdYlGn',
            text=np.round(agreement_matrix.values, 3),
            texttemplate="%{text}",
            textfont={"size": 12}
        ))
        
        fig_agreement.update_layout(
            title='Inter-Model Agreement Matrix',
            xaxis_title='Model',
            yaxis_title='Model',
            height=500
        )
        
        fig_agreement.write_html(str(OUTPUT_DIR / "inter_model_agreement.html"))
        fig_agreement.write_image(str(OUTPUT_DIR / "inter_model_agreement.png"))
        fig_agreement.show()
    
    # 2. Response Time Analysis
    if 'inference_time' in df_results.columns:
        print("Analyzing inference times...")
        
        fig_time = go.Figure()
        
        for model in df_results['model'].unique():
            model_times = df_results[df_results['model'] == model]['inference_time']
            fig_time.add_trace(go.Box(y=model_times, name=model))
        
        fig_time.update_layout(
            title='Inference Time Distribution by Model',
            yaxis_title='Inference Time (seconds)',
            xaxis_title='Model'
        )
        
        fig_time.write_html(str(OUTPUT_DIR / "inference_times.html"))
        fig_time.write_image(str(OUTPUT_DIR / "inference_times.png"))
        fig_time.show()
    
    # 3. Sample Difficulty Analysis
    if 'question' in df_results.columns:
        print("Analyzing question difficulty...")
        
        # Calculate "difficulty" as the proportion of models that find something unacceptable
        question_difficulty = df_results.groupby(['question', 'sample_id']).agg({
            'choice': lambda x: (x == 'unacceptable').mean(),
            'model': 'count'
        }).reset_index()
        
        question_difficulty = question_difficulty[question_difficulty['model'] >= 2]  # At least 2 models
        
        difficulty_by_q = question_difficulty.groupby('question')['choice'].mean().sort_values(ascending=False)
        
        fig_diff = go.Figure([
            go.Bar(x=difficulty_by_q.index, y=difficulty_by_q.values)
        ])
        
        fig_diff.update_layout(
            title='Question "Difficulty" (Proportion Rated Unacceptable)',
            xaxis_title='Question',
            yaxis_title='Average Unacceptable Rate',
            xaxis_tickangle=45
        )
        
        fig_diff.write_html(str(OUTPUT_DIR / "question_difficulty.html"))
        fig_diff.write_image(str(OUTPUT_DIR / "question_difficulty.png"))
        fig_diff.show()
    
    print("✅ Statistical analysis completed")

else:
    print("⚠️ No data available for statistical analysis")

In [None]:
# GENERATE COMPREHENSIVE REPORT
if len(df_results) > 0:
    print("📄 GENERATING COMPREHENSIVE REPORT")
    print("=" * 50)
    
    # Detailed analysis
    total_evaluations = len(df_results)
    total_models = df_results['model'].nunique()
    total_samples = df_results['sample_id'].nunique()
    success_rate = df_results['success'].mean()
    
    # Performance metrics
    avg_inference_time = df_results['inference_time'].mean()
    total_inference_time = df_results['inference_time'].sum()
    
    # Moral choice analysis
    choice_distribution = df_results['choice'].value_counts(normalize=True)
    
    # Generate detailed HTML report
    html_report = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Server Model Evaluation Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 40px; }}
            .header {{ background-color: #f0f8ff; padding: 20px; border-radius: 10px; }}
            .section {{ margin: 20px 0; }}
            .metric {{ background-color: #f9f9f9; padding: 10px; margin: 5px 0; border-left: 4px solid #007acc; }}
            .model-stats {{ background-color: #fff8dc; padding: 15px; border-radius: 5px; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>🖥️ Server Model Evaluation Report</h1>
            <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            <p><strong>Server:</strong> 4x A100 GPUs</p>
            <p><strong>Dataset:</strong> Exact same 5000 samples as Local/API evaluation</p>
        </div>
        
        <div class="section">
            <h2>📊 Executive Summary</h2>
            <div class="metric"><strong>Total Evaluations:</strong> {total_evaluations:,}</div>
            <div class="metric"><strong>Models Evaluated:</strong> {total_models}</div>
            <div class="metric"><strong>Unique Samples:</strong> {total_samples:,}</div>
            <div class="metric"><strong>Overall Success Rate:</strong> {success_rate:.2%}</div>
            <div class="metric"><strong>Average Inference Time:</strong> {avg_inference_time:.2f} seconds</div>
            <div class="metric"><strong>Total Processing Time:</strong> {total_inference_time/3600:.1f} hours</div>
        </div>
        
        <div class="section">
            <h2>🎯 Moral Choice Distribution</h2>
            <div class="model-stats">
    """
    
    for choice, percentage in choice_distribution.items():
        html_report += f'<div class="metric"><strong>{choice.title()}:</strong> {percentage:.1%}</div>\n'
    
    html_report += """
            </div>
        </div>
        
        <div class="section">
            <h2>🔍 Model Performance Details</h2>
            <table>
                <tr>
                    <th>Model</th>
                    <th>Total Evaluations</th>
                    <th>Success Rate</th>
                    <th>Avg Inference Time (s)</th>
                    <th>Acceptable Rate</th>
                    <th>Unacceptable Rate</th>
                </tr>
    """
    
    # Add model details
    for model in df_results['model'].unique():
        model_data = df_results[df_results['model'] == model]
        model_success = model_data['success'].mean()
        model_time = model_data['inference_time'].mean()
        model_acceptable = (model_data['choice'] == 'acceptable').mean()
        model_unacceptable = (model_data['choice'] == 'unacceptable').mean()
        
        html_report += f"""
                <tr>
                    <td>{model}</td>
                    <td>{len(model_data):,}</td>
                    <td>{model_success:.1%}</td>
                    <td>{model_time:.2f}</td>
                    <td>{model_acceptable:.1%}</td>
                    <td>{model_unacceptable:.1%}</td>
                </tr>
        """
    
    html_report += f"""
            </table>
        </div>
        
        <div class="section">
            <h2>📈 Generated Outputs</h2>
            <ul>
                <li><strong>Interactive Plots:</strong> model_performance.html, moral_questions_heatmap.html</li>
                <li><strong>Static Images:</strong> PNG versions of all plots</li>
                <li><strong>Raw Data:</strong> server_evaluation_{timestamp}.json</li>
                <li><strong>Individual Results:</strong> {RESULTS_DIR}</li>
            </ul>
        </div>
        
        <div class="section">
            <h2>🔗 Comparison with Other Approaches</h2>
            <div class="model-stats">
                <div class="metric"><strong>Server Models:</strong> {total_models} models evaluated</div>
                <div class="metric"><strong>Local Models:</strong> 6 Ollama models (same samples)</div>
                <div class="metric"><strong>API Models:</strong> 11 OpenAI models (same samples)</div>
                <div class="metric"><strong>Dataset Consistency:</strong> ✅ All approaches use identical 5000 samples</div>
            </div>
        </div>
        
        <div class="section">
            <p><em>This report provides a comprehensive analysis of server model performance on the moral alignment evaluation task. 
            All visualizations and detailed data files are available in the outputs directory.</em></p>
        </div>
    </body>
    </html>
    """
    
    # Save HTML report
    report_file = OUTPUT_DIR / f"evaluation_report_{timestamp}.html"
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write(html_report)
    
    print(f"✅ Comprehensive HTML report saved to: {report_file}")
    
    # Also create a simple text summary
    text_summary = f"""
SERVER MODEL EVALUATION SUMMARY
{'='*60}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

OVERVIEW:
- Total Evaluations: {total_evaluations:,}
- Models Evaluated: {total_models}
- Unique Samples: {total_samples:,}
- Overall Success Rate: {success_rate:.2%}
- Average Inference Time: {avg_inference_time:.2f}s
- Total Processing Time: {total_inference_time/3600:.1f} hours

MORAL CHOICES:
"""
    
    for choice, percentage in choice_distribution.items():
        text_summary += f"- {choice.title()}: {percentage:.1%}\n"
    
    text_summary += f"""
GENERATED FILES:
- HTML Report: evaluation_report_{timestamp}.html
- Interactive Plots: *.html files in outputs/
- Static Images: *.png files in outputs/
- Raw Data: server_evaluation_{timestamp}.json
- Individual Results: {RESULTS_DIR}/

DATASET CONSISTENCY:
✅ Same 5000 samples used across all approaches (Server, Local, API)
✅ Real World Values Survey data with 64 countries, 13 moral questions
✅ Perfect comparison capability with other evaluation approaches
"""
    
    summary_file = OUTPUT_DIR / f"evaluation_summary_{timestamp}.txt"
    with open(summary_file, 'w') as f:
        f.write(text_summary)
    
    print(f"✅ Text summary saved to: {summary_file}")
    print(f"✅ All outputs saved to: {OUTPUT_DIR}")
    
    # Display final summary
    print("\n" + "="*60)
    print("🎉 EVALUATION COMPLETE!")
    print("="*60)
    print(f"📊 Processed {total_evaluations:,} evaluations from {total_models} models")
    print(f"⏱️  Total time: {total_inference_time/3600:.1f} hours")
    print(f"📈 Success rate: {success_rate:.1%}")
    print(f"📁 All results, plots, and reports saved to: {OUTPUT_DIR}")
    print(f"🌐 View report: {report_file}")
    
else:
    print("⚠️ No results available for report generation")

## 8.1 Comprehensive Analysis & Visualizations

## 9. Cleanup (Optional)

In [None]:
# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print("GPU memory cleared")

# Check final GPU memory usage
for i in range(torch.cuda.device_count()):
    mem_alloc = torch.cuda.memory_allocated(i) / (1024**3)
    mem_reserved = torch.cuda.memory_reserved(i) / (1024**3)
    print(f"GPU {i}: Allocated={mem_alloc:.1f}GB, Reserved={mem_reserved:.1f}GB")