In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for matplotlib
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Pipeline Results Folder Selector
import ipywidgets as widgets
from IPython.display import display, clear_output
from pathlib import Path

def find_pipeline_results_folders(base_path="output"):
    """Find all pipeline_results folders in the output directory"""
    pipeline_folders = []
    base_path = Path(base_path)
    
    if not base_path.exists():
        print(f"Output directory '{base_path}' not found!")
        return []
    
    for item in base_path.iterdir():
        if item.is_dir() and item.name.startswith("pipeline_results_"):
            # Check for evaluation file
            eval_json_path = item / "llm_evaluation_results.json"
            
            if eval_json_path.exists():
                pipeline_folders.append({
                    'name': item.name,
                    'path': str(item),
                    'eval_path': str(eval_json_path)
                })
            else:
                print(f"Warning: No evaluation file found in {item.name}")
    
    return sorted(pipeline_folders, key=lambda x: x['name'], reverse=True)  # Most recent first

# Find available pipeline results
available_folders = find_pipeline_results_folders()

if not available_folders:
    print("❌ No pipeline_results folders found in the output directory!")
    print("Please run the evaluation pipeline first.")
    selected_pipeline_folder = None
else:
    print(f"✅ Found {len(available_folders)} pipeline_results folders:")
    for i, folder in enumerate(available_folders[:5]):  # Show first 5
        print(f"  {i+1}. {folder['name']}")
    
    # Create selection widget
    folder_options = [(folder['name'], folder) for folder in available_folders]
    
    # Pre-select the most recent folder
    folder_selector = widgets.Dropdown(
        options=folder_options,
        value=available_folders[0] if available_folders else None,
        description='Pipeline Run:',
        style={'description_width': '120px'},
        layout=widgets.Layout(width='600px')
    )
    
    # Set button
    set_button = widgets.Button(
        description='📁 Set Pipeline Folder',
        button_style='success',
        icon='check',
        layout=widgets.Layout(width='200px')
    )
    
    output_area = widgets.Output()
    
    def set_pipeline_folder(button):
        global selected_pipeline_folder, results_path
        with output_area:
            clear_output()
            selected_folder = folder_selector.value
            
            if selected_folder:
                selected_pipeline_folder = selected_folder['path']
                results_path = selected_folder['eval_path']
                
                print(f"✅ Selected pipeline folder: {selected_folder['name']}")
                print(f"📊 Results file: {results_path}")
                print(f"📁 Full path: {selected_pipeline_folder}")
                print("\n🚀 You can now run the visualization cells below!")
            else:
                print("❌ No folder selected!")
    
    set_button.on_click(set_pipeline_folder)
    
    print(f"\n📋 Select a pipeline results folder:")
    display(widgets.HBox([folder_selector, set_button]))
    display(output_area)
    
    # Set default
    if available_folders:
        selected_pipeline_folder = available_folders[0]['path']
        results_path = available_folders[0]['eval_path']
        print(f"\n💡 Default: Using most recent run - {available_folders[0]['name']}")
    else:
        selected_pipeline_folder = None
        results_path = None

In [None]:
def load_evaluation_results(results_path):
    """Load evaluation results from JSON file."""
    with open(results_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def create_dataframe_from_results(results):
    """Convert nested results to a flat DataFrame for analysis."""
    rows = []
    for doc_name, doc_results in results.items():
        for model_name, model_results in doc_results.items():
            for ann_type, metrics in model_results.items():
                # Lenient evaluation
                rows.append({
                    'Document': doc_name,
                    'Model': model_name,
                    'Annotation_Type': ann_type,
                    'Evaluation_Mode': 'Lenient',
                    'Precision': metrics['lenient']['precision'],
                    'Recall': metrics['lenient']['recall'],
                    'F1_Score': metrics['lenient']['f1_score'],
                    'True_Positives': metrics['lenient']['true_positives'],
                    'False_Positives': metrics['lenient']['false_positives'],
                    'False_Negatives': metrics['lenient']['false_negatives'],
                    'Gold_Count': metrics['gold_count'],
                    'Predicted_Count': metrics['predicted_count']
                })
                
                # Strict evaluation
                rows.append({
                    'Document': doc_name,
                    'Model': model_name,
                    'Annotation_Type': ann_type,
                    'Evaluation_Mode': 'Strict',
                    'Precision': metrics['strict']['precision'],
                    'Recall': metrics['strict']['recall'],
                    'F1_Score': metrics['strict']['f1_score'],
                    'True_Positives': metrics['strict']['true_positives'],
                    'False_Positives': metrics['strict']['false_positives'],
                    'False_Negatives': metrics['strict']['false_negatives'],
                    'Gold_Count': metrics['gold_count'],
                    'Predicted_Count': metrics['predicted_count']
                })
    
    return pd.DataFrame(rows)

def visualize_single_run(results_path):
    """Create comprehensive visualizations for a single evaluation run."""
    
    # Load and prepare data
    results = load_evaluation_results(results_path)
    df = create_dataframe_from_results(results)
    
    # Create an enhanced figure with more subplots for better analysis
    fig = make_subplots(
        rows=4, cols=2,
        subplot_titles=(
            'F1-Scores by Model and Annotation Type (Lenient)', 
            'Document-Level Performance Heatmap',
            'Precision vs Recall by Model (with Document Points)',
            'Model Performance Across Documents',
            'Lenient vs Strict Evaluation Comparison',
            'Individual Document Analysis',
            'Annotation Type Performance Distribution',
            'Model Consistency Across Documents'
        ),
        specs=[[{"type": "bar"}, {"type": "heatmap"}],
               [{"type": "scatter"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "box"}],
               [{"type": "bar"}, {"type": "scatter"}]],
        vertical_spacing=0.08,
        horizontal_spacing=0.12
    )
    
    # Dynamic color palette for all available models
    color_palette = [
        '#E74C3C',   # Red
        '#3498DB',   # Blue
        '#2ECC71',   # Green
        '#F39C12',   # Orange
        '#9B59B6',   # Purple
        '#1ABC9C',   # Teal
        '#F1C40F',   # Yellow
        '#E67E22',   # Dark Orange
        '#95A5A6',   # Gray
        '#34495E',   # Dark Blue Gray
        '#16A085',   # Dark Teal
        '#8E44AD',   # Dark Purple
        '#C0392B',   # Dark Red
        '#2980B9',   # Dark Blue
        '#27AE60',   # Dark Green
        '#D35400'    # Dark Orange Red
    ]
    
    # Get unique models and assign colors dynamically
    unique_models = df['Model'].unique()
    model_colors = {}
    for i, model in enumerate(unique_models):
        model_colors[model] = color_palette[i % len(color_palette)]
    
    # Document colors for variety
    doc_colors = px.colors.qualitative.Set3
    
    # Get lenient and strict data
    df_lenient = df[df['Evaluation_Mode'] == 'Lenient']
    df_strict = df[df['Evaluation_Mode'] == 'Strict']
    
    # 1. F1-Scores by Model and Annotation Type (Lenient) - Enhanced with proper legend
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        avg_f1_by_type = model_data.groupby('Annotation_Type')['F1_Score'].mean().reset_index()
        
        fig.add_trace(
            go.Bar(
                x=avg_f1_by_type['Annotation_Type'],
                y=avg_f1_by_type['F1_Score'],
                name=f"{model}",
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=True,
                legendgroup='models',
                hovertemplate=f'<b>{model}</b><br>Type: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            ),
            row=1, col=1
        )
    
    # 2. Document-Level Performance Heatmap
    doc_model_performance = df_lenient.groupby(['Document', 'Model'])['F1_Score'].mean().reset_index()
    heatmap_pivot = doc_model_performance.pivot(index='Document', columns='Model', values='F1_Score')
    
    fig.add_trace(
        go.Heatmap(
            z=heatmap_pivot.values,
            x=heatmap_pivot.columns,
            y=heatmap_pivot.index,
            colorscale='RdYlBu_r',
            showscale=True,
            colorbar=dict(title="F1 Score", x=0.48),
            hovertemplate='Document: %{y}<br>Model: %{x}<br>F1 Score: %{z:.3f}<extra></extra>',
            showlegend=False
        ),
        row=1, col=2
    )
    
    # 3. Precision vs Recall scatter plot with document points
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['Recall'],
                y=model_data['Precision'],
                mode='markers',
                name=f"{model} (docs)",
                marker=dict(
                    size=8,
                    color=model_colors.get(model, '#95A5A6'),
                    symbol='circle',
                    opacity=0.7
                ),
                text=model_data['Document'] + '<br>' + model_data['Annotation_Type'],
                hovertemplate='<b>%{text}</b><br>Recall: %{x:.3f}<br>Precision: %{y:.3f}<extra></extra>',
                showlegend=True,
                legendgroup='scatter'
            ),
            row=2, col=1
        )
    
    # 4. Model Performance Across Documents (Box plot style)
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_performance = model_data.groupby('Document')['F1_Score'].mean().reset_index()
        
        fig.add_trace(
            go.Bar(
                x=doc_performance['Document'],
                y=doc_performance['F1_Score'],
                name=f"{model}",
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=False,
                opacity=0.8,
                hovertemplate=f'<b>{model}</b><br>Document: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            ),
            row=2, col=2
        )
    
    # 5. Lenient vs Strict comparison - Enhanced
    df_comparison = df.groupby(['Model', 'Evaluation_Mode'])['F1_Score'].mean().reset_index()
    lenient_data = df_comparison[df_comparison['Evaluation_Mode'] == 'Lenient']
    strict_data = df_comparison[df_comparison['Evaluation_Mode'] == 'Strict']
    
    fig.add_trace(
        go.Bar(
            x=lenient_data['Model'],
            y=lenient_data['F1_Score'],
            name='Lenient Evaluation',
            marker_color='lightblue',
            showlegend=True,
            legendgroup='evaluation_modes',
            hovertemplate='<b>Lenient</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=3, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=strict_data['Model'],
            y=strict_data['F1_Score'],
            name='Strict Evaluation',
            marker_color='darkblue',
            showlegend=True,
            legendgroup='evaluation_modes',
            hovertemplate='<b>Strict</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=3, col=1
    )
    
    # 6. Individual Document Analysis (Box plots showing variance)
    for i, annotation_type in enumerate(df_lenient['Annotation_Type'].unique()):
        ann_data = df_lenient[df_lenient['Annotation_Type'] == annotation_type]
        
        fig.add_trace(
            go.Box(
                y=ann_data['F1_Score'],
                name=annotation_type,
                boxpoints='all',
                jitter=0.3,
                pointpos=-1.8,
                marker_color=px.colors.qualitative.Set1[i % len(px.colors.qualitative.Set1)],
                showlegend=True,
                legendgroup='annotation_types',
                hovertemplate=f'<b>{annotation_type}</b><br>F1: %{{y:.3f}}<extra></extra>'
            ),
            row=3, col=2
        )
    
    # 7. Annotation Type Performance Distribution
    df_counts = df_lenient.groupby('Annotation_Type').agg({
        'Gold_Count': 'first',
        'Predicted_Count': 'mean',
        'F1_Score': 'mean'
    }).reset_index()
    
    fig.add_trace(
        go.Bar(
            x=df_counts['Annotation_Type'],
            y=df_counts['Gold_Count'],
            name='Gold Standard Count',
            marker_color='gold',
            showlegend=True,
            legendgroup='counts',
            hovertemplate='<b>Gold Standard</b><br>Type: %{x}<br>Count: %{y}<extra></extra>'
        ),
        row=4, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=df_counts['Annotation_Type'],
            y=df_counts['Predicted_Count'],
            name='Avg Predicted Count',
            marker_color='silver',
            showlegend=True,
            legendgroup='counts',
            yaxis='y2',
            hovertemplate='<b>Predicted</b><br>Type: %{x}<br>Avg Count: %{y:.1f}<extra></extra>'
        ),
        row=4, col=1
    )
    
    # 8. Model Consistency Across Documents (Coefficient of Variation)
    model_consistency = []
    for model in df_lenient['Model'].unique():
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_scores = model_data.groupby('Document')['F1_Score'].mean()
        cv = doc_scores.std() / doc_scores.mean() if doc_scores.mean() > 0 else 0
        model_consistency.append({'Model': model, 'Consistency': 1 - cv, 'CV': cv})
    
    consistency_df = pd.DataFrame(model_consistency)
    
    fig.add_trace(
        go.Scatter(
            x=consistency_df['Model'],
            y=consistency_df['Consistency'],
            mode='markers+lines',
            name='Model Consistency',
            marker=dict(
                size=12,
                color=[model_colors.get(model, '#95A5A6') for model in consistency_df['Model']],
                symbol='diamond'
            ),
            line=dict(color='gray', dash='dash'),
            showlegend=True,
            legendgroup='consistency',
            hovertemplate='<b>%{x}</b><br>Consistency: %{y:.3f}<br>CV: %{customdata:.3f}<extra></extra>',
            customdata=consistency_df['CV']
        ),
        row=4, col=2
    )
    
    # Update layout with improved legend and styling
    fig.update_layout(
        height=1600,  # Increased height for 4 rows
        title=f'Enhanced LLM Evaluation Dashboard<br><sub>Pipeline Results: {Path(results_path).parent.name} | Document-Level Analysis</sub>',
        title_x=0.5,
        showlegend=True,
        legend=dict(
            orientation="v",
            yanchor="top", 
            y=1,
            xanchor="left",
            x=1.02,
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="rgba(0,0,0,0.2)",
            borderwidth=1,
            font=dict(size=10)
        ),
        font=dict(size=11)
    )
    
    # Update subplot titles and axes with better formatting
    fig.update_xaxes(title_text="Annotation Type", row=1, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Average F1 Score", row=1, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=1, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Document", row=1, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Recall", row=2, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Precision", row=2, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Document", row=2, col=2, title_font_size=10, tickangle=45)
    fig.update_yaxes(title_text="F1 Score", row=2, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=3, col=1, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score", row=3, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Annotation Type", row=3, col=2, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score Distribution", row=3, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Annotation Type", row=4, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Count", row=4, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=4, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Consistency Score", row=4, col=2, title_font_size=10)
    
    return fig

def create_document_comparison_plot(df_lenient):
    """Create a detailed document comparison visualization."""
    # Document performance comparison
    doc_fig = go.Figure()
    
    documents = df_lenient['Document'].unique()
    models = df_lenient['Model'].unique()
    
    # Dynamic color palette for all available models
    color_palette = [
        '#E74C3C',   # Red
        '#3498DB',   # Blue
        '#2ECC71',   # Green
        '#F39C12',   # Orange
        '#9B59B6',   # Purple
        '#1ABC9C',   # Teal
        '#F1C40F',   # Yellow
        '#E67E22',   # Dark Orange
        '#95A5A6',   # Gray
        '#34495E',   # Dark Blue Gray
        '#16A085',   # Dark Teal
        '#8E44AD',   # Dark Purple
        '#C0392B',   # Dark Red
        '#2980B9',   # Dark Blue
        '#27AE60',   # Dark Green
        '#D35400'    # Dark Orange Red
    ]
    
    # Get unique models and assign colors dynamically
    unique_models = df_lenient['Model'].unique()
    model_colors = {}
    for i, model in enumerate(unique_models):
        model_colors[model] = color_palette[i % len(color_palette)]
    
    for model in models:
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_scores = model_data.groupby('Document')['F1_Score'].mean()
        
        doc_fig.add_trace(
            go.Scatter(
                x=documents,
                y=[doc_scores.get(doc, 0) for doc in documents],
                mode='lines+markers',
                name=model,
                line=dict(color=model_colors.get(model, '#95A5A6'), width=3),
                marker=dict(size=8),
                hovertemplate=f'<b>{model}</b><br>Document: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            )
        )
    
    doc_fig.update_layout(
        title='Model Performance Across Individual Documents',
        xaxis_title='Document',
        yaxis_title='Average F1 Score',
        height=400,
        hovermode='x unified',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right", 
            x=1
        )
    )
    
    doc_fig.update_xaxes(tickangle=45)
    return doc_fig

# Enhanced example usage with document-level analysis
# Use the selected pipeline folder from above
if 'results_path' in globals() and results_path and Path(results_path).exists():
    # Create main dashboard
    fig = visualize_single_run(results_path)
    fig.show()
    
    # Create additional document comparison plot
    results = load_evaluation_results(results_path)
    df = create_dataframe_from_results(results)
    df_lenient = df[df['Evaluation_Mode'] == 'Lenient']
    
    doc_fig = create_document_comparison_plot(df_lenient)
    doc_fig.show()
    
    # Print summary statistics
    print("📊 EVALUATION SUMMARY:")
    print("=" * 50)
    
    # Model rankings
    model_rankings = df_lenient.groupby('Model')['F1_Score'].agg(['mean', 'std']).round(3)
    model_rankings = model_rankings.sort_values('mean', ascending=False)
    print("\n🏆 MODEL RANKINGS (by average F1):")
    for i, (model, stats) in enumerate(model_rankings.iterrows(), 1):
        print(f"{i}. {model}: {stats['mean']:.3f} (±{stats['std']:.3f})")
    
    # Best performing annotation types
    ann_performance = df_lenient.groupby('Annotation_Type')['F1_Score'].agg(['mean', 'std']).round(3)
    ann_performance = ann_performance.sort_values('mean', ascending=False)
    print("\n📋 ANNOTATION TYPE PERFORMANCE:")
    for ann_type, stats in ann_performance.iterrows():
        print(f"• {ann_type}: {stats['mean']:.3f} (±{stats['std']:.3f})")
    
    # Document difficulty analysis
    doc_difficulty = df_lenient.groupby('Document')['F1_Score'].agg(['mean', 'std']).round(3)
    doc_difficulty = doc_difficulty.sort_values('mean')
    print(f"\n📄 DOCUMENT ANALYSIS:")
    print(f"Most challenging: {doc_difficulty.index[0]} (avg F1: {doc_difficulty.iloc[0]['mean']:.3f})")
    print(f"Easiest: {doc_difficulty.index[-1]} (avg F1: {doc_difficulty.iloc[-1]['mean']:.3f})")
    
    print("\n✅ Enhanced single run visualization created successfully!")
else:
    print(f"❌ Results file not found or not selected!")
    if 'results_path' not in globals() or not results_path:
        print("Please run the pipeline folder selector at the top first.")
    else:
        print(f"File path: {results_path}")
    print("\nAvailable pipeline results:")
    output_dir = Path("output")
    if output_dir.exists():
        for folder in sorted(output_dir.glob("pipeline_results_*")):
            results_file = folder / "llm_evaluation_results.json"
            if results_file.exists():
                print(f"  - {results_file}")

In [None]:
def load_pipeline_timing_data(pipeline_folder):
    """Load timing data from pipeline results including overall and per-document timing."""
    pipeline_folder = Path(pipeline_folder)
    
    # Load main pipeline results
    main_results_file = pipeline_folder / f"{pipeline_folder.name}.json"
    with open(main_results_file, 'r', encoding='utf-8') as f:
        pipeline_info = json.load(f)
    
    # Load evaluation results for performance data
    eval_results_file = pipeline_folder / "llm_evaluation_results.json"
    with open(eval_results_file, 'r', encoding='utf-8') as f:
        eval_results = json.load(f)
    
    return {
        'pipeline_info': pipeline_info,
        'evaluation_results': eval_results
    }

def calculate_runtime_metrics(timing_data, eval_df):
    """Calculate comprehensive runtime and efficiency metrics using actual timing data."""
    
    # Parse pipeline timing
    pipeline_info = timing_data['pipeline_info']
    
    # Get actual total processing time
    total_processing_time = pipeline_info.get('total_processing_time', '0:00:00')
    
    # Parse the time string "H:MM:SS.microseconds" format
    if isinstance(total_processing_time, str):
        time_parts = total_processing_time.split(':')
        if len(time_parts) == 3:
            hours = int(time_parts[0])
            minutes = int(time_parts[1])
            seconds = float(time_parts[2])
            total_pipeline_time = hours * 3600 + minutes * 60 + seconds
        else:
            total_pipeline_time = 0
    else:
        total_pipeline_time = 0
    
    # Get evaluation data
    eval_lenient = eval_df[eval_df['Evaluation_Mode'] == 'Lenient'].copy()
    
    # Group evaluation data by document and model
    eval_summary = eval_lenient.groupby(['Document', 'Model']).agg({
        'F1_Score': 'mean',
        'Precision': 'mean',
        'Recall': 'mean',
        'True_Positives': 'sum',
        'False_Positives': 'sum',
        'False_Negatives': 'sum'
    }).reset_index()
    
    # Create actual timing data - since we don't have individual model timings,
    # distribute the total time evenly across all model-document combinations
    models = pipeline_info['models_used']
    docs = eval_summary['Document'].unique()
    
    # Calculate actual average time per document per model
    avg_time_per_doc_per_model = total_pipeline_time / (len(models) * len(docs)) if len(models) > 0 and len(docs) > 0 else 0
    
    actual_timing = []
    for doc in docs:
        for model in models:
            # Get annotation count for this doc-model combo
            doc_eval = eval_summary[(eval_summary['Document'] == doc) & (eval_summary['Model'] == model)]
            if not doc_eval.empty:
                annotation_count = doc_eval['True_Positives'].iloc[0] + doc_eval['False_Positives'].iloc[0]
            else:
                annotation_count = 1  # Default minimum
            
            actual_timing.append({
                'document': doc,
                'model': model,
                'processing_time': avg_time_per_doc_per_model,
                'annotation_count': max(annotation_count, 1),  # Ensure at least 1
                'actual': True
            })
    
    doc_timing = pd.DataFrame(actual_timing)
    
    # Merge timing with evaluation metrics
    merged_data = pd.merge(
        eval_summary, 
        doc_timing, 
        left_on=['Document', 'Model'], 
        right_on=['document', 'model'], 
        how='left'
    )
    
    # Calculate efficiency metrics (avoid division by zero)
    merged_data['efficiency_score'] = merged_data['F1_Score'] / (merged_data['processing_time'] + 0.001)
    merged_data['annotations_per_second'] = merged_data['annotation_count'] / (merged_data['processing_time'] + 0.001)
    merged_data['time_per_annotation'] = merged_data['processing_time'] / (merged_data['annotation_count'] + 1)
    
    return {
        'merged_data': merged_data,
        'total_pipeline_time': total_pipeline_time,
        'pipeline_info': pipeline_info,
        'models_used': pipeline_info['models_used'],
        'documents_processed': len(docs),
        'avg_time_per_doc_per_model': avg_time_per_doc_per_model
    }

def create_runtime_dashboard(pipeline_folder):
    """Create comprehensive runtime analysis dashboard."""
    
    # Load data
    timing_data = load_pipeline_timing_data(pipeline_folder)
    eval_results = load_evaluation_results(pipeline_folder / "llm_evaluation_results.json")
    eval_df = create_dataframe_from_results(eval_results)
    
    # Calculate runtime metrics
    runtime_metrics = calculate_runtime_metrics(timing_data, eval_df)
    merged_data = runtime_metrics['merged_data']
    
    # Create dashboard
    fig = make_subplots(
        rows=4, cols=2,
        subplot_titles=(
            'Processing Time by Model and Document',
            'F1 Score vs Processing Time (Performance vs Speed)',
            'Model Efficiency Score (F1/Time)',
            'Annotations per Second by Model',
            'Time per Annotation Distribution',
            'Runtime vs Performance Trade-off',
            'Model Speed Comparison',
            'Document Processing Complexity'
        ),
        specs=[[{"type": "bar"}, {"type": "scatter"}],
               [{"type": "bar"}, {"type": "box"}],
               [{"type": "violin"}, {"type": "scatter"}],
               [{"type": "bar"}, {"type": "heatmap"}]],
        vertical_spacing=0.08,
        horizontal_spacing=0.12
    )
    
    # Use dynamic colors based on available models
    model_colors = {}
    models = merged_data['Model'].unique()
    color_palette = ['#E74C3C', '#3498DB', '#2ECC71', '#F39C12', '#9B59B6', '#1ABC9C', 
                    '#F1C40F', '#E67E22', '#95A5A6', '#34495E', '#16A085', '#8E44AD']
    
    for i, model in enumerate(models):
        model_colors[model] = color_palette[i % len(color_palette)]
    
    # 1. Processing Time by Model and Document
    for model in merged_data['Model'].unique():
        model_data = merged_data[merged_data['Model'] == model]
        fig.add_trace(
            go.Bar(
                x=model_data['Document'],
                y=model_data['processing_time'],
                name=f'{model}',
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=True,
                legendgroup='models',
                hovertemplate=f'<b>{model}</b><br>Document: %{{x}}<br>Time: %{{y:.2f}}s<extra></extra>'
            ),
            row=1, col=1
        )
    
    # 2. F1 Score vs Processing Time
    for model in merged_data['Model'].unique():
        model_data = merged_data[merged_data['Model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['processing_time'],
                y=model_data['F1_Score'],
                mode='markers',
                name=f'{model}',
                marker=dict(
                    size=12,
                    color=model_colors.get(model, '#95A5A6'),
                    symbol='circle',
                    opacity=0.8
                ),
                text=model_data['Document'],
                showlegend=False,
                legendgroup='models',
                hovertemplate=f'<b>{model}</b><br>Time: %{{x:.2f}}s<br>F1: %{{y:.3f}}<br>Doc: %{{text}}<extra></extra>'
            ),
            row=1, col=2
        )
    
    # 3. Model Efficiency Score
    efficiency_data = merged_data.groupby('Model')['efficiency_score'].mean().reset_index()
    efficiency_data = efficiency_data.sort_values('efficiency_score', ascending=True)
    
    fig.add_trace(
        go.Bar(
            x=efficiency_data['efficiency_score'],
            y=efficiency_data['Model'],
            orientation='h',
            name='Efficiency',
            marker_color=[model_colors.get(model, '#95A5A6') for model in efficiency_data['Model']],
            showlegend=False,
            hovertemplate='<b>%{y}</b><br>Efficiency: %{x:.4f} F1/sec<extra></extra>'
        ),
        row=2, col=1
    )
    
    # 4. Annotations per Second by Model
    for model in merged_data['Model'].unique():
        model_data = merged_data[merged_data['Model'] == model]
        fig.add_trace(
            go.Box(
                y=model_data['annotations_per_second'],
                name=model,
                boxpoints='all',
                jitter=0.3,
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=False,
                hovertemplate=f'<b>{model}</b><br>Ann/sec: %{{y:.2f}}<extra></extra>'
            ),
            row=2, col=2
        )
    
    # 5. Time per Annotation Distribution
    for model in merged_data['Model'].unique():
        model_data = merged_data[merged_data['Model'] == model]
        fig.add_trace(
            go.Violin(
                y=model_data['time_per_annotation'],
                name=model,
                side='positive',
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=False,
                hovertemplate=f'<b>{model}</b><br>Time/Ann: %{{y:.3f}}s<extra></extra>'
            ),
            row=3, col=1
        )
    
    # 6. Runtime vs Performance Trade-off (Bubble chart)
    bubble_data = merged_data.groupby('Model').agg({
        'F1_Score': 'mean',
        'processing_time': 'mean',
        'annotation_count': 'sum'
    }).reset_index()
    
    fig.add_trace(
        go.Scatter(
            x=bubble_data['processing_time'],
            y=bubble_data['F1_Score'],
            mode='markers+text',
            text=bubble_data['Model'],
            textposition='top center',
            marker=dict(
                size=bubble_data['annotation_count'],
                color=[model_colors.get(model, '#95A5A6') for model in bubble_data['Model']],
                opacity=0.7,
                sizemode='diameter',
                sizemin=15,
                sizeref=2.*max(bubble_data['annotation_count'])/(40.**2)
            ),
            name='Performance vs Speed',
            showlegend=False,
            hovertemplate='<b>%{text}</b><br>Time: %{x:.2f}s<br>F1: %{y:.3f}<br>Annotations: %{marker.size}<extra></extra>'
        ),
        row=3, col=2
    )
    
    # 7. Model Speed Comparison
    speed_data = merged_data.groupby('Model')['processing_time'].mean().reset_index()
    speed_data = speed_data.sort_values('processing_time')
    
    fig.add_trace(
        go.Bar(
            x=speed_data['Model'],
            y=speed_data['processing_time'],
            name='Avg Processing Time',
            marker_color=[model_colors.get(model, '#95A5A6') for model in speed_data['Model']],
            showlegend=False,
            hovertemplate='<b>%{x}</b><br>Avg Time: %{y:.2f}s<extra></extra>'
        ),
        row=4, col=1
    )
    
    # 8. Document Processing Complexity Heatmap
    complexity_pivot = merged_data.pivot(index='Document', columns='Model', values='processing_time')
    
    fig.add_trace(
        go.Heatmap(
            z=complexity_pivot.values,
            x=complexity_pivot.columns,
            y=complexity_pivot.index,
            colorscale='Viridis',
            showscale=True,
            hovertemplate='Document: %{y}<br>Model: %{x}<br>Time: %{z:.2f}s<extra></extra>',
            colorbar=dict(title="Processing Time (s)", x=1.02)
        ),
        row=4, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=1600,
        title=f'🚀 Runtime Analysis Dashboard<br><sub>Pipeline: {Path(pipeline_folder).name} | Total Time: {runtime_metrics["total_pipeline_time"]/60:.1f} minutes</sub>',
        title_x=0.5,
        showlegend=True,
        font=dict(size=11)
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Documents", row=1, col=1, tickangle=45)
    fig.update_yaxes(title_text="Processing Time (s)", row=1, col=1)
    
    fig.update_xaxes(title_text="Processing Time (s)", row=1, col=2)
    fig.update_yaxes(title_text="F1 Score", row=1, col=2)
    
    fig.update_xaxes(title_text="Efficiency Score (F1/s)", row=2, col=1)
    fig.update_yaxes(title_text="Model", row=2, col=1)
    
    fig.update_xaxes(title_text="Model", row=2, col=2)
    fig.update_yaxes(title_text="Annotations per Second", row=2, col=2)
    
    fig.update_xaxes(title_text="Model", row=3, col=1)
    fig.update_yaxes(title_text="Time per Annotation (s)", row=3, col=1)
    
    fig.update_xaxes(title_text="Processing Time (s)", row=3, col=2)
    fig.update_yaxes(title_text="F1 Score", row=3, col=2)
    
    fig.update_xaxes(title_text="Model", row=4, col=1)
    fig.update_yaxes(title_text="Processing Time (s)", row=4, col=1)
    
    fig.update_xaxes(title_text="Model", row=4, col=2)
    fig.update_yaxes(title_text="Document", row=4, col=2)
    
    return fig, runtime_metrics

def print_runtime_summary(runtime_metrics):
    """Print comprehensive runtime analysis summary."""
    merged_data = runtime_metrics['merged_data']
    pipeline_info = runtime_metrics['pipeline_info']
    total_time = runtime_metrics['total_pipeline_time']
    
    print("\n🚀 RUNTIME ANALYSIS SUMMARY")
    print("=" * 60)
    
    # Pipeline overview
    print(f"\n⏱️ PIPELINE OVERVIEW:")
    print(f"Total processing time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
    print(f"Documents processed: {pipeline_info['processed_documents']}")
    print(f"Models used: {len(pipeline_info['models_used'])}")
    print(f"Start time: {pipeline_info['start_time']}")
    print(f"End time: {pipeline_info['end_time']}")
    
    # Model performance summary
    model_stats = merged_data.groupby('Model').agg({
        'processing_time': ['mean', 'std', 'sum'],
        'F1_Score': 'mean',
        'efficiency_score': 'mean',
        'annotations_per_second': 'mean',
        'time_per_annotation': 'mean'
    }).round(3)
    
    print(f"\n📊 MODEL PERFORMANCE SUMMARY:")
    print(f"{'Model':<15} {'Avg Time (s)':<12} {'Total Time (s)':<14} {'Avg F1':<8} {'Efficiency':<10} {'Ann/sec':<8}")
    print("-" * 75)
    
    for model in model_stats.index:
        avg_time = model_stats.loc[model, ('processing_time', 'mean')]
        total_time_model = model_stats.loc[model, ('processing_time', 'sum')]
        avg_f1 = model_stats.loc[model, ('F1_Score', 'mean')]
        efficiency = model_stats.loc[model, ('efficiency_score', 'mean')]
        ann_per_sec = model_stats.loc[model, ('annotations_per_second', 'mean')]
        
        print(f"{model:<15} {avg_time:<12.2f} {total_time_model:<14.1f} {avg_f1:<8.3f} {efficiency:<10.4f} {ann_per_sec:<8.2f}")
    
    # Speed rankings
    speed_ranking = merged_data.groupby('Model')['processing_time'].mean().sort_values()
    efficiency_ranking = merged_data.groupby('Model')['efficiency_score'].mean().sort_values(ascending=False)
    
    print(f"\n🏃 SPEED RANKINGS (fastest to slowest):")
    for i, (model, time) in enumerate(speed_ranking.items(), 1):
        print(f"{i}. {model}: {time:.2f}s avg per document")
    
    print(f"\n🎯 EFFICIENCY RANKINGS (F1/time, best to worst):")
    for i, (model, eff) in enumerate(efficiency_ranking.items(), 1):
        print(f"{i}. {model}: {eff:.4f} F1 per second")
    
    # Document analysis
    doc_stats = merged_data.groupby('Document').agg({
        'processing_time': ['mean', 'std'],
        'F1_Score': 'mean'
    }).round(3)
    
    print(f"\n📄 DOCUMENT PROCESSING ANALYSIS:")
    print(f"{'Document':<40} {'Avg Time (s)':<12} {'Std (s)':<10} {'Avg F1':<8}")
    print("-" * 70)
    
    for doc in doc_stats.index:
        doc_short = doc[:35] + "..." if len(doc) > 35 else doc
        avg_time = doc_stats.loc[doc, ('processing_time', 'mean')]
        std_time = doc_stats.loc[doc, ('processing_time', 'std')]
        avg_f1 = doc_stats.loc[doc, ('F1_Score', 'mean')]
        
        print(f"{doc_short:<40} {avg_time:<12.2f} {std_time:<10.2f} {avg_f1:<8.3f}")
    
    # Key insights
    fastest_model = speed_ranking.index[0]
    slowest_model = speed_ranking.index[-1]
    most_efficient = efficiency_ranking.index[0]
    
    print(f"\n💡 KEY INSIGHTS:")
    print(f"• Fastest model: {fastest_model} ({speed_ranking[fastest_model]:.2f}s avg)")
    print(f"• Slowest model: {slowest_model} ({speed_ranking[slowest_model]:.2f}s avg)")
    print(f"• Most efficient: {most_efficient} ({efficiency_ranking[most_efficient]:.4f} F1/sec)")
    print(f"• Speed difference: {slowest_model} is {speed_ranking[slowest_model]/speed_ranking[fastest_model]:.1f}x slower than {fastest_model}")
    
    # Performance vs speed trade-offs
    best_f1_model = merged_data.groupby('Model')['F1_Score'].mean().idxmax()
    print(f"• Best F1 model: {best_f1_model}")
    print(f"• Best trade-off: {most_efficient} (combines good performance with speed)")
    
    return {
        'model_stats': model_stats,
        'speed_ranking': speed_ranking,
        'efficiency_ranking': efficiency_ranking,
        'document_stats': doc_stats
    }

print("✅ Runtime analysis functions updated successfully!")

In [None]:
print("RUNTIME ANALYSIS")
print("=" * 40)

# Check if we have a selected pipeline folder
if 'selected_pipeline_folder' not in globals() or not selected_pipeline_folder:
    print("❌ No pipeline folder selected. Please run the pipeline folder selector cell first.")
else:
    # Set up the main file path
    main_file = Path(selected_pipeline_folder) / f"{Path(selected_pipeline_folder).name}.json"
    
    if not main_file.exists():
        print(f"❌ Pipeline info file not found: {main_file}")
        print("Available files in the pipeline folder:")
        for file in Path(selected_pipeline_folder).glob("*.json"):
            print(f"  - {file.name}")
    else:
        print(f"✅ Using pipeline folder: {selected_pipeline_folder}")
        print(f"📄 Pipeline info file: {main_file}")
        
        # Extract actual runtime data from annotations
        model_runtime_data = []
        document_runtime_data = []

        # Get runtime data for each model-document combination
        for doc_file in main_file.parent.glob("*.json"):
            if doc_file.name != "pipeline_info.json" and doc_file != main_file:
                try:
                    with open(doc_file, 'r', encoding='utf-8') as f:
                        doc_data = json.load(f)
                        doc_name = doc_data.get('document_name', doc_file.stem)
                        
                        for annotation in doc_data.get('annotations', []):
                            model_name = annotation.get('model_name', 'Unknown')
                            llm_runtime = annotation.get('llm_runtime_seconds', 0)
                            model_runtime = annotation.get('model_runtime_seconds', 0)
                            
                            model_runtime_data.append({
                                'model': model_name,
                                'document': doc_name,
                                'llm_runtime_seconds': llm_runtime,
                                'model_runtime_seconds': model_runtime
                            })
                except Exception as e:
                    print(f"Warning: Could not process {doc_file.name}: {e}")

        # Convert to DataFrame for easier analysis
        runtime_df = pd.DataFrame(model_runtime_data)

        if not runtime_df.empty:
            # Aggregate runtime by model
            model_totals = runtime_df.groupby('model')['model_runtime_seconds'].sum().sort_values(ascending=False)
            
            # Aggregate runtime by document
            doc_totals = runtime_df.groupby('document')['model_runtime_seconds'].sum().sort_values(ascending=False)
            
            # Overall statistics
            total_runtime = runtime_df['model_runtime_seconds'].sum()
            avg_per_model = runtime_df.groupby('model')['model_runtime_seconds'].mean()
            avg_per_doc = runtime_df.groupby('document')['model_runtime_seconds'].mean()
            
            print(f"Total Processing Time: {total_runtime:.2f} seconds ({total_runtime/3600:.2f} hours)")
            print(f"Average per Model-Document: {runtime_df['model_runtime_seconds'].mean():.2f} seconds")
            print(f"Total Model-Document Combinations: {len(runtime_df)}")
            
            print("\nTOP 5 SLOWEST MODELS (Total Time):")
            for i, (model, time_sec) in enumerate(model_totals.head(5).items(), 1):
                print(f"  {i}. {model}: {time_sec:.2f}s ({time_sec/60:.1f}m)")
            
            print("\nTOP 5 FASTEST MODELS (Average Time per Document):")
            fastest_avg = avg_per_model.sort_values(ascending=True)
            for i, (model, time_sec) in enumerate(fastest_avg.head(5).items(), 1):
                print(f"  {i}. {model}: {time_sec:.2f}s avg")
            
            print("\nTOP 5 MOST TIME-CONSUMING DOCUMENTS:")
            for i, (doc, time_sec) in enumerate(doc_totals.head(5).items(), 1):
                print(f"  {i}. {doc}: {time_sec:.2f}s ({time_sec/60:.1f}m)")
            
            # Store runtime data for potential visualization
            model_runtime_summary = model_totals
            document_runtime_summary = doc_totals
            
            print(f"\nRuntime analysis complete! Found timing data for {len(runtime_df)} model-document combinations")
        else:
            print("No runtime data found in annotations")

In [None]:
print()
print("DETAILED MODEL RUNTIMES PER DOCUMENT")
print("=" * 50)

# Use the runtime data we collected in the previous cell
if 'runtime_df' in locals() and not runtime_df.empty:
    # Create a pivot table for better visualization
    runtime_pivot = runtime_df.pivot_table(
        index='document', 
        columns='model', 
        values='model_runtime_seconds',
        fill_value=0
    )
    
    # Display top 10 documents by total runtime
    doc_total_times = runtime_df.groupby('document')['model_runtime_seconds'].sum().sort_values(ascending=False)
    
    print("Top 10 Most Time-Consuming Documents:")
    print("-" * 40)
    for i, (doc, total_time) in enumerate(doc_total_times.head(10).items(), 1):
        hours = total_time / 3600
        print(f"{i:2d}. {doc}")
        print(f"    Total: {total_time:.1f}s ({hours:.1f}h)")
        
        # Show breakdown by model for this document
        doc_models = runtime_df[runtime_df['document'] == doc].sort_values('model_runtime_seconds', ascending=False)
        print("    Top 3 slowest models:")
        for j, row in doc_models.head(3).iterrows():
            model_time = row['model_runtime_seconds']
            pct = (model_time / total_time) * 100
            print(f"      - {row['model']}: {model_time:.1f}s ({pct:.1f}%)")
        print()
    
    # Show model performance consistency
    print("\nModel Performance Consistency:")
    print("-" * 35)
    model_stats = runtime_df.groupby('model')['model_runtime_seconds'].agg(['mean', 'std', 'min', 'max']).round(2)
    model_stats['cv'] = (model_stats['std'] / model_stats['mean']).round(3)  # Coefficient of variation
    model_stats = model_stats.sort_values('cv')  # Sort by consistency (lower CV = more consistent)
    
    print("Most Consistent Models (lowest coefficient of variation):")
    for i, (model, stats) in enumerate(model_stats.head(5).iterrows(), 1):
        print(f"{i}. {model}")
        print(f"   Avg: {stats['mean']:.1f}s, CV: {stats['cv']:.3f}")
        print(f"   Range: {stats['min']:.1f}s - {stats['max']:.1f}s")
    
    # Store the pivot table for potential use in visualizations
    model_document_runtimes = runtime_pivot
    print(f"\nData summary: {len(runtime_df)} model-document combinations analyzed")
    
else:
    print("No runtime data available. Please run the previous runtime analysis cell first.")

In [None]:
print("✅ Runtime analysis and dashboard functions ready!")

def create_actual_runtime_dashboard():
    """Create runtime dashboard using actual timing data from annotations."""
    
    # Check if we have a selected pipeline folder
    if 'selected_pipeline_folder' not in globals() or not selected_pipeline_folder:
        print("❌ No pipeline folder selected. Please run the pipeline folder selector cell first.")
        return None
    
    # Check if runtime data exists from previous analysis
    try:
        # Reference the global runtime_df created in the previous cell
        runtime_data = runtime_df
        if runtime_data.empty:
            print("❌ Runtime data is empty. Please run the runtime analysis cell first.")
            return None
    except NameError:
        print("❌ No runtime data available. Please run the runtime analysis cell first.")
        return None
    
    # Set up the evaluation results path
    eval_results_path = Path(selected_pipeline_folder) / "llm_evaluation_results.json"
    if not eval_results_path.exists():
        print(f"❌ Evaluation results not found: {eval_results_path}")
        return None
    
    # Load evaluation results for F1 scores
    eval_results = load_evaluation_results(eval_results_path)
    eval_df = create_dataframe_from_results(eval_results)
    eval_lenient = eval_df[eval_df['Evaluation_Mode'] == 'Lenient'].copy()
    
    # Merge runtime data with evaluation results
    eval_summary = eval_lenient.groupby(['Document', 'Model']).agg({
        'F1_Score': 'mean',
        'Precision': 'mean',
        'Recall': 'mean'
    }).reset_index()
    
    # Merge with actual runtime data
    merged_data = runtime_data.merge(
        eval_summary, 
        left_on=['document', 'model'], 
        right_on=['Document', 'Model'], 
        how='inner'
    )
    
    if merged_data.empty:
        print("❌ Could not merge runtime and evaluation data")
        return None
    
    # Dynamic color palette for all available models
    color_palette = [
        '#E74C3C',   # Red
        '#3498DB',   # Blue
        '#2ECC71',   # Green
        '#F39C12',   # Orange
        '#9B59B6',   # Purple
        '#1ABC9C',   # Teal
        '#F1C40F',   # Yellow
        '#E67E22',   # Dark Orange
        '#95A5A6',   # Gray
        '#34495E',   # Dark Blue Gray
        '#16A085',   # Dark Teal
        '#8E44AD',   # Dark Purple
        '#C0392B',   # Dark Red
        '#2980B9',   # Dark Blue
        '#27AE60',   # Dark Green
        '#D35400'    # Dark Orange Red
    ]
    
    # Get unique models and assign colors dynamically
    unique_models = merged_data['model'].unique()
    model_colors = {}
    for i, model in enumerate(unique_models):
        model_colors[model] = color_palette[i % len(color_palette)]
    
    # Create dashboard
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=(
            'Actual Processing Time by Model (seconds)',
            'F1 Score vs Processing Time',
            'Model Efficiency (F1 Score per second)',
            'Processing Time Distribution by Model',
            'Top 10 Slowest Document-Model Combinations',
            'Model Performance vs Speed Trade-off'
        ),
        specs=[[{"type": "bar"}, {"type": "scatter"}],
               [{"type": "bar"}, {"type": "box"}],
               [{"type": "bar"}, {"type": "scatter"}]],
        vertical_spacing=0.12,
        horizontal_spacing=0.15
    )
    
    # Get model totals and averages
    model_totals = merged_data.groupby('model')['model_runtime_seconds'].agg(['sum', 'mean']).reset_index()
    model_totals.columns = ['model', 'total_time', 'avg_time']
    model_totals = model_totals.sort_values('total_time', ascending=False)
    
    # 1. Total processing time by model
    fig.add_trace(
        go.Bar(
            x=model_totals['model'],
            y=model_totals['total_time'],
            marker_color=[model_colors.get(model, '#95A5A6') for model in model_totals['model']],
            name='Total Time',
            hovertemplate='<b>%{x}</b><br>Total Time: %{y:.1f}s (%{customdata:.1f}m)<extra></extra>',
            customdata=model_totals['total_time'] / 60
        ),
        row=1, col=1
    )
    
    # 2. F1 Score vs Processing Time scatter
    for model in merged_data['model'].unique():
        model_data = merged_data[merged_data['model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['model_runtime_seconds'],
                y=model_data['F1_Score'],
                mode='markers',
                name=model,
                marker=dict(
                    size=10,
                    color=model_colors.get(model, '#95A5A6'),
                    opacity=0.7
                ),
                text=model_data['document'],
                hovertemplate=f'<b>{model}</b><br>Document: %{{text}}<br>Time: %{{x:.1f}}s<br>F1: %{{y:.3f}}<extra></extra>',
                showlegend=False
            ),
            row=1, col=2
        )
    
    # 3. Model efficiency (F1 per second)
    model_efficiency = merged_data.groupby('model').apply(
        lambda x: (x['F1_Score'].mean() / x['model_runtime_seconds'].mean()) * 1000  # F1 per 1000 seconds
    ).reset_index()
    model_efficiency.columns = ['model', 'efficiency']
    model_efficiency = model_efficiency.sort_values('efficiency', ascending=False)
    
    fig.add_trace(
        go.Bar(
            x=model_efficiency['model'],
            y=model_efficiency['efficiency'],
            marker_color=[model_colors.get(model, '#95A5A6') for model in model_efficiency['model']],
            name='Efficiency',
            hovertemplate='<b>%{x}</b><br>F1 per 1000s: %{y:.3f}<extra></extra>'
        ),
        row=2, col=1
    )
    
    # 4. Processing time distribution by model (box plot)
    for model in merged_data['model'].unique():
        model_data = merged_data[merged_data['model'] == model]
        fig.add_trace(
            go.Box(
                y=model_data['model_runtime_seconds'],
                name=model,
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=False
            ),
            row=2, col=2
        )
    
    # 5. Top 10 slowest combinations
    slowest_combinations = merged_data.nlargest(10, 'model_runtime_seconds')
    time_in_minutes = slowest_combinations['model_runtime_seconds'] / 60
    fig.add_trace(
        go.Bar(
            x=[f"{row['model'][:15]}...<br>{row['document'][:20]}..." for _, row in slowest_combinations.iterrows()],
            y=slowest_combinations['model_runtime_seconds'],
            marker_color=[model_colors.get(model, '#95A5A6') for model in slowest_combinations['model']],
            name='Slowest Combinations',
            hovertemplate='<b>%{customdata}</b><br>Time: %{y:.1f}s (%{y:.1f}m)<extra></extra>',
            customdata=[f"{row['model']} - {row['document']}" for _, row in slowest_combinations.iterrows()]
        ),
        row=3, col=1
    )
    
    # 6. Performance vs Speed trade-off (average by model)
    model_summary = merged_data.groupby('model').agg({
        'F1_Score': 'mean',
        'model_runtime_seconds': 'mean'
    }).reset_index()
    
    fig.add_trace(
        go.Scatter(
            x=model_summary['model_runtime_seconds'],
            y=model_summary['F1_Score'],
            mode='markers+text',
            text=model_summary['model'],
            textposition='top center',
            marker=dict(
                size=15,
                color=[model_colors.get(model, '#95A5A6') for model in model_summary['model']],
                opacity=0.8
            ),
            name='Model Average',
            hovertemplate='<b>%{text}</b><br>Avg Time: %{x:.1f}s<br>Avg F1: %{y:.3f}<extra></extra>',
            showlegend=False
        ),
        row=3, col=2
    )
    
    # Update layout
    fig.update_layout(
        height=1200,
        title=f'Actual Runtime Performance Dashboard<br><sub>Based on real timing data from {len(merged_data)} model-document combinations</sub>',
        title_x=0.5,
        showlegend=False
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Model", row=1, col=1)
    fig.update_yaxes(title_text="Total Time (seconds)", row=1, col=1)
    fig.update_xaxes(title_text="Processing Time (seconds)", row=1, col=2)
    fig.update_yaxes(title_text="F1 Score", row=1, col=2)
    fig.update_xaxes(title_text="Model", row=2, col=1)
    fig.update_yaxes(title_text="F1 per 1000 seconds", row=2, col=1)
    fig.update_xaxes(title_text="Model", row=2, col=2)
    fig.update_yaxes(title_text="Processing Time (seconds)", row=2, col=2)
    fig.update_xaxes(title_text="Model - Document", row=3, col=1)
    fig.update_yaxes(title_text="Processing Time (seconds)", row=3, col=1)
    fig.update_xaxes(title_text="Average Processing Time (seconds)", row=3, col=2)
    fig.update_yaxes(title_text="Average F1 Score", row=3, col=2)
    
    fig.show()
    
    # Print summary statistics
    print(f"\nRUNTIME DASHBOARD SUMMARY")
    print("=" * 30)
    print(f"Total combinations analyzed: {len(merged_data)}")
    print(f"Fastest model (avg): {model_totals.iloc[-1]['model']} ({model_totals.iloc[-1]['avg_time']:.1f}s)")
    print(f"Slowest model (avg): {model_totals.iloc[0]['model']} ({model_totals.iloc[0]['avg_time']:.1f}s)")
    print(f"Most efficient: {model_efficiency.iloc[0]['model']} ({model_efficiency.iloc[0]['efficiency']:.3f} F1/1000s)")
    
    return fig

# Create the actual runtime dashboard
actual_runtime_fig = create_actual_runtime_dashboard()