In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for matplotlib
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def load_evaluation_results(results_path):
    """Load evaluation results from JSON file."""
    with open(results_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def create_dataframe_from_results(results):
    """Convert nested results to a flat DataFrame for analysis."""
    rows = []
    for doc_name, doc_results in results.items():
        for model_name, model_results in doc_results.items():
            for ann_type, metrics in model_results.items():
                # Lenient evaluation
                rows.append({
                    'Document': doc_name,
                    'Model': model_name,
                    'Annotation_Type': ann_type,
                    'Evaluation_Mode': 'Lenient',
                    'Precision': metrics['lenient']['precision'],
                    'Recall': metrics['lenient']['recall'],
                    'F1_Score': metrics['lenient']['f1_score'],
                    'True_Positives': metrics['lenient']['true_positives'],
                    'False_Positives': metrics['lenient']['false_positives'],
                    'False_Negatives': metrics['lenient']['false_negatives'],
                    'Gold_Count': metrics['gold_count'],
                    'Predicted_Count': metrics['predicted_count']
                })
                
                # Strict evaluation
                rows.append({
                    'Document': doc_name,
                    'Model': model_name,
                    'Annotation_Type': ann_type,
                    'Evaluation_Mode': 'Strict',
                    'Precision': metrics['strict']['precision'],
                    'Recall': metrics['strict']['recall'],
                    'F1_Score': metrics['strict']['f1_score'],
                    'True_Positives': metrics['strict']['true_positives'],
                    'False_Positives': metrics['strict']['false_positives'],
                    'False_Negatives': metrics['strict']['false_negatives'],
                    'Gold_Count': metrics['gold_count'],
                    'Predicted_Count': metrics['predicted_count']
                })
    
    return pd.DataFrame(rows)

def visualize_single_run(results_path):
    """Create comprehensive visualizations for a single evaluation run."""
    
    # Load and prepare data
    results = load_evaluation_results(results_path)
    df = create_dataframe_from_results(results)
    
    # Create an enhanced figure with more subplots for better analysis
    fig = make_subplots(
        rows=4, cols=2,
        subplot_titles=(
            'F1-Scores by Model and Annotation Type (Lenient)', 
            'Document-Level Performance Heatmap',
            'Precision vs Recall by Model (with Document Points)',
            'Model Performance Across Documents',
            'Lenient vs Strict Evaluation Comparison',
            'Individual Document Analysis',
            'Annotation Type Performance Distribution',
            'Model Consistency Across Documents'
        ),
        specs=[[{"type": "bar"}, {"type": "heatmap"}],
               [{"type": "scatter"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "box"}],
               [{"type": "bar"}, {"type": "scatter"}]],
        vertical_spacing=0.08,
        horizontal_spacing=0.12
    )
    
    # Color palette for models with better contrast
    model_colors = {
        'gemma3:1b': '#E74C3C',    # Red
        'gemma3:4b': '#3498DB',    # Blue  
        'gemma3:12b': '#2ECC71',   # Green
        'mistral:latest': '#F39C12' # Orange
    }
    
    # Document colors for variety
    doc_colors = px.colors.qualitative.Set3
    
    # Get lenient and strict data
    df_lenient = df[df['Evaluation_Mode'] == 'Lenient']
    df_strict = df[df['Evaluation_Mode'] == 'Strict']
    
    # 1. F1-Scores by Model and Annotation Type (Lenient) - Enhanced with proper legend
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        avg_f1_by_type = model_data.groupby('Annotation_Type')['F1_Score'].mean().reset_index()
        
        fig.add_trace(
            go.Bar(
                x=avg_f1_by_type['Annotation_Type'],
                y=avg_f1_by_type['F1_Score'],
                name=f"{model}",
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=True,
                legendgroup='models',
                hovertemplate=f'<b>{model}</b><br>Type: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            ),
            row=1, col=1
        )
    
    # 2. Document-Level Performance Heatmap
    doc_model_performance = df_lenient.groupby(['Document', 'Model'])['F1_Score'].mean().reset_index()
    heatmap_pivot = doc_model_performance.pivot(index='Document', columns='Model', values='F1_Score')
    
    fig.add_trace(
        go.Heatmap(
            z=heatmap_pivot.values,
            x=heatmap_pivot.columns,
            y=heatmap_pivot.index,
            colorscale='RdYlBu_r',
            showscale=True,
            colorbar=dict(title="F1 Score", x=0.48),
            hovertemplate='Document: %{y}<br>Model: %{x}<br>F1 Score: %{z:.3f}<extra></extra>',
            showlegend=False
        ),
        row=1, col=2
    )
    
    # 3. Precision vs Recall scatter plot with document points
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        fig.add_trace(
            go.Scatter(
                x=model_data['Recall'],
                y=model_data['Precision'],
                mode='markers',
                name=f"{model} (docs)",
                marker=dict(
                    size=8,
                    color=model_colors.get(model, '#95A5A6'),
                    symbol='circle',
                    opacity=0.7
                ),
                text=model_data['Document'] + '<br>' + model_data['Annotation_Type'],
                hovertemplate='<b>%{text}</b><br>Recall: %{x:.3f}<br>Precision: %{y:.3f}<extra></extra>',
                showlegend=True,
                legendgroup='scatter'
            ),
            row=2, col=1
        )
    
    # 4. Model Performance Across Documents (Box plot style)
    for i, model in enumerate(df_lenient['Model'].unique()):
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_performance = model_data.groupby('Document')['F1_Score'].mean().reset_index()
        
        fig.add_trace(
            go.Bar(
                x=doc_performance['Document'],
                y=doc_performance['F1_Score'],
                name=f"{model}",
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=False,
                opacity=0.8,
                hovertemplate=f'<b>{model}</b><br>Document: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            ),
            row=2, col=2
        )
    
    # 5. Lenient vs Strict comparison - Enhanced
    df_comparison = df.groupby(['Model', 'Evaluation_Mode'])['F1_Score'].mean().reset_index()
    lenient_data = df_comparison[df_comparison['Evaluation_Mode'] == 'Lenient']
    strict_data = df_comparison[df_comparison['Evaluation_Mode'] == 'Strict']
    
    fig.add_trace(
        go.Bar(
            x=lenient_data['Model'],
            y=lenient_data['F1_Score'],
            name='Lenient Evaluation',
            marker_color='lightblue',
            showlegend=True,
            legendgroup='evaluation_modes',
            hovertemplate='<b>Lenient</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=3, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=strict_data['Model'],
            y=strict_data['F1_Score'],
            name='Strict Evaluation',
            marker_color='darkblue',
            showlegend=True,
            legendgroup='evaluation_modes',
            hovertemplate='<b>Strict</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=3, col=1
    )
    
    # 6. Individual Document Analysis (Box plots showing variance)
    for i, annotation_type in enumerate(df_lenient['Annotation_Type'].unique()):
        ann_data = df_lenient[df_lenient['Annotation_Type'] == annotation_type]
        
        fig.add_trace(
            go.Box(
                y=ann_data['F1_Score'],
                name=annotation_type,
                boxpoints='all',
                jitter=0.3,
                pointpos=-1.8,
                marker_color=px.colors.qualitative.Set1[i % len(px.colors.qualitative.Set1)],
                showlegend=True,
                legendgroup='annotation_types',
                hovertemplate=f'<b>{annotation_type}</b><br>F1: %{{y:.3f}}<extra></extra>'
            ),
            row=3, col=2
        )
    
    # 7. Annotation Type Performance Distribution
    df_counts = df_lenient.groupby('Annotation_Type').agg({
        'Gold_Count': 'first',
        'Predicted_Count': 'mean',
        'F1_Score': 'mean'
    }).reset_index()
    
    fig.add_trace(
        go.Bar(
            x=df_counts['Annotation_Type'],
            y=df_counts['Gold_Count'],
            name='Gold Standard Count',
            marker_color='gold',
            showlegend=True,
            legendgroup='counts',
            hovertemplate='<b>Gold Standard</b><br>Type: %{x}<br>Count: %{y}<extra></extra>'
        ),
        row=4, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=df_counts['Annotation_Type'],
            y=df_counts['Predicted_Count'],
            name='Avg Predicted Count',
            marker_color='silver',
            showlegend=True,
            legendgroup='counts',
            yaxis='y2',
            hovertemplate='<b>Predicted</b><br>Type: %{x}<br>Avg Count: %{y:.1f}<extra></extra>'
        ),
        row=4, col=1
    )
    
    # 8. Model Consistency Across Documents (Coefficient of Variation)
    model_consistency = []
    for model in df_lenient['Model'].unique():
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_scores = model_data.groupby('Document')['F1_Score'].mean()
        cv = doc_scores.std() / doc_scores.mean() if doc_scores.mean() > 0 else 0
        model_consistency.append({'Model': model, 'Consistency': 1 - cv, 'CV': cv})
    
    consistency_df = pd.DataFrame(model_consistency)
    
    fig.add_trace(
        go.Scatter(
            x=consistency_df['Model'],
            y=consistency_df['Consistency'],
            mode='markers+lines',
            name='Model Consistency',
            marker=dict(
                size=12,
                color=[model_colors.get(model, '#95A5A6') for model in consistency_df['Model']],
                symbol='diamond'
            ),
            line=dict(color='gray', dash='dash'),
            showlegend=True,
            legendgroup='consistency',
            hovertemplate='<b>%{x}</b><br>Consistency: %{y:.3f}<br>CV: %{customdata:.3f}<extra></extra>',
            customdata=consistency_df['CV']
        ),
        row=4, col=2
    )
    
    # Update layout with improved legend and styling
    fig.update_layout(
        height=1600,  # Increased height for 4 rows
        title=f'Enhanced LLM Evaluation Dashboard<br><sub>Pipeline Results: {Path(results_path).parent.name} | Document-Level Analysis</sub>',
        title_x=0.5,
        showlegend=True,
        legend=dict(
            orientation="v",
            yanchor="top", 
            y=1,
            xanchor="left",
            x=1.02,
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="rgba(0,0,0,0.2)",
            borderwidth=1,
            font=dict(size=10)
        ),
        font=dict(size=11)
    )
    
    # Update subplot titles and axes with better formatting
    fig.update_xaxes(title_text="Annotation Type", row=1, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Average F1 Score", row=1, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=1, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Document", row=1, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Recall", row=2, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Precision", row=2, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Document", row=2, col=2, title_font_size=10, tickangle=45)
    fig.update_yaxes(title_text="F1 Score", row=2, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=3, col=1, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score", row=3, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Annotation Type", row=3, col=2, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score Distribution", row=3, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Annotation Type", row=4, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Count", row=4, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=4, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Consistency Score", row=4, col=2, title_font_size=10)
    
    return fig

def create_document_comparison_plot(df_lenient):
    """Create a detailed document comparison visualization."""
    # Document performance comparison
    doc_fig = go.Figure()
    
    documents = df_lenient['Document'].unique()
    models = df_lenient['Model'].unique()
    
    model_colors = {
        'gemma3:1b': '#E74C3C',
        'gemma3:4b': '#3498DB', 
        'gemma3:12b': '#2ECC71',
        'mistral:latest': '#F39C12'
    }
    
    for model in models:
        model_data = df_lenient[df_lenient['Model'] == model]
        doc_scores = model_data.groupby('Document')['F1_Score'].mean()
        
        doc_fig.add_trace(
            go.Scatter(
                x=documents,
                y=[doc_scores.get(doc, 0) for doc in documents],
                mode='lines+markers',
                name=model,
                line=dict(color=model_colors.get(model, '#95A5A6'), width=3),
                marker=dict(size=8),
                hovertemplate=f'<b>{model}</b><br>Document: %{{x}}<br>Avg F1: %{{y:.3f}}<extra></extra>'
            )
        )
    
    doc_fig.update_layout(
        title='Model Performance Across Individual Documents',
        xaxis_title='Document',
        yaxis_title='Average F1 Score',
        height=400,
        hovermode='x unified',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right", 
            x=1
        )
    )
    
    doc_fig.update_xaxes(tickangle=45)
    return doc_fig

# Enhanced example usage with document-level analysis
results_path = "output/pipeline_results_20250804_170535/llm_evaluation_results.json"

# Check if file exists
if Path(results_path).exists():
    # Create main dashboard
    fig = visualize_single_run(results_path)
    fig.show()
    
    # Create additional document comparison plot
    results = load_evaluation_results(results_path)
    df = create_dataframe_from_results(results)
    df_lenient = df[df['Evaluation_Mode'] == 'Lenient']
    
    doc_fig = create_document_comparison_plot(df_lenient)
    doc_fig.show()
    
    # Print summary statistics
    print("📊 EVALUATION SUMMARY:")
    print("=" * 50)
    
    # Model rankings
    model_rankings = df_lenient.groupby('Model')['F1_Score'].agg(['mean', 'std']).round(3)
    model_rankings = model_rankings.sort_values('mean', ascending=False)
    print("\n🏆 MODEL RANKINGS (by average F1):")
    for i, (model, stats) in enumerate(model_rankings.iterrows(), 1):
        print(f"{i}. {model}: {stats['mean']:.3f} (±{stats['std']:.3f})")
    
    # Best performing annotation types
    ann_performance = df_lenient.groupby('Annotation_Type')['F1_Score'].agg(['mean', 'std']).round(3)
    ann_performance = ann_performance.sort_values('mean', ascending=False)
    print("\n📋 ANNOTATION TYPE PERFORMANCE:")
    for ann_type, stats in ann_performance.iterrows():
        print(f"• {ann_type}: {stats['mean']:.3f} (±{stats['std']:.3f})")
    
    # Document difficulty analysis
    doc_difficulty = df_lenient.groupby('Document')['F1_Score'].agg(['mean', 'std']).round(3)
    doc_difficulty = doc_difficulty.sort_values('mean')
    print(f"\n📄 DOCUMENT ANALYSIS:")
    print(f"Most challenging: {doc_difficulty.index[0]} (avg F1: {doc_difficulty.iloc[0]['mean']:.3f})")
    print(f"Easiest: {doc_difficulty.index[-1]} (avg F1: {doc_difficulty.iloc[-1]['mean']:.3f})")
    
    print("\n✅ Enhanced single run visualization created successfully!")
else:
    print(f"❌ Results file not found: {results_path}")
    print("Please update the path to your evaluation results file.")
    print("\nAvailable pipeline results:")
    output_dir = Path("output")
    if output_dir.exists():
        for folder in sorted(output_dir.glob("pipeline_results_*")):
            results_file = folder / "llm_evaluation_results.json"
            if results_file.exists():
                print(f"  - {results_file}")

✅ Single run visualization created successfully!


In [None]:
def compare_multiple_runs(results_paths, run_labels=None):
    """
    Compare multiple evaluation runs to see evolution/differences with document-level insights.
    
    Args:
        results_paths: List of paths to llm_evaluation_results.json files
        run_labels: Optional list of labels for each run (defaults to folder names)
    """
    
    if run_labels is None:
        run_labels = [Path(path).parent.name for path in results_paths]
    
    # Load all results
    all_results = {}
    all_dfs = []
    
    for i, (path, label) in enumerate(zip(results_paths, run_labels)):
        if Path(path).exists():
            results = load_evaluation_results(path)
            all_results[label] = results
            df = create_dataframe_from_results(results)
            df['Run'] = label
            df['Run_Order'] = i
            all_dfs.append(df)
        else:
            print(f"⚠️ Warning: File not found: {path}")
    
    if not all_dfs:
        print("❌ No valid result files found!")
        return None, None
    
    # Combine all dataframes
    combined_df = pd.concat(all_dfs, ignore_index=True)
    
    # Create enhanced comparison dashboard
    fig = make_subplots(
        rows=4, cols=2,
        subplot_titles=(
            'F1-Score Evolution Across Runs (by Model)', 
            'Document Performance Evolution',
            'Model Performance Comparison (Latest Run)',
            'Document Difficulty Ranking Changes',
            'Lenient vs Strict Evolution',
            'Precision-Recall Evolution by Run',
            'Annotation Type Performance Trends',
            'Model Consistency Analysis'
        ),
        specs=[[{"type": "scatter"}, {"type": "scatter"}],
               [{"type": "bar"}, {"type": "heatmap"}],
               [{"type": "scatter"}, {"type": "scatter"}],
               [{"type": "scatter"}, {"type": "box"}]],
        vertical_spacing=0.08,
        horizontal_spacing=0.12
    )
    
    # Enhanced color palettes
    model_colors = {
        'gemma3:1b': '#E74C3C',    # Red
        'gemma3:4b': '#3498DB',    # Blue
        'gemma3:12b': '#2ECC71',   # Green
        'mistral:latest': '#F39C12' # Orange
    }
    
    annotation_colors = {
        'Event': '#9B59B6',       # Purple
        'Event_who': '#E67E22',   # Orange
        'Event_when': '#1ABC9C',  # Teal
        'Event_what': '#E91E63'   # Pink
    }
    
    # Get lenient data for main analysis
    df_lenient = combined_df[combined_df['Evaluation_Mode'] == 'Lenient']
    
    # 1. F1-Score Evolution Across Runs (by Model) - Enhanced with confidence bands
    for model in df_lenient['Model'].unique():
        model_data = df_lenient[df_lenient['Model'] == model]
        model_evolution = model_data.groupby('Run_Order')['F1_Score'].agg(['mean', 'std']).reset_index()
        model_evolution['Run_Labels'] = [run_labels[i] for i in model_evolution['Run_Order']]
        
        # Main line
        fig.add_trace(
            go.Scatter(
                x=model_evolution['Run_Order'],
                y=model_evolution['mean'],
                mode='lines+markers',
                name=f"{model}",
                line=dict(color=model_colors.get(model, '#95A5A6'), width=3),
                marker=dict(size=10),
                hovertemplate=f'<b>{model}</b><br>Run: %{{customdata}}<br>Avg F1: %{{y:.3f}}±%{{error_y.array:.3f}}<extra></extra>',
                customdata=model_evolution['Run_Labels'],
                error_y=dict(
                    type='data',
                    array=model_evolution['std'],
                    visible=True,
                    color=model_colors.get(model, '#95A5A6'),
                    thickness=1.5
                ),
                showlegend=True,
                legendgroup='models'
            ),
            row=1, col=1
        )
    
    # 2. Document Performance Evolution - Show how individual documents perform across runs
    documents = df_lenient['Document'].unique()
    for i, doc in enumerate(documents[:5]):  # Limit to first 5 documents for clarity
        doc_evolution = df_lenient[df_lenient['Document'] == doc].groupby('Run_Order')['F1_Score'].mean().reset_index()
        doc_evolution['Run_Labels'] = [run_labels[i] for i in doc_evolution['Run_Order']]
        
        fig.add_trace(
            go.Scatter(
                x=doc_evolution['Run_Order'],
                y=doc_evolution['F1_Score'],
                mode='lines+markers',
                name=f"Doc: {doc[:15]}..." if len(doc) > 15 else f"Doc: {doc}",
                line=dict(width=2, dash='dash'),
                marker=dict(size=6),
                hovertemplate=f'<b>{doc}</b><br>Run: %{{customdata}}<br>Avg F1: %{{y:.3f}}<extra></extra>',
                customdata=doc_evolution['Run_Labels'],
                showlegend=True,
                legendgroup='documents',
                opacity=0.7
            ),
            row=1, col=2
        )
    
    # 3. Model Performance Comparison (Latest Run) - Enhanced with both evaluation modes
    latest_run_data = combined_df[combined_df['Run_Order'] == combined_df['Run_Order'].max()]
    latest_lenient = latest_run_data[latest_run_data['Evaluation_Mode'] == 'Lenient']
    latest_strict = latest_run_data[latest_run_data['Evaluation_Mode'] == 'Strict']
    
    model_avg_lenient = latest_lenient.groupby('Model')['F1_Score'].mean()
    model_avg_strict = latest_strict.groupby('Model')['F1_Score'].mean()
    
    fig.add_trace(
        go.Bar(
            x=model_avg_lenient.index,
            y=model_avg_lenient.values,
            name='Lenient (Latest)',
            marker_color='lightblue',
            showlegend=True,
            legendgroup='evaluation_latest',
            hovertemplate='<b>Lenient</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Bar(
            x=model_avg_strict.index,
            y=model_avg_strict.values,
            name='Strict (Latest)',
            marker_color='darkblue',
            showlegend=True,
            legendgroup='evaluation_latest',
            hovertemplate='<b>Strict</b><br>Model: %{x}<br>F1: %{y:.3f}<extra></extra>'
        ),
        row=2, col=1
    )
    
    # 4. Document Difficulty Ranking Changes - Heatmap showing how document difficulty changes
    doc_difficulty_matrix = []
    for run_order in sorted(combined_df['Run_Order'].unique()):
        run_data = df_lenient[df_lenient['Run_Order'] == run_order]
        doc_scores = run_data.groupby('Document')['F1_Score'].mean()
        doc_difficulty_matrix.append(doc_scores.values)
    
    if doc_difficulty_matrix:
        fig.add_trace(
            go.Heatmap(
                z=np.array(doc_difficulty_matrix).T,
                x=[f"Run {i}" for i in range(len(run_labels))],
                y=documents,
                colorscale='RdYlBu_r',
                showscale=True,
                colorbar=dict(title="Avg F1", x=0.48),
                hovertemplate='Run: %{x}<br>Document: %{y}<br>Avg F1: %{z:.3f}<extra></extra>',
                showlegend=False
            ),
            row=2, col=2
        )
    
    # 5. Lenient vs Strict Evolution - Enhanced with trend analysis
    for mode in ['Lenient', 'Strict']:
        mode_evolution = combined_df[combined_df['Evaluation_Mode'] == mode].groupby('Run_Order')['F1_Score'].agg(['mean', 'std']).reset_index()
        mode_evolution['Run_Labels'] = [run_labels[i] for i in mode_evolution['Run_Order']]
        
        fig.add_trace(
            go.Scatter(
                x=mode_evolution['Run_Order'],
                y=mode_evolution['mean'],
                mode='lines+markers',
                name=f'{mode} Evaluation',
                line=dict(width=4, dash='solid' if mode == 'Lenient' else 'dash'),
                marker=dict(size=12),
                error_y=dict(
                    type='data',
                    array=mode_evolution['std'],
                    visible=True
                ),
                hovertemplate=f'<b>{mode}</b><br>Run: %{{customdata}}<br>Avg F1: %{{y:.3f}}±%{{error_y.array:.3f}}<extra></extra>',
                customdata=mode_evolution['Run_Labels'],
                showlegend=True,
                legendgroup='evaluation_modes'
            ),
            row=3, col=1
        )
    
    # 6. Precision-Recall Evolution by Run - Show trajectory over runs
    for run_order in sorted(df_lenient['Run_Order'].unique()):
        run_data = df_lenient[df_lenient['Run_Order'] == run_order]
        model_pr = run_data.groupby('Model')[['Precision', 'Recall']].mean().reset_index()
        
        fig.add_trace(
            go.Scatter(
                x=model_pr['Recall'],
                y=model_pr['Precision'],
                mode='markers+text',
                name=f"{run_labels[run_order]}",
                marker=dict(
                    size=12,
                    symbol='circle',
                    opacity=0.7
                ),
                text=model_pr['Model'],
                textposition="top center",
                hovertemplate=f'<b>Run: {run_labels[run_order]}</b><br>Model: %{{text}}<br>Recall: %{{x:.3f}}<br>Precision: %{{y:.3f}}<extra></extra>',
                showlegend=True,
                legendgroup='runs'
            ),
            row=3, col=2
        )
    
    # 7. Annotation Type Performance Trends - Enhanced with confidence intervals
    for ann_type in df_lenient['Annotation_Type'].unique():
        ann_evolution = df_lenient[df_lenient['Annotation_Type'] == ann_type].groupby('Run_Order')['F1_Score'].agg(['mean', 'std']).reset_index()
        ann_evolution['Run_Labels'] = [run_labels[i] for i in ann_evolution['Run_Order']]
        
        fig.add_trace(
            go.Scatter(
                x=ann_evolution['Run_Order'],
                y=ann_evolution['mean'],
                mode='lines+markers',
                name=f"{ann_type}",
                line=dict(color=annotation_colors.get(ann_type, '#95A5A6'), width=3),
                marker=dict(size=8),
                error_y=dict(
                    type='data',
                    array=ann_evolution['std'],
                    visible=True,
                    color=annotation_colors.get(ann_type, '#95A5A6')
                ),
                hovertemplate=f'<b>{ann_type}</b><br>Run: %{{customdata}}<br>Avg F1: %{{y:.3f}}±%{{error_y.array:.3f}}<extra></extra>',
                customdata=ann_evolution['Run_Labels'],
                showlegend=True,
                legendgroup='annotations'
            ),
            row=4, col=1
        )
    
    # 8. Model Consistency Analysis - Box plots showing variance across runs
    for model in df_lenient['Model'].unique():
        model_all_runs = df_lenient[df_lenient['Model'] == model]['F1_Score']
        
        fig.add_trace(
            go.Box(
                y=model_all_runs,
                name=f"{model}",
                boxpoints='all',
                jitter=0.3,
                pointpos=-1.8,
                marker_color=model_colors.get(model, '#95A5A6'),
                showlegend=True,
                legendgroup='model_consistency',
                hovertemplate=f'<b>{model}</b><br>F1: %{{y:.3f}}<extra></extra>'
            ),
            row=4, col=2
        )
    
    # Update layout with enhanced styling
    fig.update_layout(
        height=1600,  # Increased for 4 rows
        title='Enhanced Multi-Run LLM Evaluation Comparison<br><sub>Document-Level Analysis & Performance Evolution</sub>',
        title_x=0.5,
        showlegend=True,
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02,
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="rgba(0,0,0,0.2)",
            borderwidth=1,
            font=dict(size=10)
        ),
        font=dict(size=11)
    )
    
    # Update axes labels with better formatting
    fig.update_xaxes(title_text="Run Number", row=1, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Average F1 Score", row=1, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Run Number", row=1, col=2, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score", row=1, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=2, col=1, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score", row=2, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Run", row=2, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Document", row=2, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Run Number", row=3, col=1, title_font_size=10)
    fig.update_yaxes(title_text="Average F1 Score", row=3, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Recall", row=3, col=2, title_font_size=10)
    fig.update_yaxes(title_text="Precision", row=3, col=2, title_font_size=10)
    
    fig.update_xaxes(title_text="Run Number", row=4, col=1, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score", row=4, col=1, title_font_size=10)
    
    fig.update_xaxes(title_text="Model", row=4, col=2, title_font_size=10)
    fig.update_yaxes(title_text="F1 Score Distribution", row=4, col=2, title_font_size=10)
    
    # Add run labels to x-axes where appropriate
    for row_col in [(1, 1), (1, 2), (3, 1), (4, 1)]:
        fig.update_xaxes(
            tickmode='array',
            tickvals=list(range(len(run_labels))),
            ticktext=run_labels,
            row=row_col[0], col=row_col[1],
            tickangle=45 if len(max(run_labels, key=len)) > 10 else 0
        )
    
    return fig, combined_df

def create_detailed_comparison_report(combined_df, run_labels):
    """Create a detailed statistical comparison report."""
    print("\n📊 DETAILED MULTI-RUN COMPARISON REPORT")
    print("=" * 60)
    
    df_lenient = combined_df[combined_df['Evaluation_Mode'] == 'Lenient']
    
    # Overall improvement analysis
    first_run = df_lenient[df_lenient['Run_Order'] == 0]['F1_Score'].mean()
    last_run = df_lenient[df_lenient['Run_Order'] == df_lenient['Run_Order'].max()]['F1_Score'].mean()
    improvement = ((last_run - first_run) / first_run) * 100
    
    print(f"\n🚀 OVERALL IMPROVEMENT:")
    print(f"First run average F1: {first_run:.3f}")
    print(f"Last run average F1: {last_run:.3f}")
    print(f"Improvement: {improvement:+.1f}%")
    
    # Best performing run per model
    print(f"\n🏆 BEST PERFORMING RUNS BY MODEL:")
    for model in df_lenient['Model'].unique():
        model_data = df_lenient[df_lenient['Model'] == model]
        best_run = model_data.groupby('Run_Order')['F1_Score'].mean().idxmax()
        best_score = model_data.groupby('Run_Order')['F1_Score'].mean().max()
        print(f"• {model}: Run {best_run} ({run_labels[best_run]}) - F1: {best_score:.3f}")
    
    # Document performance stability
    print(f"\n📄 DOCUMENT PERFORMANCE STABILITY:")
    doc_stability = []
    for doc in df_lenient['Document'].unique():
        doc_data = df_lenient[df_lenient['Document'] == doc]
        doc_scores = doc_data.groupby('Run_Order')['F1_Score'].mean()
        cv = doc_scores.std() / doc_scores.mean() if doc_scores.mean() > 0 else 0
        doc_stability.append({'Document': doc, 'CV': cv, 'Avg_F1': doc_scores.mean()})
    
    doc_stability_df = pd.DataFrame(doc_stability).sort_values('CV')
    print(f"Most stable: {doc_stability_df.iloc[0]['Document'][:30]}... (CV: {doc_stability_df.iloc[0]['CV']:.3f})")
    print(f"Most variable: {doc_stability_df.iloc[-1]['Document'][:30]}... (CV: {doc_stability_df.iloc[-1]['CV']:.3f})")
    
    return doc_stability_df

def create_summary_table(combined_df):
    """Create a summary table for the comparison."""
    summary = combined_df.groupby(['Run', 'Model', 'Evaluation_Mode']).agg({
        'F1_Score': ['mean', 'std'],
        'Precision': 'mean',
        'Recall': 'mean'
    }).round(3)
    
    summary.columns = ['F1_Mean', 'F1_Std', 'Precision_Mean', 'Recall_Mean']
    return summary.reset_index()

# Enhanced example usage for multiple runs comparison with detailed analysis
results_paths = [
    "output/pipeline_results_20250804_170535/llm_evaluation_results.json",
    # Add more paths here for comparison, e.g.:
    # "output/pipeline_results_20250805_120000/llm_evaluation_results.json",
    # "output/pipeline_results_20250806_150000/llm_evaluation_results.json"
]

run_labels = [
    "Baseline Run",
    # Add corresponding labels, e.g.:
    # "Improved Prompts",
    # "Fine-tuned Models"
]

# Check if we have multiple runs to compare
if len(results_paths) > 1:
    print("🔄 Analyzing multiple evaluation runs...")
    fig_comparison, df_comparison = compare_multiple_runs(results_paths, run_labels)
    
    if fig_comparison is not None:
        fig_comparison.show()
        
        # Show enhanced summary table
        summary_table = create_summary_table(df_comparison)
        print("\n📊 STATISTICAL SUMMARY TABLE:")
        print(summary_table.to_string(index=False))
        
        # Create detailed comparison report
        doc_stability = create_detailed_comparison_report(df_comparison, run_labels)
        
        print("\n✅ Enhanced multi-run comparison visualization created successfully!")
    else:
        print("❌ Failed to create comparison - check file paths.")
        
else:
    print("📝 For multi-run comparison, add more result file paths to 'results_paths' list above.")
    print("\n🔍 Available pipeline results:")
    output_dir = Path("output")
    if output_dir.exists():
        available_results = []
        for folder in sorted(output_dir.glob("pipeline_results_*")):
            results_file = folder / "llm_evaluation_results.json"
            if results_file.exists():
                available_results.append(str(results_file))
                print(f"  ✓ {results_file}")
        
        if len(available_results) > 1:
            print(f"\n💡 Copy these paths to compare multiple runs:")
            for i, path in enumerate(available_results):
                print(f'    "{path}",')
    else:
        print("  ❌ No output directory found.")
        
    print("\n🎯 Example configuration for multiple runs:")
    print('''results_paths = [
    "output/pipeline_results_20250804_170535/llm_evaluation_results.json",
    "output/pipeline_results_20250805_120000/llm_evaluation_results.json",
    "output/pipeline_results_20250806_150000/llm_evaluation_results.json"
]

run_labels = [
    "Baseline",
    "Improved Prompts", 
    "Fine-tuned Models"
]''')