In [None]:
import json
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
# Load and process evaluation data from multiple pipeline results

import os
import json
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

# Find all pipeline_results folders
results_folder = "output"
pipeline_folders = [f for f in os.listdir(results_folder) 
                    if f.startswith("pipeline_results_") and os.path.isdir(os.path.join(results_folder, f))]

# Filter folders that have evaluation files
valid_folders = []
for folder in pipeline_folders:
    evaluation_file = os.path.join(results_folder, folder, "llm_evaluation_results.json")
    if os.path.exists(evaluation_file):
        valid_folders.append(folder)
    else:
        print(f"Warning: No evaluation file found in {folder}")

print(f"Found {len(valid_folders)} pipeline_results folders:")
for folder in sorted(valid_folders):
    print(f"  - {folder}")

# Create multi-select widget for runs
run_selector = widgets.SelectMultiple(
    options=[(folder, folder) for folder in sorted(valid_folders)],
    description='Select runs:',
    disabled=False,
    layout=widgets.Layout(width='50%', height='150px')
)

# Auto-select first 3 runs for testing
# if len(valid_folders) >= 3:
#     run_selector.value = tuple(sorted(valid_folders)[:3])
#     print(f"\nAuto-selected first 3 runs: {list(run_selector.value)}")
# else:
#     run_selector.value = tuple(sorted(valid_folders))
#     print(f"\nAuto-selected all available runs: {list(run_selector.value)}")

# Load selected data
def load_selected_runs():
    selected_runs = run_selector.value
    if not selected_runs:
        print("❌ No runs selected!")
        return
    
    print(f"Loading {len(selected_runs)} selected runs...")
    
    global evaluation_data
    evaluation_data = {}
    
    for run_name in selected_runs:
        evaluation_file = os.path.join(results_folder, run_name, "llm_evaluation_results.json")
        
        try:
            with open(evaluation_file, 'r') as f:
                data = json.load(f)
                evaluation_data[run_name] = data
                print(f"✅ Loaded {run_name}")
        except Exception as e:
            print(f"❌ Error loading {run_name}: {e}")
    
    print(f"\n✅ Successfully loaded {len(evaluation_data)} runs")
    return evaluation_data

# Load the data immediately
evaluation_data = load_selected_runs()

display(run_selector)

# Button for manual reloading if needed
load_button = widgets.Button(
    description='Load Selected Data',
    disabled=False,
    button_style='success',
    icon='check'
)

output_area = widgets.Output()

def on_button_click(b):
    with output_area:
        clear_output()
        load_selected_runs()

load_button.on_click(on_button_click)

display(load_button)
display(output_area)

In [None]:
def process_evaluation_data(evaluation_data):
    """Process evaluation data into a structured format for visualization with annotation type details"""
    processed_data = []
    
    for run_name, data in evaluation_data.items():
        # The data structure is: {document: {model: {annotation_type: {lenient/strict: metrics}}}}
        for doc_name, doc_data in data.items():
            if not isinstance(doc_data, dict):
                continue
                
            for model_name, model_data in doc_data.items():
                if not isinstance(model_data, dict):
                    continue
                
                # Process each annotation type separately
                annotation_types = ['Event', 'Event_who', 'Event_when', 'Event_what']
                
                # Calculate overall metrics (aggregated across all annotation types)
                total_tp_lenient = 0
                total_fp_lenient = 0
                total_fn_lenient = 0
                total_tp_strict = 0
                total_fp_strict = 0
                total_fn_strict = 0
                
                annotation_type_count = 0
                
                # Add individual annotation type metrics
                for ann_type in annotation_types:
                    if ann_type in model_data and isinstance(model_data[ann_type], dict):
                        ann_data = model_data[ann_type]
                        annotation_type_count += 1
                        
                        # Extract lenient metrics
                        if 'lenient' in ann_data:
                            lenient_metrics = ann_data['lenient']
                            precision_l = lenient_metrics.get('precision', 0)
                            recall_l = lenient_metrics.get('recall', 0)
                            f1_score_l = lenient_metrics.get('f1_score', 0)
                            tp_l = lenient_metrics.get('true_positives', 0)
                            fp_l = lenient_metrics.get('false_positives', 0)
                            fn_l = lenient_metrics.get('false_negatives', 0)
                            
                            total_tp_lenient += tp_l
                            total_fp_lenient += fp_l
                            total_fn_lenient += fn_l
                        else:
                            precision_l = recall_l = f1_score_l = 0
                            tp_l = fp_l = fn_l = 0
                        
                        # Extract strict metrics
                        if 'strict' in ann_data:
                            strict_metrics = ann_data['strict']
                            precision_s = strict_metrics.get('precision', 0)
                            recall_s = strict_metrics.get('recall', 0)
                            f1_score_s = strict_metrics.get('f1_score', 0)
                            tp_s = strict_metrics.get('true_positives', 0)
                            fp_s = strict_metrics.get('false_positives', 0)
                            fn_s = strict_metrics.get('false_negatives', 0)
                            
                            total_tp_strict += tp_s
                            total_fp_strict += fp_s
                            total_fn_strict += fn_s
                        else:
                            precision_s = recall_s = f1_score_s = 0
                            tp_s = fp_s = fn_s = 0
                        
                        # Add record for this specific annotation type
                        processed_data.append({
                            'run': run_name,
                            'model': model_name,
                            'document': doc_name,
                            'annotation_type': ann_type,
                            'evaluation_mode': 'lenient',
                            'precision': precision_l,
                            'recall': recall_l,
                            'f1_score': f1_score_l,
                            'true_positives': tp_l,
                            'false_positives': fp_l,
                            'false_negatives': fn_l,
                            'gold_count': ann_data.get('gold_count', 0),
                            'predicted_count': ann_data.get('predicted_count', 0)
                        })
                        
                        processed_data.append({
                            'run': run_name,
                            'model': model_name,
                            'document': doc_name,
                            'annotation_type': ann_type,
                            'evaluation_mode': 'strict',
                            'precision': precision_s,
                            'recall': recall_s,
                            'f1_score': f1_score_s,
                            'true_positives': tp_s,
                            'false_positives': fp_s,
                            'false_negatives': fn_s,
                            'gold_count': ann_data.get('gold_count', 0),
                            'predicted_count': ann_data.get('predicted_count', 0)
                        })
                
                # Add overall metrics (aggregated across all annotation types)
                if annotation_type_count > 0:
                    # Calculate overall lenient metrics
                    overall_precision_l = total_tp_lenient / (total_tp_lenient + total_fp_lenient) if (total_tp_lenient + total_fp_lenient) > 0 else 0
                    overall_recall_l = total_tp_lenient / (total_tp_lenient + total_fn_lenient) if (total_tp_lenient + total_fn_lenient) > 0 else 0
                    overall_f1_l = 2 * (overall_precision_l * overall_recall_l) / (overall_precision_l + overall_recall_l) if (overall_precision_l + overall_recall_l) > 0 else 0
                    
                    # Calculate overall strict metrics
                    overall_precision_s = total_tp_strict / (total_tp_strict + total_fp_strict) if (total_tp_strict + total_fp_strict) > 0 else 0
                    overall_recall_s = total_tp_strict / (total_tp_strict + total_fn_strict) if (total_tp_strict + total_fn_strict) > 0 else 0
                    overall_f1_s = 2 * (overall_precision_s * overall_recall_s) / (overall_precision_s + overall_recall_s) if (overall_precision_s + overall_recall_s) > 0 else 0
                    
                    # Add overall records
                    processed_data.append({
                        'run': run_name,
                        'model': model_name,
                        'document': doc_name,
                        'annotation_type': 'Overall',
                        'evaluation_mode': 'lenient',
                        'precision': overall_precision_l,
                        'recall': overall_recall_l,
                        'f1_score': overall_f1_l,
                        'true_positives': total_tp_lenient,
                        'false_positives': total_fp_lenient,
                        'false_negatives': total_fn_lenient,
                        'gold_count': 0,  # Not meaningful for overall
                        'predicted_count': 0  # Not meaningful for overall
                    })
                    
                    processed_data.append({
                        'run': run_name,
                        'model': model_name,
                        'document': doc_name,
                        'annotation_type': 'Overall',
                        'evaluation_mode': 'strict',
                        'precision': overall_precision_s,
                        'recall': overall_recall_s,
                        'f1_score': overall_f1_s,
                        'true_positives': total_tp_strict,
                        'false_positives': total_fp_strict,
                        'false_negatives': total_fn_strict,
                        'gold_count': 0,  # Not meaningful for overall
                        'predicted_count': 0  # Not meaningful for overall
                    })
    
    return pd.DataFrame(processed_data)

def create_annotation_type_heatmap(df, evaluation_mode='lenient', metric='f1_score'):
    """Create a heatmap showing performance by annotation type across models"""
    
    # Filter data for the specified evaluation mode
    df_filtered = df[df['evaluation_mode'] == evaluation_mode].copy()
    
    # Exclude 'Overall' for this specific view
    df_filtered = df_filtered[df_filtered['annotation_type'] != 'Overall']
    
    # Create pivot table with annotation types as rows and models as columns
    pivot_data = df_filtered.pivot_table(
        index='annotation_type', 
        columns='model', 
        values=metric, 
        aggfunc='mean'  # Average across documents and runs
    ).fillna(0)
    
    # Reorder annotation types for better display
    desired_order = ['Event', 'Event_who', 'Event_when', 'Event_what']
    available_types = [t for t in desired_order if t in pivot_data.index]
    pivot_data = pivot_data.reindex(available_types)
    
    num_runs = len(df['run'].unique())
    title_text = f"Annotation Type Performance - {metric.replace('_', ' ').title()} ({evaluation_mode.title()})"
    if num_runs > 1:
        title_text += f" (Average across {num_runs} runs)"
    
    fig = go.Figure(data=go.Heatmap(
        z=pivot_data.values,
        x=pivot_data.columns,
        y=pivot_data.index,
        colorscale='RdYlGn',
        zmin=0,
        zmax=1,
        text=pivot_data.values.round(3),
        texttemplate="%{text}",
        textfont={"size": 12},
        colorbar=dict(
            title=metric.replace('_', ' ').title()
        )
    ))
    
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=18, color='#2E2E2E'),
            x=0.5
        ),
        xaxis=dict(
            title="Models",
            tickfont=dict(size=12)
        ),
        yaxis=dict(
            title="Annotation Types",
            tickfont=dict(size=12)
        ),
        height=400,
        width=max(800, len(pivot_data.columns) * 120),
        template='plotly_white'
    )
    
    return fig

def create_annotation_type_difficulty_chart(df, evaluation_mode='lenient'):
    """Create a chart showing which annotation types are most difficult across all models"""
    
    # Filter data for the specified evaluation mode and exclude 'Overall'
    df_filtered = df[(df['evaluation_mode'] == evaluation_mode) & (df['annotation_type'] != 'Overall')].copy()
    
    # Calculate average F1 score per annotation type across all models and documents
    ann_stats = df_filtered.groupby('annotation_type').agg({
        'f1_score': ['mean', 'std', 'min', 'max', 'count'],
        'precision': 'mean',
        'recall': 'mean'
    }).round(4)
    
    ann_stats.columns = ['_'.join(col).strip() for col in ann_stats.columns]
    ann_stats = ann_stats.reset_index()
    ann_stats = ann_stats.sort_values('f1_score_mean', ascending=True)
    
    fig = go.Figure()
    
    # Add bar chart with error bars
    fig.add_trace(go.Bar(
        x=ann_stats['f1_score_mean'],
        y=ann_stats['annotation_type'],
        orientation='h',
        error_x=dict(
            type='data',
            array=ann_stats['f1_score_std'],
            visible=True
        ),
        marker=dict(
            color=ann_stats['f1_score_mean'],
            colorscale='RdYlGn',
            cmin=0,
            cmax=1,
            colorbar=dict(title="Average F1-Score")
        ),
        text=ann_stats['f1_score_mean'].round(3),
        textposition='auto',
        width=0.6,  # Make bars thinner
        customdata=np.column_stack((ann_stats['precision_mean'], ann_stats['recall_mean'], ann_stats['f1_score_count'])),
        hovertemplate='<b>%{y}</b><br>' +
                     'F1-Score: %{x:.3f}<br>' +
                     'Precision: %{customdata[0]:.3f}<br>' +
                     'Recall: %{customdata[1]:.3f}<br>' +
                     'Data Points: %{customdata[2]}<extra></extra>'
    ))
    
    title_text = f"Annotation Type Difficulty Ranking ({evaluation_mode.title()} Evaluation)"
    
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=16, color='#2E2E2E'),
            x=0.5
        ),
        xaxis=dict(
            title="Average F1-Score",
            range=[0, 1]
        ),
        yaxis=dict(
            title="Annotation Types"
        ),
        height=300,
        template='plotly_white'
    )
    
    return fig

def create_model_comparison_by_annotation_type(df, evaluation_mode='lenient'):
    """Create a detailed comparison of models for each annotation type"""
    
    # Filter data for the specified evaluation mode and exclude 'Overall'
    df_filtered = df[(df['evaluation_mode'] == evaluation_mode) & (df['annotation_type'] != 'Overall')].copy()
    
    # Get unique annotation types and models
    annotation_types = ['Event', 'Event_who', 'Event_when', 'Event_what']
    available_types = [t for t in annotation_types if t in df_filtered['annotation_type'].unique()]
    models = sorted(df_filtered['model'].unique())
    
    # Create subplots for each annotation type
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=available_types,
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    colors = px.colors.qualitative.Set1[:len(models)]
    
    for i, ann_type in enumerate(available_types):
        row = (i // 2) + 1
        col = (i % 2) + 1
        
        # Filter data for this annotation type
        type_data = df_filtered[df_filtered['annotation_type'] == ann_type]
        
        # Calculate average metrics per model for this annotation type
        model_stats = type_data.groupby('model').agg({
            'precision': ['mean', 'std'],
            'recall': ['mean', 'std'],
            'f1_score': ['mean', 'std']
        }).round(4)
        
        model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns]
        model_stats = model_stats.reset_index()
        
        # Add bars for each metric
        metrics = ['precision_mean', 'recall_mean', 'f1_score_mean']
        metric_names = ['Precision', 'Recall', 'F1-Score']
        metric_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
        
        for j, (metric, name, color) in enumerate(zip(metrics, metric_names, metric_colors)):
            show_legend = (i == 0)  # Only show legend for first subplot
            
            fig.add_trace(go.Bar(
                name=name,
                x=model_stats['model'],
                y=model_stats[metric],
                marker_color=color,
                text=model_stats[metric].round(3),
                textposition='auto',
                showlegend=show_legend,
                legendgroup=name,
                width=0.6  # Make bars thinner
            ), row=row, col=col)
    
    title_text = f"Model Performance by Annotation Type ({evaluation_mode.title()} Evaluation)"
    
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=18, color='#2E2E2E'),
            x=0.5
        ),
        height=600,
        template='plotly_white',
        barmode='group'
    )
    
    # Update all y-axes to have same range
    for i in range(1, 3):
        for j in range(1, 3):
            fig.update_yaxes(range=[0, 1], row=i, col=j)
            fig.update_xaxes(tickangle=45, row=i, col=j)
    
    return fig

In [None]:
# Display information about loaded runs and data aggregation
if 'evaluation_data' in globals() and evaluation_data:
    print("📋 LOADED RUNS INFORMATION")
    print("="*50)
    print(f"Number of runs loaded: {len(evaluation_data)}")
    print("Runs included in analysis:")
    for i, run_name in enumerate(evaluation_data.keys(), 1):
        print(f"  {i}. {run_name}")
    
    print("\n🔄 DATA PRESENTATION METHOD:")
    if len(evaluation_data) == 1:
        print("- Single run: Results shown directly without aggregation")
    else:
        print("- Multiple runs: Results shown SEPARATELY for each run")
        print("- Each run is displayed in its own section/subplot")
        print("- NO AVERAGING - you can see the evolution across runs!")
    print("- This allows you to compare how performance changes with different settings")
    print("="*50)
else:
    print("❌ No evaluation data loaded yet.")

In [None]:
# Process the evaluation data into a DataFrame
if 'evaluation_data' in globals() and evaluation_data:
    print("🔄 Processing evaluation data...")
    df = process_evaluation_data(evaluation_data)
    
    # Filter for 'Overall' annotation type and 'lenient' evaluation mode for main visualizations
    df_main = df[(df['annotation_type'] == 'Overall') & (df['evaluation_mode'] == 'lenient')].copy()
    
    print(f"✅ Processed data into DataFrame:")
    print(f"   - Total records: {len(df)}")
    print(f"   - Main visualization records: {len(df_main)}")
    print(f"   - Runs: {df['run'].nunique()}")
    print(f"   - Models: {df['model'].nunique()}")
    print(f"   - Documents: {df['document'].nunique()}")
    print(f"   - Annotation types: {sorted(df['annotation_type'].unique())}")
else:
    print("❌ No evaluation data to process. Please load data first.")

In [None]:
def create_model_comparison_chart(df):
    """Create a model comparison chart showing average performance with error bars"""
    
    # Calculate statistics for each model
    model_stats = df.groupby('model').agg({
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1_score': ['mean', 'std'],
        'document': 'count'
    }).round(4)
    
    model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns]
    model_stats = model_stats.reset_index()
    model_stats = model_stats.sort_values('f1_score_mean', ascending=False)
    
    fig = go.Figure()
    
    # Colors for different metrics
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    metrics = ['precision_mean', 'recall_mean', 'f1_score_mean']
    metric_names = ['Precision', 'Recall', 'F1-Score']
    error_cols = ['precision_std', 'recall_std', 'f1_score_std']
    
    for i, (metric, name, color, error_col) in enumerate(zip(metrics, metric_names, colors, error_cols)):
        fig.add_trace(go.Bar(
            name=name,
            x=model_stats['model'],
            y=model_stats[metric],
            error_y=dict(
                type='data',
                array=model_stats[error_col],
                visible=True
            ),
            marker_color=color,
            text=model_stats[metric].round(3),
            textposition='auto',
            width=0.6  # Make bars thinner
        ))
    
    num_runs = len(df['run'].unique())
    title_text = "Model Performance Comparison"
    if num_runs > 1:
        title_text += f" (Average across {num_runs} runs)"
    
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=18, color='#2E2E2E'),
            x=0.5
        ),
        xaxis=dict(
            title="Models",
            tickangle=45
        ),
        yaxis=dict(
            title="Score",
            range=[0, 1]
        ),
        height=500,
        template='plotly_white',
        barmode='group'
    )
    
    return fig

def create_document_difficulty_chart(df):
    """Create a chart showing document difficulty ranking"""
    
    # Calculate average F1 score per document
    doc_stats = df.groupby('document').agg({
        'f1_score': ['mean', 'std', 'min', 'max', 'count'],
        'precision': 'mean',
        'recall': 'mean'
    }).round(4)
    
    doc_stats.columns = ['_'.join(col).strip() for col in doc_stats.columns]
    doc_stats = doc_stats.reset_index()
    doc_stats = doc_stats.sort_values('f1_score_mean', ascending=True)  # Hardest first
    
    fig = go.Figure()
    
    # Add bar chart with error bars
    fig.add_trace(go.Bar(
        x=doc_stats['f1_score_mean'],
        y=doc_stats['document'],
        orientation='h',
        error_x=dict(
            type='data',
            array=doc_stats['f1_score_std'],
            visible=True
        ),
        marker=dict(
            color=doc_stats['f1_score_mean'],
            colorscale='RdYlGn',
            cmin=0,
            cmax=1,
            colorbar=dict(title="Average F1-Score")
        ),
        text=doc_stats['f1_score_mean'].round(3),
        textposition='auto',
        width=0.6,  # Make bars thinner
        customdata=np.column_stack((doc_stats['precision_mean'], doc_stats['recall_mean'], doc_stats['f1_score_count'])),
        hovertemplate='<b>%{y}</b><br>' +
                     'F1-Score: %{x:.3f}<br>' +
                     'Precision: %{customdata[0]:.3f}<br>' +
                     'Recall: %{customdata[1]:.3f}<br>' +
                     'Data Points: %{customdata[2]}<extra></extra>'
    ))
    
    num_runs = len(df['run'].unique())
    title_text = "Document Difficulty Ranking (Hardest First)"
    if num_runs > 1:
        title_text += f" (Average across {num_runs} runs)"
    
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=16, color='#2E2E2E'),
            x=0.5
        ),
        xaxis=dict(
            title="Average F1-Score",
            range=[0, 1]
        ),
        yaxis=dict(
            title="Documents"
        ),
        height=max(400, len(doc_stats) * 30),
        template='plotly_white'
    )
    
    return fig

In [None]:
def create_performance_heatmap(df, metric, title_suffix):
    """Create a heatmap showing model performance across documents for each run separately"""
    
    # Check if we have multiple runs
    num_runs = len(df['run'].unique())
    
    if num_runs == 1:
        # Single run - create simple heatmap
        pivot_data = df.pivot_table(
            index='model', 
            columns='document', 
            values=metric, 
            aggfunc='first'  # Take the single value
        ).fillna(0)
        
        title_text = f"Model Performance Heatmap - {title_suffix} ({df['run'].iloc[0]})"
        
        fig = go.Figure(data=go.Heatmap(
            z=pivot_data.values,
            x=pivot_data.columns,
            y=pivot_data.index,
            colorscale='RdYlGn',
            zmin=0,
            zmax=1,
            text=pivot_data.values.round(3),
            texttemplate="%{text}",
            textfont={"size": 10},
            colorbar=dict(title=title_suffix)
        ))
        
        fig.update_layout(
            title=dict(text=title_text, font=dict(size=20, color='#2E2E2E'), x=0.5),
            xaxis=dict(title="Documents", tickangle=45, tickfont=dict(size=10)),
            yaxis=dict(title="Models", tickfont=dict(size=12)),
            height=max(400, len(pivot_data.index) * 50),
            width=max(800, len(pivot_data.columns) * 80),
            template='plotly_white'
        )
        
        return fig
    
    else:
        # Multiple runs - create subplots for each run
        runs = sorted(df['run'].unique())
        cols = min(3, len(runs))  # Max 3 columns
        rows = (len(runs) + cols - 1) // cols  # Calculate rows needed
        
        fig = make_subplots(
            rows=rows, cols=cols,
            subplot_titles=runs,
            shared_xaxes=True,
            shared_yaxes=True,
            vertical_spacing=0.15,
            horizontal_spacing=0.1
        )
        
        for i, run in enumerate(runs):
            row = i // cols + 1
            col = i % cols + 1
            
            run_data = df[df['run'] == run]
            pivot_data = run_data.pivot_table(
                index='model', 
                columns='document', 
                values=metric, 
                aggfunc='first'
            ).fillna(0)
            
            fig.add_trace(
                go.Heatmap(
                    z=pivot_data.values,
                    x=pivot_data.columns,
                    y=pivot_data.index,
                    colorscale='RdYlGn',
                    zmin=0,
                    zmax=1,
                    text=pivot_data.values.round(3),
                    texttemplate="%{text}",
                    textfont={"size": 8},
                    showscale=(i == 0),  # Only show colorbar for first subplot
                    colorbar=dict(title=title_suffix) if i == 0 else None
                ),
                row=row, col=col
            )
        
        title_text = f"Model Performance Comparison Across Runs - {title_suffix}"
        
        fig.update_layout(
            title=dict(
                text=title_text,
                font=dict(size=18, color='#2E2E2E'),
                x=0.5
            ),
            height=max(400, rows * 300),
            width=max(1200, cols * 400),
            template='plotly_white'
        )
        
        return fig

In [None]:
if 'df_main' in globals() and not df_main.empty:
    print("🎯 Creating Performance Visualizations...")
    
    # 1. F1-Score Heatmap - Shows model performance across documents at a glance
    print("📊 1. Model Performance Heatmap (F1-Score)")
    f1_heatmap = create_performance_heatmap(df_main, 'f1_score', 'F1-Score')
    f1_heatmap.show()
    
    # 2. Model Comparison Chart - Shows average performance with error bars
    print("📈 2. Model Performance Comparison")
    model_comparison = create_model_comparison_chart(df_main)
    model_comparison.show()
    
    # 3. Document Difficulty Ranking
    print("📋 3. Document Difficulty Ranking")
    doc_difficulty = create_document_difficulty_chart(df_main)
    doc_difficulty.show()
    
else:
    print("❌ Please load and process evaluation data first.")

In [None]:
def create_run_comparison_dashboard(df):
    """Create a comprehensive dashboard comparing performance across different runs"""
    
    # Check if we have multiple runs
    runs = df['run'].unique()
    if len(runs) <= 1:
        print("Only one run available - no comparison needed.")
        return None
    
    # Filter for overall metrics and lenient evaluation
    df_overall = df[(df['annotation_type'] == 'Overall') & (df['evaluation_mode'] == 'lenient')].copy()
    
    if df_overall.empty:
        print("No overall metrics found for comparison.")
        return None
    
    # Create subplots: 2 rows, 2 columns
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            "Average F1-Score by Run",
            "Model Performance Across Runs", 
            "Run-to-Run Performance Change",
            "Document Difficulty Across Runs"
        ],
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]],
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )
    
    # 1. Average F1-Score by Run (Top Left)
    run_stats = df_overall.groupby('run').agg({
        'f1_score': ['mean', 'std'],
        'precision': 'mean',
        'recall': 'mean'
    }).round(4)
    run_stats.columns = ['f1_mean', 'f1_std', 'precision_mean', 'recall_mean']
    run_stats = run_stats.reset_index()
    
    fig.add_trace(go.Bar(
        x=run_stats['run'],
        y=run_stats['f1_mean'],
        error_y=dict(type='data', array=run_stats['f1_std']),
        name='F1-Score',
        marker_color='#45B7D1',
        text=run_stats['f1_mean'].round(3),
        textposition='auto',
        showlegend=False
    ), row=1, col=1)
    
    # 2. Model Performance Across Runs (Top Right)
    models = sorted(df_overall['model'].unique())
    colors = px.colors.qualitative.Set1[:len(models)]
    
    for i, model in enumerate(models):
        model_data = df_overall[df_overall['model'] == model]
        model_run_stats = model_data.groupby('run')['f1_score'].mean().reset_index()
        
        fig.add_trace(go.Scatter(
            x=model_run_stats['run'],
            y=model_run_stats['f1_score'],
            mode='lines+markers',
            name=model,
            line=dict(color=colors[i], width=3),
            marker=dict(size=8),
            showlegend=(i < 5)  # Limit legend entries
        ), row=1, col=2)
    
    # 3. Run-to-Run Performance Change (Bottom Left)
    if len(runs) >= 2:
        sorted_runs = sorted(runs)
        changes = []
        run_pairs = []
        
        for i in range(1, len(sorted_runs)):
            prev_run = sorted_runs[i-1]
            curr_run = sorted_runs[i]
            
            prev_avg = df_overall[df_overall['run'] == prev_run]['f1_score'].mean()
            curr_avg = df_overall[df_overall['run'] == curr_run]['f1_score'].mean()
            
            change = curr_avg - prev_avg
            changes.append(change)
            run_pairs.append(f"{prev_run} → {curr_run}")
        
        colors_change = ['green' if x >= 0 else 'red' for x in changes]
        
        fig.add_trace(go.Bar(
            x=run_pairs,
            y=changes,
            marker_color=colors_change,
            text=[f"{x:+.3f}" for x in changes],
            textposition='auto',
            showlegend=False,
            name='Performance Change'
        ), row=2, col=1)
    
    # 4. Document Difficulty Across Runs (Bottom Right)
    doc_run_stats = df_overall.groupby(['document', 'run'])['f1_score'].mean().reset_index()
    doc_variance = doc_run_stats.groupby('document')['f1_score'].agg(['mean', 'std']).reset_index()
    doc_variance = doc_variance.sort_values('std', ascending=False).head(10)  # Top 10 most variable
    
    fig.add_trace(go.Bar(
        x=doc_variance['document'],
        y=doc_variance['std'],
        marker=dict(
            color=doc_variance['std'],
            colorscale='Reds',
            colorbar=dict(title="Std Dev")
        ),
        text=doc_variance['std'].round(3),
        textposition='auto',
        showlegend=False,
        name='Performance Variance'
    ), row=2, col=2)
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=f"Run Comparison Dashboard ({len(runs)} runs)",
            font=dict(size=20, color='#2E2E2E'),
            x=0.5
        ),
        height=800,
        width=1400,
        template='plotly_white'
    )
    
    # Update axes
    fig.update_xaxes(title_text="Runs", row=1, col=1, tickangle=45)
    fig.update_yaxes(title_text="Average F1-Score", row=1, col=1, range=[0, 1])
    
    fig.update_xaxes(title_text="Runs", row=1, col=2, tickangle=45)
    fig.update_yaxes(title_text="F1-Score", row=1, col=2, range=[0, 1])
    
    fig.update_xaxes(title_text="Run Transitions", row=2, col=1, tickangle=45)
    fig.update_yaxes(title_text="F1-Score Change", row=2, col=1)
    
    fig.update_xaxes(title_text="Documents", row=2, col=2, tickangle=45)
    fig.update_yaxes(title_text="Performance Std Dev", row=2, col=2)
    
    return fig

In [None]:
if 'df' in globals() and not df.empty:
    
    # 4. Run Comparison (if multiple runs selected)
    if len(df['run'].unique()) > 1:
        print("🔄 4. Run Comparison Dashboard")
        run_comparison = create_run_comparison_dashboard(df)
        if run_comparison:
            run_comparison.show()
    
    # 5. Additional Heatmaps for Precision and Recall
    print("📊 5. Additional Performance Heatmaps")
    
    # Filter for main visualization data (Overall + lenient)
    df_main = df[(df['annotation_type'] == 'Overall') & (df['evaluation_mode'] == 'lenient')].copy()
    
    # Precision heatmap
    precision_heatmap = create_performance_heatmap(df_main, 'precision', 'Precision')
    precision_heatmap.show()
    
    # Recall heatmap  
    recall_heatmap = create_performance_heatmap(df_main, 'recall', 'Recall')
    recall_heatmap.show()
    
    # 6. Summary Statistics Table
    print("\n" + "="*80)
    print("📊 PERFORMANCE SUMMARY")
    print("="*80)
    
    # Best model per document
    print("\n🏆 BEST MODEL PER DOCUMENT (by F1-Score):")
    best_per_doc = df.loc[df.groupby('document')['f1_score'].idxmax()][['document', 'model', 'f1_score', 'precision', 'recall']]
    best_per_doc = best_per_doc.sort_values('f1_score', ascending=False)
    display(best_per_doc)
    
    # Overall model ranking
    print("\n🥇 OVERALL MODEL RANKING:")
    model_ranking = df.groupby('model').agg({
        'f1_score': ['mean', 'std', 'min', 'max'],
        'precision': 'mean',
        'recall': 'mean',
        'document': 'count'
    }).round(4)
    model_ranking.columns = ['F1_Mean', 'F1_Std', 'F1_Min', 'F1_Max', 'Precision_Mean', 'Recall_Mean', 'Documents']
    model_ranking = model_ranking.sort_values('F1_Mean', ascending=False)
    display(model_ranking)
    
    # Document difficulty ranking
    print("\n📋 DOCUMENT DIFFICULTY RANKING (Hardest First):")
    doc_ranking = df.groupby('document').agg({
        'f1_score': ['mean', 'std', 'min', 'max'],
        'model': 'count'
    }).round(4)
    doc_ranking.columns = ['F1_Mean', 'F1_Std', 'F1_Min', 'F1_Max', 'Models_Tested']
    doc_ranking = doc_ranking.sort_values('F1_Mean', ascending=True)
    display(doc_ranking)
    
    if len(df['run'].unique()) > 1:
        print("\n🔄 RUN COMPARISON:")
        run_summary = df.groupby('run').agg({
            'f1_score': ['mean', 'std'],
            'precision': 'mean',
            'recall': 'mean',
            'model': 'nunique',
            'document': 'nunique'
        }).round(4)
        run_summary.columns = ['F1_Mean', 'F1_Std', 'Precision_Mean', 'Recall_Mean', 'Models', 'Documents']
        run_summary = run_summary.sort_values('F1_Mean', ascending=False)
        display(run_summary)
    
else:
    print("❌ Please load and process evaluation data first.")