In [1]:
import json
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
def find_pipeline_results_folders(base_path="output"):
    """Find all pipeline_results folders in the output directory"""
    pipeline_folders = []
    base_path = Path(base_path)
    
    if not base_path.exists():
        print(f"Output directory '{base_path}' not found!")
        return []
    
    for item in base_path.iterdir():
        if item.is_dir() and item.name.startswith("pipeline_results_"):
            # Check for evaluation file (try both possible names)
            eval_json_path1 = item / "llm_evaluation_results.json"
            eval_json_path2 = item / "llm_evaluation.json"
            
            eval_path = None
            if eval_json_path1.exists():
                eval_path = eval_json_path1
            elif eval_json_path2.exists():
                eval_path = eval_json_path2
            
            if eval_path:
                pipeline_folders.append({
                    'name': item.name,
                    'path': str(item),
                    'eval_path': str(eval_path)
                })
            else:
                print(f"Warning: No evaluation file found in {item.name}")
    
    return pipeline_folders

def load_evaluation_data(eval_path):
    """Load evaluation data from JSON file"""
    try:
        with open(eval_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {eval_path}: {e}")
        return None

# Find available pipeline results
available_folders = find_pipeline_results_folders()

if not available_folders:
    print("No pipeline_results folders found in the output directory!")
else:
    print(f"Found {len(available_folders)} pipeline_results folders:")
    for folder in available_folders:
        print(f"  - {folder['name']}")

# Create selection widget
folder_options = [(folder['name'], folder) for folder in available_folders]
folder_selector = widgets.SelectMultiple(
    options=folder_options,
    description='Select runs:',
    disabled=False,
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%', height='150px')
)

load_button = widgets.Button(
    description='Load Selected Data',
    button_style='success',
    icon='check'
)

output_area = widgets.Output()

def load_data(button):
    with output_area:
        clear_output()
        selected_folders = list(folder_selector.value)
        
        if not selected_folders:
            print("Please select at least one folder!")
            return
        
        global evaluation_data
        evaluation_data = {}
        
        for folder in selected_folders:
            print(f"Loading data from {folder['name']}...")
            data = load_evaluation_data(folder['eval_path'])
            if data:
                evaluation_data[folder['name']] = data
        
        if evaluation_data:
            print(f"\nSuccessfully loaded data from {len(evaluation_data)} runs:")
            for run_name in evaluation_data.keys():
                print(f"  ✓ {run_name}")
        else:
            print("No data could be loaded!")

load_button.on_click(load_data)

display(folder_selector)
display(load_button)
display(output_area)

Found 2 pipeline_results folders:
  - pipeline_results_20250725_111753
  - pipeline_results_20250808_145025


SelectMultiple(description='Select runs:', layout=Layout(height='150px', width='50%'), options=(('pipeline_res…

Button(button_style='success', description='Load Selected Data', icon='check', style=ButtonStyle())

Output()

In [3]:
def process_evaluation_data(evaluation_data):
    """Process evaluation data into a structured format for visualization"""
    processed_data = []
    
    for run_name, data in evaluation_data.items():
        # The data structure is: {document: {model: {entity_type: {lenient/strict: metrics}}}}
        for doc_name, doc_data in data.items():
            if not isinstance(doc_data, dict):
                continue
                
            for model_name, model_data in doc_data.items():
                if not isinstance(model_data, dict):
                    continue
                
                # Calculate overall metrics across all entity types for this model-document combination
                total_tp_lenient = 0
                total_fp_lenient = 0
                total_fn_lenient = 0
                
                entity_count = 0
                
                for entity_type, entity_data in model_data.items():
                    if not isinstance(entity_data, dict):
                        continue
                    
                    entity_count += 1
                    
                    # Use lenient metrics as primary
                    if 'lenient' in entity_data and isinstance(entity_data['lenient'], dict):
                        lenient_metrics = entity_data['lenient']
                        total_tp_lenient += lenient_metrics.get('true_positives', 0)
                        total_fp_lenient += lenient_metrics.get('false_positives', 0)
                        total_fn_lenient += lenient_metrics.get('false_negatives', 0)
                
                if entity_count > 0:
                    # Calculate overall precision, recall, f1 using lenient metrics
                    precision = total_tp_lenient / (total_tp_lenient + total_fp_lenient) if (total_tp_lenient + total_fp_lenient) > 0 else 0
                    recall = total_tp_lenient / (total_tp_lenient + total_fn_lenient) if (total_tp_lenient + total_fn_lenient) > 0 else 0
                    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                    
                    processed_data.append({
                        'run': run_name,
                        'model': model_name,
                        'document': doc_name,
                        'precision': precision,
                        'recall': recall,
                        'f1_score': f1_score,
                        'entity_types': entity_count
                    })
    
    return pd.DataFrame(processed_data)

def create_performance_heatmap(df, metric='f1_score', title_suffix='F1-Score'):
    """Create a heatmap showing model performance across documents for each run separately"""
    
    # Check if we have multiple runs
    num_runs = len(df['run'].unique())
    
    if num_runs == 1:
        # Single run - create simple heatmap
        pivot_data = df.pivot_table(
            index='model', 
            columns='document', 
            values=metric, 
            aggfunc='first'  # Take the single value
        ).fillna(0)
        
        title_text = f"Model Performance Heatmap - {title_suffix} ({df['run'].iloc[0]})"
        
        fig = go.Figure(data=go.Heatmap(
            z=pivot_data.values,
            x=pivot_data.columns,
            y=pivot_data.index,
            colorscale='RdYlGn',
            zmin=0,
            zmax=1,
            text=pivot_data.values.round(3),
            texttemplate="%{text}",
            textfont={"size": 10},
            colorbar=dict(title=title_suffix)
        ))
        
        fig.update_layout(
            title=dict(text=title_text, font=dict(size=20, color='#2E2E2E'), x=0.5),
            xaxis=dict(title="Documents", tickangle=45, tickfont=dict(size=10)),
            yaxis=dict(title="Models", tickfont=dict(size=12)),
            height=max(400, len(pivot_data.index) * 50),
            width=max(800, len(pivot_data.columns) * 80),
            template='plotly_white'
        )
        
        return fig
    
    else:
        # Multiple runs - create subplots for each run
        from plotly.subplots import make_subplots
        
        runs = sorted(df['run'].unique())
        cols = min(3, len(runs))  # Max 3 columns
        rows = (len(runs) + cols - 1) // cols  # Calculate rows needed
        
        fig = make_subplots(
            rows=rows, cols=cols,
            subplot_titles=runs,
            shared_xaxes=True,
            shared_yaxes=True,
            vertical_spacing=0.15,
            horizontal_spacing=0.1
        )
        
        for i, run in enumerate(runs):
            row = i // cols + 1
            col = i % cols + 1
            
            run_data = df[df['run'] == run]
            pivot_data = run_data.pivot_table(
                index='model', 
                columns='document', 
                values=metric, 
                aggfunc='first'
            ).fillna(0)
            
            fig.add_trace(
                go.Heatmap(
                    z=pivot_data.values,
                    x=pivot_data.columns,
                    y=pivot_data.index,
                    colorscale='RdYlGn',
                    zmin=0,
                    zmax=1,
                    text=pivot_data.values.round(3),
                    texttemplate="%{text}",
                    textfont={"size": 8},
                    showscale=(i == 0),  # Only show colorbar for first subplot
                    colorbar=dict(title=title_suffix) if i == 0 else None
                ),
                row=row, col=col
            )
        
        title_text = f"Model Performance Comparison Across Runs - {title_suffix}"
        
        fig.update_layout(
            title=dict(
                text=title_text,
                font=dict(size=18, color='#2E2E2E'),
                x=0.5
            ),
            height=max(400, rows * 300),
            width=max(1200, cols * 400),
            template='plotly_white'
        )
        
        return fig

def create_model_comparison_chart(df):
    """Create a chart comparing models across all documents, showing each run separately"""
    
    num_runs = len(df['run'].unique())
    
    if num_runs == 1:
        # Single run - original behavior
        model_stats = df.groupby('model').agg({
            'precision': ['mean', 'std'],
            'recall': ['mean', 'std'], 
            'f1_score': ['mean', 'std'],
            'document': 'count'
        }).round(4)
        
        model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns]
        model_stats = model_stats.reset_index()
        
        fig = go.Figure()
        
        metrics = ['precision_mean', 'recall_mean', 'f1_score_mean']
        metric_names = ['Precision', 'Recall', 'F1-Score']
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
        
        for i, (metric, name, color) in enumerate(zip(metrics, metric_names, colors)):
            error_metric = metric.replace('_mean', '_std')
            
            fig.add_trace(go.Bar(
                name=name,
                x=model_stats['model'],
                y=model_stats[metric],
                error_y=dict(type='data', array=model_stats[error_metric], visible=True),
                marker_color=color,
                text=model_stats[metric].round(3),
                textposition='auto'
            ))
        
        title_text = f"Model Performance Comparison ({df['run'].iloc[0]})"
        
    else:
        # Multiple runs - show each run side by side
        fig = go.Figure()
        
        runs = sorted(df['run'].unique())
        metrics = ['precision', 'recall', 'f1_score']
        metric_names = ['Precision', 'Recall', 'F1-Score']
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
        
        models = sorted(df['model'].unique())
        
        # Create data for each metric and run combination
        for metric_idx, (metric, metric_name, color) in enumerate(zip(metrics, metric_names, colors)):
            for run_idx, run in enumerate(runs):
                run_data = df[df['run'] == run]
                model_means = run_data.groupby('model')[metric].mean().reindex(models, fill_value=0)
                
                # Create x positions for grouped bars
                x_positions = [f"{model}_{run}" for model in models]
                
                fig.add_trace(go.Bar(
                    name=f"{metric_name} ({run})",
                    x=x_positions,
                    y=model_means.values,
                    marker_color=color,
                    opacity=0.7 + (run_idx * 0.3 / len(runs)),  # Vary opacity by run
                    text=model_means.round(3),
                    textposition='auto',
                    offsetgroup=metric_idx,
                    legendgroup=metric_name,
                    showlegend=(run_idx == 0)  # Only show legend for first run of each metric
                ))
        
        title_text = f"Model Performance Comparison Across {len(runs)} Runs"
    
    fig.update_layout(
        title=dict(text=title_text, font=dict(size=18, color='#2E2E2E'), x=0.5),
        xaxis=dict(title="Models" if num_runs == 1 else "Models by Run", tickangle=45),
        yaxis=dict(title="Score", range=[0, 1]),
        barmode='group',
        template='plotly_white',
        height=500,
        showlegend=True
    )
    
    return fig

def create_document_difficulty_chart(df):
    """Create a chart showing which documents are hardest/easiest for models"""
    
    # Calculate average F1 score per document across all models
    doc_stats = df.groupby('document').agg({
        'f1_score': ['mean', 'std', 'min', 'max'],
        'model': 'count'
    }).round(4)
    
    doc_stats.columns = ['_'.join(col).strip() for col in doc_stats.columns]
    doc_stats = doc_stats.reset_index()
    doc_stats = doc_stats.sort_values('f1_score_mean', ascending=True)
    
    fig = go.Figure()
    
    # Add bar chart with error bars
    fig.add_trace(go.Bar(
        x=doc_stats['f1_score_mean'],
        y=doc_stats['document'],
        orientation='h',
        error_x=dict(
            type='data',
            array=doc_stats['f1_score_std'],
            visible=True
        ),
        marker=dict(
            color=doc_stats['f1_score_mean'],
            colorscale='RdYlGn',
            cmin=0,
            cmax=1,
            colorbar=dict(title="Average F1-Score")
        ),
        text=doc_stats['f1_score_mean'].round(3),
        textposition='auto'
    ))
    
    fig.update_layout(
        title=dict(
            text="Document Difficulty Ranking (by Average F1-Score Across Models)",
            font=dict(size=16, color='#2E2E2E'),
            x=0.5
        ),
        xaxis=dict(
            title="Average F1-Score",
            range=[0, 1]
        ),
        yaxis=dict(
            title="Documents"
        ),
        height=max(400, len(doc_stats) * 25),
        template='plotly_white'
    )
    
    return fig

def create_run_comparison_dashboard(df):
    """Create a dashboard comparing different runs"""
    
    if len(df['run'].unique()) < 2:
        print("Need at least 2 runs for comparison")
        return None
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'F1-Score by Run', 
            'Precision by Run',
            'Recall by Run', 
            'Model Performance Variation'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    runs = df['run'].unique()
    colors = px.colors.qualitative.Set1[:len(runs)]
    
    # Calculate stats by run
    run_stats = df.groupby('run').agg({
        'f1_score': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std']
    }).round(4)
    
    run_stats.columns = ['_'.join(col).strip() for col in run_stats.columns]
    run_stats = run_stats.reset_index()
    
    # Add traces for each metric
    metrics = [
        ('f1_score_mean', 'f1_score_std', (1, 1)),
        ('precision_mean', 'precision_std', (1, 2)),
        ('recall_mean', 'recall_std', (2, 1))
    ]
    
    for metric_mean, metric_std, (row, col) in metrics:
        fig.add_trace(
            go.Bar(
                name=metric_mean.split('_')[0].title(),
                x=run_stats['run'],
                y=run_stats[metric_mean],
                error_y=dict(
                    type='data',
                    array=run_stats[metric_std],
                    visible=True
                ),
                text=run_stats[metric_mean].round(3),
                textposition='auto',
                showlegend=False
            ),
            row=row, col=col
        )
    
    # Model performance variation (coefficient of variation)
    model_variation = df.groupby(['run', 'model'])['f1_score'].mean().groupby('run').std()
    
    fig.add_trace(
        go.Bar(
            name='F1 Std Dev',
            x=model_variation.index,
            y=model_variation.values,
            text=model_variation.values.round(3),
            textposition='auto',
            showlegend=False
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title=dict(
            text="Run Comparison Dashboard",
            font=dict(size=20, color='#2E2E2E'),
            x=0.5
        ),
        height=800,
        template='plotly_white'
    )
    
    return fig

# Check if evaluation_data exists and process it
if 'evaluation_data' in globals() and evaluation_data:
    df = process_evaluation_data(evaluation_data)
    
    if not df.empty:
        print(f"✅ Data processed successfully!")
        print(f"📊 Data shape: {df.shape}")
        print(f"🏃 Runs: {list(df['run'].unique())}")
        print(f"🤖 Models: {list(df['model'].unique())}")
        print(f"📄 Documents: {len(df['document'].unique())} documents")
        print(f"📈 Average F1-Score: {df['f1_score'].mean():.3f}")
    else:
        print("❌ No data to process. Please load evaluation data first.")
else:
    print("❌ No evaluation data loaded. Please run the previous cell first.")

✅ Data processed successfully!
📊 Data shape: (237, 7)
🏃 Runs: ['pipeline_results_20250725_111753', 'pipeline_results_20250808_145025']
🤖 Models: ['gemma3:1b', 'gemma3:4b', 'gemma3:12b', 'mistral:latest']
📄 Documents: 30 documents
📈 Average F1-Score: 0.216


In [4]:
# Display information about loaded runs and data aggregation
if 'evaluation_data' in globals() and evaluation_data:
    print("📋 LOADED RUNS INFORMATION")
    print("="*50)
    print(f"Number of runs loaded: {len(evaluation_data)}")
    print("Runs included in analysis:")
    for i, run_name in enumerate(evaluation_data.keys(), 1):
        print(f"  {i}. {run_name}")
    
    print("\n🔄 DATA PRESENTATION METHOD:")
    if len(evaluation_data) == 1:
        print("- Single run: Results shown directly without aggregation")
    else:
        print("- Multiple runs: Results shown SEPARATELY for each run")
        print("- Each run is displayed in its own section/subplot")
        print("- NO AVERAGING - you can see the evolution across runs!")
    print("- This allows you to compare how performance changes with different settings")
    print("="*50)
else:
    print("❌ No evaluation data loaded yet.")

📋 LOADED RUNS INFORMATION
Number of runs loaded: 2
Runs included in analysis:
  1. pipeline_results_20250725_111753
  2. pipeline_results_20250808_145025

🔄 DATA PRESENTATION METHOD:
- Multiple runs: Results shown SEPARATELY for each run
- Each run is displayed in its own section/subplot
- NO AVERAGING - you can see the evolution across runs!
- This allows you to compare how performance changes with different settings


In [5]:
if 'df' in globals() and not df.empty:
    print("🎯 Creating Performance Visualizations...")
    
    # 1. F1-Score Heatmap - Shows model performance across documents at a glance
    print("📊 1. Model Performance Heatmap (F1-Score)")
    f1_heatmap = create_performance_heatmap(df, 'f1_score', 'F1-Score')
    f1_heatmap.show()
    
    # 2. Model Comparison Chart - Shows average performance with error bars
    print("📈 2. Model Performance Comparison")
    model_comparison = create_model_comparison_chart(df)
    model_comparison.show()
    
    # 3. Document Difficulty Ranking
    print("📋 3. Document Difficulty Ranking")
    doc_difficulty = create_document_difficulty_chart(df)
    doc_difficulty.show()
    
else:
    print("❌ Please load and process evaluation data first.")

🎯 Creating Performance Visualizations...
📊 1. Model Performance Heatmap (F1-Score)


📈 2. Model Performance Comparison


📋 3. Document Difficulty Ranking


In [6]:
if 'df' in globals() and not df.empty:
    
    # 4. Run Comparison (if multiple runs selected)
    if len(df['run'].unique()) > 1:
        print("🔄 4. Run Comparison Dashboard")
        run_comparison = create_run_comparison_dashboard(df)
        if run_comparison:
            run_comparison.show()
    
    # 5. Additional Heatmaps for Precision and Recall
    print("📊 5. Additional Performance Heatmaps")
    
    # Precision heatmap
    precision_heatmap = create_performance_heatmap(df, 'precision', 'Precision')
    precision_heatmap.show()
    
    # Recall heatmap  
    recall_heatmap = create_performance_heatmap(df, 'recall', 'Recall')
    recall_heatmap.show()
    
    # 6. Summary Statistics Table
    print("\n" + "="*80)
    print("📊 PERFORMANCE SUMMARY")
    print("="*80)
    
    # Best model per document
    print("\n🏆 BEST MODEL PER DOCUMENT (by F1-Score):")
    best_per_doc = df.loc[df.groupby('document')['f1_score'].idxmax()][['document', 'model', 'f1_score', 'precision', 'recall']]
    best_per_doc = best_per_doc.sort_values('f1_score', ascending=False)
    display(best_per_doc)
    
    # Overall model ranking
    print("\n🥇 OVERALL MODEL RANKING:")
    model_ranking = df.groupby('model').agg({
        'f1_score': ['mean', 'std', 'min', 'max'],
        'precision': 'mean',
        'recall': 'mean',
        'document': 'count'
    }).round(4)
    model_ranking.columns = ['F1_Mean', 'F1_Std', 'F1_Min', 'F1_Max', 'Precision_Mean', 'Recall_Mean', 'Documents']
    model_ranking = model_ranking.sort_values('F1_Mean', ascending=False)
    display(model_ranking)
    
    # Document difficulty ranking
    print("\n📋 DOCUMENT DIFFICULTY RANKING (Hardest First):")
    doc_ranking = df.groupby('document').agg({
        'f1_score': ['mean', 'std', 'min', 'max'],
        'model': 'count'
    }).round(4)
    doc_ranking.columns = ['F1_Mean', 'F1_Std', 'F1_Min', 'F1_Max', 'Models_Tested']
    doc_ranking = doc_ranking.sort_values('F1_Mean', ascending=True)
    display(doc_ranking)
    
    if len(df['run'].unique()) > 1:
        print("\n🔄 RUN COMPARISON:")
        run_summary = df.groupby('run').agg({
            'f1_score': ['mean', 'std'],
            'precision': 'mean',
            'recall': 'mean',
            'model': 'nunique',
            'document': 'nunique'
        }).round(4)
        run_summary.columns = ['F1_Mean', 'F1_Std', 'Precision_Mean', 'Recall_Mean', 'Models', 'Documents']
        run_summary = run_summary.sort_values('F1_Mean', ascending=False)
        display(run_summary)
    
else:
    print("❌ Please load and process evaluation data first.")

🔄 4. Run Comparison Dashboard


📊 5. Additional Performance Heatmaps



📊 PERFORMANCE SUMMARY

🏆 BEST MODEL PER DOCUMENT (by F1-Score):


Unnamed: 0,document,model,f1_score,precision,recall
215,CAN v. TURKEY,gemma3:12b,0.625954,0.61194,0.640625
191,YAKUSHEV v. UKRAINE,gemma3:12b,0.597938,0.604167,0.591837
139,PANYUSHKINY v. RUSSIA,gemma3:12b,0.581818,0.703297,0.496124
203,PAKHTUSOV v. RUSSIA,gemma3:12b,0.577778,0.65,0.52
10,SIDOROVA v. RUSSIA,gemma3:12b,0.571429,0.551724,0.592593
75,STANA v. ROMANIA,gemma3:12b,0.542373,0.5,0.592593
147,SOLCAN v. ROMANIA,gemma3:12b,0.533333,0.685714,0.436364
51,OTGON v. THE REPUBLIC OF MOLDOVA,gemma3:12b,0.521127,0.402174,0.74
18,MOSKALEV v. RUSSIA,gemma3:12b,0.515464,0.471698,0.568182
228,LOZOVYYE v. RUSSIA,mistral:latest,0.503311,0.487179,0.520548



🥇 OVERALL MODEL RANKING:


Unnamed: 0_level_0,F1_Mean,F1_Std,F1_Min,F1_Max,Precision_Mean,Recall_Mean,Documents
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gemma3:12b,0.3277,0.1783,0.0,0.626,0.4569,0.2873,59
mistral:latest,0.2283,0.171,0.0,0.5278,0.2705,0.2295,60
gemma3:4b,0.1937,0.1499,0.0,0.5378,0.2904,0.1697,59
gemma3:1b,0.1141,0.1088,0.0,0.35,0.2885,0.0868,59



📋 DOCUMENT DIFFICULTY RANKING (Hardest First):


Unnamed: 0_level_0,F1_Mean,F1_Std,F1_Min,F1_Max,Models_Tested
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
S.N. v. RUSSIA,0.0173,0.0273,0.0,0.0748,8
KOSAITE - CYPIENE AND OTHERS v. LITHUANIA,0.0504,0.0776,0.0,0.2335,8
YERMAKOVICH v. RUSSIA,0.0729,0.1026,0.0,0.3158,8
CABUCAK v. GERMANY,0.1058,0.1492,0.0,0.3983,8
BIGUN v. UKRAINE,0.1259,0.1116,0.0,0.2877,8
BELYAYEV AND OTHERS v. UKRAINE,0.1277,0.0816,0.0,0.2609,8
VISY v. SLOVAKIA,0.1443,0.121,0.0,0.3093,8
H_INESS v. NORWAY (1),0.151,0.1867,0.0,0.4369,5
MURUZHEVA v. RUSSIA,0.1515,0.1774,0.0,0.417,8
S.V. v. ITALY,0.1635,0.2084,0.0,0.4493,8



🔄 RUN COMPARISON:


Unnamed: 0_level_0,F1_Mean,F1_Std,Precision_Mean,Recall_Mean,Models,Documents
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pipeline_results_20250808_145025,0.2718,0.1645,0.3673,0.2501,4,30
pipeline_results_20250725_111753,0.1587,0.1599,0.2843,0.1354,4,30


In [7]:
def create_per_document_charts(df):
    """Create individual charts for each document showing all models with all metrics"""
    
    documents = df['document'].unique()
    
    for doc in documents:
        doc_data = df[df['document'] == doc].copy()
        
        if doc_data.empty:
            continue
            
        # Sort models by F1-score for better visualization
        doc_data = doc_data.sort_values('f1_score', ascending=True)
        
        fig = go.Figure()
        
        # Add bars for each metric
        metrics = ['precision', 'recall', 'f1_score']
        metric_names = ['Precision', 'Recall', 'F1-Score']
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
        
        for i, (metric, name, color) in enumerate(zip(metrics, metric_names, colors)):
            fig.add_trace(go.Bar(
                name=name,
                x=doc_data[metric],
                y=doc_data['model'],
                orientation='h',
                marker_color=color,
                text=doc_data[metric].round(3),
                textposition='auto',
                offsetgroup=i,
                width=0.25  # Make bars thinner so they don't overlap
            ))
        
        # Find the best model for this document
        best_model = doc_data.loc[doc_data['f1_score'].idxmax()]
        
        fig.update_layout(
            title=dict(
                text=f"Model Performance on Document: {doc}<br><span style='font-size:14px'>Best Model: {best_model['model']} (F1: {best_model['f1_score']:.3f})</span>",
                font=dict(size=16, color='#2E2E2E'),
                x=0.5
            ),
            xaxis=dict(
                title="Score",
                range=[0, 1],
                tickfont=dict(size=12)
            ),
            yaxis=dict(
                title="Models",
                tickfont=dict(size=12)
            ),
            barmode='group',
            template='plotly_white',
            height=max(300, len(doc_data) * 40),
            width=800,
            showlegend=True,
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
        )
        
        fig.show()

if 'df' in globals() and not df.empty:
    print("\n" + "="*80)
    print("📄 DETAILED PERFORMANCE BY DOCUMENT")
    print("="*80)
    print("Individual charts for each document showing all models with precision, recall, and F1-score")
    print()
    
    create_per_document_charts(df)
    
else:
    print("❌ Please load and process evaluation data first.")


📄 DETAILED PERFORMANCE BY DOCUMENT
Individual charts for each document showing all models with precision, recall, and F1-score



In [8]:
def create_comprehensive_overview(df):
    """Create a comprehensive overview chart with all models and documents for each run"""
    
    runs = sorted(df['run'].unique())
    num_runs = len(runs)
    
    if num_runs == 1:
        # Single run - create 3-panel view (Precision, Recall, F1-Score)
        fig = make_subplots(
            rows=1, cols=3,
            subplot_titles=('Precision', 'Recall', 'F1-Score'),
            shared_yaxes=True,
            horizontal_spacing=0.08
        )
        
        models = sorted(df['model'].unique())
        documents = sorted(df['document'].unique())
        colors = px.colors.qualitative.Set1[:len(models)]
        if len(models) > len(colors):
            colors = colors * (len(models) // len(colors) + 1)
        
        metrics = ['precision', 'recall', 'f1_score']
        
        for col_idx, metric in enumerate(metrics, 1):
            pivot_data = df.pivot_table(
                index='document', 
                columns='model', 
                values=metric, 
                aggfunc='first'
            ).fillna(0)
            
            for model_idx, model in enumerate(models):
                if model in pivot_data.columns:
                    values = pivot_data[model].values
                    fig.add_trace(
                        go.Bar(
                            name=model if col_idx == 1 else None,
                            x=values,
                            y=documents,
                            orientation='h',
                            marker_color=colors[model_idx],
                            text=[f"{val:.3f}" if val > 0 else "" for val in values],
                            textposition='auto',
                            textfont=dict(size=8),
                            showlegend=(col_idx == 1),
                            offsetgroup=model_idx,
                            width=0.8/len(models)
                        ),
                        row=1, col=col_idx
                    )
        
        title_text = f"📊 Complete Performance Overview - {runs[0]}"
        height = max(600, len(documents) * 25)
        width = 1400
        
    else:
        # Multiple runs - create grid with runs as rows and metrics as columns
        fig = make_subplots(
            rows=num_runs, cols=3,
            subplot_titles=[f"{run} - {metric}" for run in runs for metric in ['Precision', 'Recall', 'F1-Score']],
            shared_yaxes=True,
            vertical_spacing=0.15,
            horizontal_spacing=0.08
        )
        
        models = sorted(df['model'].unique())
        documents = sorted(df['document'].unique())
        colors = px.colors.qualitative.Set1[:len(models)]
        if len(models) > len(colors):
            colors = colors * (len(models) // len(colors) + 1)
        
        metrics = ['precision', 'recall', 'f1_score']
        
        for run_idx, run in enumerate(runs, 1):
            run_data = df[df['run'] == run]
            
            for col_idx, metric in enumerate(metrics, 1):
                pivot_data = run_data.pivot_table(
                    index='document', 
                    columns='model', 
                    values=metric, 
                    aggfunc='first'
                ).fillna(0)
                
                for model_idx, model in enumerate(models):
                    if model in pivot_data.columns:
                        values = pivot_data[model].values
                        show_legend = (run_idx == 1 and col_idx == 1)  # Only show legend on first subplot
                        
                        fig.add_trace(
                            go.Bar(
                                name=model if show_legend else None,
                                x=values,
                                y=documents,
                                orientation='h',
                                marker_color=colors[model_idx],
                                text=[f"{val:.3f}" if val > 0 else "" for val in values],
                                textposition='auto',
                                textfont=dict(size=6),
                                showlegend=show_legend,
                                offsetgroup=model_idx,
                                width=0.8/len(models),
                                legendgroup=model  # Group legend items by model
                            ),
                            row=run_idx, col=col_idx
                        )
        
        title_text = f"📊 Performance Comparison Across {num_runs} Runs"
        height = max(800, num_runs * 300)
        width = 1400
    
    # Update layout
    fig.update_layout(
        title=dict(
            text=title_text,
            font=dict(size=20, color='#2E2E2E'),
            x=0.5
        ),
        height=height,
        width=width,
        template='plotly_white',
        barmode='group',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.15,
            xanchor="center",
            x=0.5,
            font=dict(size=10)
        )
    )
    
    # Update x-axes for all subplots
    for row in range(1, (num_runs if num_runs > 1 else 1) + 1):
        for col in range(1, 4):
            fig.update_xaxes(
                title="Score",
                range=[0, 1],
                tickfont=dict(size=10),
                row=row, col=col
            )
    
    # Update y-axis (only for first column since shared)
    for row in range(1, (num_runs if num_runs > 1 else 1) + 1):
        fig.update_yaxes(
            title="Documents" if row == 1 else "",
            tickfont=dict(size=10),
            row=row, col=1
        )
    
    return fig

if 'df' in globals() and not df.empty:
    print("\n" + "="*80)
    print("🎯 COMPREHENSIVE OVERVIEW")
    print("="*80)
    
    # Show which runs are included
    unique_runs = df['run'].unique()
    if len(unique_runs) == 1:
        print(f"📊 Displaying results for run: {unique_runs[0]}")
        print("Single run analysis with detailed metrics breakdown")
    else:
        print(f"📊 Comparing {len(unique_runs)} runs side by side:")
        for i, run in enumerate(unique_runs, 1):
            print(f"    {i}. {run}")
        print("🔍 Each run shown separately - NO AVERAGING!")
        print("📈 Perfect for seeing how performance evolves across different settings")
    
    print("Complete view of all models across all documents for all metrics")
    print()
    
    overview_fig = create_comprehensive_overview(df)
    overview_fig.show()
    
else:
    print("❌ Please load and process evaluation data first.")


🎯 COMPREHENSIVE OVERVIEW
📊 Comparing 2 runs side by side:
    1. pipeline_results_20250725_111753
    2. pipeline_results_20250808_145025
🔍 Each run shown separately - NO AVERAGING!
📈 Perfect for seeing how performance evolves across different settings
Complete view of all models across all documents for all metrics

