# LLM Translation Metrics Aggregation

This notebook aggregates translation metrics (BLEU, chrF, TER, BERT) from LLM experiments and generates comparison visualizations.

In [1]:
import json
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Configuration
CLEANED_DATA_DIR = Path('cleaned_data')
OUTPUT_DIR = Path('aggregated_metrics')
CHARTS_DIR = OUTPUT_DIR / 'comparison_charts'

# Metrics to aggregate
METRICS = ['bleu_score', 'chrF_score', 'ter_test', 'bert_score_f1']

# Create output directories
OUTPUT_DIR.mkdir(exist_ok=True)
CHARTS_DIR.mkdir(exist_ok=True)

## 1. Load and Aggregate Data

In [3]:
def load_json_file(filepath):
    """Load a JSON file and return its contents."""
    with open(filepath, 'r') as f:
        return json.load(f)

def compute_averages(data):
    """Compute average metrics from a list of prompt results."""
    if not data:
        return None
    
    averages = {}
    for metric in METRICS:
        values = [item.get(metric, 0) for item in data if metric in item]
        if values:
            averages[f'avg_{metric}'] = sum(values) / len(values)
        else:
            averages[f'avg_{metric}'] = 0
    
    averages['sample_count'] = len(data)
    return averages

def get_round_number(filename):
    """Extract round number from filename (e.g., '1.json' -> 1)."""
    return int(Path(filename).stem)

In [4]:
def process_all_data():
    """Process all LLM data and return aggregated results."""
    all_results = []
    
    # Get all LLM folders
    llm_folders = [d for d in CLEANED_DATA_DIR.iterdir() if d.is_dir()]
    
    for llm_folder in sorted(llm_folders):
        llm_name = llm_folder.name
        print(f"Processing {llm_name}...")
        
        # Walk through all JSON files
        for json_file in llm_folder.rglob('*.json'):
            # Get relative path for category info
            rel_path = json_file.relative_to(llm_folder)
            category = str(rel_path.parent)
            round_num = get_round_number(json_file.name)
            
            # Load and compute averages
            data = load_json_file(json_file)
            averages = compute_averages(data)
            
            if averages:
                result = {
                    'llm': llm_name,
                    'category': category,
                    'round': round_num,
                    **averages
                }
                all_results.append(result)
                
                # Save individual JSON file
                output_path = OUTPUT_DIR / llm_name / category
                output_path.mkdir(parents=True, exist_ok=True)
                
                output_file = output_path / f'round_{round_num}_avg.json'
                with open(output_file, 'w') as f:
                    json.dump(result, f, indent=2)
    
    return all_results

# Process all data
all_results = process_all_data()
print(f"\nTotal aggregated results: {len(all_results)}")

Processing claude3.5...
Processing gemini2.5...
Processing gpt5...
Processing llama3_1b...
Processing llama3_8b...
Processing mistral...
Processing phi3_14b...
Processing phi3_8b...
Processing qwen_14b...

Total aggregated results: 153


## 2. Create Summary DataFrame

In [5]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(all_results)
df.head(10)

Unnamed: 0,llm,category,round,avg_bleu_score,avg_chrF_score,avg_ter_test,avg_bert_score_f1,sample_count
0,claude3.5,grammatical_induction,4,35.278291,56.182666,58.686869,0.921532,5
1,claude3.5,grammatical_induction,2,22.707722,51.472255,57.777778,0.907586,5
2,claude3.5,grammatical_induction,1,54.284941,66.221288,73.428571,0.944407,5
3,claude3.5,grammatical_induction,3,28.994246,45.267076,60.111111,0.90195,5
4,claude3.5,grammatical_induction,5,14.38946,45.614222,100.181993,0.912811,5
5,claude3.5,zero_shot,1,3.603309,9.903597,203.333333,0.866242,5
6,claude3.5,morphological_induction,1,57.512442,74.502619,79.152237,0.93514,15
7,claude3.5,few_shot/translation_question,4,31.674833,49.879452,54.0,0.926734,5
8,claude3.5,few_shot/translation_question,2,26.648216,37.530204,56.095238,0.894184,5
9,claude3.5,few_shot/translation_question,1,25.645595,29.837534,100.0,0.90062,5


In [6]:
# Aggregate by LLM and round (across all categories)
summary_df = df.groupby(['llm', 'round']).agg({
    'avg_bleu_score': 'mean',
    'avg_chrF_score': 'mean',
    'avg_ter_test': 'mean',
    'avg_bert_score_f1': 'mean',
    'sample_count': 'sum'
}).reset_index()

summary_df

Unnamed: 0,llm,round,avg_bleu_score,avg_chrF_score,avg_ter_test,avg_bert_score_f1,sample_count
0,claude3.5,1,31.331993,45.867087,137.849495,0.903978,40
1,claude3.5,2,36.39287,53.785397,61.513228,0.91304,20
2,claude3.5,3,40.553548,56.485917,59.291005,0.923499,20
3,claude3.5,4,33.984375,55.50265,66.451178,0.927902,20
4,claude3.5,5,33.342549,54.278335,83.949553,0.926096,20
5,gemini2.5,1,51.105065,56.854163,53.590476,0.930417,40
6,gemini2.5,2,35.895857,52.772853,52.550265,0.907173,20
7,gemini2.5,3,36.999485,54.552528,89.243386,0.911891,20
8,gemini2.5,4,34.546093,49.567766,71.858586,0.858855,20
9,gemini2.5,5,36.26392,52.279148,84.999293,0.909837,20


In [7]:
# Save summary to CSV
summary_csv_path = CHARTS_DIR / 'all_models_summary.csv'
summary_df.to_csv(summary_csv_path, index=False)
print(f"Saved summary to {summary_csv_path}")

Saved summary to aggregated_metrics/comparison_charts/all_models_summary.csv


## 3. Generate Refined Comparison Charts

In [8]:
# Set style
sns.set_theme(style="whitegrid")
plt.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 100})

def generate_refined_charts(df):
    # Metrics mapping
    metrics_map = {
        'avg_bleu_score': 'BLEU Score',
        'avg_chrF_score': 'chrF Score',
        'avg_ter_test': 'TER Score',
        'avg_bert_score_f1': 'BERTScore F1'
    }
    
    unique_rounds = sorted(df['round'].unique())
    unique_llms = sorted(df['llm'].unique())
    
    print(f"Generating per-round charts for {len(unique_rounds)} rounds...")
    
    # 1. Per-Round, Per-Metric Charts
    for r in unique_rounds:
        round_df = df[df['round'] == r]
        
        for metric_col, metric_name in metrics_map.items():
            plt.figure(figsize=(10, 6))
            
            # Create bar plot
            ax = sns.barplot(
                data=round_df,
                x='llm',
                y=metric_col,
                order=unique_llms,
                palette='viridis',
                hue='llm',
                legend=False
            )
            
            # Customization
            plt.title(f'Round {r} - {metric_name} Comparison')
            plt.xlabel('LLM Model')
            plt.ylabel(metric_name)
            plt.xticks(rotation=45)
            
            # Add value labels
            for container in ax.containers:
                ax.bar_label(container, fmt='%.2f', padding=3)
            
            plt.tight_layout()
            
            # Save file
            # Clean up filename for bert
            safe_metric_name = metric_col.replace('avg_', '').replace('_score', '').replace('_test', '')
            if 'bert' in safe_metric_name:
                safe_metric_name = safe_metric_name.replace('_f1', '')
                
            filename = f"round_{r}_{safe_metric_name}.png"
            output_path = CHARTS_DIR / filename
            plt.savefig(output_path)
            plt.close()
            print(f"  Saved: {output_path}")

    # 2. Overall Summary (Average of Round Averages)
    print("\nGenerating overall summary (Average of Round Averages)...")
    
    # Group by LLM and calculate mean of the round-level averages
    # We use the existing summary_df which is already averaged by round
    overall_avg_df = df.groupby('llm')[list(metrics_map.keys())].mean().reset_index()
    
    # Save this summary to CSV
    overall_csv_path = CHARTS_DIR / 'overall_average_of_rounds.csv'
    overall_avg_df.to_csv(overall_csv_path, index=False)
    print(f"  Saved summary data: {overall_csv_path}")
    
    # Generate charts for overall averages
    for metric_col, metric_name in metrics_map.items():
        plt.figure(figsize=(10, 6))
        
        ax = sns.barplot(
            data=overall_avg_df,
            x='llm',
            y=metric_col,
            order=unique_llms,
            palette='magma',
            hue='llm',
            legend=False
        )
        
        plt.title(f'Overall Average (Across Rounds) - {metric_name}')
        plt.xlabel('LLM Model')
        plt.ylabel(f'Average {metric_name}')
        plt.xticks(rotation=45)
        
        for container in ax.containers:
            ax.bar_label(container, fmt='%.2f', padding=3)
            
        plt.tight_layout()
        
        safe_metric_name = metric_col.replace('avg_', '').replace('_score', '').replace('_test', '')
        if 'bert' in safe_metric_name:
            safe_metric_name = safe_metric_name.replace('_f1', '')
            
        filename = f"overall_average_{safe_metric_name}.png"
        output_path = CHARTS_DIR / filename
        plt.savefig(output_path)
        plt.close()
        print(f"  Saved: {output_path}")

# Run the generation
generate_refined_charts(summary_df)

Generating per-round charts for 5 rounds...


  Saved: aggregated_metrics/comparison_charts/round_1_bleu.png


  Saved: aggregated_metrics/comparison_charts/round_1_chrF.png


  Saved: aggregated_metrics/comparison_charts/round_1_ter.png


  Saved: aggregated_metrics/comparison_charts/round_1_bert.png


  Saved: aggregated_metrics/comparison_charts/round_2_bleu.png


  Saved: aggregated_metrics/comparison_charts/round_2_chrF.png


  Saved: aggregated_metrics/comparison_charts/round_2_ter.png


  Saved: aggregated_metrics/comparison_charts/round_2_bert.png


  Saved: aggregated_metrics/comparison_charts/round_3_bleu.png


  Saved: aggregated_metrics/comparison_charts/round_3_chrF.png


  Saved: aggregated_metrics/comparison_charts/round_3_ter.png


  Saved: aggregated_metrics/comparison_charts/round_3_bert.png


  Saved: aggregated_metrics/comparison_charts/round_4_bleu.png


  Saved: aggregated_metrics/comparison_charts/round_4_chrF.png


  Saved: aggregated_metrics/comparison_charts/round_4_ter.png


  Saved: aggregated_metrics/comparison_charts/round_4_bert.png


  Saved: aggregated_metrics/comparison_charts/round_5_bleu.png


  Saved: aggregated_metrics/comparison_charts/round_5_chrF.png


  Saved: aggregated_metrics/comparison_charts/round_5_ter.png


  Saved: aggregated_metrics/comparison_charts/round_5_bert.png

Generating overall summary (Average of Round Averages)...
  Saved summary data: aggregated_metrics/comparison_charts/overall_average_of_rounds.csv


  Saved: aggregated_metrics/comparison_charts/overall_average_bleu.png


  Saved: aggregated_metrics/comparison_charts/overall_average_chrF.png


  Saved: aggregated_metrics/comparison_charts/overall_average_ter.png


  Saved: aggregated_metrics/comparison_charts/overall_average_bert.png
