# Evaluation Results

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

## WER

In [None]:
results = []

for model in ["whisper-large-v3", "whisper-large-v3-turbo"]:
    for language in ["en", "de", "cs", "pl", "hu"]:
        file_path = f"outputs/{language}/transcription/{model}.json"

        try:
            with open(file_path) as f:
                wers = []

                data = json.load(f)

                for sample in data["samples"]:
                    wers.append(sample["metrics"]["wer"])

                results.append(
                    {"model": model, "language": language, "WER": sum(wers) / len(wers)}
                )
        except:
            continue

In [38]:
results

[{'model': 'whisper-large-v3', 'language': 'en', 'WER': 0.08998028571428571},
 {'model': 'whisper-large-v3', 'language': 'de', 'WER': 0.16997333333333334},
 {'model': 'whisper-large-v3', 'language': 'cs', 'WER': 0.12373633125556546},
 {'model': 'whisper-large-v3', 'language': 'pl', 'WER': 0.09633522665210269},
 {'model': 'whisper-large-v3', 'language': 'hu', 'WER': 0.18015702702702702},
 {'model': 'whisper-large-v3-turbo',
  'language': 'de',
  'WER': 0.1891611280487805},
 {'model': 'whisper-large-v3-turbo',
  'language': 'cs',
  'WER': 0.1319052537845058},
 {'model': 'whisper-large-v3-turbo',
  'language': 'pl',
  'WER': 0.10391556526488258},
 {'model': 'whisper-large-v3-turbo',
  'language': 'hu',
  'WER': 0.20494153153153152}]

In [None]:
# Organize data by model and language
models = list(set(r["model"] for r in results))
languages = list(set(r["language"] for r in results))

# Create a lookup for WER values
wer_lookup = {(r["model"], r["language"]): r["WER"] for r in results}

# Set up the plot
x = np.arange(len(languages))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))

# Plot bars for each model
colors = ["#2ecc71", "#e74c3c"]
for i, model in enumerate(models):
    wer_values = [wer_lookup.get((model, lang), 0) for lang in languages]
    offset = width * (i - len(models) / 2 + 0.5)
    bars = ax.bar(
        x + offset, wer_values, width, label=model, color=colors[i], edgecolor="white"
    )

    # Add value labels on bars
    for bar, val in zip(bars, wer_values):
        if val > 0:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                bar.get_height() + 0.005,
                f"{val:.2%}",
                ha="center",
                va="bottom",
                fontsize=9,
            )

# Customize the plot
ax.set_xlabel("Language", fontsize=12, fontweight="bold")
ax.set_ylabel("Word Error Rate (WER)", fontsize=12, fontweight="bold")
ax.set_title("WER Comparison by Model and Language", fontsize=14, fontweight="bold")
ax.set_xticks(x)
ax.set_xticklabels([lang.upper() for lang in languages], fontsize=11)
ax.legend(title="Model", loc="upper right")
ax.set_ylim(0, max(r["WER"] for r in results) * 1.2)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0%}"))

plt.tight_layout()
plt.show()


## Translation evaluations

### German to English

In [None]:
with open("/Users/hhayat/Code/open-language-eval/outputs/de/evals/en_whisper-large-v3_gpt-4o-mini_llm_judge.json") as f:
    data = json.load(f)

for k, v in data.items():




{'20130910-0900-PLENARY-12-de_20130910-16:01:47_6': {'pair_id': '20130910-0900-PLENARY-12-de_20130910-16:01:47_6',
  'source_text': ' um den tatsächlichen Bedarf bis Ende dieses Jahres abzudecken.',
  'translated_text': 'to cover the actual demand by the end of this year.',
  'num_errors': 1,
  'errors': [{'error': 'No errors found in the translation.',
    'classification': 'no-error',
    'severity': 1,
    'snippet': 'to cover the actual demand by the end of this year.'}]},
 '20090324-0900-PLENARY-12-de_20090324-19:34:22_0': {'pair_id': '20090324-0900-PLENARY-12-de_20090324-19:34:22_0',
  'source_text': ' Ja, schönen Dank, Frau Präsidentin.',
  'translated_text': 'Yes, thank you very much, Madam President.',
  'num_errors': 1,
  'errors': [{'error': 'No errors detected in the translation.',
    'classification': 'no-error',
    'severity': 1,
    'snippet': 'Yes, thank you very much, Madam President.'}]},
 '20130910-0900-PLENARY-12-de_20130910-16:01:47_3': {'pair_id': '20130910-0900

In [None]:
from collections import Counter

# Aggregate errors by class
error_counts = Counter()
total_errors = 0

for sample_id, sample_data in data.items():
    num_errors = sample_data.get("num_errors", 0)
    total_errors += num_errors

    # Count errors by class/type
    errors = sample_data.get("errors", [])
    for error in errors:
        # Adjust this based on your error structure
        # If errors are strings (class names):
        if isinstance(error, str):
            error_counts[error] += 1
        # If errors are dicts with a 'type' or 'class' key:
        elif isinstance(error, dict):
            error_type = (
                error.get("type")
                or error.get("class")
                or error.get("category", "unknown")
            )
            error_counts[error_type] += 1

print(f"Total Errors: {total_errors}")
print(f"\nErrors by Class:")
for error_class, count in error_counts.most_common():
    print(f"  {error_class}: {count}")

# Create bar chart
if error_counts:
    fig, ax = plt.subplots(figsize=(10, 6))

    classes = list(error_counts.keys())
    counts = list(error_counts.values())

    # Sort by count descending
    sorted_pairs = sorted(zip(counts, classes), reverse=True)
    counts, classes = zip(*sorted_pairs)

    colors = plt.cm.Reds(np.linspace(0.4, 0.8, len(classes)))
    bars = ax.bar(classes, counts, color=colors, edgecolor="white")

    # Add value labels on bars
    for bar, count in zip(bars, counts):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.5,
            str(count),
            ha="center",
            va="bottom",
            fontsize=10,
            fontweight="bold",
        )

    ax.set_xlabel("Error Class", fontsize=12, fontweight="bold")
    ax.set_ylabel("Count", fontsize=12, fontweight="bold")
    ax.set_title(
        f"Error Distribution by Class (Total: {total_errors})",
        fontsize=14,
        fontweight="bold",
    )
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("No errors found to plot.")


In [None]:
from dotenv import load_dotenv

In [15]:
load_dotenv()

True

In [None]:
with open(
    "/Users/hhayat/Code/open-language-eval/outputs/hu/translation/en_whisper-large-v3_gpt-4o-mini.json"
) as f:
    data = json.load(f)


In [None]:
data["samples"][0]["audio_info"]["audio_id"]

'20170213-0900-PLENARY-11-hu_20170213-18:36:37_1'