# Eval Cv22
- now legacy code as the code of this notebook is refactored into eval scripts
- this notebook is for initial exploration of cv22 dataset and building eval pipeline

## Variables, file utils

In [3]:
from dotenv import load_dotenv
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import json

REPO_ROOT = Path('..').resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))

load_dotenv(override=True)
print(os.getenv("TEST"))


Ok. Reading from .env file


## CV22 manager class
- simple class to easily work with Common Voice dataset

In [26]:
from audio_eval import CommonVoiceDataset


In [26]:
# print size of all the languages in cv22
cv = CommonVoiceDataset(os.getenv("CV22_PATH"))
all_stats = {}
for lang in cv.languages:
    stats = cv.get_language_stats(lang, split="test")
    all_stats[lang] = stats
    print(f"{lang}: {stats}")

Found 10 languages: ['ru', 'it', 'en', 'es', 'ja']...
ru: {'total_samples': 10244, 'available_splits': ['train', 'dev', 'test', 'validated', 'invalidated', 'other', 'reported'], 'unique_speakers': 2033, 'avg_up_votes': 2.0807301835220615, 'high_quality_samples': 10244}
it: {'total_samples': 15177, 'available_splits': ['train', 'dev', 'test', 'validated', 'invalidated', 'other', 'reported'], 'unique_speakers': 3848, 'avg_up_votes': 2.1311853462476114, 'high_quality_samples': 15177}
en: {'total_samples': 16396, 'available_splits': ['train', 'dev', 'test', 'validated', 'invalidated', 'other', 'reported'], 'unique_speakers': 12082, 'avg_up_votes': 2.2098072700658697, 'high_quality_samples': 16396}
es: {'total_samples': 15893, 'available_splits': ['train', 'dev', 'test', 'validated', 'invalidated', 'other', 'reported'], 'unique_speakers': 6550, 'avg_up_votes': 2.0812936512930222, 'high_quality_samples': 15893}
ja: {'total_samples': 8004, 'available_splits': ['train', 'dev', 'test', 'validat

In [None]:
# use the class
cv_dataset = CommonVoiceDataset(os.getenv("CV22_PATH"))
print(cv_dataset.languages)

# get the first 5 samples of english test set
samples_df = cv_dataset.get_samples("en", split="test", n_samples=5)

# get the first sample of english test set with audio path and metadata
sample_with_audio = cv_dataset.get_sample_with_audio("en", samples_df.iloc[0])
from pprint import pprint

pprint(sample_with_audio["audio_path"])
pprint(sample_with_audio["text"])

## Sample Transcriptions endpoints

In [34]:
!uv pip install openai groq speechmatics-python elevenlabs jiwer -q

In [10]:
from dotenv import load_dotenv

load_dotenv(override=True)

SAMPLE_AUDIO_FILE = "/root/data/common_voice_22/cv-corpus-22.0-2025-06-20/en/clips/common_voice_en_77702.mp3"
SAMPLE_TEXT = "He was thinking about omens, and someone had appeared."

### OpenAI

In [30]:
from openai import OpenAI

client = OpenAI()
audio_file = open(SAMPLE_AUDIO_FILE, "rb")

transcription = client.audio.transcriptions.create(model="gpt-4o-transcribe", file=audio_file)

print(transcription.text)

He was thinking about omens, and someone had appeared.


### Groq

In [31]:
import os
from groq import Groq

client = Groq()
filename = SAMPLE_AUDIO_FILE

with open(filename, "rb") as file:
    transcription = client.audio.transcriptions.create(
        file=(filename, file.read()),
        model="whisper-large-v3-turbo",
        response_format="verbose_json",
    )
    print(transcription.text)

 He was thinking about omens and someone had appeared.


### ElevenLabs

In [None]:
from elevenlabs import ElevenLabs

# Initialize client
client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))

# Open your audio file and send to API
with open(SAMPLE_AUDIO_FILE, "rb") as f:
    transcript = client.speech_to_text.convert(
        file=f,
        model_id="scribe_v1",  # or "scribe_v1_experimental"
        language_code="en",  # optional
    )

# 'This request exceeds your API key quota of 10. You have 0 credits remaining, while 5 credits are required for this request.'
print(transcript.text)

### Spechmatics


In [None]:
# https://github.com/speechmatics/speechmatics-python-sdk

from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig
from speechmatics.batch_client import BatchClient
from httpx import HTTPStatusError

API_KEY = os.getenv("SPEECHMATICS_API_KEY")
PATH_TO_FILE = SAMPLE_AUDIO_FILE
LANGUAGE = "en"

settings = ConnectionSettings(
    url="https://asr.api.speechmatics.com/v2",  # Batch API endpoint
    auth_token=API_KEY,
)

with BatchClient(settings) as client:
    try:
        # Submit job
        job_id = client.submit_job(PATH_TO_FILE, BatchTranscriptionConfig(language=LANGUAGE))
        print(f"Job {job_id} submitted. Waiting for completion...")

        # Wait for results (txt, json-v2, srt, etc.)
        transcript = client.wait_for_completion(job_id, transcription_format="txt")
        print("Transcript:\n", transcript)

    except HTTPStatusError as e:
        if e.response.status_code == 401:
            print("Invalid API key – check your API_KEY.")
        elif e.response.status_code == 400:
            print("Bad request:", e.response.json().get("detail"))
        else:
            raise e


### Gladia

In [None]:
import requests
import time
import os

API_KEY = os.getenv("GLADIA_API_KEY")
AUDIO_FILE = SAMPLE_AUDIO_FILE

headers = {"x-gladia-key": API_KEY}

# 1. Upload audio file
with open(AUDIO_FILE, "rb") as f:
    resp = requests.post(
        "https://api.gladia.io/v2/upload",
        headers=headers,
        files={"audio": (os.path.basename(AUDIO_FILE), f, "audio/wav")},
    )
resp.raise_for_status()
file_url = resp.json()["audio_url"]
print("Uploaded:", file_url)

# 2. Request transcription
payload = {"audio_url": file_url}
resp = requests.post(
    "https://api.gladia.io/v2/pre-recorded", headers={**headers, "Content-Type": "application/json"}, json=payload
)
resp.raise_for_status()
job = resp.json()
job_id, result_url = job["id"], job["result_url"]
print("Job ID:", job_id)

# 3. Poll until done
while True:
    resp = requests.get(result_url, headers=headers)
    resp.raise_for_status()
    data = resp.json()
    if data["status"] == "done":
        transcript = data["result"]["transcription"]["full_transcript"]
        print("\nTranscription:\n", transcript)
        break
    elif data["status"] == "error":
        print("Error:", data)
        break
    else:
        time.sleep(2)


## ✅ Transcription functions 

In [None]:
from audio_eval import (
    DEFAULT_SERVICE_MODELS,
    build_service_function_map,
    transcribe_elevenlabs,
    transcribe_gladia,
    transcribe_groq,
    transcribe_menlo,
    transcribe_openai,
    transcribe_speechmatics,
    transcribe_vllm,
)


## ✅ Text normalization functions
- loot from Huggingface - OpenASRLeaderboard repo

In [28]:
from audio_eval import (
    DEFAULT_NORMALIZER,
    english_normalizer,
    get_text_normalizer,
    multilingual_normalizer,
)

text_normalizer = DEFAULT_NORMALIZER


## Service Transcribe Functions

Define transcribe functions for each ASR service with the same interface: `transcribe(audio_path)` -> str

In [15]:
# Service and model configurations
SERVICE_MODELS = list(DEFAULT_SERVICE_MODELS)

# Create service functions with models
SERVICE_FUNCS = build_service_function_map(SERVICE_MODELS)
SERVICES = list(SERVICE_FUNCS.keys())
print(f"SERVICES: {SERVICES}")
print(f"Len SERVICES: {len(SERVICES)}")


SERVICES: ['menlo_large-v3', 'vllm_large-v3', 'openai_gpt-4o-transcribe', 'openai_gpt-4o-mini-transcribe', 'openai_whisper-1', 'groq_whisper-large-v3', 'groq_whisper-large-v3-turbo', 'elevenlabs_scribe_v1', 'gladia_pre-recorded']
Len SERVICES: 9


## WER Evaluation Pipeline

Functions for running WER evaluation across multiple services with retry, error handling, and checkpointing.

In [24]:
from audio_eval import run_wer_evaluation, run_wer_evaluation_parallel


## Run WER Evaluation

Example usage of the WER evaluation pipeline.

In [29]:
# Run evaluation for all services on 1 English sample (PARALLEL VERSION)
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"Number of services: {len(SERVICES)}")
print(f"Services: {SERVICES}")
results = run_wer_evaluation_parallel(
    dataset_path=os.getenv("CV22_PATH"),
    services=SERVICES,  # All services
    service_funcs=SERVICE_FUNCS,
    languages=["en"],  # English only
    results_file=f"results/wer_results_{timestamp}.json",
    checkpoint_file=f"results/wer_checkpoint_{timestamp}.json",
    log_file=f"logs/wer_eval_{timestamp}.log",
    max_retries=2,  # Fewer retries for testing
    n_samples=1,  # 1 sample
    max_workers=8,  # Run up to 8 services in parallel per sample
    normalizer_resolver=get_text_normalizer,
)

print("\nFinal results:")
for lang, data in results.items():
    print(f"{lang}:")
    for service, metrics in data.items():
        print(
            f"  {service}: WER={metrics['wer']:.4f}, Avg Time={metrics['timing']:.2f}s, Samples={metrics['n_samples']}"
        )


Processing language: en
Processing language: en
Processing language: en
Processing language: en
Processing language: en
Evaluating 8 services on en
Evaluating 8 services on en
Evaluating 8 services on en
Evaluating 8 services on en
Evaluating 8 services on en
Evaluating 8 services on en


Number of services: 8
Services: ['menlo_large-v3', 'openai_gpt-4o-transcribe', 'openai_gpt-4o-mini-transcribe', 'openai_whisper-1', 'groq_whisper-large-v3', 'groq_whisper-large-v3-turbo', 'elevenlabs_scribe_v1', 'gladia_pre-recorded']
Found 10 languages: ['ru', 'it', 'en', 'es', 'ja']...


menlo_large-v3 on en: WER=0.0000, Avg Time=0.50s, Samples=1
menlo_large-v3 on en: WER=0.0000, Avg Time=0.50s, Samples=1
menlo_large-v3 on en: WER=0.0000, Avg Time=0.50s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Samples=1
openai_gpt-4o-mini-transcribe on en: WER=0.0000, Avg Time=1.44s, Samples=1
openai_gpt-4o-mini-transcribe on en: WER=0.0000, Avg Time=1.44s, Samples=1
openai_gpt-4o-mini-transcribe on en: WER=0.0000, Avg Time=1.44s, Samples=1
openai_whisper-1 on en: WER=0.0000, Avg Time=1.14s, Samples=1
menlo_large-v3 on en: WER=0.0000, Avg Time=0.50s, Samples=1
menlo_large-v3 on en: WER=0.0000, Avg Time=0.50s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Samples=1
openai_gpt-4o-transcribe on en: WER=0.0000, Avg Time=1.29s, Sam


Final results:
en:
  menlo_large-v3: WER=0.0000, Avg Time=0.50s, Samples=1
  openai_gpt-4o-transcribe: WER=0.0000, Avg Time=1.29s, Samples=1
  openai_gpt-4o-mini-transcribe: WER=0.0000, Avg Time=1.44s, Samples=1
  openai_whisper-1: WER=0.0000, Avg Time=1.14s, Samples=1
  groq_whisper-large-v3: WER=0.0000, Avg Time=0.60s, Samples=1
  groq_whisper-large-v3-turbo: WER=0.0000, Avg Time=0.62s, Samples=1
  elevenlabs_scribe_v1: WER=0.0000, Avg Time=4.92s, Samples=1
  gladia_pre-recorded: WER=0.0000, Avg Time=18.64s, Samples=1


In [None]:
# Run evaluation for ALL languages, 50 samples each, all services (PARALLEL VERSION)
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Get all available languages
cv_dataset = CommonVoiceDataset(os.getenv("CV22_PATH"))
all_languages = cv_dataset.languages
print(f"Found {len(all_languages)} languages: {all_languages}")

print(f"Number of services: {len(SERVICES)}")
print(f"Services: {SERVICES}")

results = run_wer_evaluation_parallel(
    dataset_path=os.getenv("CV22_PATH"),
    services=SERVICES,  # All services
    service_funcs=SERVICE_FUNCS,
    languages=all_languages,  # All languages
    results_file=f"results/wer_results_all_{timestamp}.json",
    checkpoint_file=f"results/wer_checkpoint_all_{timestamp}.json",
    log_file=f"logs/wer_eval_all_{timestamp}.log",
    max_retries=2,  # Fewer retries for testing
    n_samples=50,  # 50 samples per language
    max_workers=8,  # Run up to 8 services in parallel per sample
    normalizer_resolver=get_text_normalizer,
)

print("\nFinal results:")
for lang, data in results.items():
    print(f"\n{lang.upper()}:")
    for service, metrics in data.items():
        print(
            f"  {service}: WER={metrics['wer']:.4f}, Avg Time={metrics['timing']:.2f}s, Samples={metrics['n_samples']}"
        )

In [None]:
# Test language-specific normalizers
test_texts = {
    "en": "He was thinking about omens, and someone had appeared.",
    "es": "Él estaba pensando en augurios, y alguien había aparecido.",
    "fr": "Il pensait aux présages, et quelqu'un était apparu.",
    "de": "Er dachte an Omen, und jemand war erschienen.",
    "ja": "彼は前兆について考えていて、誰かが現れた。",
    "zh": "他在思考预兆，有人出现了。",
    "ru": "Он думал о предзнаменованиях, и кто-то появился.",
    "ar": "كان يفكر في النذير، وظهر شخص ما.",
    "hi": "वह पूर्वाभास के बारे में सोच रहा था, और कोई प्रकट हुआ था।",
    "pt": "Ele estava pensando em presságios, e alguém havia aparecido.",
}

print("Testing language-specific normalizers:")
for lang, text in test_texts.items():
    normalizer = get_text_normalizer(lang)
    normalized = normalizer(text)
    print(f"{lang.upper()}: {text}")
    print(f"       -> {normalized}")
    print()

Testing language-specific normalizers:
EN: He was thinking about omens, and someone had appeared.
       -> he was thinking about omens and someone had appeared

ES: Él estaba pensando en augurios, y alguien había aparecido.
       -> el estaba pensando en augurios y alguien habia aparecido

FR: Il pensait aux présages, et quelqu'un était apparu.
       -> il pensait aux presages et quelqu un etait apparu

DE: Er dachte an Omen, und jemand war erschienen.
       -> er dachte an omen und jemand war erschienen

JA: 彼は前兆について考えていて、誰かが現れた。
       -> 彼は前兆について考えていて 誰かか現れた

ZH: 他在思考预兆，有人出现了。
       -> 他在思考预兆 有人出现了

RU: Он думал о предзнаменованиях, и кто-то появился.
       -> он думал о предзнаменованиях и кто то появился

AR: كان يفكر في النذير، وظهر شخص ما.
       -> كان يفكر في النذير وظهر شخص ما

HI: वह पूर्वाभास के बारे में सोच रहा था, और कोई प्रकट हुआ था।
       -> वह परव भ स क ब र म स च रह थ और क ई परकट हआ थ

PT: Ele estava pensando em presságios, e alguém havia aparecido.
       -> el