## 2 Whisper Baseline

Before fine-tuning our own model, we will first decode with the whisper model to get a baseline word error rate.

Imports and configuration

In [1]:
import os
import torch
import whisper
import pandas as pd
import numpy as np
from tqdm import tqdm
from jiwer import wer, cer
from bs4 import BeautifulSoup
import re
from pathlib import Path
import warnings
import setproctitle
warnings.filterwarnings("ignore")

PROCESS_NAME = "romansh-asr"
setproctitle.setproctitle(PROCESS_NAME)

DATA_PATH = "romansh-data/sursilvan-mini/"
WHISPER_MODEL = "medium"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_FILE = "whisper_baseline_results.txt"

Checking GPU availability

In [2]:
print("="*60)
print("Whisper Baseline for Romansh ASR")
print("="*60)
print(f"Model: {WHISPER_MODEL}")
print(f"Device: {DEVICE}")
print(f"Data path: {DATA_PATH}")
print("="*60)

if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("CUDA not available - running on CPU (this will be slow!)")

Whisper Baseline for Romansh ASR
Model: medium
Device: cuda
Data path: romansh-data/sursilvan-mini/
GPU: NVIDIA GeForce RTX 3090
GPU Memory: 25.30 GB


Defining some helpful helper functions

In [7]:
def clean_html(text):
    """Remove HTML tags from text"""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_data(tsv_path):
    """Load and prepare data from TSV file"""
    print(f"Loading data from {tsv_path}")
    df = pd.read_csv(tsv_path, sep='\t')
    
    print("Cleaning HTML from sentences...")
    df['clean_sentence'] = df['sentence'].apply(clean_html)
    
    print("\nSample cleaning results:")
    for i in range(min(3, len(df))):
        print(f"Original: {df['sentence'].iloc[i][:100]}...")
        print(f"Cleaned:  {df['clean_sentence'].iloc[i][:100]}...")
        print()
    
    return df

def calculate_metrics(references, hypotheses):
    """Calculate WER and CER with better error handling"""
    # Filter out empty or invalid pairs
    valid_pairs = []
    for ref, hyp in zip(references, hypotheses):
        # Skip if reference is empty or just whitespace
        if not isinstance(ref, str) or not ref.strip():
            continue
        # Skip if hypothesis is empty (but still count these as errors if reference exists)
        if not isinstance(hyp, str):
            hyp = ""
        
        valid_pairs.append((ref.strip(), hyp.strip()))
    
    if not valid_pairs:
        print("Warning: No valid reference-hypothesis pairs found")
        return None, None
    
    refs, hyps = zip(*valid_pairs)
    
    try:
        wer_score = wer(refs, hyps)
        cer_score = cer(refs, hyps)
        return wer_score, cer_score
    except Exception as e:
        print(f"Warning: Error calculating metrics: {e}")
        print(f"First few references: {refs[:3]}")
        print(f"First few hypotheses: {hyps[:3]}")
        return None, None

Loading the whisper model

In [8]:
print(f"\nLoading Whisper {WHISPER_MODEL} model...")
model = whisper.load_model(WHISPER_MODEL, device=DEVICE)
print("Model loaded successfully!")


Loading Whisper medium model...


Model loaded successfully!


Decoding with the whisper model

In [9]:
def transcribe_all_with_language(model, audio_paths):
    """Transcribe all audio files with a single progress bar"""
    transcriptions = []
    detected_languages = []
    
    # Single progress bar for all files
    for audio_path in tqdm(audio_paths, desc="Transcribing audio files"):
        try:
            if not os.path.exists(audio_path):
                transcriptions.append("")
                detected_languages.append("missing")
                continue
            
            result = model.transcribe(
                audio_path,
                task="transcribe",
                fp16=torch.cuda.is_available(),
                verbose=False
            )
            
            transcriptions.append(result['text'].strip())
            detected_languages.append(result.get('language', 'unknown'))
            
        except Exception as e:
            transcriptions.append("")
            detected_languages.append("error")
    
    return transcriptions, detected_languages

Processing all splits

In [12]:
splits = ['train', 'validated', 'test']
all_results = {}
language_summary = {}

for split in splits:
    print(f"\n{'='*50}")
    print(f"Processing {split} split")
    print(f"{'='*50}")
    
    tsv_path = os.path.join(DATA_PATH, f"{split}.tsv")
    clips_path = os.path.join(DATA_PATH, "clips")
    
    if not os.path.exists(tsv_path):
        print(f"{split}.tsv not found, skipping...")
        continue
    
    df = load_data(tsv_path)
    
    audio_paths = [os.path.join(clips_path, path) for path in df['path']]
    
    existing_indices = [i for i, path in enumerate(audio_paths) if os.path.exists(path)]
    missing_count = len(audio_paths) - len(existing_indices)
    
    if missing_count > 0:
        print(f"{missing_count} audio files missing, filtering them out")
        df = df.iloc[existing_indices].reset_index(drop=True)
        audio_paths = [audio_paths[i] for i in existing_indices]
    
    print(f"Total utterances: {len(df)}")
    
    print(f"\nRunning Whisper transcription...")
    hypotheses, detected_langs = transcribe_all_with_language(model, audio_paths)
    
    df['detected_language'] = detected_langs
    
    references = df['clean_sentence'].tolist()
    wer_score, cer_score = calculate_metrics(references, hypotheses)
    
    all_results[split] = {
        'wer': wer_score,
        'cer': cer_score,
        'count': len(df),
        'references': references[:5],
        'hypotheses': hypotheses[:5],
        'languages': detected_langs[:5]
    }
    
    lang_counts = df['detected_language'].value_counts()
    language_summary[split] = lang_counts
    
    print(f"\nResults for {split}:")
    print(f"   Utterances: {len(df)}")
    if wer_score is not None:
        print(f"   WER: {wer_score:.4f} ({wer_score*100:.2f}%)")
        print(f"   CER: {cer_score:.4f} ({cer_score*100:.2f}%)")
    else:
        print("   No valid reference-hypothesis pairs")
    
    print(f"\nLanguage detection distribution:")
    for lang, count in lang_counts.items():
        print(f"   {lang}: {count} ({count/len(df)*100:.1f}%)")


Processing train split
Loading data from romansh-data/sursilvan-mini/train.tsv
Cleaning HTML from sentences...

Sample cleaning results:
Original: <p><span b="2442.334" e="2442.484" s="100" data-index="0" class>Il</span> <span b="2442.484" e="2442...
Cleaned:  Il davos temps vegn bia discutau dalla rolla dalla dunna ella baselgia catolica, co vesis vus quella...

Original: <p><span b="806.922" e="807.322" s="83" data-index="0" class>Suenter</span> <span b="807.352" e="807...
Cleaned:  Suenter ina prelecziun facultativa en dretg da bancas el studi ha quei plaschiu fetg bein a mi. Quei...

Original: <p>E per veser quei stuein nus far in viadi da rodund 80 tochen varga 100 kilometers sut la tiara. <...
Cleaned:  E per veser quei stuein nus far in viadi da rodund 80 tochen varga 100 kilometers sut la tiara. Schi...

Total utterances: 92

Running Whisper transcription...


Transcribing audio files:   0%|          | 0/92 [00:00<?, ?it/s]

Detected language: Slovenian




100%|██████████| 672/672 [00:01<00:00, 504.86frames/s]
Transcribing audio files:   1%|          | 1/92 [00:01<02:27,  1.62s/it]

Detected language: German


100%|██████████| 2392/2392 [00:02<00:00, 890.17frames/s]
Transcribing audio files:   2%|▏         | 2/92 [00:04<03:37,  2.41s/it]

Detected language: Albanian


100%|██████████| 2031/2031 [00:02<00:00, 709.38frames/s]
Transcribing audio files:   3%|▎         | 3/92 [00:07<04:05,  2.75s/it]

Detected language: Italian


100%|██████████| 1350/1350 [00:01<00:00, 791.22frames/s]
Transcribing audio files:   4%|▍         | 4/92 [00:09<03:36,  2.46s/it]

Detected language: Lithuanian


100%|██████████| 3063/3063 [00:03<00:00, 768.31frames/s]
Transcribing audio files:   5%|▌         | 5/92 [00:14<04:32,  3.13s/it]

Detected language: German


100%|██████████| 2325/2325 [00:03<00:00, 639.13frames/s]
Transcribing audio files:   7%|▋         | 6/92 [00:18<04:53,  3.41s/it]

Detected language: German


100%|██████████| 1049/1049 [00:02<00:00, 474.98frames/s]
Transcribing audio files:   8%|▊         | 7/92 [00:20<04:24,  3.11s/it]

Detected language: Portuguese


100%|██████████| 2580/2580 [00:03<00:00, 836.56frames/s]
Transcribing audio files:   9%|▊         | 8/92 [00:23<04:29,  3.20s/it]

Detected language: German


100%|██████████| 1952/1952 [00:09<00:00, 203.20frames/s]
Transcribing audio files:  10%|▉         | 9/92 [00:33<07:20,  5.31s/it]

Detected language: English


100%|██████████| 1868/1868 [00:01<00:00, 1032.66frames/s]
Transcribing audio files:  11%|█         | 10/92 [00:35<05:53,  4.32s/it]

Detected language: Dutch


100%|██████████| 2541/2541 [00:03<00:00, 659.77frames/s]
Transcribing audio files:  12%|█▏        | 11/92 [00:40<05:45,  4.26s/it]

Detected language: Slovenian


100%|██████████| 1934/1934 [00:03<00:00, 518.33frames/s]
Transcribing audio files:  13%|█▎        | 12/92 [00:44<05:35,  4.19s/it]

Detected language: Occitan


100%|██████████| 2365/2365 [00:03<00:00, 633.40frames/s]
Transcribing audio files:  14%|█▍        | 13/92 [00:48<05:26,  4.14s/it]

Detected language: German


100%|██████████| 579/579 [00:01<00:00, 376.02frames/s]
Transcribing audio files:  15%|█▌        | 14/92 [00:49<04:28,  3.45s/it]

Detected language: Occitan


100%|██████████| 1964/1964 [00:01<00:00, 999.85frames/s] 
Transcribing audio files:  16%|█▋        | 15/92 [00:52<03:57,  3.08s/it]

Detected language: German


100%|██████████| 751/751 [00:01<00:00, 586.79frames/s]
Transcribing audio files:  17%|█▋        | 16/92 [00:53<03:20,  2.63s/it]

Detected language: Italian


100%|██████████| 1459/1459 [00:02<00:00, 685.05frames/s]
Transcribing audio files:  18%|█▊        | 17/92 [00:56<03:12,  2.57s/it]

Detected language: German


100%|██████████| 2645/2645 [00:03<00:00, 724.32frames/s]
Transcribing audio files:  20%|█▉        | 18/92 [01:00<03:40,  2.98s/it]

Detected language: Slovenian


100%|██████████| 1548/1548 [00:02<00:00, 584.78frames/s]
Transcribing audio files:  21%|██        | 19/92 [01:03<03:36,  2.96s/it]

Detected language: Latin


100%|██████████| 1665/1665 [00:02<00:00, 709.67frames/s]
Transcribing audio files:  22%|██▏       | 20/92 [01:05<03:26,  2.86s/it]

Detected language: Slovenian


100%|██████████| 1059/1059 [00:01<00:00, 608.13frames/s]
Transcribing audio files:  23%|██▎       | 21/92 [01:07<03:05,  2.62s/it]

Detected language: German


100%|██████████| 2151/2151 [00:14<00:00, 146.82frames/s]
Transcribing audio files:  24%|██▍       | 22/92 [01:22<07:22,  6.32s/it]

Detected language: Albanian


100%|██████████| 1025/1025 [00:01<00:00, 719.31frames/s]
Transcribing audio files:  25%|██▌       | 23/92 [01:24<05:40,  4.94s/it]

Detected language: German


100%|██████████| 2154/2154 [00:02<00:00, 992.61frames/s]
Transcribing audio files:  26%|██▌       | 24/92 [01:26<04:44,  4.19s/it]

Detected language: German


100%|██████████| 2246/2246 [00:10<00:00, 206.95frames/s]
Transcribing audio files:  27%|██▋       | 25/92 [01:38<07:00,  6.27s/it]

Detected language: Slovenian


100%|██████████| 311/311 [00:00<00:00, 620.61frames/s]
Transcribing audio files:  28%|██▊       | 26/92 [01:38<05:05,  4.63s/it]

Detected language: Portuguese


  0%|          | 0/2781 [00:01<?, ?frames/s]
Transcribing audio files:  28%|██▊       | 26/92 [01:40<04:14,  3.86s/it]


KeyboardInterrupt: 

Save results to file

In [None]:
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    f.write("="*60 + "\n")
    f.write("Whisper Baseline Results\n")
    f.write(f"Model: {WHISPER_MODEL}\n")
    f.write(f"Date: {pd.Timestamp.now()}\n")
    f.write("="*60 + "\n\n")
    
    for split, results in all_results.items():
        f.write(f"\n{split.upper()} Split:\n")
        f.write(f"  Utterances: {results['count']}\n")
        if results['wer'] is not None:
            f.write(f"  WER: {results['wer']:.4f}\n")
            f.write(f"  CER: {results['cer']:.4f}\n")
        
        f.write("\n  Language Distribution:\n")
        for lang, count in language_summary[split].items():
            f.write(f"    {lang}: {count}\n")
        
        f.write("\n  Example transcriptions:\n")
        for i, (ref, hyp, lang) in enumerate(zip(results['references'], 
                                                results['hypotheses'], 
                                                results['languages'])):
            f.write(f"    Ref {i+1}: {ref}\n")
            f.write(f"    Hyp {i+1}: {hyp}\n")
            f.write(f"    Lang {i+1}: {lang}\n\n")

Show summary

In [None]:
print("\n" + "="*60)
print("Summary")
print("="*60)

total_utterances = 0
for split in splits:
    if split in all_results:
        results = all_results[split]
        print(f"\n{split.upper()}:")
        print(f"  Utterances: {results['count']}")
        total_utterances += results['count']
        if results['wer'] is not None:
            print(f"  WER: {results['wer']:.4f}")
            print(f"  CER: {results['cer']:.4f}")

print(f"\nTotal utterances processed: {total_utterances}")
print("="*60)
print("Done!")