## 4 Model Evaluation

This notebook loads the Whisper medium model finetuned on 8 hours of Sursilvan data and evaluates it on the test set.

In [3]:
# Cell 1: Imports
import os
import torch
import whisper
import pandas as pd
import numpy as np
from jiwer import wer, cer
from tqdm import tqdm
import warnings
from transformers import WhisperForConditionalGeneration, WhisperProcessor
warnings.filterwarnings("ignore")
import librosa
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from constants import FOLDER_NAMES, DATA_ROOT
from helpers import get_idiom_name_by_folder

# Configuration
MODEL_PATH = "./whisper-medium-rm-all"  # Path to your fine-tuned model
TEST_FILE = "test.tsv"
CLIPS_DIR = "clips"
BATCH_SIZE = 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_SAMPLES = None  # Set to a number for quick test, None for full test set

In [4]:
# Cell 2: Check GPU and Load Model (Fixed)
print("="*60)
print("Whisper Romansh Model Evaluation")
print("="*60)

print(f"Device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Load your fine-tuned model (Hugging Face format)
print(f"\nüì• Loading fine-tuned model from {MODEL_PATH}...")

# Load processor (for feature extraction and tokenization)
processor = WhisperProcessor.from_pretrained(MODEL_PATH)

# Load model
model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(DEVICE)

print("‚úÖ Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Whisper Romansh Model Evaluation
Device: cuda
GPU: NVIDIA GeForce RTX 3090

üì• Loading fine-tuned model from ./whisper-medium-rm-all...


Loading weights:   0%|          | 0/947 [00:00<?, ?it/s]

‚úÖ Model loaded successfully!
Model parameters: 763.9M


In [5]:
# Cell 3: Load Test Data from All Idioms

print("\nüìÇ Loading test data from all idioms...")

df_test = pd.DataFrame()
audio_paths = []
references = []
idioms = []

for idiom_folder in FOLDER_NAMES:
    idiom_path = os.path.join(DATA_ROOT, idiom_folder)
    idiom_name = get_idiom_name_by_folder(idiom_folder)
    test_tsv = os.path.join(idiom_path, "test.tsv")
    clips_path = os.path.join(idiom_path, "clips")
    
    if not os.path.exists(test_tsv):
        print(f"‚ö†Ô∏è No test.tsv found for {idiom_folder}, skipping...")
        continue
    
    print(f"\nüìÇ Processing {idiom_folder}...")
    
    df_idiom = pd.read_csv(test_tsv, sep='\t')
    
    valid_indices = []
    for idx, row in df_idiom.iterrows():
        audio_path = os.path.join(clips_path, row['path'])
        if os.path.exists(audio_path):
            valid_indices.append(idx)
            audio_paths.append(audio_path)
            references.append(row['sentence'])
            idioms.append(idiom_name)
    
    df_idiom_valid = df_idiom.loc[valid_indices].copy()
    df_idiom_valid['idiom'] = idiom_folder
    df_test = pd.concat([df_test, df_idiom_valid], ignore_index=True)
    
    print(f"  Added {len(df_idiom_valid)} samples from {idiom_folder}")

print("\n" + "="*60)
print("üìä Combined Test Dataset Statistics")
print("="*60)
print(f"Total test samples across all idioms: {len(df_test)}")

print(f"\n‚úÖ Total audio files to process: {len(audio_paths)}")

if NUM_SAMPLES:
    df_test = df_test.head(NUM_SAMPLES)
    audio_paths = audio_paths[:NUM_SAMPLES]
    print(f"Using first {NUM_SAMPLES} samples for quick test")


üìÇ Loading test data from all idioms...

üìÇ Processing rmsurmiran-cc-2021-12-23...
  Added 151 samples from rmsurmiran-cc-2021-12-23

üìÇ Processing rmsutsilv-cc-2022-05-18...
  Added 94 samples from rmsutsilv-cc-2022-05-18

üìÇ Processing rmputer-cc-2021-06-11...
  Added 114 samples from rmputer-cc-2021-06-11

üìÇ Processing rm-cc-2021-05-28...
  Added 81 samples from rm-cc-2021-05-28

üìÇ Processing rmvallader-cc-2021-05-28...
  Added 97 samples from rmvallader-cc-2021-05-28

üìÇ Processing rmsursilv-cc-2021-05-28...
  Added 94 samples from rmsursilv-cc-2021-05-28

üìä Combined Test Dataset Statistics
Total test samples across all idioms: 631

‚úÖ Total audio files to process: 631


In [6]:
# Cell 4: Batch Transcription (Fixed)

class AudioDataset(Dataset):
    def __init__(self, audio_paths, processor, device):
        self.audio_paths = audio_paths
        self.processor = processor
        self.device = device
    
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        # Load audio
        audio_array, sr = librosa.load(audio_path, sr=16000)
        
        # Process to features
        input_features = self.processor(
            audio_array, 
            sampling_rate=16000, 
            return_tensors="pt"
        ).input_features[0]  # Remove batch dimension
        
        return input_features

def collate_fn(batch):
    """Custom collate function to handle different sized tensors"""
    # Find max length in batch
    max_len = max(features.shape[-1] for features in batch)
    
    # Pad all features to max length
    padded_batch = []
    for features in batch:
        pad_len = max_len - features.shape[-1]
        if pad_len > 0:
            # Pad with zeros along the time dimension
            padding = torch.zeros((features.shape[0], pad_len))
            padded = torch.cat([features, padding], dim=-1)
        else:
            padded = features
        padded_batch.append(padded)
    
    # Stack into batch
    return torch.stack(padded_batch)

print(f"\nüéôÔ∏è Transcribing {len(audio_paths)} test files...")

# Create dataset and dataloader with custom collate
dataset = AudioDataset(audio_paths, processor, DEVICE)
dataloader = DataLoader(
    dataset, 
    batch_size=8, 
    shuffle=False, 
    num_workers=0,  # Set to 0 to avoid multiprocessing issues
    collate_fn=collate_fn
)

transcriptions = []

for batch_features in tqdm(dataloader, desc="Transcribing"):
    # Move batch to device
    batch_features = batch_features.to(DEVICE)
    
    # Generate transcriptions for the batch
    with torch.no_grad():
        predicted_ids = model.generate(
            batch_features,
            max_length=225,
            num_beams=1,
            task="transcribe"
        )
    
    # Decode batch
    batch_transcriptions = processor.batch_decode(
        predicted_ids, 
        skip_special_tokens=True
    )
    transcriptions.extend(batch_transcriptions)

print(f"‚úÖ Transcribed {len(transcriptions)} files")


üéôÔ∏è Transcribing 631 test files...


Transcribing:   0%|          | 0/79 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'tr

‚úÖ Transcribed 631 files





In [7]:
# Cell 5: Calculate WER and CER per idiom
print("\n" + "="*60)
print("üìä FINAL RESULTS - PER IDIOM")
print("="*60)

# Group by idiom
idiom_results = defaultdict(lambda: {"references": [], "hypotheses": []})

# Filter valid pairs and group by idiom
valid_pairs = []
for ref, hyp, idiom in zip(references, transcriptions, idioms):
    if ref and hyp:
        valid_pairs.append((ref, hyp, idiom))
        idiom_results[idiom]["references"].append(ref)
        idiom_results[idiom]["hypotheses"].append(hyp)

if not valid_pairs:
    print("‚ùå No valid reference-hypothesis pairs found!")
else:
    # Overall results
    all_refs = [p[0] for p in valid_pairs]
    all_hyps = [p[1] for p in valid_pairs]
    
    overall_wer = wer(all_refs, all_hyps)
    overall_cer = cer(all_refs, all_hyps)
    
    print("\n" + "="*50)
    print("üìà OVERALL RESULTS")
    print("="*50)
    print(f"Total test samples: {len(df_test)}")
    print(f"Valid pairs: {len(valid_pairs)}/{len(df_test)}")
    print(f"\nWord Error Rate (WER): {overall_wer:.4f} ({overall_wer*100:.2f}%)")
    print(f"Character Error Rate (CER): {overall_cer:.4f} ({overall_cer*100:.2f}%)")
    
    # Per-idiom results
    print("\n" + "="*50)
    print("üìä PER IDIOM RESULTS")
    print("="*50)
    
    # Store results for dataframe
    per_idiom_data = []
    
    for idiom, data in idiom_results.items():
        if len(data["references"]) > 0:
            idiom_wer = wer(data["references"], data["hypotheses"])
            idiom_cer = cer(data["references"], data["hypotheses"])
            
            print(f"\nüìÅ {idiom.upper()}")
            print(f"  Samples: {len(data['references'])}")
            print(f"  WER: {idiom_wer:.4f} ({idiom_wer*100:.2f}%)")
            print(f"  CER: {idiom_cer:.4f} ({idiom_cer*100:.2f}%)")
            
            per_idiom_data.append({
                "idiom": idiom,
                "samples": len(data["references"]),
                "wer": idiom_wer,
                "cer": idiom_cer
            })
    
    # Create summary dataframe
    summary_df = pd.DataFrame(per_idiom_data)
    print("\n" + "="*50)
    print("üìã SUMMARY TABLE")
    print("="*50)
    print(summary_df.to_string(index=False))


üìä FINAL RESULTS - PER IDIOM

üìà OVERALL RESULTS
Total test samples: 631
Valid pairs: 631/631

Word Error Rate (WER): 0.0167 (1.67%)
Character Error Rate (CER): 0.0071 (0.71%)

üìä PER IDIOM RESULTS

üìÅ SURMIRAN
  Samples: 151
  WER: 0.0127 (1.27%)
  CER: 0.0048 (0.48%)

üìÅ SUTSILVAN
  Samples: 94
  WER: 0.0063 (0.63%)
  CER: 0.0033 (0.33%)

üìÅ PUTER
  Samples: 114
  WER: 0.0061 (0.61%)
  CER: 0.0025 (0.25%)

üìÅ RG
  Samples: 81
  WER: 0.0343 (3.43%)
  CER: 0.0188 (1.88%)

üìÅ VALLADER
  Samples: 97
  WER: 0.0211 (2.11%)
  CER: 0.0092 (0.92%)

üìÅ SURSILVAN
  Samples: 94
  WER: 0.0217 (2.17%)
  CER: 0.0042 (0.42%)

üìã SUMMARY TABLE
    idiom  samples      wer      cer
 Surmiran      151 0.012748 0.004806
Sutsilvan       94 0.006293 0.003329
    Puter      114 0.006132 0.002499
       RG       81 0.034252 0.018803
 Vallader       97 0.021125 0.009180
Sursilvan       94 0.021665 0.004220


In [8]:
# Cell 6: Sample Transcriptions
print("\n" + "="*60)
print("üìù SAMPLE TRANSCRIPTIONS")
print("="*60)

# Show 5 random samples
import random
sample_indices = random.sample(range(len(valid_pairs)), min(5, len(valid_pairs)))

for i, idx in enumerate(sample_indices):
    print(f"\n--- Sample {i+1} ---")
    print(f"Reference: {references[idx][:200]}...")
    print(f"Hypothesis: {transcriptions[idx][:200]}...")
    
    # Calculate sample-level WER
    sample_wer = wer(references[idx], transcriptions[idx])
    print(f"Sample WER: {sample_wer:.4f}")
    print("-" * 40)


üìù SAMPLE TRANSCRIPTIONS

--- Sample 1 ---
Reference: Noua sa mossa la donna?...
Hypothesis: Noua sa mossa la donna?...
Sample WER: 0.0000
----------------------------------------

--- Sample 2 ---
Reference: La radunanza communala da Luzein ha approv√† il nov model da scola cun in lieu da scola communabel per la scolina e la scola primara a Pany. Quai a partir da l‚Äôonn da scola 2 1000 21 22.Er approv√† ha l...
Hypothesis: La radunanza communala da Luzein ha approv√† il nov model da scola cun in lieu da scola communabel per la scolina e la scola primara a Pany. Quai a partir da l‚Äôonn da scola 2 1000 21 22.Er approv√† ha l...
Sample WER: 0.0000
----------------------------------------

--- Sample 3 ---
Reference: Chegl capeta darar....
Hypothesis: Chegl capeta darar....
Sample WER: 0.0000
----------------------------------------

--- Sample 4 ---
Reference: E lura vegn que ad√ºna p√º concret, tuot es pront, uossa haun ils scienzios da la h. t. w. da strer vi dal glatsch. √ún glat