In [None]:
# this notebook is created to learn about evaluating stt results.
# there are some inferences run on wandb, with evals being done which can be further explored here.

In [None]:
import pandas as pd
df_inf = pd.read_parquet("../outputs/vhp-whisper-azure-sample/inference_results.parquet")
df_inf.head()

In [None]:
df_eval = pd.read_csv("../outputs/vhp-whisper-azure-sample/evaluation_results.csv")
df_eval.head()

In [None]:
# all scripts for the experiments are put under /scripts.
# the evaluations bits are under evaluate.py
# here we are explaining step by step of how we compare the inference results (hypothesis) against gt (df_gt in the code):
# (assume args.parque is "../data/veterans_history_project_resources.parquet")

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

df_gt = pd.read_parquet("../data/veterans_history_project_resources.parquet")

# Pick one example to walk through (file_id=0)
file_id = 0
gt_row = df_gt.iloc[file_id]

print(f"=== GROUND TRUTH FOR FILE {file_id} ===")
print(f"Collection: {gt_row['collection_number']}")
print()
print(f"Raw XML transcript (first 500 chars):")
print(gt_row['fulltext_file_str'][:500])

In [None]:
# Cell 2: Show the cleaning process step-by-step
def clean_raw_transcript_str(fulltext_file_str: str) -> str:
    """
    Clean raw XML transcript from VHP dataset.
    This function is from scripts/evaluate.py
    """
    if not fulltext_file_str or pd.isna(fulltext_file_str):
        return ""
    
    l_transcript_lines = []
    
    # Step 1: Parse XML to extract speaker and text
    soup = BeautifulSoup(fulltext_file_str, 'xml')
    
    for sp in soup.find_all('sp'):
        try:
            speaker = sp.find('speaker').get_text(strip=True)
        except:
            speaker = "speaker_unknown"
        try:
            spoken_text = sp.find('p').get_text(strip=True)
        except:
            spoken_text = ""
        
        l_transcript_lines.append(f"<{speaker}>{spoken_text}</{speaker}> ")
    
    # Step 2: Merge lines
    transcript_lines = ''.join(l_transcript_lines)
    print("STEP 1 - After parsing XML:")
    print(transcript_lines[:300])
    print()
    
    # Step 3: Remove annotations: (), [], {}
    transcript_lines_stripped = re.sub(r'\([^)]*\)', '', transcript_lines)
    transcript_lines_stripped = re.sub(r'\[[^]]*\]', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\{[^}]*\)\}', '', transcript_lines_stripped)
    print("STEP 2 - After removing annotations:")
    print(transcript_lines_stripped[:300])
    print()
    
    # Step 4: Remove dashes and ellipsis
    transcript_lines_stripped = re.sub(r'--+', '', transcript_lines_stripped)
    transcript_lines_stripped = re.sub(r'\.{2,}', '', transcript_lines_stripped)
    
    # Step 5: Clean whitespace
    transcript_lines_stripped = re.sub(r'\s+', ' ', transcript_lines_stripped).strip()
    
    # Step 6: Remove speaker tags
    transcript_lines_stripped = re.sub(r'\<[^>]*\>', '', transcript_lines_stripped)
    print("STEP 3 - After removing speaker tags:")
    print(transcript_lines_stripped[:300])
    print()
    
    return transcript_lines_stripped

# Apply cleaning
reference = clean_raw_transcript_str(gt_row['fulltext_file_str'])
print("=== FINAL CLEANED REFERENCE ===")
print(f"Length: {len(reference)} characters")
print(f"First 500 chars: {reference[:500]}")

In [None]:
# Cell 3: Get hypothesis and show normalization
hypothesis = df_inf.iloc[file_id]['hypothesis']

print("=== HYPOTHESIS (Model Output) ===")
print(f"Length: {len(hypothesis)} characters")
print(f"First 500 chars: {hypothesis[:500]}")

In [None]:
# Cell 4: Show normalization process
import jiwer

def normalize(s):
    """Normalize text for WER calculation (from scripts/evaluate.py)"""
    tx = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemovePunctuation(),
        jiwer.Strip(),
        jiwer.RemoveMultipleSpaces()
    ])
    return tx(s)

# Normalize both
ref_norm = normalize(reference)
hyp_norm = normalize(hypothesis)

print("=== NORMALIZATION PROCESS ===")
print()
print("REFERENCE BEFORE:")
print(reference[:200])
print()
print("REFERENCE AFTER:")
print(ref_norm[:200])
print()
print("HYPOTHESIS BEFORE:")
print(hypothesis[:200])
print()
print("HYPOTHESIS AFTER:")
print(hyp_norm[:200])
print()
print(f"Why normalize? Removes case, punctuation, extra spaces so we compare WORDS only")

In [None]:
# Cell 5: Calculate WER step-by-step
m = jiwer.process_words(ref_norm, hyp_norm)

print("=== WER CALCULATION ===")
print()
print(f"Reference words: {len(ref_norm.split())}")
print(f"Hypothesis words: {len(hyp_norm.split())}")
print()
print(f"Substitutions: {m.substitutions} (wrong word)")
print(f"Deletions: {m.deletions} (model missed word)")
print(f"Insertions: {m.insertions} (model added extra word)")
print()
print(f"Total errors: {m.substitutions + m.deletions + m.insertions}")
print(f"WER = Total errors / Reference words")
print(f"WER = {m.substitutions + m.deletions + m.insertions} / {len(ref_norm.split())}")
print(f"WER = {m.wer:.3f} ({m.wer*100:.1f}%)")
print()
print("Interpretation:")
if m.wer < 0.1:
    print("  → Excellent (< 10% error)")
elif m.wer < 0.3:
    print("  → Good (10-30% error)")
elif m.wer < 0.5:
    print("  → Fair (30-50% error)")
else:
    print("  → Poor (> 50% error)")

In [None]:
# Cell 6: Show alignment visualization (which words are wrong)
alignment = jiwer.process_words(ref_norm, hyp_norm)

print("=== WORD ALIGNMENT (First 50 words) ===")
print()
ref_words = ref_norm.split()[:50]
hyp_words = hyp_norm.split()[:50]

# Simple alignment display
for i, (r, h) in enumerate(zip(ref_words, hyp_words)):
    if r == h:
        status = "✓"
    else:
        status = "✗ SUB"
    print(f"{i:3d}: REF='{r:15s}' HYP='{h:15s}' {status}")

In [None]:
# Cell 7: Compare with evaluation results
eval_row = df_eval.iloc[file_id]

print("=== VERIFICATION ===")
print()
print("Our calculation:")
print(f"  WER: {m.wer:.3f}")
print(f"  Substitutions: {m.substitutions}")
print(f"  Deletions: {m.deletions}")
print(f"  Insertions: {m.insertions}")
print()
print("evaluate.py results:")
print(f"  WER: {eval_row['wer']:.3f}")
print(f"  Substitutions: {eval_row['substitutions']}")
print(f"  Deletions: {eval_row['deletions']}")
print(f"  Insertions: {eval_row['insertions']}")
print()
print(f"Match: {abs(m.wer - eval_row['wer']) < 0.001}")