# CER/WER 101 (custom, HF evaluate, jiwer)
- Apply audio_eval/text_normalizer_utils.get_text_normalizer per language
- Tokenization now matches evaluation pipeline (whitespace only)
- currently following kotoba-whisper's repo logic to normalize Japanese and Chinese text
  - https://github.com/kotoba-tech/kotoba-whisper/blob/649fe3d1427d9ad6027940e29e7651fd516ea590/run_short_form_eval.py#L200

## Text normalization challenges
- Japanese words can appear in hiragana, katakana, or kanji but mean the same thing.
- Example: Tokyo → とうきょう (hiragana), トウキョウ (katakana), 東京 (kanji).
- All forms sound identical, yet ASR evaluation would count them as different.
- Normalization needs to unify such variants while keeping true distinctions intact.

Here are languages that, like Japanese, have multiple scripts or character sets for the same sounds/words, causing normalization headaches in ASR:

- Chinese → Simplified (认识) vs. Traditional (認識), same meaning/sound.  
- Korean → Hangul (한글) vs. Hanja (漢字), older texts often mix them.  
- Hindi and related Indic languages → Hindi in Devanagari (नमस्ते) vs. Romanized (namaste).  
- Arabic & Persian → Variants of Arabic script across regions; e.g., Urdu shares sounds but uses different character forms.  
- Serbian → Can be written in Cyrillic (Србија) or Latin (Srbija), same word.  
- Mongolian → Traditional script vs. Cyrillic script in Mongolia.  
- Kazakh / Uzbek / other Central Asian languages → Cyrillic vs. Latin vs. Arabic script versions, depending on region.  

These cases mirror Japanese in that different scripts = same sound/meaning, so without normalization, ASR scoring looks worse than it is.

In [11]:
import sys, importlib, subprocess

from pathlib import Path

def ensure(pkg):
    try:
        importlib.import_module(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])

# ensure dependencies
ensure('evaluate')
ensure('jiwer')

# Ensure project root is on sys.path so audio_eval imports work
repo_root = Path.cwd().resolve()
while repo_root != repo_root.parent and not (repo_root / 'audio_eval').exists():
    repo_root = repo_root.parent
if (repo_root / 'audio_eval').exists() and str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

import evaluate
import jiwer

from audio_eval.text_normalizer_utils import get_text_normalizer
from audio_eval.normalizer.normalizer import BasicTextNormalizer, BasicMultilingualTextNormalizer

# ------------------------------
# Custom metric utilities
# ------------------------------
def edit_distance(seq1, seq2):
    m, n = len(seq1), len(seq2)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m+1):
        dp[i][0] = i
    for j in range(n+1):
        dp[0][j] = j
    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if seq1[i-1] == seq2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,
                dp[i][j-1] + 1,
                dp[i-1][j-1] + cost,
            )
    return dp[m][n]

def custom_cer(ref, hyp):
    return edit_distance(list(ref), list(hyp)) / max(1, len(ref))

def custom_wer_tokens(ref_tokens, hyp_tokens):
    return edit_distance(ref_tokens, hyp_tokens) / max(1, len(ref_tokens))

# ------------------------------
# HF evaluate + jiwer
# ------------------------------
WER_METRIC = evaluate.load('wer')
try:
    CER_METRIC = evaluate.load('cer')
except Exception:
    CER_METRIC = None

def evaluate_cer(ref, hyp):
    if CER_METRIC is not None:
        return float(CER_METRIC.compute(references=[ref], predictions=[hyp]))
    return custom_cer(ref, hyp)

def evaluate_wer_tokens(ref_tokens, hyp_tokens):
    ref_join = ' '.join(ref_tokens)
    hyp_join = ' '.join(hyp_tokens)
    return float(WER_METRIC.compute(references=[ref_join], predictions=[hyp_join]))

def jiwer_wer_tokens(ref_tokens, hyp_tokens):
    ref_join = ' '.join(ref_tokens)
    hyp_join = ' '.join(hyp_tokens)
    return float(jiwer.wer(ref_join, hyp_join))

def jiwer_cer_score(ref, hyp):
    try:
        return float(jiwer.cer(ref, hyp))
    except AttributeError:
        measures = jiwer.compute_measures(ref, hyp)
        if 'cer' in measures:
            return float(measures['cer'])
        return custom_cer(ref, hyp)

def fmt(x):
    return f"{x:.3f}"

# ------------------------------
# Token helpers
# ------------------------------
def tokens_whitespace(text):
    return [tok for tok in text.split() if tok]

def tokens_characters(text):
    return [char for char in text if not char.isspace()]

def display_table(title, pairs, tokenizer_selector):
    print(title)
    print('Lang       Method      CER      WER')
    print('-------------------------------------')
    for lang, (ref, hyp) in pairs.items():
        token_fn = tokenizer_selector(lang)
        ref_tokens = token_fn(ref)
        hyp_tokens = token_fn(hyp)
        c_cer = custom_cer(ref, hyp)
        c_wer = custom_wer_tokens(ref_tokens, hyp_tokens)
        e_cer = evaluate_cer(ref, hyp)
        e_wer = evaluate_wer_tokens(ref_tokens, hyp_tokens)
        j_cer = jiwer_cer_score(ref, hyp)
        j_wer = jiwer_wer_tokens(ref_tokens, hyp_tokens)
        print(f"{lang:<10} custom   {fmt(c_cer)}  {fmt(c_wer)}")
        print(f'          evaluate {fmt(e_cer)}  {fmt(e_wer)}')
        print(f'          jiwer    {fmt(j_cer)}  {fmt(j_wer)}')
        print('-------------------------------------')
    print()

# ------------------------------
# Test sentences (1-character change each)
# ------------------------------
examples = {
    'English': (
        'Hello, nice to meet you.',
        'Hello, nice to meet ya.',
    ),
    'Japanese': (
        'こんにちは、はじめまして。',
        'こんにちは、はじめましで。',
    ),
    'Chinese': (
        '你好，很高兴认识你。',
        '你好，很高兴認識你。',
    ),
}

lang_codes = {'English': 'en', 'Japanese': 'ja', 'Chinese': 'zh'}

basic_text_normalizers = {
    'English': BasicTextNormalizer(remove_diacritics=False, split_letters=False),
    'Japanese': BasicTextNormalizer(remove_diacritics=False, split_letters=True),
    'Chinese': BasicTextNormalizer(remove_diacritics=False, split_letters=True),
}

multilingual_text_normalizers = {
    'English': BasicMultilingualTextNormalizer(),
    'Japanese': BasicMultilingualTextNormalizer(),
    'Chinese': BasicMultilingualTextNormalizer(),
}

normalized_examples = {}
basic_text_examples = {}
multilingual_examples = {}

print('Normalization preview')
print('-------------------------------------')
for lang, (ref, hyp) in examples.items():
    official_normalizer = get_text_normalizer(lang_codes[lang])
    basic_normalizer = basic_text_normalizers[lang]
    multilingual_normalizer = multilingual_text_normalizers[lang]
    norm_ref_off = official_normalizer(ref)
    norm_hyp_off = official_normalizer(hyp)
    norm_ref_basic = basic_normalizer(ref)
    norm_hyp_basic = basic_normalizer(hyp)
    norm_ref_multi = multilingual_normalizer(ref)
    norm_hyp_multi = multilingual_normalizer(hyp)
    normalized_examples[lang] = (norm_ref_off, norm_hyp_off)
    basic_text_examples[lang] = (norm_ref_basic, norm_hyp_basic)
    multilingual_examples[lang] = (norm_ref_multi, norm_hyp_multi)
    print(f'{lang:<10} REF raw: {ref}')
    print(f'{lang:<10} REF official: {norm_ref_off}')
    print(f'{lang:<10} REF basic: {norm_ref_basic}')
    print(f'{lang:<10} REF multilingual: {norm_ref_multi}')
    print(f'{lang:<10} HYP raw: {hyp}')
    print(f'{lang:<10} HYP official: {norm_hyp_off}')
    print(f'{lang:<10} HYP basic: {norm_hyp_basic}')
    print(f'{lang:<10} HYP multilingual: {norm_hyp_multi}')
    print('')

display_table('Normalized (official pipeline)', normalized_examples, lambda _: tokens_whitespace)
display_table('BasicTextNormalizer (JA/ZH split letters)', basic_text_examples, lambda _: tokens_whitespace)
display_table('BasicMultilingualTextNormalizer (default)', multilingual_examples, lambda _: tokens_whitespace)
display_table('Raw text (char-level tokens for JA/ZH)', examples, lambda lang: tokens_characters if lang in {'Japanese', 'Chinese'} else tokens_whitespace)


Normalization preview
-------------------------------------
English    REF raw: Hello, nice to meet you.
English    REF official: hello nice to meet you
English    REF basic: hello nice to meet you 
English    REF multilingual: hello nice to meet you
English    HYP raw: Hello, nice to meet ya.
English    HYP official: hello nice to meet ya
English    HYP basic: hello nice to meet ya 
English    HYP multilingual: hello nice to meet ya

Japanese   REF raw: こんにちは、はじめまして。
Japanese   REF official: こんにちは はしめまして
Japanese   REF basic: こ ん に ち は は じ め ま し て 
Japanese   REF multilingual: こんにちは はしめまして
Japanese   HYP raw: こんにちは、はじめましで。
Japanese   HYP official: こんにちは はしめまして
Japanese   HYP basic: こ ん に ち は は じ め ま し で 
Japanese   HYP multilingual: こんにちは はしめまして

Chinese    REF raw: 你好，很高兴认识你。
Chinese    REF official: 你好 很高兴认识你
Chinese    REF basic: 你 好 很 高 兴 认 识 你 
Chinese    REF multilingual: 你好 很高兴认识你
Chinese    HYP raw: 你好，很高兴認識你。
Chinese    HYP official: 你好 很高兴認識你
Chinese    HYP basic: 你 好 很 高 兴 