In [None]:
import json
import pandas as pd
from pathlib import Path
from pypinyin import pinyin, Style
import difflib
from dataclasses import dataclass
from typing import List, Dict, Optional

# Paths
DATA_DIR = Path("../data/sample_audio")
REFERENCE_FILE = Path("../data/reference_phrases.json")

# Load reference data
with open(REFERENCE_FILE, 'r', encoding='utf-8') as f:
    reference_data = json.load(f)

✓ Setup complete


In [2]:
def text_to_pinyin(text, style=Style.TONE3):
    """
    Convert Chinese text to pinyin
    
    Args:
        text: Chinese characters
        style: Pinyin style (TONE3 = numeric tones like 'ni3 hao3')
    
    Returns:
        List of pinyin syllables
    """
    result = pinyin(text, style=style, heteronym=False)
    # Flatten the nested list
    return [syllable[0] for syllable in result]

def text_to_pinyin_display(text):
    """Get pinyin with tone marks for display"""
    result = pinyin(text, style=Style.TONE)
    return ' '.join([syllable[0] for syllable in result])

# Test the functions
test_phrases = ["你好", "妈妈", "马", "骂"]

print("Pinyin Conversion Tests:\n")
for phrase in test_phrases:
    numeric = text_to_pinyin(phrase, Style.TONE3)
    display = text_to_pinyin_display(phrase)
    print(f"{phrase:6} → {' '.join(numeric):15} → {display}")

Pinyin Conversion Tests:

你好     → ni3 hao3        → nǐ hǎo
妈妈     → ma1 ma1         → mā mā
马      → ma3             → mǎ
骂      → ma4             → mà


In [3]:
@dataclass
class Syllable:
    """Represents a Mandarin syllable with components"""
    full: str           # Full pinyin (e.g., 'ni3')
    initial: str        # Initial consonant (e.g., 'n')
    final: str          # Final vowel/ending (e.g., 'i')
    tone: str           # Tone number (e.g., '3')
    
def decompose_pinyin(pinyin_syllable):
    """
    Break pinyin into initial, final, and tone
    
    Mandarin syllable structure: (Initial) + Final + Tone
    Initial = optional consonant at start (b, p, m, f, d, t, n, l, etc.)
    Final = vowel + optional nasal ending (a, o, e, i, u, ü, an, en, ang, eng, etc.)
    Tone = 1, 2, 3, 4, or 5 (neutral)
    """
    # Mandarin initials (consonants that can start a syllable)
    initials = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 
                'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w']
    
    # Extract tone (last character if it's a digit)
    if pinyin_syllable and pinyin_syllable[-1].isdigit():
        tone = pinyin_syllable[-1]
        base = pinyin_syllable[:-1]
    else:
        tone = '5'  # Neutral tone
        base = pinyin_syllable
    
    # Find initial (longest matching prefix)
    initial = ''
    final = base
    
    for init in sorted(initials, key=len, reverse=True):  # Check longest first
        if base.startswith(init):
            initial = init
            final = base[len(init):]
            break
    
    return Syllable(
        full=pinyin_syllable,
        initial=initial,
        final=final,
        tone=tone
    )

# Test decomposition
test_syllables = ['ni3', 'hao3', 'ma1', 'ma3', 'ma4', 'zhi1', 'chi1']

print("Phoneme Decomposition:\n")
print(f"{'Pinyin':<8} {'Initial':<8} {'Final':<8} {'Tone':<6}")
print("-" * 35)
for syl in test_syllables:
    decomposed = decompose_pinyin(syl)
    print(f"{decomposed.full:<8} {decomposed.initial:<8} {decomposed.final:<8} {decomposed.tone:<6}")

Phoneme Decomposition:

Pinyin   Initial  Final    Tone  
-----------------------------------
ni3      n        i        3     
hao3     h        ao       3     
ma1      m        a        1     
ma3      m        a        3     
ma4      m        a        4     
zhi1     zh       i        1     
chi1     ch       i        1     


In [7]:
# We need to load the transcription results from Day 2
# Let's recreate the comparison with pinyin added

def load_and_analyze_transcriptions():
    """Load reference data and add pinyin analysis"""
    
    results = []
    
    for phrase_data in reference_data['phrases']:
        filename = phrase_data['filename']
        expected_chinese = phrase_data['chinese']
        expected_pinyin_str = phrase_data['pinyin']
        
        # Convert expected to pinyin syllables
        expected_pinyin = text_to_pinyin(expected_chinese, Style.TONE3)
        
        results.append({
            'filename': filename,
            'expected_chinese': expected_chinese,
            'expected_pinyin_str': expected_pinyin_str,
            'expected_pinyin': expected_pinyin,
            'expected_syllables': [decompose_pinyin(syl) for syl in expected_pinyin]
        })
    
    return pd.DataFrame(results)

analysis_df = load_and_analyze_transcriptions()

# Display sample
print("Expected Pronunciations:\n")
for idx, row in analysis_df.iterrows():
    print(f"\n{row['expected_chinese']} ({row['filename']})")
    print(f"  Pinyin: {' '.join(row['expected_pinyin'])}")
    print(f"  Syllables: ", end='')
    for syl in row['expected_syllables']:
        print(f"[{syl.initial}-{syl.final}-{syl.tone}] ", end='')
    print()

Expected Pronunciations:


你好 (ni_hao)
  Pinyin: ni3 hao3
  Syllables: [n-i-3] [h-ao-3] 

你好 (ni_hao_incorrect)
  Pinyin: ni3 hao3
  Syllables: [n-i-3] [h-ao-3] 

谢谢 (xie_xie)
  Pinyin: xie4 xie4
  Syllables: [x-ie-4] [x-ie-4] 

妈妈 (ma_ma)
  Pinyin: ma1 ma1
  Syllables: [m-a-1] [m-a-1] 

马 (ma_horse)
  Pinyin: ma3
  Syllables: [m-a-3] 

骂 (ma_scold)
  Pinyin: ma4
  Syllables: [m-a-4] 

我在学中文 (wo_zai_xue_zhong_wen)
  Pinyin: wo3 zai4 xue2 zhong1 wen2
  Syllables: [w-o-3] [z-ai-4] [x-ue-2] [zh-ong-1] [w-en-2] 

今天天气很好 (jin_tian_tian_qi_hen_hao)
  Pinyin: jin1 tian1 tian1 qi4 hen3 hao3
  Syllables: [j-in-1] [t-ian-1] [t-ian-1] [q-i-4] [h-en-3] [h-ao-3] 

知道 (zhi_dao)
  Pinyin: zhi1 dao4
  Syllables: [zh-i-1] [d-ao-4] 

吃饭 (chi_fan)
  Pinyin: chi1 fan4
  Syllables: [ch-i-1] [f-an-4] 


In [8]:
def compare_syllables(expected: Syllable, actual: Syllable) -> Dict:
    """
    Compare two syllables and identify differences
    
    Returns dict with:
        - match: bool (exact match)
        - initial_match: bool
        - final_match: bool  
        - tone_match: bool
        - feedback: str (description of error)
    """
    if expected.full == actual.full:
        return {
            'match': True,
            'initial_match': True,
            'final_match': True,
            'tone_match': True,
            'feedback': '✓ Correct'
        }
    
    initial_match = expected.initial == actual.initial
    final_match = expected.final == actual.final
    tone_match = expected.tone == actual.tone
    
    # Generate specific feedback
    errors = []
    if not initial_match:
        errors.append(f"initial: '{expected.initial}' → '{actual.initial}'")
    if not final_match:
        errors.append(f"final: '{expected.final}' → '{actual.final}'")
    if not tone_match:
        tone_names = {
            '1': 'first tone (flat)',
            '2': 'second tone (rising)', 
            '3': 'third tone (fall-rise)',
            '4': 'fourth tone (falling)',
            '5': 'neutral tone'
        }
        errors.append(
            f"tone: {tone_names.get(expected.tone, expected.tone)} → "
            f"{tone_names.get(actual.tone, actual.tone)}"
        )
    
    feedback = "✗ " + ", ".join(errors)
    
    return {
        'match': False,
        'initial_match': initial_match,
        'final_match': final_match,
        'tone_match': tone_match,
        'feedback': feedback
    }

# Test comparison
test_cases = [
    (decompose_pinyin('ma1'), decompose_pinyin('ma1')),  # Perfect match
    (decompose_pinyin('ma1'), decompose_pinyin('ma3')),  # Wrong tone
    (decompose_pinyin('ni3'), decompose_pinyin('li3')),  # Wrong initial
    (decompose_pinyin('hao3'), decompose_pinyin('hao4')), # Wrong tone
]

print("Syllable Comparison Tests:\n")
for expected, actual in test_cases:
    result = compare_syllables(expected, actual)
    print(f"{expected.full} vs {actual.full}: {result['feedback']}")

Syllable Comparison Tests:

ma1 vs ma1: ✓ Correct
ma1 vs ma3: ✗ tone: first tone (flat) → third tone (fall-rise)
ni3 vs li3: ✗ initial: 'n' → 'l'
hao3 vs hao4: ✗ tone: third tone (fall-rise) → fourth tone (falling)


In [12]:
def assess_pronunciation(expected_chinese: str, actual_chinese: str) -> Dict:
    """
    Complete pronunciation assessment
    
    Args:
        expected_chinese: What should have been said
        actual_chinese: What was actually said (from Whisper)
    
    Returns:
        Dictionary with detailed assessment
    """
    # Convert to pinyin
    expected_pinyin = text_to_pinyin(expected_chinese, Style.TONE3)
    actual_pinyin = text_to_pinyin(actual_chinese, Style.TONE3)
    
    # Decompose into syllables
    expected_syllables = [decompose_pinyin(syl) for syl in expected_pinyin]
    actual_syllables = [decompose_pinyin(syl) for syl in actual_pinyin]
    
    # Handle length mismatch
    if len(expected_syllables) != len(actual_syllables):
        return {
            'overall_match': False,
            'score': 0,
            'expected_pinyin': ' '.join(expected_pinyin),
            'actual_pinyin': ' '.join(actual_pinyin),
            'syllable_details': [],
            'summary': f"Length mismatch: expected {len(expected_syllables)} syllables, "
                      f"got {len(actual_syllables)}"
        }
    
    # Compare syllable by syllable
    syllable_comparisons = []
    for i, (exp_syl, act_syl) in enumerate(zip(expected_syllables, actual_syllables)):
        comparison = compare_syllables(exp_syl, act_syl)
        comparison['position'] = i
        comparison['expected'] = exp_syl.full
        comparison['actual'] = act_syl.full
        syllable_comparisons.append(comparison)
    
    # Calculate overall score
    total_components = len(expected_syllables) * 3  # initial + final + tone
    correct_components = sum([
        comp['initial_match'] + comp['final_match'] + comp['tone_match']
        for comp in syllable_comparisons
    ])
    
    score = (correct_components / total_components) * 100 if total_components > 0 else 0
    overall_match = all(comp['match'] for comp in syllable_comparisons)
    
    return {
        'overall_match': overall_match,
        'score': round(score, 1),
        'expected_pinyin': ' '.join(expected_pinyin),
        'actual_pinyin': ' '.join(actual_pinyin),
        'syllable_details': syllable_comparisons,
        'summary': generate_feedback_summary(syllable_comparisons)
    }


def generate_feedback_summary(comparisons: List[Dict]) -> str:
    """Generate human-readable feedback summary"""
    if all(c['match'] for c in comparisons):
        return "✓ Perfect pronunciation!"
    
    tone_errors = sum(1 for c in comparisons if not c['tone_match'])
    initial_errors = sum(1 for c in comparisons if not c['initial_match'])
    final_errors = sum(1 for c in comparisons if not c['final_match'])
    
    feedback_parts = []
    if tone_errors > 0:
        feedback_parts.append(f"{tone_errors} tone error(s)")
    if initial_errors > 0:
        feedback_parts.append(f"{initial_errors} initial consonant error(s)")
    if final_errors > 0:
        feedback_parts.append(f"{final_errors} vowel/final error(s)")
    
    return "Issues: " + ", ".join(feedback_parts)

# Test the full assessment
test_assessments = [
    ("你好", "你好"),      # Perfect
    ("妈妈", "妈妈"),      # Perfect  
    ("马", "妈妈"),        # Wrong word
    ("骂", "妈妈"),        # Wrong word
]

print("Full Pronunciation Assessments:\n")
for expected, actual in test_assessments:
    result = assess_pronunciation(expected, actual)
    print(f"\nExpected: {expected} → Actual: {actual}")
    print(f"Score: {result['score']}%")
    print(f"Expected pinyin: {result['expected_pinyin']}")
    print(f"Actual pinyin: {result['actual_pinyin']}")
    print(f"Summary: {result['summary']}")

Full Pronunciation Assessments:


Expected: 你好 → Actual: 你好
Score: 100.0%
Expected pinyin: ni3 hao3
Actual pinyin: ni3 hao3
Summary: ✓ Perfect pronunciation!

Expected: 妈妈 → Actual: 妈妈
Score: 100.0%
Expected pinyin: ma1 ma1
Actual pinyin: ma1 ma1
Summary: ✓ Perfect pronunciation!

Expected: 马 → Actual: 妈妈
Score: 0%
Expected pinyin: ma3
Actual pinyin: ma1 ma1
Summary: Length mismatch: expected 1 syllables, got 2

Expected: 骂 → Actual: 妈妈
Score: 0%
Expected pinyin: ma4
Actual pinyin: ma1 ma1
Summary: Length mismatch: expected 1 syllables, got 2


In [None]:
# Load transcription results from previous notebook
transcriptions_df = pd.read_csv("../data/transcriptions_df.csv")

# Load reference data for expected values
reference_df = pd.DataFrame(reference_data['phrases'])

# Merge to get expected Chinese characters
merged_df = transcriptions_df.merge(
    reference_df[['filename', 'chinese']],
    left_on='base_filename',
    right_on='filename',
    how='left',
    suffixes=('', '_ref')
)

# Rename for clarity
merged_df = merged_df.rename(columns={
    'chinese': 'expected',
    'transcription': 'actual'
})

print("Real Recording Assessments:\n")
print("="*80)

assessment_results = []

for idx, row in merged_df.iterrows():
    # Skip if we don't have both expected and actual
    if pd.isna(row['expected']) or pd.isna(row['actual']):
        print(f"\nFile: {row['base_filename']}.wav - SKIPPED (missing data)")
        print("-"*80)
        continue
    
    result = assess_pronunciation(row['expected'], row['actual'])
    
    # Store for later analysis
    assessment_results.append({
        'filename': row['base_filename'],
        'expected': row['expected'],
        'actual': row['actual'],
        'score': result['score'],
        'overall_match': result['overall_match']
    })
    
    print(f"\nFile: {row['base_filename']}.wav")
    print(f"Expected: {row['expected']} ({result['expected_pinyin']})")
    print(f"Actual:   {row['actual']} ({result['actual_pinyin']})")
    print(f"Score: {result['score']}% - {result['summary']}")
    
    if not result['overall_match']:
        print("\nDetailed Feedback:")
        for detail in result['syllable_details']:
            if not detail['match']:
                print(f"  Syllable {detail['position']+1}: {detail['feedback']}")
    
    print("-"*80)

# Summary statistics
assessment_df = pd.DataFrame(assessment_results)
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Total recordings assessed: {len(assessment_df)}")
print(f"Perfect pronunciations: {assessment_df['overall_match'].sum()}")
print(f"Average score: {assessment_df['score'].mean():.1f}%")
print(f"Median score: {assessment_df['score'].median():.1f}%")
print(f"Score range: {assessment_df['score'].min():.1f}% - {assessment_df['score'].max():.1f}%")

Real Recording Assessments:


File: chi_fan.wav
Expected: 吃饭 (chi1 fan4)
Actual:   吃饭 (chi1 fan4)
Score: 100.0% - ✓ Perfect pronunciation!
--------------------------------------------------------------------------------

File: jin_tian_tian_qi_hen_hao.wav
Expected: 今天天气很好 (jin1 tian1 tian1 qi4 hen3 hao3)
Actual:   今天天气很好 (jin1 tian1 tian1 qi4 hen3 hao3)
Score: 100.0% - ✓ Perfect pronunciation!
--------------------------------------------------------------------------------

File: ma_horse.wav
Expected: 马 (ma3)
Actual:   马 (ma3)
Score: 100.0% - ✓ Perfect pronunciation!
--------------------------------------------------------------------------------

File: ma_ma.wav
Expected: 妈妈 (ma1 ma1)
Actual:   妈妈 (ma1 ma1)
Score: 100.0% - ✓ Perfect pronunciation!
--------------------------------------------------------------------------------

File: ma_scold.wav
Expected: 骂 (ma4)
Actual:   妈 (ma1)
Score: 66.7% - Issues: 1 tone error(s)

Detailed Feedback:
  Syllable 1: ✗ tone: fourth tone (falling) 

## Two Issues:

### 1. "Ni Hao" Romanization Problem

This is a genuine limitation. When Whisper can't confidently match audio to Chinese phonemes, it falls back to romanization. This creates a UX problem:

- **Root cause:** Whisper's confidence threshold. Poor pronunciation → low confidence → fallback to romanized output
- **Why this matters:** Beginners (our target market) will hit this constantly. A system that returns "Sorry, too unclear" isn't helpful.

**Possible solutions:**
- (a) Force Chinese output mode (doesn't exist in Whisper API unfortunately)
- (b) Fuzzy matching: If Whisper returns romanization, try to map "Ni Hao" → closest Chinese match from expected phrases
- (c) Use a different model for very poor pronunciation: Models specifically trained on learner speech (not Whisper)
- (d) Hybrid approach: If romanization detected, fall back to simpler "try again" feedback rather than detailed phoneme analysis

### 2. 骂 vs 妈 Confusion

This is likely **Whisper's language model bias**.

**Language models have priors.** Whisper was trained on natural speech where:
- 妈 (mother) appears 1000x more frequently than 骂 (scold)
- People rarely say 骂 in isolation
- Given ambiguous audio, the model defaults to the more common word

**This is a fundamental challenge in speech recognition:** acoustic model says "ma-ish sound with falling tone" → language model says "probably 妈 not 骂 because context"