In [None]:
"""
Iteration: 1 - CORAL-Urdu-ASR - CORAL_Iteration1_Baseline_Evaluation.ipynb
===============================================
Paste this entire script into a Kaggle notebook cell and run.
Requires: Mozilla Common Voice Urdu dataset as input
"""

# ============================================================================
# PART 1: INSTALL DEPENDENCIES
# ============================================================================
print("Installing dependencies...")
import sys
!{sys.executable} -m pip install -q editdistance

# ============================================================================
# PART 2: IMPORTS
# ============================================================================
print("Loading libraries...")
import torch
import gc
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple, Dict
import warnings
import json
import csv
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import editdistance
from collections import defaultdict
from datetime import datetime

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

from transformers import (
    WhisperProcessor, WhisperForConditionalGeneration,
    Wav2Vec2Processor, Wav2Vec2ForCTC,
    AutoProcessor, AutoModelForCTC
)

print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

# ============================================================================
# PART 3: CONFIGURATION
# ============================================================================
CONFIG = {
DATASET_PATH = "/kaggle/input/common-voice-ur/cv-corpus-22.0-delta-2025-06-20/ur"
'MAX_SAMPLES': 10,  # Start small
'OUTPUT_DIR': './iteration1_results',
# Models to evaluate (comment out slow models for quick testing)
'MODELS': [
        "whisper-small",      # Fast
        "whisper-medium",     # Balanced
        "whisper-large",    # Accurate but slow
        "wav2vec2-urdu",      # Urdu-specific
        "mms-300m",           # Multilingual
    ],
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
    'BATCH_CLEANUP': True
}

print(f"\nConfiguration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# ============================================================================
# PART 4: ASR WRAPPER (Streamlined)
# ============================================================================

class UrduASRWrapper:
    SUPPORTED_MODELS = {
        "whisper-large": "openai/whisper-large-v3",
        "whisper-medium": "openai/whisper-medium",
        "whisper-small": "openai/whisper-small",
        "mms-1b": "facebook/mms-1b-all",
        "mms-300m": "facebook/mms-300m",
        "wav2vec2-urdu": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
    }
    
    def __init__(self, device='cpu'):
        self.device = device
        self.current_model = None
        self.processor = None
        self.current_model_name = None
    
    def _preprocess_audio(self, file_path: str, target_sr: int = 16000):
        audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        max_val = np.abs(audio).max()
        if max_val > 0:
            audio = audio / max_val
        return audio
    
    def _load_model(self, model_name: str):
        if model_name not in self.SUPPORTED_MODELS:
            raise ValueError(f"Model {model_name} not supported")
        
        model_id = self.SUPPORTED_MODELS[model_name]
        
        if "whisper" in model_name:
            self.processor = WhisperProcessor.from_pretrained(model_id)
            self.current_model = WhisperForConditionalGeneration.from_pretrained(model_id)
        elif "mms" in model_name:
            self.processor = AutoProcessor.from_pretrained(model_id)
            self.current_model = AutoModelForCTC.from_pretrained(model_id)
        elif "wav2vec2" in model_name:
            self.processor = Wav2Vec2Processor.from_pretrained(model_id)
            self.current_model = Wav2Vec2ForCTC.from_pretrained(model_id)
        
        self.current_model = self.current_model.to(self.device)
        self.current_model.eval()
        self.current_model_name = model_name
    
    def _extract_whisper_probs(self, audio_array):
        input_features = self.processor(
            audio_array, sampling_rate=16000, return_tensors="pt"
        ).input_features.to(self.device)
        
        with torch.no_grad():
            output = self.current_model.generate(
                input_features, return_dict_in_generate=True, output_scores=True
            )
        
        transcription = self.processor.batch_decode(
            output.sequences, skip_special_tokens=True
        )[0]
        
        if hasattr(output, 'scores') and output.scores:
            probs = [torch.softmax(score, dim=-1).max().item() for score in output.scores]
            avg_prob = np.mean(probs) if probs else 0.8
        else:
            avg_prob = 0.8
        
        words = transcription.strip().split()
        return [(word, avg_prob) for word in words]
    
    def _extract_ctc_probs(self, audio_array):
        inputs = self.processor(
            audio_array, sampling_rate=16000, return_tensors="pt", padding=True
        )
        input_values = inputs.input_values.to(self.device)
        
        with torch.no_grad():
            logits = self.current_model(input_values).logits
        
        probs = torch.softmax(logits, dim=-1)
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)[0]
        
        words = transcription.strip().split()
        if len(words) > 0:
            avg_conf = probs.max(dim=-1).values.squeeze().mean().item()
            return [(word, avg_conf) for word in words]
        return []
    
    def _cleanup(self):
        if self.current_model is not None:
            del self.current_model
            del self.processor
            self.current_model = None
            self.processor = None
        if self.device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
    
    def word_probabilities(self, audio_file_path: str, model_name: str):
        try:
            audio_array = self._preprocess_audio(audio_file_path)
            self._load_model(model_name)
            
            if "whisper" in model_name:
                results = self._extract_whisper_probs(audio_array)
            elif "mms" in model_name or "wav2vec2" in model_name:
                results = self._extract_ctc_probs(audio_array)
            else:
                raise ValueError(f"Unknown model type: {model_name}")
            
            self._cleanup()
            return results
        except Exception as e:
            self._cleanup()
            raise RuntimeError(f"Error: {str(e)}")

# ============================================================================
# PART 5: EVALUATION FUNCTIONS
# ============================================================================

def compute_wer(reference: str, hypothesis: str):
    ref_words = reference.strip().split()
    hyp_words = hypothesis.strip().split()
    if len(ref_words) == 0:
        return 0.0 if len(hyp_words) == 0 else 1.0
    return editdistance.eval(ref_words, hyp_words) / len(ref_words)

def compute_cer(reference: str, hypothesis: str):
    ref_chars = list(reference.strip())
    hyp_chars = list(hypothesis.strip())
    if len(ref_chars) == 0:
        return 0.0 if len(hyp_chars) == 0 else 1.0
    return editdistance.eval(ref_chars, hyp_chars) / len(ref_chars)

def compute_ece(confidences, accuracies, n_bins=10):
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        in_bin = (confidences > bin_boundaries[i]) & (confidences <= bin_boundaries[i + 1])
        if in_bin.sum() > 0:
            acc = accuracies[in_bin].mean()
            conf = confidences[in_bin].mean()
            ece += np.abs(conf - acc) * in_bin.mean()
    return ece

def load_test_samples(dataset_path, max_samples):
    dataset_path = Path(dataset_path)
    
    # Find metadata file
    for fname in ["test.tsv", "dev.tsv", "validated.tsv"]:
        tsv_file = dataset_path / fname
        if tsv_file.exists():
            break
    
    samples = []
    with open(tsv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for i, row in enumerate(reader):
            if i >= max_samples:
                break
            audio_path = dataset_path / "clips" / row['path']
            if audio_path.exists():
                samples.append({
                    'audio_id': row['path'],
                    'audio_path': str(audio_path),
                    'reference': row['sentence'],
                    'duration': float(row.get('duration', 0))
                })
    
    return samples

def evaluate_model(asr_wrapper, model_name, test_samples):
    results = []
    
    for sample in tqdm(test_samples, desc=model_name):
        try:
            word_probs = asr_wrapper.word_probabilities(sample['audio_path'], model_name)
            hypothesis = ' '.join([w for w, p in word_probs])
            reference = sample['reference']
            
            wer = compute_wer(reference, hypothesis)
            cer = compute_cer(reference, hypothesis)
            avg_conf = np.mean([p for w, p in word_probs]) if word_probs else 0.0
            
            # Calibration
            ref_words = reference.split()
            confidences = [p for w, p in word_probs]
            accuracies = [1.0 if i < len(ref_words) and w == ref_words[i] else 0.0 
                         for i, (w, p) in enumerate(word_probs)]
            
            ece = compute_ece(np.array(confidences), np.array(accuracies)) if confidences else 0.0
            
            results.append({
                'audio_id': sample['audio_id'],
                'model_name': model_name,
                'reference': reference,
                'hypothesis': hypothesis,
                'wer': wer,
                'cer': cer,
                'avg_confidence': avg_conf,
                'ece': ece,
                'duration': sample['duration']
            })
        except Exception as e:
            print(f"\nError on {sample['audio_id']}: {str(e)}")
            continue
    
    return results

# ============================================================================
# PART 6: VISUALIZATION
# ============================================================================

def generate_plots(df, output_dir):
    output_dir = Path(output_dir)
    
    # WER comparison
    plt.figure(figsize=(10, 6))
    model_wer = df.groupby('model_name')['wer'].mean().sort_values()
    plt.barh(model_wer.index, model_wer.values, color='steelblue')
    plt.xlabel('Word Error Rate (WER)')
    plt.title('Model Comparison: Average WER', fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(output_dir / 'wer_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # WER distribution
    plt.figure(figsize=(12, 6))
    df.boxplot(column='wer', by='model_name')
    plt.ylabel('WER')
    plt.title('WER Distribution by Model', fontweight='bold')
    plt.suptitle('')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(output_dir / 'wer_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Calibration
    plt.figure(figsize=(10, 6))
    model_ece = df.groupby('model_name')['ece'].mean().sort_values()
    plt.barh(model_ece.index, model_ece.values, color='coral')
    plt.xlabel('Expected Calibration Error (ECE)')
    plt.title('Confidence Calibration by Model', fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(output_dir / 'calibration.png', dpi=300, bbox_inches='tight')
    plt.close()

# ============================================================================
# PART 7: MAIN EXECUTION
# ============================================================================

def run_iteration1():
    print("\n" + "="*80)
    print("CORAL ITERATION 1: BASELINE EVALUATION")
    print("="*80)
    print(f"\nTimestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Setup
    output_dir = Path(CONFIG['OUTPUT_DIR'])
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Load data
    print(f"\n[1/5] Loading test dataset from {CONFIG['DATASET_PATH']}...")
    test_samples = load_test_samples(CONFIG['DATASET_PATH'], CONFIG['MAX_SAMPLES'])
    print(f"Loaded {len(test_samples)} test samples")
    
    # Initialize
    print(f"\n[2/5] Initializing ASR wrapper on {CONFIG['DEVICE']}...")
    asr_wrapper = UrduASRWrapper(device=CONFIG['DEVICE'])
    
    # Evaluate
    print(f"\n[3/5] Evaluating {len(CONFIG['MODELS'])} models...")
    all_results = []
    
    for model in CONFIG['MODELS']:
        print(f"\n{'='*60}")
        print(f"Model: {model}")
        print(f"{'='*60}")
        model_results = evaluate_model(asr_wrapper, model, test_samples)
        all_results.extend(model_results)
        
        if CONFIG['BATCH_CLEANUP']:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    
    # Save detailed results
    print(f"\n[4/5] Saving results...")
    df.to_csv(output_dir / 'detailed_results.csv', index=False, encoding='utf-8')
    
    # Compute aggregates
    aggregate = df.groupby('model_name').agg({
        'wer': ['mean', 'std', 'min', 'max'],
        'cer': ['mean', 'std'],
        'avg_confidence': ['mean', 'std'],
        'ece': ['mean', 'std'],
        'duration': 'sum'
    }).round(4)
    
    aggregate.to_csv(output_dir / 'aggregate_metrics.csv')
    
    print("\n" + "="*80)
    print("AGGREGATE METRICS")
    print("="*80)
    print(aggregate)
    
    # Generate visualizations
    print(f"\n[5/5] Generating visualizations...")
    generate_plots(df, output_dir)
    
    # Generate report
    report_file = output_dir / 'ITERATION1_REPORT.txt'
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("CORAL PROJECT - ITERATION 1 EVALUATION REPORT\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Samples: {len(test_samples)}\n")
        f.write(f"Models: {len(CONFIG['MODELS'])}\n")
        f.write(f"Total Duration: {df['duration'].sum():.2f}s\n\n")
        
        f.write("-"*80 + "\n")
        f.write("BASELINE WER BY MODEL\n")
        f.write("-"*80 + "\n\n")
        f.write(df.groupby('model_name')['wer'].describe().to_string())
        f.write("\n\n")
        
        best_model = df.groupby('model_name')['wer'].mean().idxmin()
        best_wer = df.groupby('model_name')['wer'].mean().min()
        f.write(f"BEST MODEL: {best_model}\n")
        f.write(f"BASELINE WER: {best_wer:.4f} ({best_wer*100:.2f}%)\n\n")
        
        f.write("-"*80 + "\n")
        f.write("CALIBRATION ANALYSIS\n")
        f.write("-"*80 + "\n\n")
        f.write(df.groupby('model_name')['ece'].describe().to_string())
        f.write("\n\n")
        
        f.write("-"*80 + "\n")
        f.write("ITERATION 1 DELIVERABLES - COMPLETE\n")
        f.write("-"*80 + "\n\n")
        f.write("✓ ASR ensemble integrated\n")
        f.write("✓ Confidence extraction implemented\n")
        f.write("✓ Baseline WER established\n")
        f.write("✓ Calibration metrics computed\n")
        f.write("✓ Comparative analysis complete\n\n")
        
        f.write("-"*80 + "\n")
        f.write("NEXT STEPS (ITERATION 2)\n")
        f.write("-"*80 + "\n\n")
        f.write("• Develop LLM instruction prompts\n")
        f.write("• Test hypothesis fusion strategies\n")
        f.write(f"• Target WER: < {(best_wer*0.85)*100:.2f}%\n")
        f.write("• Begin prompt engineering experiments\n\n")
        
        f.write("="*80 + "\n")
    
    # Print summary
    print("\n" + "="*80)
    print("ITERATION 1 COMPLETE")
    print("="*80)
    print(f"\nResults saved to: {output_dir.absolute()}")
    print("\nGenerated files:")
    print("  • detailed_results.csv")
    print("  • aggregate_metrics.csv")
    print("  • wer_comparison.png")
    print("  • wer_distribution.png")
    print("  • calibration.png")
    print("  • ITERATION1_REPORT.txt")
    
    print("\n" + "-"*80)
    print("KEY FINDINGS")
    print("-"*80)
    print(f"Best Model: {best_model}")
    print(f"Baseline WER: {best_wer*100:.2f}%")
    print(f"Samples Evaluated: {len(df)}")
    print(f"Average ECE: {df['ece'].mean():.4f}")
    
    print("\n" + "-"*80)
    print("READY FOR ITERATION 2")
    print("-"*80)
    print(f"Target WER: < {(best_wer*0.85)*100:.2f}%")
    print("Next: Develop LLM-based hypothesis correction")
    print("="*80 + "\n")
    
    return df, aggregate

# ============================================================================
# RUN THE EVALUATION
# ============================================================================

if __name__ == "__main__":
    try:
        results_df, aggregate_metrics = run_iteration1()
        
        print("\nEvaluation successful!")
        print("Review the results in ./iteration1_results/")
        print("\nTo analyze further:")
        print("  results_df.head()  # View sample results")
        print("  results_df.describe()  # Statistical summary")
        print("  results_df.groupby('model_name')['wer'].mean()  # WER by model")
        
    except Exception as e:
        print(f"\n\nERROR: {str(e)}")
        print("\nTroubleshooting steps:")
        print("1. Verify DATASET_PATH in CONFIG")
        print("2. Check dataset structure (clips/ folder and .tsv files)")
        print("3. Reduce MAX_SAMPLES if running out of memory")
        print("4. Comment out large models in CONFIG['MODELS']")
        raise

Installing dependencies...
Loading libraries...


'c:\Users\Nouman' is not recognized as an internal or external command,
operable program or batch file.
  from .autonotebook import tqdm as notebook_tqdm


Device: CPU

Configuration:
  DATASET_PATH: C:\Users\Nouman Hafeez\Desktop\FYP\dataset\cv-corpus-22.0-delta-2025-06-20\ur
  MAX_SAMPLES: 10
  OUTPUT_DIR: ./iteration1_results
  MODELS: ['whisper-small', 'whisper-medium', 'whisper-large', 'wav2vec2-urdu', 'mms-300m']
  DEVICE: cpu
  BATCH_CLEANUP: True

CORAL ITERATION 1: BASELINE EVALUATION

Timestamp: 2025-10-05 11:34:18

[1/5] Loading test dataset from C:\Users\Nouman Hafeez\Desktop\FYP\dataset\cv-corpus-22.0-delta-2025-06-20\ur...


ERROR: [Errno 2] No such file or directory: 'C:\\Users\\Nouman Hafeez\\Desktop\\FYP\\dataset\\cv-corpus-22.0-delta-2025-06-20\\ur\\validated.tsv'

Troubleshooting steps:
1. Verify DATASET_PATH in CONFIG
2. Check dataset structure (clips/ folder and .tsv files)
3. Reduce MAX_SAMPLES if running out of memory
4. Comment out large models in CONFIG['MODELS']


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Nouman Hafeez\\Desktop\\FYP\\dataset\\cv-corpus-22.0-delta-2025-06-20\\ur\\validated.tsv'