In [None]:
"""
Iteration: 1 - CORAL-Urdu-ASR - CORAL_Iteration1_ASR_Ensemble.ipynb
Urdu ASR Wrapper for Multiple Models
Supports diverse ASR models for Urdu speech recognition
Optimized for Kaggle CPU/GPU notebooks with one-at-a-time loading
"""
# ============================================================================
# COMPLETE REAL-TIME URDU ASR SYSTEM FOR KAGGLE
# ============================================================================

from flask import Flask, render_template_string, request, jsonify, send_file
import os
import tempfile
import time
import json
import zipfile
import io
from pathlib import Path
from threading import Thread
from datetime import datetime
import torch
import gc
import librosa
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    WhisperProcessor, 
    WhisperForConditionalGeneration,
    Wav2Vec2Processor, 
    Wav2Vec2ForCTC,
    SeamlessM4TForSpeechToText,
    SeamlessM4TProcessor,
    AutoProcessor,
    AutoModelForCTC
)

# Script conversion for Hindi to Urdu
try:
    from indic_transliteration import sanscript
    from indic_transliteration.sanscript import transliterate
    TRANSLITERATION_AVAILABLE = True
except ImportError:
    TRANSLITERATION_AVAILABLE = False
    print("Warning: indic-transliteration not available. Installing...")

# Urdu-specific imports
try:
    import unicodedata
    UNICODE_AVAILABLE = True
except ImportError:
    UNICODE_AVAILABLE = False

# ============================================================================
# ASR WRAPPER CLASS
# ============================================================================

class UrduASRWrapper:
    """Enhanced wrapper with real-time streaming support and Urdu script enforcement"""
    
    SUPPORTED_MODELS = {
        "whisper-large": "openai/whisper-large-v3",
        "whisper-medium": "openai/whisper-medium",
        "whisper-small": "openai/whisper-small",
        "seamless-large": "facebook/seamless-m4t-v2-large",
        "seamless-medium": "facebook/seamless-m4t-medium",
        "mms-1b": "facebook/mms-1b-all",
        "mms-300m": "facebook/mms-300m",
        "wav2vec2-urdu": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu"
    }
    
    def _convert_to_urdu_script(self, text: str) -> str:
        """Convert Devanagari (Hindi) text to Perso-Arabic (Urdu) script"""
        if not text or not text.strip():
            return text
        
        # Check if text is already in Arabic/Urdu script (Unicode range)
        if any('\u0600' <= char <= '\u06FF' or '\u0750' <= char <= '\u077F' for char in text):
            return text
        
        # If text is in Devanagari, convert to Urdu
        if TRANSLITERATION_AVAILABLE:
            try:
                from indic_transliteration import sanscript
                urdu_text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.URDU)
                return urdu_text
            except Exception as e:
                print(f"Transliteration failed: {e}")
                return text
        
        return text
    
    def __init__(self, device: str = None):
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
        
        print(f"ASR Wrapper initialized on: {self.device}")
        
        self.current_model = None
        self.processor = None
        self.current_model_name = None
    
    def _preprocess_audio(self, file_path: str, target_sr: int = 16000) -> np.ndarray:
        """Convert audio file to required format"""
        try:
            audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
            
            if audio.dtype != np.float32:
                audio = audio.astype(np.float32)
            
            max_val = np.abs(audio).max()
            if max_val > 0:
                audio = audio / max_val
            
            return audio
            
        except Exception as e:
            raise ValueError(f"Error loading audio file {file_path}: {str(e)}")
    
    def _load_model(self, model_name: str):
        """Load ASR model"""
        if model_name not in self.SUPPORTED_MODELS:
            raise ValueError(f"Model {model_name} not supported")
        
        model_id = self.SUPPORTED_MODELS[model_name]
        print(f"Loading {model_name} ({model_id})...")
        
        try:
            if "whisper" in model_name:
                self.processor = WhisperProcessor.from_pretrained(model_id)
                self.current_model = WhisperForConditionalGeneration.from_pretrained(model_id)
                
            elif "seamless" in model_name:
                self.processor = SeamlessM4TProcessor.from_pretrained(model_id)
                self.current_model = SeamlessM4TForSpeechToText.from_pretrained(model_id)
                
            elif "mms" in model_name:
                self.processor = AutoProcessor.from_pretrained(model_id)
                self.current_model = AutoModelForCTC.from_pretrained(model_id)
                
            elif "wav2vec2" in model_name:
                self.processor = Wav2Vec2Processor.from_pretrained(model_id)
                self.current_model = Wav2Vec2ForCTC.from_pretrained(model_id)
            
            self.current_model = self.current_model.to(self.device)
            self.current_model.eval()
            self.current_model_name = model_name
            
            print(f"Model {model_name} loaded successfully")
            
        except Exception as e:
            raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
    
    def _extract_whisper_probabilities(self, audio_array: np.ndarray):
        """Extract word probabilities from Whisper with Urdu script"""
        input_features = self.processor(
            audio_array, 
            sampling_rate=16000, 
            return_tensors="pt"
        ).input_features.to(self.device)
        
        # Force Urdu language for Whisper models
        with torch.no_grad():
            predicted_ids = self.current_model.generate(
                input_features,
                language="urdu",  # Force Urdu language
                task="transcribe",  # Transcription task
                return_dict_in_generate=True,
                output_scores=True
            )
        
        transcription = self.processor.batch_decode(
            predicted_ids.sequences, 
            skip_special_tokens=True
        )[0]
        
        # Convert to Urdu script if needed
        transcription = self._convert_to_urdu_script(transcription)
        
        word_probs = []
        if hasattr(predicted_ids, 'scores') and predicted_ids.scores:
            all_probs = []
            for score in predicted_ids.scores:
                probs = torch.softmax(score, dim=-1)
                max_prob = probs.max().item()
                all_probs.append(max_prob)
            
            words = transcription.strip().split()
            if len(words) > 0 and len(all_probs) > 0:
                avg_prob = np.mean(all_probs)
                word_probs = [(word, avg_prob) for word in words]
            else:
                word_probs = [(word, 0.8) for word in words]
        else:
            words = transcription.strip().split()
            word_probs = [(word, 0.8) for word in words]
        
        return word_probs
    
    def _extract_ctc_probabilities(self, audio_array: np.ndarray):
        """Extract word probabilities from CTC models with Urdu script"""
        # Set target language for MMS models
        if "mms" in self.current_model_name:
            self.processor.tokenizer.set_target_lang("urd")
            self.current_model.load_adapter("urd")
        
        inputs = self.processor(
            audio_array,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        input_values = inputs.input_values.to(self.device)
        
        with torch.no_grad():
            logits = self.current_model(input_values).logits
        
        probs = torch.softmax(logits, dim=-1)
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)[0]
        
        # Convert to Urdu script if needed
        transcription = self._convert_to_urdu_script(transcription)
        
        words = transcription.strip().split()
        word_probs = []
        
        if len(words) > 0:
            max_probs = probs.max(dim=-1).values.squeeze()
            avg_confidence = max_probs.mean().item()
            word_probs = [(word, avg_confidence) for word in words]
        
        return word_probs
    
    def _extract_seamless_probabilities(self, audio_array: np.ndarray):
        """Extract word probabilities from Seamless with Urdu script"""
        audio_inputs = self.processor(
            audios=audio_array,
            sampling_rate=16000,
            return_tensors="pt"
        ).to(self.device)
        
        with torch.no_grad():
            output = self.current_model.generate(
                **audio_inputs,
                tgt_lang="urd",  # Force Urdu language
                return_dict_in_generate=True,
                output_scores=True
            )
        
        transcription = self.processor.decode(
            output.sequences[0].tolist(),
            skip_special_tokens=True
        )
        
        # Convert to Urdu script if needed
        transcription = self._convert_to_urdu_script(transcription)
        
        word_probs = []
        if hasattr(output, 'scores') and output.scores:
            all_probs = []
            for score in output.scores:
                probs = torch.softmax(score, dim=-1)
                max_prob = probs.max().item()
                all_probs.append(max_prob)
            
            words = transcription.strip().split()
            if len(words) > 0 and len(all_probs) > 0:
                avg_prob = np.mean(all_probs)
                word_probs = [(word, avg_prob) for word in words]
            else:
                word_probs = [(word, 0.7) for word in words]
        else:
            words = transcription.strip().split()
            word_probs = [(word, 0.7) for word in words]
        
        return word_probs
    
    def _cleanup(self):
        """Clean up memory"""
        if self.current_model is not None:
            del self.current_model
            self.current_model = None
        
        if self.processor is not None:
            del self.processor
            self.processor = None
        
        self.current_model_name = None
        
        if self.device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
    
    def word_probabilities(self, audio_file_path: str, model_name: str):
        """Process audio and return word-probability pairs"""
        try:
            print(f"\nProcessing: {Path(audio_file_path).name}")
            print(f"Model: {model_name}")
            
            print("Preprocessing audio...")
            audio_array = self._preprocess_audio(audio_file_path)
            print(f"Audio loaded: {len(audio_array)/16000:.2f} seconds")
            
            self._load_model(model_name)
            
            print("Running inference...")
            
            if "whisper" in model_name:
                results = self._extract_whisper_probabilities(audio_array)
            elif "mms" in model_name or "wav2vec2" in model_name:
                results = self._extract_ctc_probabilities(audio_array)
            elif "seamless" in model_name:
                results = self._extract_seamless_probabilities(audio_array)
            else:
                raise ValueError(f"Unknown model type: {model_name}")
            
            print(f"Transcription complete: {len(results)} words")
            
            self._cleanup()
            print("Memory cleaned")
            
            return results
            
        except Exception as e:
            self._cleanup()
            raise RuntimeError(f"Error processing audio with {model_name}: {str(e)}")

# ============================================================================
# FLASK APP WITH ENHANCED REAL-TIME FEATURES
# ============================================================================

app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024  # 100MB

DATASET_DIR = Path("/kaggle/working/recorded_dataset")
DATASET_DIR.mkdir(exist_ok=True)

HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Real-Time Urdu Speech Recognition</title>
    <style>
        * { margin: 0; padding: 0; box-sizing: border-box; }
        
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }
        
        .container {
            background: white;
            border-radius: 20px;
            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
            max-width: 1200px;
            margin: 0 auto;
            padding: 40px;
            animation: fadeIn 0.5s ease-in;
        }
        
        @keyframes fadeIn {
            from { opacity: 0; transform: translateY(20px); }
            to { opacity: 1; transform: translateY(0); }
        }
        
        .header {
            text-align: center;
            margin-bottom: 40px;
        }
        
        .header h1 {
            color: #667eea;
            font-size: 2.5em;
            margin-bottom: 10px;
            font-weight: 700;
        }
        
        .header p {
            color: #666;
            font-size: 1.1em;
        }
        
        .tabs {
            display: flex;
            gap: 10px;
            margin-bottom: 30px;
            border-bottom: 2px solid #e0e0e0;
        }
        
        .tab {
            padding: 15px 30px;
            background: none;
            border: none;
            cursor: pointer;
            font-size: 1.1em;
            font-weight: 600;
            color: #666;
            transition: all 0.3s ease;
            border-bottom: 3px solid transparent;
        }
        
        .tab:hover { color: #667eea; }
        
        .tab.active {
            color: #667eea;
            border-bottom-color: #667eea;
        }
        
        .tab-content { display: none; }
        .tab-content.active {
            display: block;
            animation: fadeIn 0.3s ease-in;
        }
        
        .upload-section {
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            border-radius: 15px;
            padding: 40px;
            text-align: center;
            margin-bottom: 30px;
            border: 2px dashed #667eea;
            transition: all 0.3s ease;
            cursor: pointer;
        }
        
        .upload-section:hover {
            border-color: #764ba2;
            transform: translateY(-2px);
        }
        
        .upload-icon { font-size: 4em; margin-bottom: 20px; }
        
        .file-input { display: none; }
        
        .file-label {
            display: inline-block;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 15px 40px;
            border-radius: 50px;
            cursor: pointer;
            font-size: 1.1em;
            font-weight: 600;
            transition: all 0.3s ease;
        }
        
        .file-label:hover {
            transform: scale(1.05);
            box-shadow: 0 5px 20px rgba(102, 126, 234, 0.4);
        }
        
        .file-name {
            margin-top: 15px;
            color: #667eea;
            font-weight: 600;
            font-size: 1.1em;
        }
        
        .record-section {
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            border-radius: 15px;
            padding: 40px;
            text-align: center;
            margin-bottom: 30px;
        }
        
        .record-button {
            width: 120px;
            height: 120px;
            border-radius: 50%;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            border: none;
            color: white;
            font-size: 3em;
            cursor: pointer;
            transition: all 0.3s ease;
            box-shadow: 0 5px 20px rgba(102, 126, 234, 0.3);
            margin: 20px auto;
            display: block;
        }
        
        .record-button:hover {
            transform: scale(1.1);
            box-shadow: 0 8px 30px rgba(102, 126, 234, 0.5);
        }
        
        .record-button.recording {
            background: linear-gradient(135deg, #ff4444 0%, #cc0000 100%);
            animation: pulse 1.5s infinite;
        }
        
        @keyframes pulse {
            0%, 100% { transform: scale(1); }
            50% { transform: scale(1.05); }
        }
        
        .record-timer {
            font-size: 2em;
            color: #667eea;
            font-weight: 700;
            margin: 20px 0;
        }
        
        .audio-player {
            width: 100%;
            margin: 20px 0;
            display: none;
        }
        
        .audio-player.active { display: block; }
        
        .action-buttons {
            display: none;
            gap: 10px;
            justify-content: center;
            margin-top: 20px;
        }
        
        .action-buttons.active {
            display: flex;
        }
        
        .btn {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            padding: 15px 40px;
            border-radius: 50px;
            font-size: 1.1em;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.3s ease;
        }
        
        .btn:hover {
            transform: scale(1.05);
            box-shadow: 0 5px 20px rgba(102, 126, 234, 0.4);
        }
        
        .btn-success {
            background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
        }
        
        .model-section { margin-bottom: 30px; }
        
        .model-section h3 {
            color: #333;
            margin-bottom: 15px;
            font-size: 1.3em;
        }
        
        .model-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
            gap: 15px;
        }
        
        .model-card {
            background: white;
            border: 2px solid #e0e0e0;
            border-radius: 10px;
            padding: 15px;
            cursor: pointer;
            transition: all 0.3s ease;
            text-align: center;
        }
        
        .model-card:hover {
            border-color: #667eea;
            transform: translateY(-2px);
            box-shadow: 0 5px 15px rgba(102, 126, 234, 0.2);
        }
        
        .model-card.selected {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border-color: #667eea;
        }
        
        .model-card input[type="radio"] { display: none; }
        
        .model-name { font-weight: 600; font-size: 1em; }
        .model-desc { font-size: 0.85em; margin-top: 5px; opacity: 0.8; }
        
        .process-btn {
            width: 100%;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            padding: 18px;
            border-radius: 50px;
            font-size: 1.2em;
            font-weight: 600;
            cursor: pointer;
            transition: all 0.3s ease;
            margin-bottom: 20px;
        }
        
        .process-btn:hover:not(:disabled) {
            transform: scale(1.02);
            box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4);
        }
        
        .process-btn:disabled { opacity: 0.6; cursor: not-allowed; }
        
        .loading {
            display: none;
            text-align: center;
            padding: 30px;
        }
        
        .loading.active { display: block; }
        
        .spinner {
            border: 4px solid #f3f3f3;
            border-top: 4px solid #667eea;
            border-radius: 50%;
            width: 50px;
            height: 50px;
            animation: spin 1s linear infinite;
            margin: 0 auto 20px;
        }
        
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
        
        .progress-steps {
            display: none;
            margin-top: 20px;
        }
        
        .progress-steps.active { display: block; }
        
        .step {
            display: flex;
            align-items: center;
            padding: 10px;
            margin: 5px 0;
            background: #f9f9f9;
            border-radius: 10px;
            transition: all 0.3s ease;
        }
        
        .step.active {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }
        
        .step.completed {
            background: #28a745;
            color: white;
        }
        
        .step-icon {
            font-size: 1.5em;
            margin-right: 15px;
        }
        
        .results {
            display: none;
            margin-top: 30px;
        }
        
        .results.active {
            display: block;
            animation: fadeIn 0.5s ease-in;
        }
        
        .results h3 {
            color: #333;
            margin-bottom: 20px;
            font-size: 1.5em;
        }
        
        .transcription-box {
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            border-radius: 15px;
            padding: 25px;
            margin-bottom: 20px;
            direction: rtl;
            text-align: right;
            font-size: 1.3em;
            line-height: 1.8;
            color: #333;
            font-weight: 500;
        }
        
        .word-list {
            background: #f9f9f9;
            border-radius: 15px;
            padding: 20px;
            max-height: 400px;
            overflow-y: auto;
        }
        
        .word-item {
            background: white;
            border-radius: 10px;
            padding: 15px 20px;
            margin-bottom: 10px;
            display: flex;
            justify-content: space-between;
            align-items: center;
            transition: all 0.3s ease;
            border-left: 4px solid #667eea;
        }
        
        .word-item:hover {
            transform: translateX(-5px);
            box-shadow: 0 3px 10px rgba(0, 0, 0, 0.1);
        }
        
        .word-text {
            font-size: 1.2em;
            font-weight: 600;
            color: #333;
            direction: rtl;
        }
        
        .confidence {
            display: flex;
            align-items: center;
            gap: 10px;
        }
        
        .confidence-bar {
            width: 100px;
            height: 8px;
            background: #e0e0e0;
            border-radius: 10px;
            overflow: hidden;
        }
        
        .confidence-fill {
            height: 100%;
            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
            transition: width 0.5s ease;
        }
        
        .confidence-text {
            font-weight: 600;
            color: #667eea;
            min-width: 50px;
        }
        
        .stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin-top: 20px;
        }
        
        .stat-card {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border-radius: 10px;
            padding: 20px;
            text-align: center;
        }
        
        .stat-value {
            font-size: 2em;
            font-weight: 700;
            margin-bottom: 5px;
        }
        
        .stat-label {
            font-size: 0.9em;
            opacity: 0.9;
        }
        
        .notification {
            position: fixed;
            top: 20px;
            right: 20px;
            padding: 15px 25px;
            border-radius: 10px;
            color: white;
            font-weight: 600;
            display: none;
            z-index: 1000;
            animation: slideIn 0.3s ease-in;
        }
        
        @keyframes slideIn {
            from { transform: translateX(400px); }
            to { transform: translateX(0); }
        }
        
        .notification.active { display: block; }
        .notification.error { background: #ff4444; }
        .notification.success { background: #28a745; }
        .notification.info { background: #667eea; }
        
        .dataset-info {
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            border-radius: 15px;
            padding: 25px;
            margin-top: 20px;
            text-align: center;
        }
        
        .dataset-info h4 {
            color: #667eea;
            font-size: 1.3em;
            margin-bottom: 15px;
        }
        
        .dataset-stat {
            font-size: 1.5em;
            font-weight: 700;
            color: #333;
            margin: 10px 0;
        }
        
        @media (max-width: 768px) {
            .container { padding: 20px; }
            .header h1 { font-size: 2em; }
            .model-grid { grid-template-columns: 1fr; }
            .tabs { overflow-x: auto; }
        }
    </style>
</head>
<body>
    <div class="notification" id="notification"></div>
    
    <div class="container">
        <div class="header">
            <h1>Real-Time Urdu Speech Recognition</h1>
            <p>Advanced AI-powered transcription with live recording & dataset collection</p>
        </div>

        <div class="tabs">
            <button class="tab active" onclick="switchTab('upload')">Upload Audio</button>
            <button class="tab" onclick="switchTab('record')">Record Audio</button>
        </div>

        <div class="tab-content active" id="upload-tab">
            <div class="upload-section" onclick="document.getElementById('audioFile').click()">
                <div class="upload-icon">📁</div>
                <input type="file" id="audioFile" class="file-input" accept="audio/*,video/*">
                <label for="audioFile" class="file-label">Choose Audio File</label>
                <div class="file-name" id="fileName">No file selected</div>
                <p style="margin-top: 15px; color: #666;">Supports MP3, MP4, WAV, and more</p>
            </div>
        </div>

        <div class="tab-content" id="record-tab">
            <div class="record-section">
                <h3 style="color: #667eea; margin-bottom: 20px;">Record Your Voice</h3>
                <p style="color: #666; margin-bottom: 20px;">Click the microphone to start recording</p>
                
                <button class="record-button" id="recordBtn" onclick="toggleRecording()">🎤</button>
                
                <div class="record-timer" id="recordTimer">00:00</div>
                
                <audio class="audio-player" id="audioPlayer" controls></audio>
                
                <div class="action-buttons" id="recordActions">
                    <button class="btn btn-success" onclick="saveToDataset()">
                        💾 Save to Dataset
                    </button>
                    <button class="btn" onclick="useForTranscription()">
                        🔄 Use for Transcription
                    </button>
                </div>
            </div>

            <div class="dataset-info">
                <h4>📊 Dataset Statistics</h4>
                <div class="dataset-stat" id="datasetCount">0 recordings saved</div>
                <button class="file-label" style="margin-top: 15px;" onclick="downloadDataset()">
                    ⬇️ Download Dataset
                </button>
            </div>
        </div>

        <div class="model-section">
            <h3>Select ASR Model</h3>
            <div class="model-grid">
                <div class="model-card" onclick="selectModel('whisper-small')">
                    <input type="radio" name="model" value="whisper-small" id="whisper-small">
                    <div class="model-name">Whisper Small</div>
                    <div class="model-desc">Fast & Efficient</div>
                </div>
                <div class="model-card" onclick="selectModel('whisper-medium')">
                    <input type="radio" name="model" value="whisper-medium" id="whisper-medium">
                    <div class="model-name">Whisper Medium</div>
                    <div class="model-desc">Balanced</div>
                </div>
                <div class="model-card" onclick="selectModel('whisper-large')">
                    <input type="radio" name="model" value="whisper-large" id="whisper-large">
                    <div class="model-name">Whisper Large</div>
                    <div class="model-desc">Most Accurate</div>
                </div>
                <div class="model-card" onclick="selectModel('wav2vec2-urdu')">
                    <input type="radio" name="model" value="wav2vec2-urdu" id="wav2vec2-urdu">
                    <div class="model-name">Wav2Vec2 Urdu</div>
                    <div class="model-desc">Specialized</div>
                </div>
                <div class="model-card" onclick="selectModel('mms-1b')">
                    <input type="radio" name="model" value="mms-1b" id="mms-1b">
                    <div class="model-name">MMS 1B</div>
                    <div class="model-desc">Multilingual</div>
                </div>
                <div class="model-card" onclick="selectModel('seamless-medium')">
                    <input type="radio" name="model" value="seamless-medium" id="seamless-medium">
                    <div class="model-name">Seamless M4T</div>
                    <div class="model-desc">Universal</div>
                </div>
            </div>
        </div>

        <button class="process-btn" onclick="processAudio()" id="processBtn">
            🚀 Start Transcription
        </button>

        <div class="loading" id="loading">
            <div class="spinner"></div>
            <p style="color: #667eea; font-size: 1.1em; font-weight: 600;">Processing your audio...</p>
        </div>

        <div class="progress-steps" id="progressSteps">
            <div class="step" id="step1">
                <span class="step-icon">🎵</span>
                <span>Loading audio file...</span>
            </div>
            <div class="step" id="step2">
                <span class="step-icon">🔧</span>
                <span>Preprocessing audio...</span>
            </div>
            <div class="step" id="step3">
                <span class="step-icon">🤖</span>
                <span>Loading AI model...</span>
            </div>
            <div class="step" id="step4">
                <span class="step-icon">🔄</span>
                <span>Running transcription...</span>
            </div>
            <div class="step" id="step5">
                <span class="step-icon">✅</span>
                <span>Finalizing results...</span>
            </div>
        </div>

        <div class="results" id="results">
            <h3>📝 Transcription Results</h3>
            <div class="transcription-box" id="transcription"></div>
            <div class="stats">
                <div class="stat-card">
                    <div class="stat-value" id="wordCount">0</div>
                    <div class="stat-label">Words</div>
                </div>
                <div class="stat-card">
                    <div class="stat-value" id="avgConfidence">0%</div>
                    <div class="stat-label">Avg Confidence</div>
                </div>
                <div class="stat-card">
                    <div class="stat-value" id="duration">0s</div>
                    <div class="stat-label">Duration</div>
                </div>
            </div>
            <h3 style="margin-top: 30px;">📊 Word-level Analysis</h3>
            <div class="word-list" id="wordList"></div>
        </div>
    </div>

    <script>
        let selectedModel = null;
        let selectedFile = null;
        let mediaRecorder = null;
        let audioChunks = [];
        let recordingStartTime = null;
        let timerInterval = null;
        let recordedBlob = null;

        function switchTab(tabName) {
            document.querySelectorAll('.tab').forEach(tab => tab.classList.remove('active'));
            document.querySelectorAll('.tab-content').forEach(content => content.classList.remove('active'));
            
            event.target.classList.add('active');
            document.getElementById(tabName + '-tab').classList.add('active');
        }

        document.getElementById('audioFile').addEventListener('change', function(e) {
            if (e.target.files.length > 0) {
                selectedFile = e.target.files[0];
                document.getElementById('fileName').textContent = selectedFile.name;
                showNotification('File selected: ' + selectedFile.name, 'success');
            }
        });

        async function toggleRecording() {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                stopRecording();
            } else {
                await startRecording();
            }
        }

        async function startRecording() {
            try {
                const stream = await navigator.mediaDevices.getUserMedia({ 
                    audio: {
                        echoCancellation: true,
                        noiseSuppression: true,
                        sampleRate: 16000
                    } 
                });
                
                mediaRecorder = new MediaRecorder(stream, {
                    mimeType: 'audio/webm;codecs=opus'
                });
                audioChunks = [];

                mediaRecorder.ondataavailable = (event) => {
                    if (event.data.size > 0) {
                        audioChunks.push(event.data);
                    }
                };

                mediaRecorder.onstop = () => {
                    recordedBlob = new Blob(audioChunks, { type: 'audio/webm' });
                    const audioUrl = URL.createObjectURL(recordedBlob);
                    const audioPlayer = document.getElementById('audioPlayer');
                    audioPlayer.src = audioUrl;
                    audioPlayer.classList.add('active');
                    document.getElementById('recordActions').classList.add('active');
                    showNotification('Recording stopped successfully!', 'success');
                };

                mediaRecorder.start(100);
                document.getElementById('recordBtn').classList.add('recording');
                document.getElementById('recordBtn').textContent = '⏹️';
                
                recordingStartTime = Date.now();
                timerInterval = setInterval(updateTimer, 100);
                showNotification('Recording started...', 'info');
            } catch (err) {
                showNotification('Microphone access denied: ' + err.message, 'error');
            }
        }

        function stopRecording() {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                mediaRecorder.stream.getTracks().forEach(track => track.stop());
                document.getElementById('recordBtn').classList.remove('recording');
                document.getElementById('recordBtn').textContent = '🎤';
                clearInterval(timerInterval);
            }
        }

        function updateTimer() {
            const elapsed = Date.now() - recordingStartTime;
            const seconds = Math.floor(elapsed / 1000);
            const minutes = Math.floor(seconds / 60);
            const secs = seconds % 60;
            document.getElementById('recordTimer').textContent = 
                `${String(minutes).padStart(2, '0')}:${String(secs).padStart(2, '0')}`;
        }

        async function saveToDataset() {
            if (!recordedBlob) {
                showNotification('No recording to save', 'error');
                return;
            }

            const formData = new FormData();
            formData.append('audio', recordedBlob, 'recording.webm');

            try {
                showNotification('Saving to dataset...', 'info');
                const response = await fetch('/save_to_dataset', {
                    method: 'POST',
                    body: formData
                });
                const data = await response.json();
                
                if (data.success) {
                    showNotification('Recording saved! Total: ' + data.total_recordings, 'success');
                    document.getElementById('datasetCount').textContent = 
                        data.total_recordings + ' recordings saved';
                } else {
                    showNotification(data.error, 'error');
                }
            } catch (error) {
                showNotification('Error saving to dataset: ' + error.message, 'error');
            }
        }

        function useForTranscription() {
            if (!recordedBlob) {
                showNotification('No recording available', 'error');
                return;
            }
            selectedFile = new File([recordedBlob], 'recording.webm', { type: 'audio/webm' });
            showNotification('Recording loaded for transcription', 'success');
            processAudio();
        }

        async function downloadDataset() {
            try {
                showNotification('Preparing dataset download...', 'info');
                const response = await fetch('/download_dataset');
                const blob = await response.blob();
                const url = window.URL.createObjectURL(blob);
                const a = document.createElement('a');
                a.href = url;
                a.download = 'urdu_dataset_' + new Date().toISOString().split('T')[0] + '.zip';
                document.body.appendChild(a);
                a.click();
                window.URL.revokeObjectURL(url);
                document.body.removeChild(a);
                showNotification('Dataset downloaded successfully!', 'success');
            } catch (error) {
                showNotification('Error downloading dataset: ' + error.message, 'error');
            }
        }

        async function loadDatasetStats() {
            try {
                const response = await fetch('/dataset_stats');
                const data = await response.json();
                document.getElementById('datasetCount').textContent = 
                    data.count + ' recordings saved (' + data.total_size_mb + ' MB)';
            } catch (error) {
                console.error('Error loading dataset stats:', error);
            }
        }

        function selectModel(modelName) {
            selectedModel = modelName;
            document.querySelectorAll('.model-card').forEach(card => {
                card.classList.remove('selected');
            });
            document.querySelector(`#${modelName}`).closest('.model-card').classList.add('selected');
            showNotification('Model selected: ' + modelName, 'info');
        }

        async function processAudio() {
            if (!selectedFile) {
                showNotification('Please select or record an audio file', 'error');
                return;
            }
            if (!selectedModel) {
                showNotification('Please select a model', 'error');
                return;
            }

            const formData = new FormData();
            formData.append('audio', selectedFile);
            formData.append('model', selectedModel);

            document.getElementById('processBtn').disabled = true;
            document.getElementById('loading').classList.add('active');
            document.getElementById('progressSteps').classList.add('active');
            document.getElementById('results').classList.remove('active');

            const steps = ['step1', 'step2', 'step3', 'step4', 'step5'];
            let currentStep = 0;

            const stepInterval = setInterval(() => {
                if (currentStep > 0) {
                    document.getElementById(steps[currentStep - 1]).classList.remove('active');
                    document.getElementById(steps[currentStep - 1]).classList.add('completed');
                }
                if (currentStep < steps.length) {
                    document.getElementById(steps[currentStep]).classList.add('active');
                    currentStep++;
                }
            }, 800);

            try {
                const response = await fetch('/transcribe', {
                    method: 'POST',
                    body: formData
                });
                const data = await response.json();

                clearInterval(stepInterval);

                if (data.error) {
                    showNotification(data.error, 'error');
                } else {
                    steps.forEach(step => {
                        document.getElementById(step).classList.remove('active');
                        document.getElementById(step).classList.add('completed');
                    });
                    setTimeout(() => {
                        displayResults(data);
                        showNotification('Transcription completed successfully!', 'success');
                    }, 500);
                }
            } catch (error) {
                clearInterval(stepInterval);
                showNotification('Error processing audio: ' + error.message, 'error');
            } finally {
                document.getElementById('processBtn').disabled = false;
                document.getElementById('loading').classList.remove('active');
                setTimeout(() => {
                    document.getElementById('progressSteps').classList.remove('active');
                    steps.forEach(step => {
                        document.getElementById(step).classList.remove('active');
                        document.getElementById(step).classList.remove('completed');
                    });
                }, 2000);
            }
        }

        function displayResults(data) {
            const transcription = data.results.map(r => r.word).join(' ');
            const avgConf = (data.results.reduce((sum, r) => sum + r.probability, 0) / data.results.length * 100).toFixed(1);

            document.getElementById('transcription').textContent = transcription;
            document.getElementById('wordCount').textContent = data.results.length;
            document.getElementById('avgConfidence').textContent = avgConf + '%';
            document.getElementById('duration').textContent = (data.audio_duration || 0).toFixed(1) + 's';

            const wordListHtml = data.results.map((item, index) => `
                <div class="word-item" style="animation-delay: ${index * 0.05}s">
                    <div class="word-text">${item.word}</div>
                    <div class="confidence">
                        <div class="confidence-bar">
                            <div class="confidence-fill" style="width: ${item.probability * 100}%"></div>
                        </div>
                        <div class="confidence-text">${(item.probability * 100).toFixed(1)}%</div>
                    </div>
                </div>
            `).join('');

            document.getElementById('wordList').innerHTML = wordListHtml;
            document.getElementById('results').classList.add('active');
        }

        function showNotification(message, type) {
            const notification = document.getElementById('notification');
            notification.textContent = message;
            notification.className = 'notification ' + type + ' active';
            setTimeout(() => {
                notification.classList.remove('active');
            }, 4000);
        }

        loadDatasetStats();
    </script>
</body>
</html>
'''

@app.route('/')
def index():
    return render_template_string(HTML_TEMPLATE)

@app.route('/transcribe', methods=['POST'])
def transcribe():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file provided'}), 400
        
        audio_file = request.files['audio']
        model_name = request.form.get('model', 'whisper-small')
        
        if audio_file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.webm') as tmp_file:
            audio_file.save(tmp_file.name)
            tmp_path = tmp_file.name
        
        try:
            audio_data, sr = librosa.load(tmp_path, sr=16000)
            duration = len(audio_data) / sr
            
            wrapper = UrduASRWrapper()
            results = wrapper.word_probabilities(tmp_path, model_name)
            
            formatted_results = [
                {'word': word, 'probability': float(prob)}
                for word, prob in results
            ]
            
            return jsonify({
                'success': True,
                'results': formatted_results,
                'model': model_name,
                'audio_duration': duration
            })
            
        finally:
            if os.path.exists(tmp_path):
                os.remove(tmp_path)
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/save_to_dataset', methods=['POST'])
def save_to_dataset():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': 'No audio file provided'}), 400
        
        audio_file = request.files['audio']
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
        filename = f"urdu_audio_{timestamp}.webm"
        file_path = DATASET_DIR / filename
        
        audio_file.save(str(file_path))
        
        metadata = {
            'filename': filename,
            'timestamp': timestamp,
            'size': os.path.getsize(file_path),
            'format': 'webm',
            'date': datetime.now().isoformat()
        }
        
        metadata_path = DATASET_DIR / f"{filename}.json"
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        
        total_recordings = len(list(DATASET_DIR.glob('*.webm')))
        
        return jsonify({
            'success': True,
            'filename': filename,
            'total_recordings': total_recordings,
            'message': 'Recording saved successfully!'
        })
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/dataset_stats', methods=['GET'])
def dataset_stats():
    try:
        audio_files = list(DATASET_DIR.glob('*.webm')) + list(DATASET_DIR.glob('*.mp3'))
        
        total_size = sum(f.stat().st_size for f in audio_files)
        total_size_mb = total_size / (1024 * 1024)
        
        return jsonify({
            'count': len(audio_files),
            'total_size_mb': round(total_size_mb, 2),
            'dataset_path': str(DATASET_DIR)
        })
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/download_dataset', methods=['GET'])
def download_dataset():
    try:
        memory_file = io.BytesIO()
        
        with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for audio_file in DATASET_DIR.glob('*.webm'):
                zipf.write(audio_file, audio_file.name)
            
            for audio_file in DATASET_DIR.glob('*.mp3'):
                zipf.write(audio_file, audio_file.name)
            
            for json_file in DATASET_DIR.glob('*.json'):
                zipf.write(json_file, json_file.name)
            
            readme_content = f"""Urdu Audio Dataset
=====================
Total Recordings: {len(list(DATASET_DIR.glob('*.webm')) + list(DATASET_DIR.glob('*.mp3')))}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

File Structure:
- *.webm/*.mp3: Audio recordings
- *.json: Metadata for each recording

This dataset was collected using the Real-Time Urdu Speech Recognition System.
"""
            zipf.writestr('README.txt', readme_content)
        
        memory_file.seek(0)
        
        return send_file(
            memory_file,
            mimetype='application/zip',
            as_attachment=True,
            download_name=f'urdu_dataset_{datetime.now().strftime("%Y%m%d")}.zip'
        )
    
    except Exception as e:
        return jsonify({'error': str(e)}), 500

def run_flask():
    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)

# ============================================================================
# KAGGLE NOTEBOOK EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("Starting Real-Time Urdu ASR System...")
    print("="*70)
    
    # Install required package for script conversion
    print("Installing dependencies...")
    import subprocess
    import sys
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'indic-transliteration'], check=True)
    
    # Import transliteration after installation
    try:
        from indic_transliteration import sanscript
        print("✅ Urdu script conversion enabled")
    except ImportError:
        print("⚠️  Script conversion unavailable - text may appear in Hindi")
    
    flask_thread = Thread(target=run_flask, daemon=True)
    flask_thread.start()
    
    time.sleep(3)
    
    print("Installing pyngrok...")
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'pyngrok'], check=True)
    
    from pyngrok import ngrok
    
    print("Setting up ngrok tunnel...")
    # Replace with your ngrok auth token
    ngrok.set_auth_token("349KjsKTpaWa9GS08jQjwXO9Aom_89JHNhMVDUcjxkVtxaj1g")
    
    public_url = ngrok.connect(5000)
    
    print("\n" + "="*70)
    print("✅ REAL-TIME URDU ASR SYSTEM IS LIVE!")
    print("="*70)
    print(f"🌐 Public URL: {public_url}")
    print(f"📁 Dataset Directory: {DATASET_DIR}")
    print("="*70)
    print("\n📱 Features:")
    print("  • Real-time audio recording from browser")
    print("  • Live transcription with word-by-word analysis")
    print("  • Audio preprocessing and normalization")
    print("  • URDU SCRIPT OUTPUT (Perso-Arabic)")
    print("  • Save recordings to dataset")
    print("  • Download complete dataset as ZIP")
    print("  • 8 different ASR models")
    print("  • Step-by-step progress visualization")
    print("  • Word-level confidence scores")
    print("\n⚠️  Keep this notebook running to maintain the connection")
    print("="*70)
    
    # Keep the notebook running
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("\nShutting down server...")