# ASR (Automatic Speech Recognition) Testing Notebook

This notebook tests multiple ASR models:
1. **Whisper Small** - OpenAI's small model
2. **Whisper Large v3 Turbo** - OpenAI's turbo model
3. **Parakeet TDT 0.6B v3** - NVIDIA's specialized model

Features:
- Record audio from microphone
- Save audio as WAV files
- Compare transcription quality and performance

In [None]:
# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

if IN_COLAB:
    print("\n" + "="*70)
    print("  GOOGLE COLAB SETUP")
    print("="*70)
    
    # Clone repository
    print("\n[1/3] Cloning repository...")
    !git clone https://github.com/ltruciosr-dev/utec-voice-assistant.git
    
    # Change to repo directory
    import os
    os.chdir('utec-voice-assistant')
    print("✓ Repository cloned")
    
    # Install dependencies
    print("\n[2/3] Installing dependencies...")
    !pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    !pip install -q -r requirements.txt
    print("✓ Dependencies installed")
    
    # Verify GPU
    print("\n[3/3] Verifying GPU access...")
    import torch
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    print("\n" + "="*70)
    print("  SETUP COMPLETE!")
    print("="*70)
else:
    print("Skipping Colab setup (running locally)")

# 🚀 Google Colab Setup

**Run this section if using Google Colab. Skip if running locally.**

This will:
1. Clone the repository
2. Install dependencies
3. Set up the environment

## 1. Setup and Imports

In [None]:
import torch
import numpy as np
import soundfile as sf
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers import AutoModelForCTC, AutoTokenizer
import time
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## 2. Microphone Recording Function

In [None]:
def record_audio(duration=5, sample_rate=16000, output_dir="recordings"):
    """
    Record audio from microphone and save as WAV file.
    Works in both Colab and local environments.
    
    Args:
        duration: Recording duration in seconds (Colab only)
        sample_rate: Sample rate (16kHz recommended for ASR)
        output_dir: Directory to save recordings
    
    Returns:
        Path to saved WAV file
    """
    from pathlib import Path
    from datetime import datetime
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = output_path / f"recording_{timestamp}.wav"
    
    try:
        from google.colab import files
        from IPython.display import Audio, display
        import io
        
        # Colab environment - use browser recording
        print(f"🎤 Recording for {duration} seconds in Colab...")
        print("Click the record button below and speak!")
        
        # Record using Colab's Audio widget
        audio = Audio(rate=sample_rate, autoplay=False)
        display(audio)
        
        # Note: In Colab, you need to manually upload or use alternative method
        print("\n⚠️  Browser recording widget displayed above.")
        print("Alternative: Upload an audio file instead:")
        
        uploaded = files.upload()
        
        if uploaded:
            # Save uploaded file
            upload_filename = list(uploaded.keys())[0]
            with open(filename, 'wb') as f:
                f.write(uploaded[upload_filename])
            print(f"✓ Audio saved to: {filename}")
        else:
            print("⚠️  No file uploaded")
            return None
            
    except ImportError:
        # Local environment - provide instructions
        print("⚠️  Audio recording requires manual file upload in this environment")
        print("Please upload an audio file (WAV format preferred)")
        print(f"Expected location: {filename}")
        return None
    
    return str(filename)

# Alternative: Simple file upload for Colab
def upload_audio_file():
    """
    Upload audio file (works in Colab and local Jupyter).
    """
    try:
        from google.colab import files
        from pathlib import Path
        
        print("📁 Upload your audio file (WAV, MP3, etc.):")
        uploaded = files.upload()
        
        if uploaded:
            filename = list(uploaded.keys())[0]
            print(f"✓ File uploaded: {filename}")
            return filename
        else:
            print("⚠️  No file uploaded")
            return None
    except ImportError:
        print("⚠️  File upload widget not available")
        print("Please use the file browser to add your audio file")
        return None

print("✓ Audio functions defined")
print("\nFor Colab: Use upload_audio_file() to upload an audio file")
print("Example: audio_file = upload_audio_file()")

## 3. Whisper Models Setup

In [None]:
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

print(f"Using device: {device}")
print(f"Data type: {torch_dtype}")

### 3.1 Whisper Small Model

In [None]:
print("Loading Whisper Small model...")
torch.cuda.empty_cache()

whisper_small_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-small",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
whisper_small_model.to(device)

whisper_small_processor = AutoProcessor.from_pretrained("openai/whisper-small")

whisper_small_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_small_model,
    tokenizer=whisper_small_processor.tokenizer,
    feature_extractor=whisper_small_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Whisper Small loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

### 3.2 Whisper Large v3 Turbo Model

In [None]:
print("Loading Whisper Large v3 Turbo model...")
torch.cuda.empty_cache()

whisper_turbo_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large-v3-turbo",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
whisper_turbo_model.to(device)

whisper_turbo_processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

whisper_turbo_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_turbo_model,
    tokenizer=whisper_turbo_processor.tokenizer,
    feature_extractor=whisper_turbo_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Whisper Turbo loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

### 3.3 NVIDIA Parakeet TDT 0.6B v3 Model

In [None]:
print("Loading NVIDIA Parakeet TDT 0.6B v3 model...")
torch.cuda.empty_cache()

# Load Parakeet model (CTC-based)
parakeet_model = AutoModelForCTC.from_pretrained(
    "nvidia/parakeet-tdt-0.6b-v3",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    trust_remote_code=True
)
parakeet_model.to(device)

parakeet_processor = AutoProcessor.from_pretrained(
    "nvidia/parakeet-tdt-0.6b-v3",
    trust_remote_code=True
)

parakeet_pipe = pipeline(
    "automatic-speech-recognition",
    model=parakeet_model,
    tokenizer=parakeet_processor.tokenizer,
    feature_extractor=parakeet_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Parakeet TDT loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## 4. Transcription Functions

In [None]:
def transcribe_with_whisper(audio_path, pipe, model_name, language="spanish"):
    """
    Transcribe audio using Whisper models.
    """
    print(f"\n{'='*70}")
    print(f"Transcribing with {model_name}")
    print(f"{'='*70}")
    
    # Clear cache and measure memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**3
    
    # Transcribe
    start_time = time.time()
    
    result = pipe(
        audio_path,
        generate_kwargs={"language": language, "task": "transcribe"},
        return_timestamps=False
    )
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    # Memory stats
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**3
        final_memory = torch.cuda.memory_allocated() / 1024**3
        memory_used = peak_memory - initial_memory
    
    # Display results
    print(f"\n📝 Transcription: {result['text']}")
    print(f"\n⏱️  Inference Time: {inference_time:.3f}s")
    
    if torch.cuda.is_available():
        print(f"💾 Memory Used: {memory_used:.3f} GB")
        print(f"📊 Peak Memory: {peak_memory:.3f} GB")
    
    return {
        "model": model_name,
        "text": result['text'],
        "inference_time": inference_time,
        "memory_used": memory_used if torch.cuda.is_available() else 0
    }


def transcribe_with_parakeet(audio_path, pipe, model_name):
    """
    Transcribe audio using Parakeet model.
    """
    print(f"\n{'='*70}")
    print(f"Transcribing with {model_name}")
    print(f"{'='*70}")
    
    # Clear cache and measure memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**3
    
    # Transcribe
    start_time = time.time()
    
    result = pipe(audio_path)
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    # Memory stats
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**3
        final_memory = torch.cuda.memory_allocated() / 1024**3
        memory_used = peak_memory - initial_memory
    
    # Display results
    print(f"\n📝 Transcription: {result['text']}")
    print(f"\n⏱️  Inference Time: {inference_time:.3f}s")
    
    if torch.cuda.is_available():
        print(f"💾 Memory Used: {memory_used:.3f} GB")
        print(f"📊 Peak Memory: {peak_memory:.3f} GB")
    
    return {
        "model": model_name,
        "text": result['text'],
        "inference_time": inference_time,
        "memory_used": memory_used if torch.cuda.is_available() else 0
    }

## 5. Record Audio and Test All Models

In [None]:
# Upload or specify audio file
# For Colab: Upload a file
# audio_file = upload_audio_file()

# Or specify existing file path
audio_file = "test_audio.wav"  # Replace with your file

print(f"Using audio file: {audio_file}")
print("\n💡 Tip: In Colab, run: audio_file = upload_audio_file()")

### Test Whisper Small

In [None]:
result_small = transcribe_with_whisper(
    audio_file,
    whisper_small_pipe,
    "Whisper Small",
    language="spanish"
)

### Test Whisper Large v3 Turbo

In [None]:
result_turbo = transcribe_with_whisper(
    audio_file,
    whisper_turbo_pipe,
    "Whisper Large v3 Turbo",
    language="spanish"
)

### Test NVIDIA Parakeet TDT

In [None]:
result_parakeet = transcribe_with_parakeet(
    audio_file,
    parakeet_pipe,
    "NVIDIA Parakeet TDT 0.6B v3"
)

## 6. Comparison Summary

In [None]:
import pandas as pd

# Compile results
results = [result_small, result_turbo, result_parakeet]

# Create comparison DataFrame
comparison_df = pd.DataFrame(results)

print("\n" + "="*70)
print("  MODEL COMPARISON SUMMARY")
print("="*70)
print("\nTranscriptions:")
for r in results:
    print(f"\n{r['model']}:")
    print(f"  {r['text']}")

print("\n" + "="*70)
print("Performance Metrics:")
print("="*70)
print(comparison_df[['model', 'inference_time', 'memory_used']].to_string(index=False))

# Find fastest model
fastest = min(results, key=lambda x: x['inference_time'])
print(f"\n🏆 Fastest Model: {fastest['model']} ({fastest['inference_time']:.3f}s)")

# Find most memory efficient
if torch.cuda.is_available():
    most_efficient = min(results, key=lambda x: x['memory_used'])
    print(f"💾 Most Memory Efficient: {most_efficient['model']} ({most_efficient['memory_used']:.3f} GB)")

## 7. Batch Testing with Multiple Recordings

In [None]:
# Batch testing with multiple audio files
print("⚠️  Batch recording disabled in Colab-friendly version")
print("Upload multiple files individually and test them:")
print("\nExample:")
print("  audio_file = upload_audio_file()")
print("  r_small = transcribe_with_whisper(audio_file, whisper_small_pipe, 'Whisper Small')")
print("  r_turbo = transcribe_with_whisper(audio_file, whisper_turbo_pipe, 'Whisper Turbo')")  
print("  r_parakeet = transcribe_with_parakeet(audio_file, parakeet_pipe, 'Parakeet TDT')")

## 8. Final Recommendations

In [None]:
print("\n" + "="*70)
print("  RECOMMENDATIONS FOR VOICE ASSISTANT")
print("="*70)

print("\n📊 Model Characteristics:")
print("\n1. Whisper Small:")
print("   - Fast inference (~2-3s)")
print("   - Low memory (~1.5-2GB)")
print("   - Good Spanish support")
print("   - ✅ Recommended for real-time voice assistant")

print("\n2. Whisper Large v3 Turbo:")
print("   - Better accuracy")
print("   - Moderate memory (~3-4GB)")
print("   - Slower than small")
print("   - Good for higher quality needs")

print("\n3. NVIDIA Parakeet TDT 0.6B v3:")
print("   - Very fast (CTC-based)")
print("   - Low memory")
print("   - Multilingual support")
print("   - Alternative option for speed-critical applications")

print("\n💡 For <12GB VRAM constraint:")
print("   Choose: Whisper Small or Parakeet TDT")
print("   This leaves ~8-10GB for LLM and TTS")