# ASR (Automatic Speech Recognition) Testing Notebook

This notebook tests multiple ASR models:
1. **Whisper Small** - OpenAI's small model
2. **Whisper Large v3 Turbo** - OpenAI's turbo model
3. **Parakeet TDT 0.6B v3** - NVIDIA's specialized model

Features:
- Record audio from microphone
- Save audio as WAV files
- Compare transcription quality and performance

In [1]:
# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

if IN_COLAB:
    print("\n" + "="*70)
    print("  GOOGLE COLAB SETUP")
    print("="*70)

    # Clone repository
    print("\n[1/3] Cloning repository...")
    !git clone https://github.com/ltruciosr-dev/utec-voice-assistant.git

    # Change to repo directory
    import os
    os.chdir('utec-voice-assistant')
    print("✓ Repository cloned")

    # Install dependencies
    print("\n[2/3] Installing dependencies...")
    !pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    !pip install -q -r requirements.txt
    print("✓ Dependencies installed")

    # Verify GPU
    print("\n[3/3] Verifying GPU access...")
    import torch
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

    print("\n" + "="*70)
    print("  SETUP COMPLETE!")
    print("="*70)
else:
    print("Skipping Colab setup (running locally)")

✓ Running in Google Colab

  GOOGLE COLAB SETUP

[1/3] Cloning repository...
Cloning into 'utec-voice-assistant'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 64 (delta 26), reused 55 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 60.77 KiB | 3.80 MiB/s, done.
Resolving deltas: 100% (26/26), done.
✓ Repository cloned

[2/3] Installing dependencies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

# 🚀 Google Colab Setup

**Run this section if using Google Colab. Skip if running locally.**

This will:
1. Clone the repository
2. Install dependencies
3. Set up the environment

## 1. Setup and Imports

In [14]:
import torch
import numpy as np
import soundfile as sf
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from transformers import AutoModelForCTC, AutoTokenizer
import time
from pathlib import Path
from datetime import datetime
from pathlib import Path
from google.colab import files
import numpy as np
import scipy.io.wavfile as wavfile
import warnings
warnings.filterwarnings('ignore')

# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
GPU: Tesla T4
GPU Memory: 14.74 GB


## 2. Microphone Recording Function

In [15]:
def upload_audio(output_dir="recordings"):
    """Simpler version - just upload a WAV file"""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    print(f"📤 Please upload a WAV audio file")
    uploaded = files.upload()

    if not uploaded:
        print("⚠️  No file uploaded")
        return None

    # Get uploaded file
    upload_filename = list(uploaded.keys())[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = output_path / f"recording_{timestamp}.wav"

    # Save with timestamp
    import shutil
    shutil.move(upload_filename, str(filename))

    print(f"✓ Audio saved to: {filename}")
    return str(filename)

def record_audio(duration=5, sample_rate=16000, output_dir="recordings"):
    """
    Record audio from microphone in Colab and save as WAV file.

    Args:
        duration: Recording duration in seconds
        sample_rate: Sample rate (16kHz recommended for ASR)
        output_dir: Directory to save recordings

    Returns:
        Path to saved WAV file
    """
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = output_path / f"recording_{timestamp}.wav"

    try:
        from google.colab import output
        from IPython.display import Javascript, display
        from base64 import b64decode

        print(f"🎤 Recording for {duration} seconds...")
        print("Please allow microphone access when prompted!")

        # JavaScript code to record audio in browser
        RECORD_AUDIO = """
        const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
        const b2text = blob => new Promise(resolve => {
          const reader = new FileReader()
          reader.onloadend = e => resolve(e.srcElement.result)
          reader.readAsDataURL(blob)
        })

        var record = time => new Promise(async resolve => {
          stream = await navigator.mediaDevices.getUserMedia({ audio: true })
          recorder = new MediaRecorder(stream)
          chunks = []
          recorder.ondataavailable = e => chunks.push(e.data)
          recorder.start()
          await sleep(time)
          recorder.onstop = async ()=>{
            blob = new Blob(chunks)
            text = await b2text(blob)
            resolve(text)
          }
          recorder.stop()
        })
        """

        display(Javascript(RECORD_AUDIO))

        # Record audio
        audio_data = output.eval_js(f'record({duration * 1000})')

        # Decode base64 audio data
        audio_bytes = b64decode(audio_data.split(',')[1])

        # Save raw audio to temporary file
        temp_file = output_path / f"temp_{timestamp}.webm"
        with open(temp_file, 'wb') as f:
            f.write(audio_bytes)

        print(f"✓ Audio recorded!")

        # Convert to WAV using ffmpeg (installed by default in Colab)
        import subprocess

        cmd = [
            'ffmpeg', '-i', str(temp_file),
            '-ar', str(sample_rate),  # resample to target rate
            '-ac', '1',  # mono
            '-y',  # overwrite
            str(filename)
        ]

        result = subprocess.run(cmd, capture_output=True, text=True)

        if result.returncode == 0:
            # Clean up temp file
            temp_file.unlink()
            print(f"✓ Audio saved to: {filename}")

            # Display audio player
            from IPython.display import Audio, display
            display(Audio(str(filename)))

            return str(filename)
        else:
            print(f"⚠️  Error converting audio: {result.stderr}")
            return str(temp_file)

    except ImportError:
        print("⚠️  This function requires Google Colab environment")
        return None
    except Exception as e:
        print(f"⚠️  Error recording audio: {e}")
        return None

## 3. Whisper Models Setup

In [18]:
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

print(f"Using device: {device}")
print(f"Data type: {torch_dtype}")

Using device: cuda
Data type: torch.float16


### 3.1 Whisper Small Model

In [19]:
print("Loading Whisper Small model...")
torch.cuda.empty_cache()

whisper_small_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-small",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
whisper_small_model.to(device)

whisper_small_processor = AutoProcessor.from_pretrained("openai/whisper-small")

whisper_small_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_small_model,
    tokenizer=whisper_small_processor.tokenizer,
    feature_extractor=whisper_small_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Whisper Small loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

Loading Whisper Small model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda


✓ Whisper Small loaded
VRAM usage: 0.47 GB


### 3.2 Whisper Large v3 Turbo Model

In [20]:
print("Loading Whisper Large v3 Turbo model...")
torch.cuda.empty_cache()

whisper_turbo_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large-v3-turbo",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
whisper_turbo_model.to(device)

whisper_turbo_processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")

whisper_turbo_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_turbo_model,
    tokenizer=whisper_turbo_processor.tokenizer,
    feature_extractor=whisper_turbo_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Whisper Turbo loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

Loading Whisper Large v3 Turbo model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda


✓ Whisper Turbo loaded
VRAM usage: 1.97 GB


### 3.3 Whisper Large v3 Turbo Model fine-tuned for ES

In [24]:
print("Loading Whisper Large v3 Turbo model...")
torch.cuda.empty_cache()

whisper_turbo_es_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "adriszmar/whisper-large-v3-turbo-es",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
whisper_turbo_es_model.to(device)

whisper_turbo_es_processor = AutoProcessor.from_pretrained("adriszmar/whisper-large-v3-turbo-es")

whisper_turbo_es_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_turbo_es_model,
    tokenizer=whisper_turbo_es_processor.tokenizer,
    feature_extractor=whisper_turbo_es_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("✓ Whisper Turbo loaded")
if torch.cuda.is_available():
    print(f"VRAM usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

Loading Whisper Large v3 Turbo model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda


✓ Whisper Turbo loaded
VRAM usage: 3.48 GB


## 4. Transcription Functions

In [25]:
def transcribe_with_whisper(audio_path, pipe, model_name, language="spanish"):
    """
    Transcribe audio using Whisper models.
    """
    print(f"\n{'='*70}")
    print(f"Transcribing with {model_name}")
    print(f"{'='*70}")

    # Clear cache and measure memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**3

    # Transcribe
    start_time = time.time()

    result = pipe(
        audio_path,
        generate_kwargs={"language": language, "task": "transcribe"},
        return_timestamps=False
    )

    end_time = time.time()
    inference_time = end_time - start_time

    # Memory stats
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**3
        final_memory = torch.cuda.memory_allocated() / 1024**3
        memory_used = peak_memory - initial_memory

    # Display results
    print(f"\n📝 Transcription: {result['text']}")
    print(f"\n⏱️  Inference Time: {inference_time:.3f}s")

    if torch.cuda.is_available():
        print(f"💾 Memory Used: {memory_used:.3f} GB")
        print(f"📊 Peak Memory: {peak_memory:.3f} GB")

    return {
        "model": model_name,
        "text": result['text'],
        "inference_time": inference_time,
        "memory_used": memory_used if torch.cuda.is_available() else 0
    }

## 5. Record Audio and Test All Models

In [33]:
audio_file = record_audio(duration=5, sample_rate=16000)

print(f"Using audio file: {audio_file}")
print("\n💡 Tip: In Colab, run: audio_file = upload_audio_file()")

🎤 Recording for 5 seconds...
Please allow microphone access when prompted!


<IPython.core.display.Javascript object>

✓ Audio recorded!
✓ Audio saved to: recordings/recording_20251028_003531.wav


Using audio file: recordings/recording_20251028_003531.wav

💡 Tip: In Colab, run: audio_file = upload_audio_file()


### Test Whisper Small

In [37]:
result_small = transcribe_with_whisper(
    audio_file,
    whisper_small_pipe,
    "Whisper Small",
    language="spanish"
)


Transcribing with Whisper Small

📝 Transcription:  Hola, mañana se discutirá cuántos centros poblados tiene el ristrito de Comas en Lima.

⏱️  Inference Time: 0.648s
💾 Memory Used: 0.287 GB
📊 Peak Memory: 3.776 GB


### Test Whisper Large v3 Turbo

In [40]:
result_turbo = transcribe_with_whisper(
    audio_file,
    whisper_turbo_pipe,
    "Whisper Large v3 Turbo",
    language="spanish"
)


Transcribing with Whisper Large v3 Turbo

📝 Transcription:  Hola, mañana se discutirá cuántos centros poblados tienen el distrito de Comas en Lima.

⏱️  Inference Time: 0.673s
💾 Memory Used: 0.185 GB
📊 Peak Memory: 3.674 GB


### Test Whisper Large v3 Turbo - ES

In [41]:
result_turbo_es = transcribe_with_whisper(
    audio_file,
    whisper_turbo_es_pipe,
    "Whisper Large v3 Turbo - ES",
    language="spanish"
)


Transcribing with Whisper Large v3 Turbo - ES

📝 Transcription: O la mañana se discutiría cuántos centros poblados tiene el distrito de Comas en Lima.

⏱️  Inference Time: 0.558s
💾 Memory Used: 0.185 GB
📊 Peak Memory: 3.674 GB


## 6. Comparison Summary

In [42]:
import pandas as pd

# Compile results
results = [result_small, result_turbo, result_turbo_es]

# Create comparison DataFrame
comparison_df = pd.DataFrame(results)

print("\n" + "="*70)
print("  MODEL COMPARISON SUMMARY")
print("="*70)
print("\nTranscriptions:")
for r in results:
    print(f"\n{r['model']}:")
    print(f"  {r['text']}")

print("\n" + "="*70)
print("Performance Metrics:")
print("="*70)
print(comparison_df[['model', 'inference_time', 'memory_used']].to_string(index=False))

# Find fastest model
fastest = min(results, key=lambda x: x['inference_time'])
print(f"\n🏆 Fastest Model: {fastest['model']} ({fastest['inference_time']:.3f}s)")

# Find most memory efficient
if torch.cuda.is_available():
    most_efficient = min(results, key=lambda x: x['memory_used'])
    print(f"💾 Most Memory Efficient: {most_efficient['model']} ({most_efficient['memory_used']:.3f} GB)")


  MODEL COMPARISON SUMMARY

Transcriptions:

Whisper Small:
   Hola, mañana se discutirá cuántos centros poblados tiene el ristrito de Comas en Lima.

Whisper Large v3 Turbo:
   Hola, mañana se discutirá cuántos centros poblados tienen el distrito de Comas en Lima.

Whisper Large v3 Turbo - ES:
  O la mañana se discutiría cuántos centros poblados tiene el distrito de Comas en Lima.

Performance Metrics:
                      model  inference_time  memory_used
              Whisper Small        0.647920     0.286950
     Whisper Large v3 Turbo        0.673096     0.185043
Whisper Large v3 Turbo - ES        0.557664     0.185042

🏆 Fastest Model: Whisper Large v3 Turbo - ES (0.558s)
💾 Most Memory Efficient: Whisper Large v3 Turbo - ES (0.185 GB)
