# Lab 3.4: Speech-to-Text with Whisper

**Objective**: Transcribe audio using OpenAI Whisper

**Duration**: 25 minutes

## Learning Outcomes
- Load Whisper model for transcription
- Handle audio files
- Detect language automatically

In [None]:
import sys
sys.path.insert(0, "../../../src")
from hf_ecosystem import __version__
print(f"hf-ecosystem version: {__version__}")

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

## 1. Load Whisper Pipeline

In [None]:
# Load Whisper model and processor (manual inference due to pipeline bug in transformers 5.0)
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
print("Whisper tiny model loaded")

## 2. Transcribe Audio

In [None]:
# Load sample audio from dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
audio_decoder = ds[0]["audio"]

# Extract audio data (new torchcodec API)
samples = audio_decoder.get_all_samples()
audio = {
    "array": samples.data.squeeze(0).numpy(),  # Convert to numpy, remove channel dim
    "sampling_rate": samples.sample_rate,
}
print(f"Audio sampling rate: {audio['sampling_rate']} Hz")
print(f"Audio shape: {audio['array'].shape}")

In [None]:
# Transcribe using manual inference
inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
generated_ids = model.generate(inputs["input_features"], language="en", task="transcribe")
result = {"text": processor.batch_decode(generated_ids, skip_special_tokens=True)[0]}
print(f"Transcription: {result['text']}")

## Verification

In [None]:
def verify_lab():
    assert "text" in result
    assert len(result["text"]) > 0
    print("âœ… Lab completed successfully!")

verify_lab()