# Faster Whisper for Small Scale Speech To Text Test

In [1]:
import sys
print(sys.prefix)
print(sys.executable)
import os
print(os.environ)

/opt/anaconda3
/opt/anaconda3/bin/python
environ({'COMMAND_MODE': 'unix2003', 'CONDA_DEFAULT_ENV': 'base', 'CONDA_EXE': '/opt/anaconda3/bin/conda', 'CONDA_PREFIX': '/opt/anaconda3', 'CONDA_PROMPT_MODIFIER': '(base) ', 'CONDA_PYTHON_EXE': '/opt/anaconda3/bin/python', 'CONDA_SHLVL': '1', 'GSETTINGS_SCHEMA_DIR': '/opt/anaconda3/share/glib-2.0/schemas', 'HOME': '/Users/hehvince', 'HOMEBREW_CELLAR': '/opt/homebrew/Cellar', 'HOMEBREW_PREFIX': '/opt/homebrew', 'HOMEBREW_REPOSITORY': '/opt/homebrew', 'INFOPATH': '/opt/homebrew/share/info:', 'LOGNAME': 'hehvince', 'MAMBA_EXE': '/Users/hehvince/.micromamba/bin/micromamba', 'MAMBA_ROOT_PREFIX': '/Users/hehvince/micromamba', 'MallocNanoZone': '0', 'OLDPWD': '/', 'ORIGINAL_XDG_CURRENT_DESKTOP': 'undefined', 'PATH': '/opt/anaconda3/bin:/Users/hehvince/micromamba/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local

In [2]:

import time, statistics, math, os, tempfile, pathlib
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

from datasets import load_dataset, Audio, concatenate_datasets
from jiwer import wer
from faster_whisper import WhisperModel



In [3]:
# ---- Configure model ----
MODEL_NAME = "small.en"       # use "small" if you need multilingual
COMPUTE_TYPE = "int8"         # int8 for low RAM & good speed/accuracy balance
DEVICE = "auto"               # auto -> Metal on Apple Silicon; else CPU


model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)
print(MODEL_NAME, "Loaded.")


small.en Loaded.


In [None]:
def transcribe_array_to_text(audio_array: np.ndarray, sample_rate: int, beam_size=5, vad_filter=True) -> Tuple[str, float]:
    """Write an audio array to a temp WAV and transcribe. Returns (text, processing_time_seconds)"""
    # Ensure mono float32 at 16k if needed:
    if audio_array.ndim > 1:
        audio_array = np.mean(audio_array, axis=1)
    if sample_rate != 16000:
        # librosa can resample but to avoid extra deps, let faster-whisper handle resampling from file.
        pass

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
        sf.write(tmp.name, audio_array, sample_rate)
        t0 = time.perf_counter()
        segments, info = model.transcribe(tmp.name, vad_filter=vad_filter, beam_size=beam_size)
        hyp = "".join(seg.text for seg in segments).strip()
        dt = time.perf_counter() - t0
    return hyp, dt

def eval_dataset(dataset, max_items=None, beam_size=5, vad_filter=True, verbose_every=25):
    """Evaluate a HuggingFace dataset that has fields: audio[array, sampling_rate], text.

    Returns dict with refs, hyps, proc_times, audio_durs, rtf_list, and WER.

    If max_items is None, processing entire split (can take long).
    """

    refs, hyps = [], []
    proc_times, audio_durs, rtf_list = [], [], []
    n = len(dataset) if hasattr(dataset, "__len__") else None
    count = 0
    iterator = dataset if not n else range(n)

    for i, item in enumerate(iterator):
        
        if i < len(dataset):
            rec = item if not n else dataset[i]
        else:
            print(f"Index {i} out of bounds")

        rec = item if not n else dataset[i]
        audio = rec["audio"]
        arr = audio["array"].astype(np.float32)
        sr = audio["sampling_rate"]
        duration = len(arr) / float(sr)
        ref = rec["text"].strip().lower()

        hyp, dt = transcribe_array_to_text(arr, sr, beam_size=beam_size, vad_filter=vad_filter)
        hyp = hyp.strip().lower()

        refs.append(ref); hyps.append(hyp)
        proc_times.append(dt); audio_durs.append(duration)
        rtf = dt / max(1e-9, duration)
        rtf_list.append(rtf)

        count += 1
        if verbose_every and (count % verbose_every == 0):
            print(f"[{count}] dur={duration:.1f}s  time={dt:.2f}s  RTF={rtf:.2f}")
        if max_items is not None and count >= max_items:
            break

    metric = wer(refs, hyps)
    return {
        "refs": refs,
        "hyps": hyps,
        "proc_times": proc_times,
        "audio_durs": audio_durs,
        "rtf_list": rtf_list,
        "wer": metric
    }

def plot_latency_histogram(rtf_list: List[float]):
    """Plot a simple histogram of Real-Time Factor (RTF). Lower is better (<1 == faster than real-time)."""
    plt.figure(figsize=(6,4))
    plt.hist(rtf_list, bins=20)
    plt.title("Per-file Real-Time Factor (RTF)")
    plt.xlabel("RTF (processing_time / audio_duration)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()


In [8]:
# ===== Baseline on LibriSpeech: dev-clean (validation) and test-clean =====
from datasets import load_dataset, Audio, concatenate_datasets
import statistics, numpy as np

print("Downloading LibriSpeech splits (this can take minutes the first time)...")
ds_dev = load_dataset("openslr/librispeech_asr", "clean", split="validation")
ds_test = load_dataset("openslr/librispeech_asr", "clean", split="test")

# IMPORTANT: ensure we get NumPy arrays (not torch tensors)
ds_dev = ds_dev.cast_column("audio", Audio(sampling_rate=16000))
ds_test = ds_test.cast_column("audio", Audio(sampling_rate=16000))

# Merge the two clean splits (~10h total if you later run all items)
ds = concatenate_datasets([ds_dev, ds_test])

# Print the number of examples in the dataset (length of the dataset)
print(f"Number of items in the dataset: {len(ds)}")

# Print the column names to inspect the structure
print(f"Dataset columns: {ds.column_names}")

# You can keep this small for quick sanity checks and increase later
MAX_ITEMS = 50   # set to None to run all files in dev+test (≈10 hours total)
print("Evaluating... (MAX_ITEMS=", MAX_ITEMS, ")")
results = eval_dataset(ds, max_items=MAX_ITEMS, beam_size=5, vad_filter=True)

print("\n== LibriSpeech results ==")
print("WER:", results["wer"])
rtf = results["rtf_list"]
print(f"RTF  mean={statistics.mean(rtf):.2f}  median={statistics.median(rtf):.2f}  p90={np.percentile(rtf,90):.2f}")

# Tiny latency chart (RTF histogram)
plot_latency_histogram(results["rtf_list"])


Downloading LibriSpeech splits (this can take minutes the first time)...


Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Number of items in the dataset: 5323
Dataset columns: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id']
Evaluating... (MAX_ITEMS= 50 )


RuntimeError: Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6 and 7.
          2. The PyTorch version (2.8.0) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 7: dlopen(/opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core7.dylib, 0x0006): Library not loaded: @rpath/libavutil.59.dylib
  Referenced from: <7706FFE4-9BB3-3BCA-880F-8416ADA3BD4A> /opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core7.dylib
  Reason: tried: '/opt/anaconda3/lib/python3.12/lib-dynload/../../libavutil.59.dylib' (no such file), '/opt/anaconda3/bin/../lib/libavutil.59.dylib' (no such file)
FFmpeg version 6: dlopen(/opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core6.dylib, 0x0006): Library not loaded: @rpath/libavutil.58.dylib
  Referenced from: <724DA566-18E0-3A5D-A6EF-CD214BDD9FA1> /opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core6.dylib
  Reason: tried: '/opt/anaconda3/lib/python3.12/lib-dynload/../../libavutil.58.dylib' (no such file), '/opt/anaconda3/bin/../lib/libavutil.58.dylib' (no such file)
FFmpeg version 5: dlopen(/opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core5.dylib, 0x0006): Library not loaded: @rpath/libavutil.57.dylib
  Referenced from: <B30E8984-8BA6-39EC-8B21-07BBC8AECC80> /opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core5.dylib
  Reason: tried: '/opt/anaconda3/lib/python3.12/lib-dynload/../../libavutil.57.dylib' (no such file), '/opt/anaconda3/bin/../lib/libavutil.57.dylib' (no such file)
FFmpeg version 4: dlopen(/opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core4.dylib, 0x0006): Library not loaded: @rpath/libavutil.56.dylib
  Referenced from: <456702AB-221D-3ECD-860E-E97F76E6F6DC> /opt/anaconda3/lib/python3.12/site-packages/torchcodec/libtorchcodec_core4.dylib
  Reason: tried: '/opt/anaconda3/lib/python3.12/lib-dynload/../../libavutil.56.dylib' (no such file), '/opt/anaconda3/bin/../lib/libavutil.56.dylib' (no such file)
[end of libtorchcodec loading traceback].

In [None]:
# ===== Optional realism pass: People's Speech (MLCommons) ~10 hours via streaming =====
# This will *stream* the dataset and process until ~10 hours of audio are seen, without downloading everything.
# You can reduce TARGET_HOURS to 5 if you're short on time/disk.
TARGET_HOURS = 5
MAX_ITEMS = None   # or cap number of utterances to a fixed integer

print("Preparing People's Speech streaming loader...")
ds_ps = load_dataset("MLCommons/peoples_speech", "clean", split="train", streaming=True)
ds_ps = ds_ps.cast_column("audio", Audio(sampling_rate=16000))

refs, hyps, proc_times, audio_durs, rtf_list = [], [], [], [], []
seen_sec = 0.0
count = 0

for rec in ds_ps:
    arr = rec["audio"]["array"].astype(np.float32)
    sr = rec["audio"]["sampling_rate"]
    duration = rec.get("duration_ms", len(arr) / sr * 1000) / 1000.0
    ref = rec.get("text", "").strip().lower()

    hyp, dt = transcribe_array_to_text(arr, sr, beam_size=5, vad_filter=True)
    hyp = hyp.strip().lower()

    refs.append(ref); hyps.append(hyp)
    proc_times.append(dt); audio_durs.append(duration)
    rtf_list.append(dt / max(1e-9, duration))

    seen_sec += duration
    count += 1
    if count % 25 == 0:
        print(f"[{count}] cum_dur={seen_sec/3600:.2f}h  last_dur={duration:.1f}s  last_time={dt:.2f}s")

    if (MAX_ITEMS is not None and count >= MAX_ITEMS) or (seen_sec >= TARGET_HOURS * 3600):
        break

print("\n== People's Speech ~%.2f hours summary ==" % (seen_sec/3600))
try:
    print("WER:", wer(refs, hyps))
except Exception as e:
    print("WER could not be computed (maybe empty references). Error:", e)
if rtf_list:
    print(f"RTF  mean={statistics.mean(rtf_list):.2f}  median={statistics.median(rtf_list):.2f}")
else:
    print("No items processed.")
