# Swahili ASR — Offline Baseline (Colab)

**Goal:** Build a fast, privacy-preserving Kiswahili ASR baseline that runs offline on a single NVIDIA T4 (≤16 GB), produces `submission.csv`, and logs Real‑Time Factor (RTFx).

> ⚠️ **Use only Zindi-provided data** for training/evaluation. Pretrained *open* models are okay unless the challenge page states otherwise.


In [ ]:
#@title 1) Install dependencies (open-source only)
!pip -q install 'faster-whisper>=1.0.0' ctranslate2 datasets soundfile jiwer torch torchaudio pynvml --upgrade
!pip -q install transformers accelerate evaluate sentencepiece pyctcdecode


In [ ]:
#@title 2) Config
import os, re, time, json, glob
import numpy as np
import pandas as pd
import soundfile as sf
from faster_whisper import WhisperModel

SEED = 42
MODEL_NAME = 'small'          # tiny|base|small
COMPUTE_TYPE = 'int8_float16' # good balance of speed/memory on T4
LANGUAGE = 'sw'
BEAM_SIZE = 5; PATIENCE = 0.2; TEMPERATURE = 0.0
VAD_FILTER = True; MIN_SIL_MS = 200; CONDITION_ON_PREV = False
TEST_WAV_GLOB = 'data/test/*.wav'  # put Zindi test wavs here
OUT_CSV = 'artifacts/submission.csv'

os.makedirs('artifacts', exist_ok=True)

def normalize_text(t: str) -> str:
  t = t.lower()
  t = re.sub(r"[^\w\s]", " ", t)
  t = re.sub(r"\s+", " ", t).strip()
  return t


### (Optional) Data layout help
Place files like so:
```
data/
 ├── test/               # Zindi test WAV files
 ├── train/              # Zindi train WAV files
 └── splits/             # your own manifests (filename,text) for val
```


In [ ]:
#@title 3) Load model (faster-whisper / CTranslate2)
model = WhisperModel(MODEL_NAME, device='cuda', compute_type=COMPUTE_TYPE)
print('Loaded model:', MODEL_NAME, 'compute_type:', COMPUTE_TYPE)


In [ ]:
#@title 4) Inference on test set -> submission.csv
wav_files = sorted(glob.glob(TEST_WAV_GLOB))
rows, rtfs = [], []
for p in wav_files:
  audio, sr = sf.read(p)
  clip_len = len(audio) / float(sr)
  t0 = time.time()
  segments, info = model.transcribe(
      audio,
      language=LANGUAGE, beam_size=BEAM_SIZE, patience=PATIENCE,
      vad_filter=VAD_FILTER, vad_parameters=dict(min_silence_duration_ms=MIN_SIL_MS),
      condition_on_previous_text=CONDITION_ON_PREV, temperature=TEMPERATURE,
  )
  hypo = ' '.join(s.text for s in segments).strip()
  elapsed = time.time() - t0
  rtf = clip_len / max(elapsed, 1e-6)
  rtfs.append(rtf)
  rows.append({'filename': os.path.basename(p), 'text': normalize_text(hypo)})
  print(os.path.basename(p), f'len={clip_len:.2f}s time={elapsed:.2f}s RTFx={rtf:.2f}')

import pandas as pd
pd.DataFrame(rows).to_csv(OUT_CSV, index=False)
print('Saved submission ->', OUT_CSV)
print('Mean RTFx:', np.mean(rtfs))


In [ ]:
#@title 5) (Optional) WER on a local validation split
import jiwer
VAL_MANIFEST = 'data/splits/val_manifest.csv'  # filename,text
if os.path.exists(VAL_MANIFEST):
  df = pd.read_csv(VAL_MANIFEST)
  preds, refs = [], []
  for _, row in df.iterrows():
    wav_p = os.path.join('data','train', row['filename'])
    if not os.path.exists(wav_p):
      continue
    audio, sr = sf.read(wav_p)
    segments, _ = model.transcribe(
        audio, language=LANGUAGE, beam_size=BEAM_SIZE, patience=PATIENCE,
        vad_filter=VAD_FILTER, vad_parameters=dict(min_silence_duration_ms=MIN_SIL_MS),
        condition_on_previous_text=CONDITION_ON_PREV, temperature=TEMPERATURE,
    )
    hypo = normalize_text(' '.join(s.text for s in segments).strip())
    ref  = normalize_text(str(row['text']))
    preds.append(hypo); refs.append(ref)
  print('Validation WER:', jiwer.wer(refs, preds))
else:
  print('Skip: no validation manifest at', VAL_MANIFEST)


In [ ]:
#@title 6) Peak GPU memory (approx) — ensure < 16 GB
import subprocess, re
try:
  out = subprocess.check_output(['bash','-lc','nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits']).decode()
  vals = [int(x) for x in re.split(r'[\r\n]+', out) if x.strip()]
  print('Approx GPU memory used (MiB):', max(vals) if vals else 'n/a')
except Exception as e:
  print('nvidia-smi not available:', e)


## (Optional) Fine-tune CTC (Wav2Vec2 XLS‑R)
Use `scripts/train_wav2vec2_ctc.py` with your own `data/splits/train_manifest.csv` and `val_manifest.csv`. Keep batches small to fit a single T4; prefer FP16.
