# Ambara — Colab Runner

Automated notebook for running **ingest** (extract + sync) and **iterate**
(export + train + re-draft) on Colab with a free GPU.

**Usage:** Fill in the configuration cell below, then **Runtime > Run All**.

- Clips are pulled from / pushed to Supabase automatically — no manual uploads.
- Trained models are pushed to HuggingFace Hub — no Google Drive needed.
- Google Drive is only used for the `.env` file (Supabase credentials).

In [None]:
# ---- Google Drive (only for .env) ----
DRIVE_ROOT = "ambara"

# ---- Repository ----
REPO_URL = "https://github.com/ny-randriantsarafara/ny-feoko.git"
REPO_BRANCH = "main"

# ---- HuggingFace ----
HF_TOKEN = ""  # Write token — required for --push-to-hub

# ---- Ingest: download + extract + sync (set False to skip) ----
INGEST_ENABLED = False
INGEST_INPUT = "https://youtube.com/watch?v=..."  # YouTube URL or Drive file path
INGEST_LABEL = "my-recording"
INGEST_WHISPER_HF = ""  # HuggingFace model ID for transcription (leave empty for stock Whisper)

# ---- Iterate: export + train + re-draft (set False to skip) ----
ITERATE_ENABLED = True
ITERATE_LABEL = ""  # Run label in Supabase (from ingest)
ITERATE_BASE_MODEL = "openai/whisper-small"
ITERATE_EPOCHS = 10
ITERATE_BATCH_SIZE = 4
ITERATE_LR = 1e-5
ITERATE_PUSH_TO_HUB = ""  # HuggingFace repo ID — strongly recommended on Colab

## Environment Setup

Mounts Drive (for `.env` only), clones the repo, and installs dependencies.

In [None]:
import os
import subprocess
import sys
from pathlib import Path

from google.colab import drive

DRIVE_MOUNT = Path("/content/drive")
DRIVE_BASE = DRIVE_MOUNT / "MyDrive" / DRIVE_ROOT
REPO_DIR = Path("/content/ny-feoko")

# ---- Mount Google Drive (for .env) ----
if not (DRIVE_MOUNT / "MyDrive").exists():
    drive.mount(str(DRIVE_MOUNT))
DRIVE_BASE.mkdir(parents=True, exist_ok=True)
print(f"Drive mounted at {DRIVE_MOUNT}")

# ---- Clone or update repo ----
if REPO_DIR.exists():
    subprocess.run(
        ["git", "-C", str(REPO_DIR), "pull", "--ff-only"],
        check=True,
    )
    print(f"Repo updated: {REPO_DIR}")
else:
    subprocess.run(
        ["git", "clone", "-b", REPO_BRANCH, REPO_URL, str(REPO_DIR)],
        check=True,
    )
    print(f"Repo cloned: {REPO_DIR}")

os.chdir(REPO_DIR)

# ---- Symlink .env from Drive (if present) ----
env_drive = DRIVE_BASE / ".env"
env_local = REPO_DIR / ".env"
if env_drive.exists():
    if env_local.is_symlink() or env_local.exists():
        env_local.unlink()
    env_local.symlink_to(env_drive)
    print(f"  .env -> {env_drive}")
else:
    print("WARNING: No .env found on Drive. Place it at My Drive/ambara/.env")

# ---- Python version check ----
v = sys.version_info
print(f"\nPython {v.major}.{v.minor}.{v.micro}")
if v < (3, 10):
    print("WARNING: This project requires Python >= 3.10.")

# ---- Install dependencies ----
subprocess.run(["make", "colab-install"], check=True, cwd=str(REPO_DIR))

# ---- HuggingFace login ----
if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN)
    print("Logged in to HuggingFace Hub.")

# ---- Environment summary ----
import torch

gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A"
print(f"\n{'=' * 50}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA:    {torch.cuda.is_available()} ({gpu_name})")
print(f"Repo:    {REPO_DIR}")
print(f"{'=' * 50}")
print("Setup complete.")

## Ingest

Downloads audio (if URL), extracts speech clips, and syncs everything to
Supabase in one shot. Skip this if you already ingested locally.

In [None]:
if not INGEST_ENABLED:
    print("Ingest skipped (INGEST_ENABLED = False).")
else:
    cmd = [
        "python", "-m", "pipeline.cli", "ingest",
        INGEST_INPUT,
        "--device", "cuda",
        "--verbose",
    ]
    if INGEST_LABEL:
        cmd += ["--label", INGEST_LABEL]
    if INGEST_WHISPER_HF:
        cmd += ["--whisper-hf", INGEST_WHISPER_HF]

    print(f"Running: {' '.join(cmd)}")
    subprocess.run(cmd, check=True)

## Iterate

Exports corrected clips from Supabase, fine-tunes Whisper, and re-drafts
pending clips — all in one command. Clips are downloaded from Supabase
Storage automatically (no manual upload needed).

In [None]:
if not ITERATE_ENABLED:
    print("Iterate skipped (ITERATE_ENABLED = False).")
elif not ITERATE_LABEL:
    print("ERROR: Set ITERATE_LABEL to the run label from ingest.")
else:
    cmd = [
        "python", "-m", "pipeline.cli", "iterate",
        "--label", ITERATE_LABEL,
        "--device", "cuda",
        "--base-model", ITERATE_BASE_MODEL,
        "--epochs", str(ITERATE_EPOCHS),
        "--batch-size", str(ITERATE_BATCH_SIZE),
        "--lr", str(ITERATE_LR),
    ]
    if ITERATE_PUSH_TO_HUB:
        cmd += ["--push-to-hub", ITERATE_PUSH_TO_HUB]

    print(f"Running: {' '.join(cmd)}")
    subprocess.run(cmd, check=True)

## Results

Summary of what happened and suggested next steps.

In [None]:
from pathlib import Path

print(f"{'=' * 50}")
print("Results")
print(f"{'=' * 50}")

model_dir = Path("models/whisper-mg-v1/model")
if model_dir.exists():
    size_mb = sum(f.stat().st_size for f in model_dir.rglob("*") if f.is_file()) / (1024 * 1024)
    print(f"\nTrained model: {model_dir} ({size_mb:.0f} MB)")

if ITERATE_PUSH_TO_HUB:
    print(f"  HuggingFace: https://huggingface.co/{ITERATE_PUSH_TO_HUB}")

print(f"\nNext steps:")
print(f"  1. Open the transcript editor: ./ambara editor")
print(f"  2. Correct the improved drafts")
print(f"  3. Run this notebook again to train on the new corrections")
if ITERATE_PUSH_TO_HUB:
    print(f"\nUse the model locally:")
    print(f"  ./ambara extract -i audio.wav -o data/output/ --device mps \\")
    print(f"      --whisper-hf {ITERATE_PUSH_TO_HUB}")