# üéôÔ∏è TTS Worker ‚Äî Kokoro GPU on Colab T4

This notebook runs on Google Colab with a T4 GPU to generate TTS narration audio files.

**How it works:**
1. Mounts Google Drive
2. Watches a job directory for incoming TTS requests (JSON files)
3. Generates WAV audio using Kokoro with GPU-accelerated onnxruntime
4. Writes WAV files back to Drive for the local machine to pick up

**Setup:** Runtime ‚Üí Change runtime type ‚Üí T4 GPU

## 1. Install Dependencies

In [None]:
!pip install -q kokoro-onnx soundfile
# Install GPU-accelerated onnxruntime (replaces CPU version)
!pip install -q onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
# Verify GPU is available
!nvidia-smi

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Configure Paths

The job directory structure on Google Drive:
```
My Drive/
  autonomous-recording/
    tts-jobs/
      <job-id>/
        request.json    ‚Üê local machine writes this
        audio/          ‚Üê worker writes WAVs here
        done.marker     ‚Üê worker writes when complete
```

In [None]:
import os

# Base directory on Google Drive
DRIVE_BASE = "/content/drive/MyDrive/autonomous-recording/tts-jobs"
os.makedirs(DRIVE_BASE, exist_ok=True)
print(f"Job directory: {DRIVE_BASE}")
print(f"Existing jobs: {os.listdir(DRIVE_BASE) if os.path.exists(DRIVE_BASE) else 'none'}")

## 4. Verify GPU + ONNX Runtime

In [None]:
import onnxruntime as ort

providers = ort.get_available_providers()
print(f"Available ONNX providers: {providers}")

has_cuda = 'CUDAExecutionProvider' in providers
print(f"CUDA available: {has_cuda}")

if not has_cuda:
    print("‚ö†Ô∏è  CUDA not available. Check that:")
    print("   1. Runtime type is set to GPU (T4)")
    print("   2. onnxruntime-gpu installed correctly")
    print("   Falling back to CPU (will be slower)")

## 5. Download Kokoro Model (cached on Drive)

In [None]:
import urllib.request

MODEL_CACHE = "/content/drive/MyDrive/autonomous-recording/models"
os.makedirs(MODEL_CACHE, exist_ok=True)

KOKORO_MODEL = os.path.join(MODEL_CACHE, "kokoro-v1.0.onnx")
KOKORO_VOICES = os.path.join(MODEL_CACHE, "voices-v1.0.bin")

MODEL_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
VOICES_URL = "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"

def download_if_missing(url, dest, label):
    if os.path.exists(dest):
        size_mb = os.path.getsize(dest) / (1024 * 1024)
        print(f"‚úì {label} already cached ({size_mb:.1f} MB)")
        return
    print(f"‚¨á Downloading {label}...")
    urllib.request.urlretrieve(url, dest)
    size_mb = os.path.getsize(dest) / (1024 * 1024)
    print(f"‚úì {label} downloaded ({size_mb:.1f} MB)")

download_if_missing(MODEL_URL, KOKORO_MODEL, "Kokoro model")
download_if_missing(VOICES_URL, KOKORO_VOICES, "Kokoro voices")

## 6. Initialize Kokoro TTS Engine

In [None]:
import time
from kokoro_onnx import Kokoro

print("Loading Kokoro model...")
t0 = time.time()
kokoro = Kokoro(KOKORO_MODEL, KOKORO_VOICES)
print(f"‚úì Model loaded in {time.time() - t0:.1f}s")

# Warm up with a short synthesis
print("Warming up GPU...")
t0 = time.time()
_, _ = kokoro.create("Hello world.", voice="am_michael", speed=1.0, lang="en-us")
print(f"‚úì Warm-up done in {time.time() - t0:.2f}s")

## 7. TTS Job Processor

Processes a single job directory. Called by the watcher loop or directly.

In [None]:
import json
import soundfile as sf
import tempfile


def process_tts_job(job_dir: str) -> dict:
    """Process a TTS job from a directory on Google Drive.

    Expected structure:
      job_dir/request.json ‚Äî contains steps with narration text
      job_dir/audio/       ‚Äî output WAV files written here
      job_dir/done.marker  ‚Äî written on completion

    request.json format:
    {
        "voice": "am_michael",
        "speed": 1.0,
        "language": "en-us",
        "steps": [
            {"id": "step-01", "narration": "Text to synthesize..."},
            ...
        ]
    }
    """
    request_path = os.path.join(job_dir, "request.json")
    audio_dir = os.path.join(job_dir, "audio")
    done_marker = os.path.join(job_dir, "done.marker")
    error_marker = os.path.join(job_dir, "error.marker")

    # Skip already-completed jobs
    if os.path.exists(done_marker):
        return {"status": "already_done", "job_dir": job_dir}

    if not os.path.exists(request_path):
        return {"status": "no_request", "job_dir": job_dir}

    os.makedirs(audio_dir, exist_ok=True)

    with open(request_path, "r") as f:
        request = json.load(f)

    voice = request.get("voice", "am_michael")
    speed = float(request.get("speed", 1.0))
    language = request.get("language", "en-us")
    steps = request.get("steps", [])

    results = []
    total_duration = 0.0

    print(f"\n{'='*60}")
    print(f"Processing job: {os.path.basename(job_dir)}")
    print(f"Voice: {voice} | Speed: {speed} | Steps: {len(steps)}")
    print(f"{'='*60}")

    try:
        for idx, step in enumerate(steps, 1):
            step_id = str(step["id"])
            narration = str(step["narration"]).strip()
            wav_path = os.path.join(audio_dir, f"step-{step_id}.wav")

            # Skip if already generated
            if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
                data, sr = sf.read(wav_path)
                duration = len(data) / sr
                print(f"  [{idx}/{len(steps)}] ‚ôª Reused {os.path.basename(wav_path)} ({duration:.2f}s)")
                results.append({"id": step_id, "duration": duration, "reused": True})
                total_duration += duration
                continue

            t0 = time.time()
            samples, sample_rate = kokoro.create(
                narration, voice=voice, speed=speed, lang=language
            )
            elapsed = time.time() - t0
            duration = len(samples) / sample_rate

            # Write atomically via temp file
            tmp_fd, tmp_path = tempfile.mkstemp(suffix=".wav", dir=audio_dir)
            os.close(tmp_fd)
            sf.write(tmp_path, samples, sample_rate)
            os.replace(tmp_path, wav_path)

            rtf = elapsed / duration if duration > 0 else 0
            print(f"  [{idx}/{len(steps)}] ‚úì {os.path.basename(wav_path)} ({duration:.2f}s audio, {elapsed:.2f}s gen, RTF={rtf:.2f})")
            results.append({"id": step_id, "duration": duration, "gen_time": elapsed})
            total_duration += duration

        # Write completion marker with metadata
        completion = {
            "status": "completed",
            "total_duration": total_duration,
            "steps_generated": len(results),
            "results": results,
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        }
        with open(done_marker, "w") as f:
            json.dump(completion, f, indent=2)

        print(f"\n‚úì Job complete: {len(results)} steps, {total_duration:.2f}s total audio")
        return completion

    except Exception as e:
        error_info = {"status": "error", "error": str(e), "step": idx if 'idx' in dir() else -1}
        with open(error_marker, "w") as f:
            json.dump(error_info, f, indent=2)
        print(f"\n‚úó Job failed: {e}")
        return error_info

## 8. Job Watcher Loop

Polls the Drive job directory for new requests. Run this cell to start watching.

**To stop:** Interrupt the cell (‚¨õ stop button).

In [None]:
import datetime

POLL_INTERVAL = 5  # seconds between checks


def watch_for_jobs():
    """Watch the job directory for new TTS requests."""
    print(f"üëÄ Watching for jobs in: {DRIVE_BASE}")
    print(f"   Poll interval: {POLL_INTERVAL}s")
    print(f"   Press ‚¨õ to stop\n")

    processed = set()

    # Mark existing completed jobs
    if os.path.exists(DRIVE_BASE):
        for name in os.listdir(DRIVE_BASE):
            job_dir = os.path.join(DRIVE_BASE, name)
            if os.path.isdir(job_dir):
                done = os.path.join(job_dir, "done.marker")
                error = os.path.join(job_dir, "error.marker")
                if os.path.exists(done) or os.path.exists(error):
                    processed.add(name)

    print(f"   Skipping {len(processed)} already-processed job(s)")

    while True:
        try:
            if not os.path.exists(DRIVE_BASE):
                time.sleep(POLL_INTERVAL)
                continue

            for name in sorted(os.listdir(DRIVE_BASE)):
                if name in processed:
                    continue

                job_dir = os.path.join(DRIVE_BASE, name)
                if not os.path.isdir(job_dir):
                    continue

                request_path = os.path.join(job_dir, "request.json")
                if not os.path.exists(request_path):
                    continue

                # New job found
                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"\n[{now}] üìã New job detected: {name}")

                result = process_tts_job(job_dir)
                processed.add(name)

                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"[{now}] ‚úì Job {name} ‚Üí {result.get('status', 'unknown')}")

            time.sleep(POLL_INTERVAL)

        except KeyboardInterrupt:
            print("\n\nüõë Watcher stopped.")
            break


watch_for_jobs()

## 9. Manual Job Processing (Optional)

Process a specific job directory directly, without the watcher loop.

In [None]:
# Uncomment and set the job ID to process manually:
# JOB_ID = "my-job-id"
# result = process_tts_job(os.path.join(DRIVE_BASE, JOB_ID))
# print(json.dumps(result, indent=2))

## 10. Quick Test

Generate a single test audio to verify everything works.

In [None]:
# Quick test: generate a sample and play it
test_text = "Welcome to this tutorial. Today we'll learn about bubble sort, a simple comparison-based sorting algorithm."

t0 = time.time()
samples, sr = kokoro.create(test_text, voice="am_michael", speed=1.0, lang="en-us")
elapsed = time.time() - t0
duration = len(samples) / sr

print(f"Generated {duration:.2f}s of audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")

# Save and play
sf.write("/tmp/test_tts.wav", samples, sr)

from IPython.display import Audio, display
display(Audio(samples, rate=sr))

## 11. NVENC Video Encoding (Optional)

If you want to offload final video encoding to the T4 GPU, this cell sets up
FFmpeg with NVENC hardware encoding. Copy your assembled video to Drive,
then run encoding here for 5-10x speedup over CPU libx264.

In [None]:
# Check if NVENC is available on this T4
import subprocess

result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout[:500])

# Check default ffmpeg for NVENC support
result = subprocess.run(['ffmpeg', '-hide_banner', '-encoders'], capture_output=True, text=True)
nvenc_encoders = [l.strip() for l in result.stdout.split('\n') if 'nvenc' in l.lower()]
print(f'\nNVENC encoders available: {len(nvenc_encoders)}')
for enc in nvenc_encoders:
    print(f'  {enc}')

if not nvenc_encoders:
    print('\n‚ö†Ô∏è  NVENC not in default ffmpeg. Installing CUDA-enabled build...')
    !apt-get -qq update && apt-get -qq install -y nasm yasm > /dev/null 2>&1
    # Use pre-built CUDA ffmpeg if available
    print('Note: You may need to build ffmpeg from source with --enable-nvenc')
    print('See: https://github.com/Glyx/colab-ffmpeg-cuda')
else:
    print('\n‚úì NVENC ready!')

In [None]:
def encode_with_nvenc(input_path: str, output_path: str, cq: int = 20) -> str:
    """Encode video using NVENC hardware encoder on T4.
    
    Args:
        input_path: Path to input video (on Drive)
        output_path: Path for output video (on Drive)
        cq: Constant quality (similar to CRF, 0-51, lower=better)
    """
    import subprocess, time

    cmd = [
        'ffmpeg', '-y', '-hwaccel', 'cuda',
        '-i', input_path,
        '-c:v', 'h264_nvenc',
        '-preset', 'p7',  # Max quality preset
        '-rc', 'vbr',
        '-cq', str(cq),
        '-b:v', '0',  # Let CQ drive quality
        '-c:a', 'aac', '-b:a', '192k',
        '-movflags', '+faststart',
        output_path,
    ]

    print(f'Encoding: {input_path}')
    print(f'Output:   {output_path}')
    print(f'Preset:   p7 (max quality) | CQ: {cq}')

    t0 = time.time()
    result = subprocess.run(cmd, capture_output=True, text=True)
    elapsed = time.time() - t0

    if result.returncode != 0:
        print(f'\n‚úó Encoding failed:\n{result.stderr[-500:]}')
        return ''

    size_mb = os.path.getsize(output_path) / (1024 * 1024)
    print(f'\n‚úì Done in {elapsed:.1f}s ({size_mb:.1f} MB)')
    return output_path

# Example usage:
# encode_with_nvenc(
#     '/content/drive/MyDrive/autonomous-recording/video-to-encode.mp4',
#     '/content/drive/MyDrive/autonomous-recording/video-encoded-nvenc.mp4',
#     cq=20
# )