# üéôÔ∏è F5-TTS Worker ‚Äî Voice Cloning on Colab T4

This notebook uses F5-TTS for high-quality, natural-sounding narration with voice cloning.
Provide a 10-15s reference audio clip and F5-TTS will generate all narration in that voice.

**Advantages over Kokoro:**
- Voice cloning from a short reference clip
- More natural prosody, emphasis, and pacing
- GPU-bound model ‚Äî T4 provides real speedup (RTF ~0.3-0.5)

**Setup:** Runtime ‚Üí Change runtime type ‚Üí T4 GPU

## 1. Install Dependencies

In [None]:
!pip install -q f5-tts soundfile
# Verify GPU
!nvidia-smi

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Configure Paths

Job directory structure on Google Drive:
```
My Drive/
  autonomous-recording/
    f5-tts-jobs/          ‚Üê separate from kokoro jobs
      <job-id>/
        request.json      ‚Üê local machine writes this
        ref_audio.wav     ‚Üê reference voice clip (copied from settings)
        audio/            ‚Üê worker writes WAVs here
        done.marker       ‚Üê worker writes when complete
    voice-refs/           ‚Üê store your reference voice clips here
      teacher-voice.wav
```

In [None]:
import os

DRIVE_BASE = "/content/drive/MyDrive/autonomous-recording/f5-tts-jobs"
ENCODE_DRIVE_BASE = "/content/drive/MyDrive/autonomous-recording/encode-jobs"
VOICE_REFS_DIR = "/content/drive/MyDrive/autonomous-recording/voice-refs"

os.makedirs(DRIVE_BASE, exist_ok=True)
os.makedirs(ENCODE_DRIVE_BASE, exist_ok=True)
os.makedirs(VOICE_REFS_DIR, exist_ok=True)

print(f"Job directory: {DRIVE_BASE}")
print(f"Encode job directory: {ENCODE_DRIVE_BASE}")
print(f"Voice refs directory: {VOICE_REFS_DIR}")

# List existing voice references
refs = [f for f in os.listdir(VOICE_REFS_DIR) if f.endswith('.wav')] if os.path.exists(VOICE_REFS_DIR) else []
if refs:
    print(f"\nAvailable voice references: {refs}")
else:
    print(f"\n‚ö†Ô∏è  No voice references found in {VOICE_REFS_DIR}")
    print("Upload a 10-15s WAV clip of the target voice to that directory.")
    print("The default F5-TTS reference voice will be used as fallback.")

## 4. Verify GPU + PyTorch

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected. Check Runtime ‚Üí Change runtime type ‚Üí T4 GPU")

## 5. Initialize F5-TTS Model

First load downloads the model (~1.2 GB). Subsequent runs use the cached version.

In [None]:
import time
from f5_tts.api import F5TTS

print("Loading F5-TTS model (first run downloads ~1.2 GB)...")
t0 = time.time()
f5tts = F5TTS(model="F5TTS_v1_Base", ckpt_file="", device=None)  # auto-detect GPU
print(f"‚úì Model loaded in {time.time() - t0:.1f}s")

# Warm up
print("Warming up GPU...")
t0 = time.time()
_ = f5tts.infer(
    ref_file="",  # uses built-in default reference
    ref_text="",
    gen_text="Hello, this is a warm up sentence for the GPU.",
    seed=42,
)
print(f"‚úì Warm-up done in {time.time() - t0:.2f}s")

## 6. Upload a Voice Reference (Optional)

Upload a 10-15 second WAV clip of the voice you want to clone.
Place it in `My Drive/autonomous-recording/voice-refs/`.

**Tips for good reference audio:**
- 10-15 seconds of clear speech, no background noise
- Normal speaking pace (not too fast, not too slow)
- Conversational tone matching your tutorial style
- WAV format, 16kHz+ sample rate

In [None]:
# You can also upload directly from your local machine:
# from google.colab import files
# uploaded = files.upload()  # opens file picker
# for name, data in uploaded.items():
#     dest = os.path.join(VOICE_REFS_DIR, name)
#     with open(dest, 'wb') as f:
#         f.write(data)
#     print(f"Saved {name} to {dest}")

# List available references
refs = [f for f in os.listdir(VOICE_REFS_DIR) if f.endswith('.wav')] if os.path.exists(VOICE_REFS_DIR) else []
print(f"Voice references: {refs if refs else 'none (will use F5-TTS default)'}")

## 7. F5-TTS Job Processor

In [None]:
import json
import soundfile as sf
import tempfile
import numpy as np


def process_f5_tts_job(job_dir: str) -> dict:
    """Process an F5-TTS job from a directory on Google Drive.

    request.json format:
    {
        "ref_audio": "teacher-voice.wav",     # filename in voice-refs/ or path
        "ref_text": "Transcription of the reference audio.",
        "speed": 1.0,
        "seed": 42,
        "nfe_step": 32,
        "steps": [
            {"id": "step-01", "narration": "Text to synthesize..."},
            ...
        ]
    }
    """
    request_path = os.path.join(job_dir, "request.json")
    audio_dir = os.path.join(job_dir, "audio")
    done_marker = os.path.join(job_dir, "done.marker")
    error_marker = os.path.join(job_dir, "error.marker")

    if os.path.exists(done_marker):
        return {"status": "already_done", "job_dir": job_dir}

    if not os.path.exists(request_path):
        return {"status": "no_request", "job_dir": job_dir}

    os.makedirs(audio_dir, exist_ok=True)

    with open(request_path, "r") as f:
        request = json.load(f)

    # Resolve reference audio
    ref_audio_name = request.get("ref_audio", "")
    ref_text = request.get("ref_text", "")
    speed = float(request.get("speed", 1.0))
    seed = request.get("seed", None)
    nfe_step = int(request.get("nfe_step", 32))
    steps = request.get("steps", [])

    # Find reference audio file
    ref_file = ""
    if ref_audio_name:
        # Check job directory first (uploaded with job)
        job_ref = os.path.join(job_dir, ref_audio_name)
        if os.path.exists(job_ref):
            ref_file = job_ref
        else:
            # Check voice-refs directory
            refs_ref = os.path.join(VOICE_REFS_DIR, ref_audio_name)
            if os.path.exists(refs_ref):
                ref_file = refs_ref
            else:
                print(f"‚ö†Ô∏è  Reference audio '{ref_audio_name}' not found, using F5-TTS default")

    results = []
    total_duration = 0.0

    print(f"\n{'='*60}")
    print(f"Processing F5-TTS job: {os.path.basename(job_dir)}")
    print(f"Ref audio: {ref_file or '(F5-TTS default)'} | Speed: {speed} | Steps: {len(steps)}")
    print(f"NFE steps: {nfe_step} | Seed: {seed or 'random'}")
    print(f"{'='*60}")

    try:
        for idx, step in enumerate(steps, 1):
            step_id = str(step["id"])
            narration = str(step["narration"]).strip()
            wav_path = os.path.join(audio_dir, f"step-{step_id}.wav")

            # Skip if already generated
            if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
                data, sr = sf.read(wav_path)
                duration = len(data) / sr
                print(f"  [{idx}/{len(steps)}] ‚ôª Reused {os.path.basename(wav_path)} ({duration:.2f}s)")
                results.append({"id": step_id, "duration": duration, "reused": True})
                total_duration += duration
                continue

            t0 = time.time()
            wav, sample_rate, _ = f5tts.infer(
                ref_file=ref_file,
                ref_text=ref_text,
                gen_text=narration,
                nfe_step=nfe_step,
                speed=speed,
                seed=seed,
            )
            elapsed = time.time() - t0
            duration = len(wav) / sample_rate

            # Write atomically
            tmp_fd, tmp_path = tempfile.mkstemp(suffix=".wav", dir=audio_dir)
            os.close(tmp_fd)
            sf.write(tmp_path, wav, sample_rate)
            os.replace(tmp_path, wav_path)

            rtf = elapsed / duration if duration > 0 else 0
            print(f"  [{idx}/{len(steps)}] ‚úì {os.path.basename(wav_path)} ({duration:.2f}s audio, {elapsed:.2f}s gen, RTF={rtf:.2f})")
            results.append({"id": step_id, "duration": duration, "gen_time": elapsed})
            total_duration += duration

        # Write completion marker
        completion = {
            "status": "completed",
            "engine": "f5-tts",
            "total_duration": total_duration,
            "steps_generated": len(results),
            "results": results,
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        }
        with open(done_marker, "w") as f:
            json.dump(completion, f, indent=2)

        print(f"\n‚úì Job complete: {len(results)} steps, {total_duration:.2f}s total audio")
        return completion

    except Exception as e:
        error_info = {"status": "error", "error": str(e), "step": idx if 'idx' in dir() else -1}
        with open(error_marker, "w") as f:
            json.dump(error_info, f, indent=2)
        print(f"\n‚úó Job failed: {e}")
        import traceback
        traceback.print_exc()
        return error_info

## NVENC Video Encoding Setup

In [None]:
import shutil
import subprocess


def _run(cmd: list[str], check: bool = False) -> subprocess.CompletedProcess:
    result = subprocess.run(cmd, capture_output=True, text=True)
    if check and result.returncode != 0:
        raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{result.stderr}")
    return result


def _run_ffmpeg(cmd: list[str]) -> None:
    print("FFmpeg command:")
    print("  " + " ".join(cmd))
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    assert proc.stdout is not None
    for line in proc.stdout:
        line = line.rstrip()
        if line:
            print(line)
    code = proc.wait()
    if code != 0:
        raise RuntimeError(f"ffmpeg failed with exit code {code}")


def has_h264_nvenc() -> bool:
    ffmpeg = shutil.which("ffmpeg")
    if not ffmpeg:
        return False
    result = _run(["ffmpeg", "-hide_banner", "-encoders"])
    return result.returncode == 0 and "h264_nvenc" in result.stdout


def _install_nvenc_ffmpeg() -> None:
    _run(["bash", "-lc", "apt-get -qq update"], check=False)
    if not shutil.which("ffmpeg"):
        print("ffmpeg not found, installing...")
        _run(["bash", "-lc", "apt-get -qq install -y ffmpeg"], check=True)

    if not has_h264_nvenc():
        print("h264_nvenc not found in current ffmpeg. Trying CUDA-related packages...")
        _run(["bash", "-lc", "apt-get -qq install -y nvidia-cuda-toolkit"], check=False)
        _run(["bash", "-lc", "apt-get -qq install -y ffmpeg"], check=False)


print("Checking ffmpeg + NVENC availability...")
_install_nvenc_ffmpeg()

if has_h264_nvenc():
    print("NVENC-enabled ffmpeg is available.")
else:
    print("WARNING: NVENC encoder still unavailable in this runtime ffmpeg build.")
    print("  Try a known CUDA-enabled ffmpeg binary or source build if needed.")

print("ffmpeg version:")
print(_run(["bash", "-lc", "ffmpeg -version | head -n 1"]).stdout.strip())

In [None]:
from pathlib import Path


def _nvenc_video_args(fmt: dict) -> list[str]:
    codec = str(fmt.get("codec", "h264_nvenc"))
    preset = str(fmt.get("preset", "p7"))
    cq = int(fmt.get("cq", 20))
    vf_parts = []

    width = fmt.get("width")
    height = fmt.get("height")
    if width and height:
        vf_parts.append(f"scale={int(width)}:{int(height)}")

    user_filter = str(fmt.get("video_filter", "")).strip()
    if user_filter:
        vf_parts.append(user_filter)

    args = [
        "-c:v", codec,
        "-preset", preset,
        "-rc", "vbr",
        "-cq", str(cq),
        "-b:v", "0",
    ]
    if vf_parts:
        args += ["-vf", ",".join(vf_parts)]
    return args


def _audio_args(fmt: dict) -> list[str]:
    return [
        "-c:a", str(fmt.get("audio_codec", "aac")),
        "-b:a", str(fmt.get("audio_bitrate", "192k")),
    ]


def process_encode_job(job_dir: str) -> dict:
    """Process a single encode job directory on Google Drive."""
    job_path = Path(job_dir)
    request_path = job_path / "request.json"
    done_marker = job_path / "done.marker"
    error_marker = job_path / "error.marker"

    if done_marker.exists():
        return {"status": "already_done", "job_dir": str(job_path)}

    if not request_path.exists():
        return {"status": "no_request", "job_dir": str(job_path)}

    started = time.time()
    request = json.loads(request_path.read_text(encoding="utf-8"))
    input_files = [str(x) for x in request.get("input_files", [])]
    operations = request.get("operations", [])
    output_format = request.get("output_format") or {}

    fmt = {
        "codec": "h264_nvenc",
        "preset": "p7",
        "cq": 20,
        "audio_codec": "aac",
        "audio_bitrate": "192k",
        "video_filter": "fps=30,format=yuv420p",
        "width": 1920,
        "height": 1080,
    }
    fmt.update(output_format)

    print(f"\n{'=' * 60}")
    print(f"Processing encode job: {job_path.name}")
    print(f"Declared inputs: {len(input_files)} | Operations: {len(operations)}")
    print(f"{'=' * 60}")

    try:
        for name in input_files:
            p = job_path / name
            if not p.exists():
                raise FileNotFoundError(f"Missing input file from request.input_files: {name}")

        for idx, op in enumerate(operations, 1):
            op_type = str(op.get("type", "")).strip()
            print(f"\n[{idx}/{len(operations)}] Operation: {op_type}")

            if op_type == "transcode":
                inp = job_path / str(op["input"])
                out = job_path / str(op["output"])
                if not inp.exists():
                    raise FileNotFoundError(f"Transcode input missing: {inp.name}")
                out.parent.mkdir(parents=True, exist_ok=True)

                cmd = [
                    "ffmpeg", "-y", "-hide_banner", "-hwaccel", "cuda", "-i", str(inp),
                    *_nvenc_video_args(fmt),
                    *_audio_args(fmt),
                    "-movflags", "+faststart",
                    str(out),
                ]
                _run_ffmpeg(cmd)

            elif op_type == "mux_audio":
                video = job_path / str(op["video"])
                audio = job_path / str(op["audio"])
                out = job_path / str(op["output"])
                if not video.exists():
                    raise FileNotFoundError(f"Mux video missing: {video.name}")
                if not audio.exists():
                    raise FileNotFoundError(f"Mux audio missing: {audio.name}")
                out.parent.mkdir(parents=True, exist_ok=True)

                audio_delay_ms = int(op.get("audio_delay_ms", 0))
                if audio_delay_ms > 0:
                    delay = f"adelay={audio_delay_ms}|{audio_delay_ms}"
                    map_audio = "[aout]"
                    filter_args = ["-filter_complex", f"[1:a]{delay}[aout]", "-map", "0:v:0", "-map", map_audio]
                else:
                    filter_args = ["-map", "0:v:0", "-map", "1:a:0"]

                cmd = [
                    "ffmpeg", "-y", "-hide_banner",
                    "-hwaccel", "cuda", "-i", str(video),
                    "-i", str(audio),
                    *filter_args,
                    *_nvenc_video_args(fmt),
                    *_audio_args(fmt),
                    "-shortest",
                    "-movflags", "+faststart",
                    str(out),
                ]
                _run_ffmpeg(cmd)

            elif op_type == "concat":
                inputs = [job_path / str(x) for x in op.get("inputs", [])]
                out = job_path / str(op["output"])
                for p in inputs:
                    if not p.exists():
                        raise FileNotFoundError(f"Concat input missing: {p.name}")
                if not inputs:
                    raise ValueError("Concat operation requires at least one input")
                out.parent.mkdir(parents=True, exist_ok=True)

                with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as fp:
                    concat_list = Path(fp.name)
                    for p in inputs:
                        fp.write(f"file '{p}'\n")

                cmd = [
                    "ffmpeg", "-y", "-hide_banner",
                    "-hwaccel", "cuda",
                    "-f", "concat", "-safe", "0", "-i", str(concat_list),
                    *_nvenc_video_args(fmt),
                ]

                if bool(op.get("loudnorm", False)):
                    cmd += ["-af", "loudnorm=I=-16:LRA=11:TP=-1.5"]

                cmd += [*_audio_args(fmt), "-movflags", "+faststart", str(out)]
                try:
                    _run_ffmpeg(cmd)
                finally:
                    if concat_list.exists():
                        concat_list.unlink()

            else:
                raise ValueError(f"Unsupported operation type: {op_type}")

        elapsed = time.time() - started
        output_names = sorted({str(op.get("output")) for op in operations if op.get("output")})
        outputs = {}
        total_bytes = 0
        for name in output_names:
            path = job_path / name
            if path.exists():
                size = path.stat().st_size
                outputs[name] = {"bytes": size}
                total_bytes += size

        completion = {
            "status": "completed",
            "elapsed_sec": elapsed,
            "operation_count": len(operations),
            "outputs": outputs,
            "total_output_bytes": total_bytes,
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        }
        error_marker.unlink(missing_ok=True)
        done_marker.write_text(json.dumps(completion, indent=2), encoding="utf-8")

        print(f"\nJob complete in {elapsed:.2f}s | outputs={len(outputs)} | total={total_bytes / (1024 * 1024):.2f} MB")
        return completion

    except Exception as exc:
        err = {
            "status": "error",
            "error": str(exc),
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        }
        done_marker.unlink(missing_ok=True)
        error_marker.write_text(json.dumps(err, indent=2), encoding="utf-8")
        print(f"\nJob failed: {exc}")
        return err

## NVENC Verification

In [None]:
print("GPU info:")
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout[:1500])

encoders_out = subprocess.run(["ffmpeg", "-hide_banner", "-encoders"], capture_output=True, text=True).stdout
nvenc_lines = [line.strip() for line in encoders_out.splitlines() if "nvenc" in line.lower()]
print(f"NVENC encoders found: {len(nvenc_lines)}")
for line in nvenc_lines:
    print(f"  {line}")

if not any("h264_nvenc" in line for line in nvenc_lines):
    print("WARNING: h264_nvenc not available. Encoding jobs may fail until ffmpeg supports NVENC.")

## NVENC Quick Test

In [None]:
import pathlib

test_dir = pathlib.Path("/tmp/nvenc-test")
test_dir.mkdir(parents=True, exist_ok=True)
src = test_dir / "bars-input.webm"
dst = test_dir / "bars-output.mp4"

create_cmd = [
    "ffmpeg", "-y", "-hide_banner",
    "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=30",
    "-f", "lavfi", "-i", "sine=frequency=1000:sample_rate=48000",
    "-t", "10", "-c:v", "libvpx-vp9", "-c:a", "libopus", str(src),
]
_run_ffmpeg(create_cmd)

before = src.stat().st_size

encode_cmd = [
    "ffmpeg", "-y", "-hide_banner", "-hwaccel", "cuda", "-i", str(src),
    "-c:v", "h264_nvenc", "-preset", "p7", "-rc", "vbr", "-cq", "20", "-b:v", "0",
    "-vf", "fps=30,format=yuv420p",
    "-c:a", "aac", "-b:a", "192k",
    "-movflags", "+faststart", str(dst),
]

t0 = time.time()
_run_ffmpeg(encode_cmd)
elapsed = time.time() - t0
after = dst.stat().st_size

print(f"Input size:  {before / (1024 * 1024):.2f} MB")
print(f"Output size: {after / (1024 * 1024):.2f} MB")
print(f"Encode time: {elapsed:.2f}s")
print(f"Speed:       {10.0 / elapsed:.2f}x realtime")

## 8. Job Watcher Loop

Polls both Drive job directories for F5-TTS and NVENC requests.

**To stop:** Interrupt the cell (‚¨õ stop button).

In [None]:
import datetime
import threading

POLL_INTERVAL = 5  # seconds


def watch_tts():
    """Watch for new F5-TTS jobs."""
    print(f"Watching F5-TTS jobs in: {DRIVE_BASE}")
    print(f"  Poll interval: {POLL_INTERVAL}s")

    processed = set()
    if os.path.exists(DRIVE_BASE):
        for name in os.listdir(DRIVE_BASE):
            job_dir = os.path.join(DRIVE_BASE, name)
            if os.path.isdir(job_dir):
                done = os.path.join(job_dir, "done.marker")
                error = os.path.join(job_dir, "error.marker")
                if os.path.exists(done) or os.path.exists(error):
                    processed.add(name)

    print(f"  Skipping {len(processed)} already-processed TTS job(s)")

    while True:
        try:
            if not os.path.exists(DRIVE_BASE):
                time.sleep(POLL_INTERVAL)
                continue

            for name in sorted(os.listdir(DRIVE_BASE)):
                if name in processed:
                    continue

                job_dir = os.path.join(DRIVE_BASE, name)
                if not os.path.isdir(job_dir):
                    continue

                request_path = os.path.join(job_dir, "request.json")
                if not os.path.exists(request_path):
                    continue

                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"[{now}] [TTS] New job detected: {name}")
                result = process_f5_tts_job(job_dir)
                processed.add(name)
                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"[{now}] [TTS] Job {name} -> {result.get('status', 'unknown')}")

            time.sleep(POLL_INTERVAL)

        except Exception as exc:
            now = datetime.datetime.now().strftime("%H:%M:%S")
            print(f"[{now}] [TTS] Watcher error: {exc}")
            time.sleep(POLL_INTERVAL)


def watch_encode():
    """Watch for new NVENC encode jobs."""
    print(f"Watching encode jobs in: {ENCODE_DRIVE_BASE}")
    print(f"  Poll interval: {POLL_INTERVAL}s")

    processed = set()
    if os.path.exists(ENCODE_DRIVE_BASE):
        for name in os.listdir(ENCODE_DRIVE_BASE):
            job_dir = os.path.join(ENCODE_DRIVE_BASE, name)
            if os.path.isdir(job_dir):
                done = os.path.join(job_dir, "done.marker")
                err = os.path.join(job_dir, "error.marker")
                if os.path.exists(done) or os.path.exists(err):
                    processed.add(name)

    print(f"  Skipping {len(processed)} already-processed encode job(s)")

    while True:
        try:
            if not os.path.exists(ENCODE_DRIVE_BASE):
                time.sleep(POLL_INTERVAL)
                continue

            for name in sorted(os.listdir(ENCODE_DRIVE_BASE)):
                if name in processed:
                    continue

                job_dir = os.path.join(ENCODE_DRIVE_BASE, name)
                if not os.path.isdir(job_dir):
                    continue

                request_path = os.path.join(job_dir, "request.json")
                if not os.path.exists(request_path):
                    continue

                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"[{now}] [ENCODE] New job detected: {name}")
                result = process_encode_job(job_dir)
                processed.add(name)
                now = datetime.datetime.now().strftime("%H:%M:%S")
                print(f"[{now}] [ENCODE] Job {name} -> {result.get('status', 'unknown')}")

            time.sleep(POLL_INTERVAL)

        except Exception as exc:
            now = datetime.datetime.now().strftime("%H:%M:%S")
            print(f"[{now}] [ENCODE] Watcher error: {exc}")
            time.sleep(POLL_INTERVAL)


threading.Thread(target=watch_tts, daemon=True, name="f5-tts-watcher").start()
threading.Thread(target=watch_encode, daemon=True, name="encode-watcher").start()

try:
    while True:
        time.sleep(10)
except KeyboardInterrupt:
    print("Stopped.")

## 9. Quick Test

Generate a test audio to hear the voice quality.

In [None]:
# Quick test with default voice
test_text = "Welcome to this tutorial. Today we'll learn about bubble sort, a simple comparison-based sorting algorithm. It is not the fastest, but it is a great starting point."

# To test with YOUR voice, set ref_file and ref_text:
# ref_file = os.path.join(VOICE_REFS_DIR, "teacher-voice.wav")
# ref_text = "The transcription of what is said in the reference audio."
ref_file = ""  # empty = use F5-TTS built-in default
ref_text = ""

t0 = time.time()
wav, sr, _ = f5tts.infer(
    ref_file=ref_file,
    ref_text=ref_text,
    gen_text=test_text,
    seed=42,
)
elapsed = time.time() - t0
duration = len(wav) / sr

print(f"Generated {duration:.2f}s of audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")

sf.write("/tmp/test_f5tts.wav", wav, sr)

from IPython.display import Audio, display
display(Audio(wav, rate=sr))

## 10. Test with Voice Clone

Test with a reference voice clip from your Drive.

In [None]:
# Uncomment and set your reference audio:
# ref_file = os.path.join(VOICE_REFS_DIR, "teacher-voice.wav")
# ref_text = "The exact words spoken in the reference audio file."
#
# test_text = "Welcome to this tutorial. Today we will learn about bubble sort."
#
# wav, sr, _ = f5tts.infer(
#     ref_file=ref_file,
#     ref_text=ref_text,
#     gen_text=test_text,
#     seed=42,
# )
#
# from IPython.display import Audio, display
# display(Audio(wav, rate=sr))