# Audiobook Generator (XTTS v2 voice cloning) — Notebook

This notebook generates an audiobook-style audio file from a **TXT** file using a **reference voice sample** (**MP3**) for speaker conditioning (voice cloning) via **Coqui TTS (XTTS v2)**.

**Folder assumption:** the reference voice MP3 and the input TXT are in the **same folder** as this notebook (or you can point to a different folder).

**Ethics/safety:** Only clone a voice you own or have **explicit permission** to use.


## 1) Install dependencies

- `TTS` for XTTS v2
- `pydub` for audio conversion/concatenation
- `ffmpeg` is required for MP3 I/O, speed control, loudness normalization, and silence trimming.

### Known dependency issue (BeamSearchScorer)
If you see: `ImportError: cannot import name 'BeamSearchScorer' from 'transformers'`
pin Transformers to `<4.57`.


In [None]:
# Install Python packages (run once per environment)
# Pin transformers to avoid BeamSearchScorer import break in transformers>=4.57
%pip install -q --upgrade "TTS" "transformers<4.57" "pydub"


## 2) Set inputs/outputs

- **VOICE_MP3**: reference voice sample in MP3 format (10–30s clean speech recommended)
- **TEXT_TXT**: input text file in TXT format
- **OUT_AUDIO**: output audiobook file (.wav or .mp3). MP3 requires FFmpeg.

### Output tuning
- **SPEED**: tempo change *without* pitch shift (FFmpeg `atempo`)
- **MP3_BITRATE**: used only for MP3 output
- **NORMALIZE_LOUDNESS**: applies EBU R128-style loudness normalization (FFmpeg `loudnorm`)
- **TRIM_SILENCE**: trims leading/trailing silence (FFmpeg `silenceremove`)


In [None]:
from pathlib import Path

# If files are in the same folder as this notebook, keep WORKDIR as '.'
WORKDIR = Path(".")

VOICE_MP3 = WORKDIR / "voice_sample.mp3"   # <-- change filename
TEXT_TXT   = WORKDIR / "book.txt"          # <-- change filename

OUT_AUDIO  = WORKDIR / "audiobook.mp3"     # can be .wav or .mp3

# TTS options
LANG = "en"          # e.g., 'en', 'vi', 'zh-cn'
USE_GPU = False      # set True if you have CUDA GPU properly set up
MAX_CHARS_PER_CHUNK = 260

# Output tuning
SPEED = 1.0                 # 1.0 = normal, 1.1 = 10% faster, 0.9 = 10% slower
MP3_BITRATE = "192k"        # e.g. "96k", "128k", "192k", "256k"

NORMALIZE_LOUDNESS = True   # loudnorm
LOUDNORM_I = -16            # Integrated loudness target (LUFS). Common: -16 (podcast/audiobook)
LOUDNORM_TP = -1.5          # True peak (dBTP)
LOUDNORM_LRA = 11           # Loudness range

TRIM_SILENCE = True         # silenceremove
SILENCE_THRESHOLD_DB = -45  # threshold in dB for silence detection
SILENCE_MIN_SEC = 0.20      # min silence duration to trim at start/end (seconds)

VOICE_MP3, TEXT_TXT, OUT_AUDIO


## 3) Imports and helpers

In [None]:
import re
import subprocess
import shutil
from pathlib import Path
from pydub import AudioSegment

def assert_exists(path: Path, label: str):
    if not path.exists():
        raise FileNotFoundError(f"{label} not found: {path.resolve()}")

def normalize_text(text: str) -> str:
    # Remove hyphenation at line breaks: "exam-\nple" -> "example"
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
    # Collapse whitespace
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def split_into_chunks(text: str, max_chars: int = 260) -> list[str]:
    """Sentence-ish split then repack into chunks up to max_chars."""
    sentences = re.split(r"(?<=[.!?。！？])\s+", text)
    sentences = [s.strip() for s in sentences if s.strip()]

    chunks = []
    buf = ""
    for s in sentences:
        # If a single sentence is huge, hard-split it.
        if len(s) > max_chars:
            if buf:
                chunks.append(buf.strip())
                buf = ""
            for i in range(0, len(s), max_chars):
                chunks.append(s[i:i+max_chars].strip())
            continue

        if not buf:
            buf = s
        elif len(buf) + 1 + len(s) <= max_chars:
            buf += " " + s
        else:
            chunks.append(buf.strip())
            buf = s

    if buf:
        chunks.append(buf.strip())

    return chunks

def mp3_to_wav(mp3_path: Path, wav_path: Path) -> Path:
    """Convert MP3 -> WAV using pydub (requires ffmpeg)."""
    audio = AudioSegment.from_file(mp3_path, format="mp3")
    audio = audio.set_channels(1)  # mono often works better for speaker conditioning
    audio.export(wav_path, format="wav")
    return wav_path

def _atempo_chain(speed: float) -> str:
    """Build chained atempo filters to support speeds outside 0.5..2.0."""
    if speed <= 0:
        raise ValueError("SPEED must be > 0")

    parts = []
    x = float(speed)

    while x > 2.0:
        parts.append("atempo=2.0")
        x /= 2.0
    while x < 0.5:
        parts.append("atempo=0.5")
        x /= 0.5

    parts.append(f"atempo={x:.6f}".rstrip("0").rstrip("."))
    return ",".join(parts)

def _silenceremove_filter(threshold_db: float, min_sec: float) -> str:
    """Trim leading+trailing silence using silenceremove."""
    thr = f"{threshold_db}dB"
    d = max(0.0, float(min_sec))
    return (
        f"silenceremove="
        f"start_periods=1:start_duration={d}:start_threshold={thr}:"
        f"stop_periods=1:stop_duration={d}:stop_threshold={thr}"
    )

def _loudnorm_filter(I: float, TP: float, LRA: float) -> str:
    """EBU R128 loudness normalization (single-pass)."""
    return f"loudnorm=I={I}:TP={TP}:LRA={LRA}"

def export_with_ffmpeg_filters(
    audio: AudioSegment,
    out_path: Path,
    speed: float = 1.0,
    mp3_bitrate: str = "192k",
    trim_silence: bool = True,
    silence_threshold_db: float = -45,
    silence_min_sec: float = 0.20,
    normalize_loudness: bool = True,
    loudnorm_I: float = -16,
    loudnorm_TP: float = -1.5,
    loudnorm_LRA: float = 11,
):
    """Export AudioSegment applying FFmpeg filters: silenceremove, atempo, loudnorm, and bitrate."""
    if shutil.which("ffmpeg") is None:
        raise RuntimeError("ffmpeg not found on PATH. Install ffmpeg to use MP3 I/O and audio filtering.")

    out_path = Path(out_path)
    out_ext = out_path.suffix.lower()

    # Write a temp WAV from pydub, then let FFmpeg filter+encode.
    tmp_wav = out_path.with_suffix(".tmp_export.wav")
    audio.export(tmp_wav, format="wav")

    filters = []
    if trim_silence:
        filters.append(_silenceremove_filter(silence_threshold_db, silence_min_sec))
    if abs(speed - 1.0) > 1e-6:
        filters.append(_atempo_chain(speed))
    if normalize_loudness:
        filters.append(_loudnorm_filter(loudnorm_I, loudnorm_TP, loudnorm_LRA))

    cmd = ["ffmpeg", "-y", "-i", str(tmp_wav)]

    if filters:
        cmd += ["-filter:a", ",".join(filters)]

    if out_ext == ".mp3":
        cmd += ["-b:a", mp3_bitrate, str(out_path)]
    elif out_ext == ".wav":
        cmd += ["-c:a", "pcm_s16le", str(out_path)]
    else:
        raise ValueError("OUT_AUDIO must end with .wav or .mp3")

    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=True)

    try:
        tmp_wav.unlink()
    except Exception:
        pass


## 4) Load input files + convert voice MP3 → WAV

In [None]:
assert_exists(VOICE_MP3, "Reference voice MP3")
assert_exists(TEXT_TXT, "Input TXT")

voice_wav = VOICE_MP3.with_suffix(".wav")
if not voice_wav.exists():
    print(f"Converting {VOICE_MP3.name} -> {voice_wav.name}")
    mp3_to_wav(VOICE_MP3, voice_wav)
else:
    print(f"Using existing WAV: {voice_wav.name}")

voice_wav


## 5) Read text and chunk it

In [None]:
raw_text = TEXT_TXT.read_text(encoding="utf-8", errors="ignore")
text = normalize_text(raw_text)

if not text:
    raise RuntimeError("No text found in the TXT file.")

chunks = split_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK)

print(f"Characters: {len(text):,}")
print(f"Chunks: {len(chunks):,} (max {MAX_CHARS_PER_CHUNK} chars each)")

# Preview first chunk
chunks[0][:500]


## 6) Load XTTS v2 model

In [None]:
from TTS.api import TTS

MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"

print(f"Loading model: {MODEL_NAME} (gpu={USE_GPU})")
tts = TTS(model_name=MODEL_NAME, gpu=USE_GPU)


## 7) Generate audio chunks and concatenate into a single audiobook

This will create a folder like `audiobook_chunks/` next to your output file.


In [None]:
# Working directory for chunk WAVs
chunk_dir = OUT_AUDIO.parent / f"{OUT_AUDIO.stem}_chunks"
chunk_dir.mkdir(parents=True, exist_ok=True)

chunk_files = []
total = len(chunks)

for i, chunk in enumerate(chunks, start=1):
    chunk_file = chunk_dir / f"chunk_{i:05d}.wav"
    if not chunk_file.exists():
        print(f"[{i}/{total}] Synthesizing -> {chunk_file.name}")
        tts.tts_to_file(
            text=chunk,
            speaker_wav=str(voice_wav),
            language=LANG,
            file_path=str(chunk_file),
        )
    else:
        print(f"[{i}/{total}] Skipping existing -> {chunk_file.name}")
    chunk_files.append(chunk_file)

print(f"Generated {len(chunk_files)} chunk WAV files in: {chunk_dir.resolve()}")


## 8) Export final audiobook (.wav or .mp3) with:
- **Silence trimming** (leading/trailing) via `silenceremove`
- **Speed control** (tempo only) via `atempo`
- **Volume normalization** via `loudnorm`
- **MP3 bitrate** via encoder settings

All of these use **FFmpeg**.


In [None]:
# Concatenate chunk WAVs into one AudioSegment
combined = AudioSegment.empty()

for cf in chunk_files:
    seg = AudioSegment.from_wav(cf)
    combined += seg

export_with_ffmpeg_filters(
    audio=combined,
    out_path=OUT_AUDIO,
    speed=SPEED,
    mp3_bitrate=MP3_BITRATE,
    trim_silence=TRIM_SILENCE,
    silence_threshold_db=SILENCE_THRESHOLD_DB,
    silence_min_sec=SILENCE_MIN_SEC,
    normalize_loudness=NORMALIZE_LOUDNESS,
    loudnorm_I=LOUDNORM_I,
    loudnorm_TP=LOUDNORM_TP,
    loudnorm_LRA=LOUDNORM_LRA,
)

print(f"Done: {OUT_AUDIO.resolve()}")


## 9) Optional: cleanup chunk files

In [None]:
# Set to True if you want to delete intermediate chunks after a successful export.
DELETE_CHUNKS = False

if DELETE_CHUNKS:
    for cf in chunk_files:
        try:
            cf.unlink()
        except Exception as e:
            print(f"Could not delete {cf.name}: {e}")
    try:
        chunk_dir.rmdir()
    except Exception as e:
        print(f"Could not remove chunk dir: {e}")
    print("Cleanup complete.")
else:
    print(f"Keeping chunks in: {chunk_dir.resolve()}")
