# Audiobook Generator (XTTS v2 model) — Notebook

This notebook generates an audiobook-style audio file from a **TXT** file using a **reference voice sample** (**MP3**) for speaker conditioning via **Coqui TTS (XTTS v2)**.

**Folder assumption:** the reference voice MP3 and the input TXT are in the **same folder** as this notebook (or you can point to a different folder).

**Ethics/safety:** Only generate a voice you own or have **explicit permission** to use.


## 0. Install the dependencies: 
- `coqui-tts` for XTTS v2
- `ffmpeg` is required for MP3 I/O (conversion + final MP3 export)

If you don't have FFmpeg installed:
- Windows: install via `choco install ffmpeg` (Chocolatey) or download an official build ("ffmpeg-7.1.1-full_build-shared.7z") and add to PATH
- macOS: `brew install ffmpeg`
- Linux (Debian/Ubuntu): `sudo apt-get install ffmpeg`

Then run: `pip install -r requirements.txt`

or like me (Windows):

```bash
python -m venv audiogen
audiogen/Scripts/activate
pip install ipykernel, coqui-tts
pip install "transformers==5.0.0"
uv pip install torch torchaudio torchcodec --torch-backend=auto
git clone https://github.com/idiap/coqui-ai-TTS
cd coqui-ai-TTS
uv pip install -e .[notebooks]
```

## 1. Imports and helpers

In [None]:
import os, re, torch, shutil, subprocess
from pydub import AudioSegment
from TTS.api import TTS
os.environ["COQUI_TOS_AGREED"] = "1"

# # Get device if NVDIA GPU is present. AMD GPU is not supported or very limit.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(TTS().list_models())

In [197]:
# Set the location of cached voice
WORKDIR = os.getcwd()
if ("XDG_DATA_HOME" not in os.environ) and ("TTS_HOME" not in os.environ):
	os.environ["XDG_DATA_HOME"] = WORKDIR + "\\temp"
	os.environ["TTS_HOME"] = WORKDIR + "\\temp"

# Set location for input files
REF_VOICES = WORKDIR + "\\voices"
INPUT_TEXTS = WORKDIR + "\\texts"

# Set location for output files
OUTPUT_CHUNKS = WORKDIR + "\\chunks"
OUTPUT_AUDIOS = WORKDIR + "\\audios"

# Pick the file for now
VOICE_NAME = "Hor_Tuck_Loon"
FILE_NAME = "thought_out_thought"

# Output tuning
SPEED = 1.0                 # 1.0 = normal, 1.1 = 10% faster, 0.9 = 10% slower
MP3_BITRATE = "192k"        # e.g. "96k", "128k", "192k", "256k"

NORMALIZE_LOUDNESS = True   # loudnorm
LOUDNORM_I = -16            # Integrated loudness target (LUFS). Common: -16 (podcast/audiobook)
LOUDNORM_TP = -1.5          # True peak (dBTP)
LOUDNORM_LRA = 11           # Loudness range

TRIM_SILENCE = False         # silenceremove
SILENCE_THRESHOLD_DB = -45  # threshold in dB for silence detection
SILENCE_MIN_SEC = 0.20      # min silence duration to trim at start/end (seconds)

In [None]:
def normalize_text(text: str) -> str:
	# Remove hyphenation at line breaks: "exam-\nple" -> "example"
	text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
	# Collapse whitespace
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()

def mp3_to_wav(mp3_path, wav_path):
    """Convert MP3 -> WAV using pydub (requires ffmpeg)."""
    audio = AudioSegment.from_file(mp3_path, format="mp3")
    audio = audio.set_channels(1)  # mono often works better for speaker conditioning
    audio.export(wav_path, format="wav")
    return wav_path


In [None]:
def _atempo_chain(speed: float) -> str:
    """Build chained atempo filters to support speeds outside 0.5..2.0."""
    if speed <= 0:
        raise ValueError("SPEED must be > 0")

    parts = []
    x = float(speed)

    while x > 2.0:
        parts.append("atempo=2.0")
        x /= 2.0
    while x < 0.5:
        parts.append("atempo=0.5")
        x /= 0.5

    parts.append(f"atempo={x:.6f}".rstrip("0").rstrip("."))
    return ",".join(parts)

def _silenceremove_filter(threshold_db: float, min_sec: float) -> str:
    """Trim leading+trailing silence using silenceremove."""
    thr = f"{threshold_db}dB"
    d = max(0.0, float(min_sec))
    return (
        f"silenceremove="
        f"start_periods=1:start_duration={d}:start_threshold={thr}:"
        f"stop_periods=1:stop_duration={d}:stop_threshold={thr}"
    )

def _loudnorm_filter(I: float, TP: float, LRA: float) -> str:
    """EBU R128 loudness normalization (single-pass)."""
    return f"loudnorm=I={I}:TP={TP}:LRA={LRA}"

def export_with_ffmpeg_filters(
    audio: AudioSegment,
    out_path: Path,
    speed: float = 1.0,
    mp3_bitrate: str = "192k",
    trim_silence: bool = True,
    silence_threshold_db: float = -45,
    silence_min_sec: float = 0.20,
    normalize_loudness: bool = True,
    loudnorm_I: float = -16,
    loudnorm_TP: float = -1.5,
    loudnorm_LRA: float = 11,
):
    """Export AudioSegment applying FFmpeg filters: silenceremove, atempo, loudnorm, and bitrate."""
    if shutil.which("ffmpeg") is None:
        raise RuntimeError("ffmpeg not found on PATH. Install ffmpeg to use MP3 I/O and audio filtering.")

    out_path = Path(out_path)
    out_ext = out_path.suffix.lower()

    # Write a temp WAV from pydub, then let FFmpeg filter+encode.
    tmp_wav = out_path.with_suffix(".tmp_export.wav")
    audio.export(tmp_wav, format="wav")

    filters = []
    if trim_silence:
        filters.append(_silenceremove_filter(silence_threshold_db, silence_min_sec))
    if abs(speed - 1.0) > 1e-6:
        filters.append(_atempo_chain(speed))
    if normalize_loudness:
        filters.append(_loudnorm_filter(loudnorm_I, loudnorm_TP, loudnorm_LRA))

    cmd = ["ffmpeg", "-y", "-i", str(tmp_wav)]

    if filters:
        cmd += ["-filter:a", ",".join(filters)]

    if out_ext == ".mp3":
        cmd += ["-b:a", mp3_bitrate, str(out_path)]
    elif out_ext == ".wav":
        cmd += ["-c:a", "pcm_s16le", str(out_path)]
    else:
        raise ValueError("OUT_AUDIO must end with .wav or .mp3")

    print("Running:", " ".join(cmd))

    subprocess.run(cmd, check=True)

    try:
        tmp_wav.unlink()
    except Exception:
        pass

In [None]:

file_path = f"{INPUT_TEXTS}\\{FILE_NAME}.txt"
contents = []
with open(file_path, 'r') as file:
	for line in file:
		if line.strip():
			contents.append(normalize_text(line))

# i=1
# for para in contents:
# 	print(f"{i}. {para}")
# 	i += 1

## 2. Provide reference audio files and generate a sample simple speech with that voice:

In [87]:
# initialize TTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
# Sample speech
# reference_files = ["reference_voice.mp3"]
# tts.tts_to_file(
#     text="This is the audio edtion of the book . \"Thought . Our Sole Universe\"",
#     speaker_wav=reference_files,
#     language="en",
#     file_path="output.wav"
# )

'output.wav'

## 3. Cache reference voices for easy reuse

In [199]:
# reference audio
reference_files = f"{REF_VOICES}\\{VOICE_NAME}.mp3"

chunk_files = []

In [None]:
# names of temporary chunks files
file_name = f"{OUTPUT_CHUNKS}\\{FILE_NAME}\\output0.wav"

if not os.path.exists(file_name):
	tts.tts_to_file(
		text="This is the audio edition of the book - “Thought - Our.. Sole Universe”",
		speaker_wav=reference_files,
		speaker="HorTuckLoon",
		language="en",
		file_path=file_name
	)
	chunk_files.append(file_name)

# print(chunk_files)

Create a temporary folder like `thought_out_thought/` with many chunk output files.

In [None]:
i = 1
for paragraph in contents:
	file_name = f"{OUTPUT_CHUNKS}\\{FILE_NAME}\\output{i}.wav"
	if not os.path.exists(file_name):
		tts.tts_to_file(
			text=paragraph,
			speaker="HorTuckLoon",
			language="en",
			file_path=file_name
		)
		chunk_files.append(file_name)
	i += 1

In [205]:
def natural_sort_key(s):
    # Split the filename into parts of numbers and non-numbers
    return [int(p) if p.isdigit() else p.lower() for p in re.findall(r'\d+|\D+', s)]

chunk_files.sort(key=natural_sort_key)
print(chunk_files)

['d:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output0.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output1.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output2.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output3.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output4.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output5.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output6.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output7.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output8.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output9.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output10.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output11.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output12.wav', 'd:\\Dhamma\\AudioFileTool\\chunks\\thought_out_thought\\output13.wav', '

## 4. Concatenate into a single audio file



In [194]:
# Concatenate chunk WAVs into one AudioSegment
combined = AudioSegment.empty()
audio_file = f"{OUTPUT_AUDIOS}\\{FILE_NAME}.mp3"

for file in chunk_files:
    seg = AudioSegment.from_wav(file)
    combined += seg

try:
    combined.export(audio_file, format="mp3", bitrate="128k")
except:
    print(f"Can not combine chunk files!")

print(f"Generated {audio_file}!")

Generated d:\Dhamma\AudioFileTool\audios\thought_out_thought.mp3!
