# Audiobook Generator (XTTS v2 model) â€” Notebook

This notebook generates an audiobook-style audio file from a **TXT** file using a **reference voice sample** (**MP3**) for speaker conditioning via **Coqui TTS (XTTS v2)**.

**Folder assumption:** the reference voice MP3 and the input TXT are in the **same folder** as this notebook (or you can point to a different folder).

**Ethics/safety:** Only generate a voice you own or have **explicit permission** to use.


## 0. Install the dependencies: 
- `coqui-tts` for XTTS v2
- `ffmpeg` is required for MP3 I/O (conversion + final MP3 export)

If you don't have FFmpeg installed:
- Windows: install via `choco install ffmpeg` (Chocolatey) or download an official build ("ffmpeg-7.1.1-full_build-shared.7z") and add to PATH
- macOS: `brew install ffmpeg`
- Linux (Debian/Ubuntu): `sudo apt-get install ffmpeg`

Then run: `pip install -r requirements.txt`

or like me (Windows):

```bash
python -m venv audiogen
audiogen/Scripts/activate
pip install ipykernel, coqui-tts
pip install "transformers==5.0.0"
uv pip install torch torchaudio torchcodec --torch-backend=auto
git clone https://github.com/idiap/coqui-ai-TTS
cd coqui-ai-TTS
uv pip install -e .[notebooks]
```

## 1. Imports and helpers

In [1]:
import os
import re
# import torch
import shutil
import subprocess
from pathlib import Path
from pydub import AudioSegment
from TTS.api import TTS
os.environ["COQUI_TOS_AGREED"] = "1"

# # Get device if NVDIA GPU is present. AMD GPU is not supported or very limit.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(TTS().list_models())

In [31]:
"""
Need research more about this
"""
# WAV_FILE = filename = librosa.example('vibeace')
# from TTS.config import BaseAudioConfig
# from TTS.utils.audio import AudioProcessor
# conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
# ap = AudioProcessor(**conf)
# wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
# pitch = ap.compute_f0(wav)

'\nNeed research more about this\n'

In [2]:
# Set the location of cached voice
WORKDIR = os.getcwd()
if ("XDG_DATA_HOME" not in os.environ) and ("TTS_HOME" not in os.environ):
	os.environ["XDG_DATA_HOME"] = WORKDIR + "\\temp"
	os.environ["TTS_HOME"] = WORKDIR + "\\temp"

# Set location for input files
REF_VOICES = WORKDIR + "\\voices"
INPUT_TEXTS = WORKDIR + "\\texts"

# Set location for output files
OUTPUT_CHUNKS = WORKDIR + "\\chunks"
OUTPUT_AUDIOS = WORKDIR + "\\audios"

# Pick the file for now
# Should loop through all the file later
# VOICE_NAME = "HorTuckLoon_Talk"
# VOICE_NAME = "HorTuckLoon_GP"

FILE_NAME = "preface"
# FILE_NAME = "thought_out_thought"
# FILE_NAME = "way_of_thought"
# FILE_NAME = "weight_of_thought"


# Output tuning
SPEED = 0.9                 # 1.0 = normal, 1.1 = 10% faster, 0.9 = 10% slower
MP3_BITRATE = "192k"        # e.g. "96k", "128k", "192k", "256k"

NORMALIZE_LOUDNESS = True   # loudnorm
LOUDNORM_I = -16            # Integrated loudness target (LUFS). Common: -16 (podcast/audiobook)
LOUDNORM_TP = -1.5          # True peak (dBTP)
LOUDNORM_LRA = 11           # Loudness range

TRIM_SILENCE = False         # silenceremove
SILENCE_THRESHOLD_DB = -45  # threshold in dB for silence detection
SILENCE_MIN_SEC = 0.20      # min silence duration to trim at start/end (seconds)

In [None]:
# Normalise text file
# Could write another one for different purposes
# def normalize_text(text: str) -> str:
# 	# Remove hyphenation at line breaks: "exam-\nple" -> "example"
# 	text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
# 	# Collapse whitespace
# 	text = re.sub(r"[ \t]+", " ", text)
# 	text = re.sub(r"\n{3,}", "\n\n", text)
# 	return text.strip()

def load_paragraphs(txt_path: str) -> list[str]:
    """
    1) Remove line breaks inside paragraphs so each paragraph becomes one line.
    2) Return a list of paragraphs: contents = [para1, para2, ...]

    Paragraphs are separated by one or more blank lines.
    """
    text = Path(txt_path).read_text(encoding="utf-8", errors="ignore")

    paragraphs = []
    # Split on blank lines (handles Windows/Mac/Linux newlines)
    # However can not group into one line
    # for block in text.split('\n\n'):
    #     paragraphs.append(block)

    # A robust approach: split by blank lines using regex
    import re
    blocks = re.split(r"(?:\r?\n){2,}", text.strip())

    for b in blocks:
        # Collapse internal newlines/whitespace into single spaces
        one_line = re.sub(r"\s*\r?\n\s*", " ", b.strip())
        # Collapse multiple spaces/tabs
        one_line = re.sub(r"[ \t]{2,}", " ", one_line)
        if one_line:
            paragraphs.append(one_line)

    return paragraphs

In [4]:
# Prepare method for wav conversion as wav is lossless, no compression, better data for training
def mp3_to_wav(mp3_path, wav_path):
    """Convert MP3 -> WAV using pydub (requires ffmpeg)."""
    audio = AudioSegment.from_file(mp3_path, format="mp3")
    audio = audio.set_channels(1)  # mono often works better for speaker conditioning
    audio.export(wav_path, format="wav")
    return wav_path

In [5]:
# Audio processing functions
def _atempo_chain(speed: float) -> str:
    """Build chained atempo filters to support speeds outside 0.5..2.0."""
    if speed <= 0:
        raise ValueError("SPEED must be > 0")

    parts = []
    x = float(speed)

    while x > 2.0:
        parts.append("atempo=2.0")
        x /= 2.0
    while x < 0.5:
        parts.append("atempo=0.5")
        x /= 0.5

    parts.append(f"atempo={x:.6f}".rstrip("0").rstrip("."))
    return ",".join(parts)

def _silenceremove_filter(threshold_db: float, min_sec: float) -> str:
    """Trim leading+trailing silence using silenceremove."""
    thr = f"{threshold_db}dB"
    d = max(0.0, float(min_sec))
    return (
        f"silenceremove="
        f"start_periods=1:start_duration={d}:start_threshold={thr}:"
        f"stop_periods=1:stop_duration={d}:stop_threshold={thr}"
    )

def _loudnorm_filter(I: float, TP: float, LRA: float) -> str:
    """EBU R128 loudness normalization (single-pass)."""
    return f"loudnorm=I={I}:TP={TP}:LRA={LRA}"

def export_with_ffmpeg_filters(
    audio: AudioSegment,
    out_path: Path,
    speed: float = 1.0,
    mp3_bitrate: str = "192k",
    trim_silence: bool = True,
    silence_threshold_db: float = -45,
    silence_min_sec: float = 0.20,
    normalize_loudness: bool = True,
    loudnorm_I: float = -16,
    loudnorm_TP: float = -1.5,
    loudnorm_LRA: float = 11,
):
    """Export AudioSegment applying FFmpeg filters: silenceremove, atempo, loudnorm, and bitrate."""
    if shutil.which("ffmpeg") is None:
        raise RuntimeError("ffmpeg not found on PATH. Install ffmpeg to use MP3 I/O and audio filtering.")

    out_path = Path(out_path)
    out_ext = out_path.suffix.lower()

    # Write a temp WAV from pydub, then let FFmpeg filter+encode.
    tmp_wav = out_path.with_suffix(".tmp_export.wav")
    audio.export(tmp_wav, format="wav")

    filters = []
    if trim_silence:
        filters.append(_silenceremove_filter(silence_threshold_db, silence_min_sec))
    if abs(speed - 1.0) > 1e-6:
        filters.append(_atempo_chain(speed))
    if normalize_loudness:
        filters.append(_loudnorm_filter(loudnorm_I, loudnorm_TP, loudnorm_LRA))

    cmd = ["ffmpeg", "-y", "-i", str(tmp_wav)]

    if filters:
        cmd += ["-filter:a", ",".join(filters)]

    if out_ext == ".mp3":
        cmd += ["-b:a", mp3_bitrate, str(out_path)]
    elif out_ext == ".wav":
        cmd += ["-c:a", "pcm_s16le", str(out_path)]
    else:
        raise ValueError("OUT_AUDIO must end with .wav or .mp3")

    print("Running:", " ".join(cmd))

    subprocess.run(cmd, check=True)

    try:
        tmp_wav.unlink()
    except Exception:
        pass

## 2. Extract text from provided text files:

In [36]:
# # Old method to extract text files

# file_path = f"{INPUT_TEXTS}\\{FILE_NAME}.txt"
# contents = []
# with open(file_path, 'r') as file:
# 	for line in file:
# 		if line.strip():
# 			contents.append(normalize_text(line))

# i=1
# for para in contents:
# 	print(f"{i}. {para}")
# 	i += 1

In [None]:
# # Parse ONE text file into `contents` object
# file_path = f"{INPUT_TEXTS}\\{FILE_NAME}.txt"
# contents = load_paragraphs(file_path)

# # Print out chunk texts for monitoring text quality before TTS
# i=1
# for para in contents:
# 	print(f"{i}. {para}")
# 	i += 1

In [19]:
# Parse all text files into `contents` object
contents: dict[str, list[str]] = {}

for file in Path(INPUT_TEXTS).iterdir():
	filename = file.name.split(".")[0]
	if file.suffix == ".txt":
		file_path = f"{INPUT_TEXTS}\\{file.name}"
		file_chunks = load_paragraphs(file_path)
		contents[filename] = file_chunks

# # Print out chunk texts for monitoring text quality before TTS

# for filename, content in contents.items():
# 	print(filename)
# 	i=1
# 	for para in content:
# 		print(f"{i}. {para}")
# 		i += 1

## 2. Load xTTSv2 model into `tts` intance

In [7]:
# initialize TTS model, download xtts_v2 model to local
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

# After download, run the model locally
tts = TTS(model_path="models\\tts_models--multilingual--multi-dataset--xtts_v2\\model.pth",
		  config_path="models\\tts_models--multilingual--multi-dataset--xtts_v2\\config.json")

In [39]:
# def get_tts_paths(tts: "TTS") -> dict:
#     '''
# 	Support function to check location of model
#     '''
#     if tts.synthesizer is None:
#         raise RuntimeError("No TTS synthesizer loaded (did you load a VC model instead?)")

#     ckpt = Path(tts.synthesizer.tts_checkpoint)
#     cfg  = Path(tts.synthesizer.tts_config_path)

#     # If checkpoint is a file, model_root is its parent; if it's a dir, it is the dir itself.
#     model_root = ckpt if ckpt.is_dir() else ckpt.parent

#     return {
#         "model_name": tts.model_name,
#         "checkpoint_path": str(ckpt),
#         "config_path": str(cfg),
#         "model_root": str(model_root),
#         "voice_dir": str(tts.synthesizer.voice_dir),
#         "download_base_default": str(tts.manager.output_prefix),  # base folder for non-HF models
#     }

# paths = get_tts_paths(tts)
# paths

## 3. Cache reference voices for easy reuse

In [40]:
# Cache the voice from reference audios - Dont have to run everytime
# reference_files = []
# for file in Path(REF_VOICES).iterdir():
# 	reference_files.append(f"{REF_VOICES}\\{file.name}")
# output_voice = f"{OUTPUT_CHUNKS}\\test_voice.wav"

# if reference_files:
# 	tts.tts_to_file(
# 		text="This is the audio edition of the book - Thought - Our Sole Universe",
# 		speaker_wav=reference_files,
# 		# Assign `speakerID` to reuse later
# 		speaker="HorTuckLoon",
# 		language="en",
# 		file_path=output_voice
# 	)


## 4. Generate audio chunks

Create a temporary folder like `thought_out_thought/` with many chunk output files.

In [None]:
# Either reset or leave untouch the chunk_files object on a new run
# chunk_files = []

In [None]:
# i = 1
# for paragraph in contents:
# 	# names of temporary chunks files
# 	file_name = f"{OUTPUT_CHUNKS}\\{FILE_NAME}\\output{i}.wav"
# 	if not os.path.exists(file_name):
# 		tts.tts_to_file(
# 			text=paragraph,
# 			speaker="HorTuckLoon",
# 			language="en",
# 			file_path=file_name
# 		)
# 	if file_name not in chunk_files:
# 		chunk_files.append(file_name)
# 	i += 1

In [None]:
# # Sort the list to make sure the chunks is in the right orders
# def natural_sort_key(s):
#     # Split the filename into parts of numbers and non-numbers
#     return [int(p) if p.isdigit() else p.lower() for p in re.findall(r'\d+|\D+', s)]

# chunk_files.sort(key=natural_sort_key)
# if len(chunk_files) != len(contents):
#     raise ValueError("Error while creating chunk files!")

In [20]:
# Either reset or leave untouch the chunk_files object on a new run
chunk_files: dict[str, list[str]] = {}

for filename, content in contents.items():
	chunk_files[filename] = []
	i=1
	for paragraph in content:
		filepath = f"{OUTPUT_CHUNKS}\\{filename}\\output{i}.wav"
		if not os.path.exists(filepath):
			tts.tts_to_file(
				text=paragraph,
				speaker="HorTuckLoon",
				language="en",
				file_path=filepath
			)
		if filepath not in chunk_files[filename]:
			chunk_files[filename].append(filepath)
		i += 1

In [21]:
# Sort the list to make sure the chunks is in the right orders
def natural_sort_key(s):
    # Split the filename into parts of numbers and non-numbers
    return [int(p) if p.isdigit() else p.lower() for p in re.findall(r'\d+|\D+', s)]

for filename, chunks in chunk_files.items():
	chunks.sort(key=natural_sort_key)
	if len(chunks) != len(contents[filename]):
		raise ValueError("Error while creating chunk files!")

## 5. Concatenate into a single audio file



In [None]:
# # Concatenate chunk WAVs into one AudioSegment
# combined = AudioSegment.empty()
# audio_file = f"{OUTPUT_AUDIOS}\\{FILE_NAME}.mp3"

# for file in chunk_files:
#     seg = AudioSegment.from_wav(file)
#     combined += seg

# try:
# 	export_with_ffmpeg_filters(
# 		audio=combined,
# 		out_path=audio_file,
# 		speed=SPEED,
# 		mp3_bitrate=MP3_BITRATE,
# 		trim_silence=TRIM_SILENCE,
# 		silence_threshold_db=SILENCE_THRESHOLD_DB,
# 		silence_min_sec=SILENCE_MIN_SEC,
# 		normalize_loudness=NORMALIZE_LOUDNESS,
# 		loudnorm_I=LOUDNORM_I,
# 		loudnorm_TP=LOUDNORM_TP,
# 		loudnorm_LRA=LOUDNORM_LRA,
# 	)
# 	print(f"Generated {audio_file}!")
# except:
#     print(f"Error couldn't combine chunk files!")
# try:
#     combined.export(audio_file, format="mp3", bitrate="192k")
# except:
#

Running: ffmpeg -y -i d:\Dhamma\AudioFileTool\audios\preface.tmp_export.wav -filter:a atempo=0.9,loudnorm=I=-16:TP=-1.5:LRA=11 -b:a 192k d:\Dhamma\AudioFileTool\audios\preface.mp3
Generated d:\Dhamma\AudioFileTool\audios\preface.mp3!


In [23]:
for filename, chunks in chunk_files.items():
	# Concatenate chunk WAVs into one AudioSegment
	combined = AudioSegment.empty()
	audio_file = f"{OUTPUT_AUDIOS}\\{filename}.mp3"

	for file in chunks:
		seg = AudioSegment.from_wav(file)
		combined += seg

	try:
		export_with_ffmpeg_filters(
			audio=combined,
			out_path=audio_file,
			speed=SPEED,
			mp3_bitrate=MP3_BITRATE,
			trim_silence=TRIM_SILENCE,
			silence_threshold_db=SILENCE_THRESHOLD_DB,
			silence_min_sec=SILENCE_MIN_SEC,
			normalize_loudness=NORMALIZE_LOUDNESS,
			loudnorm_I=LOUDNORM_I,
			loudnorm_TP=LOUDNORM_TP,
			loudnorm_LRA=LOUDNORM_LRA,
		)
		print(f"Generated {audio_file}!")
	except:
		print(f"Error couldn't combine chunk files of {filename}!")

Running: ffmpeg -y -i d:\Dhamma\AudioFileTool\audios\attention_and_thought.tmp_export.wav -filter:a atempo=0.9,loudnorm=I=-16:TP=-1.5:LRA=11 -b:a 192k d:\Dhamma\AudioFileTool\audios\attention_and_thought.mp3
Generated d:\Dhamma\AudioFileTool\audios\attention_and_thought.mp3!
Running: ffmpeg -y -i d:\Dhamma\AudioFileTool\audios\preface.tmp_export.wav -filter:a atempo=0.9,loudnorm=I=-16:TP=-1.5:LRA=11 -b:a 192k d:\Dhamma\AudioFileTool\audios\preface.mp3
Generated d:\Dhamma\AudioFileTool\audios\preface.mp3!
Running: ffmpeg -y -i d:\Dhamma\AudioFileTool\audios\thought_out_thought.tmp_export.wav -filter:a atempo=0.9,loudnorm=I=-16:TP=-1.5:LRA=11 -b:a 192k d:\Dhamma\AudioFileTool\audios\thought_out_thought.mp3
Generated d:\Dhamma\AudioFileTool\audios\thought_out_thought.mp3!
Running: ffmpeg -y -i d:\Dhamma\AudioFileTool\audios\way_of_thought.tmp_export.wav -filter:a atempo=0.9,loudnorm=I=-16:TP=-1.5:LRA=11 -b:a 192k d:\Dhamma\AudioFileTool\audios\way_of_thought.mp3
Generated d:\Dhamma\AudioF