# GFM Book Text-to-Speech Pipeline

Converts Quarto book chapters to audio using Google Cloud TTS.

**Prerequisites:**
- Google Cloud account with TTS API enabled
- `gcloud` CLI installed and authenticated

**Cost:** ~62K chars per chapter, 1M chars/month free tier covers the full book.

## 1. Setup

In [None]:
# Install dependencies
!pip install -q google-cloud-texttospeech

In [None]:
# Authenticate with GCP (run once, follow the browser prompt)
!gcloud auth application-default login

In [None]:
import re
import subprocess
from pathlib import Path
from google.cloud import texttospeech

print("Setup complete!")

## 2. Configuration

In [None]:
# Path to gfm-book repository
BOOK_ROOT = Path("/root/gfm-book")  # Update this path as needed

# Chapter to convert (update for different chapters)
CHAPTER_FILE = BOOK_ROOT / "part_2" / "p2-ch05-representations.qmd"

# Output directory
OUTPUT_DIR = Path(".")  # Current directory, or set to BOOK_ROOT / "audio"
OUTPUT_DIR.mkdir(exist_ok=True)

# Voice options (uncomment preferred voice)
VOICE_NAME = "en-US-Neural2-D"  # Male, natural (recommended)
# VOICE_NAME = "en-US-Neural2-F"  # Female, natural
# VOICE_NAME = "en-US-Studio-O"   # Male, highest quality
# VOICE_NAME = "en-US-Wavenet-D"  # Male, good balance

# Speaking rate (0.25 to 4.0, default 1.0)
SPEAKING_RATE = 0.95  # Slightly slower for technical content

print(f"Chapter: {CHAPTER_FILE.name}")
print(f"Voice: {VOICE_NAME}")
print(f"Rate: {SPEAKING_RATE}")

## 3. Preprocessing Functions

In [None]:
def convert_qmd_to_text(qmd_path: Path) -> str:
    """Convert Quarto file to plain text using pandoc."""
    result = subprocess.run(
        ["pandoc", str(qmd_path), "-t", "plain", "--wrap=none"],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        raise RuntimeError(f"Pandoc failed: {result.stderr}")
    return result.stdout


def preprocess_for_tts(text: str) -> str:
    """Clean text for TTS consumption."""
    
    # Remove display math blocks
    text = re.sub(r'\$\$.*?\$\$', ' [Equation omitted] ', text, flags=re.DOTALL)
    
    # Remove inline math
    text = re.sub(r'\$[^$]+\$', '', text)
    
    # Expand cross-references
    text = re.sub(r'@sec-ch(\d+)-[\w-]+', r'Section \1', text)
    text = re.sub(r'@fig-[\w-]+', 'the figure', text)
    text = re.sub(r'@tbl-[\w-]+', 'the table', text)
    text = re.sub(r'@eq-[\w-]+', 'the equation', text)
    
    # Clean citations
    text = re.sub(r'\[@[\w_-]+(?:;\s*@[\w_-]+)*\]', '[citation]', text)
    text = re.sub(r'@[\w_-]+', '[citation]', text)
    
    # Remove callout markers but keep content
    text = re.sub(r':::\s*\{\.callout-(\w+)[^}]*\}', r'[\1]: ', text)
    text = re.sub(r':::', '', text)
    
    # Clean figure references
    text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[Figure: \1]', text)
    
    # Remove Quarto layout directives
    text = re.sub(r'\{#[\w-]+[^}]*\}', '', text)
    text = re.sub(r'\{layout[^}]*\}', '', text)
    
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    
    # Remove code blocks
    text = re.sub(r'```[\s\S]*?```', ' [Code block omitted] ', text)
    
    # Clean markdown headers
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
    
    # Remove markdown emphasis
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^*]+)\*', r'\1', text)
    text = re.sub(r'__([^_]+)__', r'\1', text)
    text = re.sub(r'_([^_]+)_', r'\1', text)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove markdown links
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    # Clean list markers
    text = re.sub(r'^\s*[-*+]\s+', '  ', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '  ', text, flags=re.MULTILINE)
    
    # Normalize whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r' \n', '\n', text)
    text = re.sub(r'\n ', '\n', text)
    
    return text.strip()


print("Preprocessing functions defined.")

## 4. TTS Generation Functions

In [None]:
def split_text_into_chunks(text: str, max_bytes: int = 4500) -> list:
    """Split text into chunks that fit within API byte limits."""
    chunks = []
    current_chunk = ""
    
    sentences = text.replace('\n\n', '\n.\n').split('. ')
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        
        if not sentence.endswith(('.', '?', '!')):
            sentence += '.'
        
        test_chunk = current_chunk + ' ' + sentence if current_chunk else sentence
        if len(test_chunk.encode('utf-8')) > max_bytes:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk = test_chunk
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


def generate_audio(text: str, output_path: Path, voice_name: str, speaking_rate: float) -> None:
    """Generate audio from text using Google Cloud TTS."""
    client = texttospeech.TextToSpeechClient()
    
    chunks = split_text_into_chunks(text)
    total_chars = sum(len(c) for c in chunks)
    print(f"Processing {len(chunks)} chunks ({total_chars:,} characters)...")
    
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name=voice_name
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        speaking_rate=speaking_rate,
        pitch=0.0
    )
    
    audio_segments = []
    for i, chunk in enumerate(chunks):
        print(f"  Chunk {i+1}/{len(chunks)}...", end=" ", flush=True)
        
        synthesis_input = texttospeech.SynthesisInput(text=chunk)
        response = client.synthesize_speech(
            input=synthesis_input,
            voice=voice,
            audio_config=audio_config
        )
        audio_segments.append(response.audio_content)
        print("done")
    
    with open(output_path, "wb") as f:
        for segment in audio_segments:
            f.write(segment)
    
    size_mb = output_path.stat().st_size / (1024 * 1024)
    print(f"\nSaved: {output_path} ({size_mb:.1f} MB)")


print("TTS functions defined.")

## 5. Run the Pipeline

In [None]:
# Step 1: Convert Quarto to plain text
print(f"Converting {CHAPTER_FILE.name}...")
raw_text = convert_qmd_to_text(CHAPTER_FILE)
print(f"  Raw text: {len(raw_text):,} characters")

In [None]:
# Step 2: Preprocess for TTS
print("Preprocessing...")
clean_text = preprocess_for_tts(raw_text)
print(f"  Clean text: {len(clean_text):,} characters")
print(f"  Equations omitted: {clean_text.count('[Equation omitted]')}")
print(f"  Citations: {clean_text.count('[citation]')}")

In [None]:
# Preview the cleaned text
print("=" * 60)
print("PREVIEW (first 2000 chars):")
print("=" * 60)
print(clean_text[:2000])

In [None]:
# Step 3: Generate audio
chapter_name = CHAPTER_FILE.stem.replace(".", "-")
output_file = OUTPUT_DIR / f"{chapter_name}.mp3"

print(f"Generating audio with voice: {VOICE_NAME}")
generate_audio(clean_text, output_file, VOICE_NAME, SPEAKING_RATE)

## 6. Batch Processing (Optional)

Generate audio for multiple chapters at once.

In [None]:
# List available chapters
chapters = sorted(BOOK_ROOT.glob("part_*/p*-ch*.qmd"))
print(f"Found {len(chapters)} chapters:")
for i, ch in enumerate(chapters[:10]):
    print(f"  {i+1}. {ch.relative_to(BOOK_ROOT)}")
if len(chapters) > 10:
    print(f"  ... and {len(chapters) - 10} more")

In [None]:
# Batch convert selected chapters (uncomment and modify as needed)
# WARNING: This will use API quota for each chapter

# chapters_to_convert = [
#     BOOK_ROOT / "part_2" / "p2-ch05-representations.qmd",
#     BOOK_ROOT / "part_2" / "p2-ch06-cnns.qmd",
#     BOOK_ROOT / "part_2" / "p2-ch07-attention.qmd",
# ]

# for chapter in chapters_to_convert:
#     print(f"\n{'='*60}")
#     print(f"Processing: {chapter.name}")
#     print(f"{'='*60}")
#     
#     raw = convert_qmd_to_text(chapter)
#     clean = preprocess_for_tts(raw)
#     output = OUTPUT_DIR / f"{chapter.stem}.mp3"
#     generate_audio(clean, output, VOICE_NAME, SPEAKING_RATE)

## 7. Voice Comparison (Optional)

Generate samples with different voices to compare quality.

In [None]:
# Sample text for voice comparison
sample_text = clean_text[:3000]  # First 3000 chars

voices_to_test = [
    "en-US-Neural2-D",  # Male, natural
    "en-US-Neural2-F",  # Female, natural
    # "en-US-Studio-O",   # Male, studio (highest quality)
]

# Uncomment to generate samples
# for voice in voices_to_test:
#     output = OUTPUT_DIR / f"sample_{voice}.mp3"
#     print(f"\nGenerating sample with {voice}...")
#     generate_audio(sample_text, output, voice, SPEAKING_RATE)

---

## Notes

**Voices:**
- `en-US-Neural2-D` (Male) - Natural, recommended for technical content
- `en-US-Neural2-F` (Female) - Natural, clear
- `en-US-Studio-O` (Male) - Highest quality, more expressive
- `en-US-Wavenet-D` (Male) - Good balance of quality and cost

**Pricing:**
- Neural2/WaveNet: $16/1M chars (1M free/month)
- Standard: $4/1M chars (4M free/month)
- Full book (~900K chars) fits within free tier

**Full voice list:** https://cloud.google.com/text-to-speech/docs/voices