# üéß One-Click Audio Generator

Generate audio for **Tr√≠ Nh·ªõ D·ªãu D√†ng** with your cloned voice.

## ‚ö° Quick Start
1. **Runtime ‚Üí Change runtime type ‚Üí T4 GPU**
2. Add `GITHUB_TOKEN` to Colab Secrets (üîë sidebar)
3. **Run All** (Ctrl+F9)

That's it! ‚òï

In [None]:
#@title ‚ö° ONE CLICK - Run Everything { display-mode: "form" }
#@markdown This cell does everything automatically:
#@markdown 1. Install dependencies
#@markdown 2. Clone repo & load voice profile
#@markdown 3. Generate audio for all chapters
#@markdown 4. Push to GitHub

#@markdown ---
#@markdown ### Settings
BOOK_ID = "gentle-mind" #@param {type:"string"}
VOICE_PROFILE = "default" #@param {type:"string"}
GITHUB_USERNAME = "nmnhut-it" #@param {type:"string"}
REPO_NAME = "english-learning-app" #@param {type:"string"}
BRANCH = "main" #@param {type:"string"}

import subprocess
import sys
import os

# ========== STEP 1: Install ==========
print("="*50)
print("üì¶ STEP 1: Installing dependencies...")
print("="*50)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", 
                "coqui-tts", "torchcodec", "soundfile", "pydub"], check=True)

import torch
import json
import re
import numpy as np
import soundfile as sf
from pathlib import Path
from datetime import datetime
from google.colab import userdata
from IPython.display import Audio, display, HTML

print(f"‚úÖ Installed! GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'None'}")

# ========== STEP 2: Clone Repo ==========
print("\n" + "="*50)
print("üì• STEP 2: Cloning repository...")
print("="*50)

try:
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
except:
    GITHUB_TOKEN = input("Enter GitHub token: ")

REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
REPO_DIR = Path(f"/content/{REPO_NAME}")

if REPO_DIR.exists():
    os.chdir(REPO_DIR)
    subprocess.run(["git", "pull", "origin", BRANCH], check=True)
else:
    subprocess.run(["git", "clone", "--depth", "1", "-b", BRANCH, REPO_URL, str(REPO_DIR)], check=True)

os.chdir(REPO_DIR)
subprocess.run(["git", "config", "user.email", "colab@thelostchapter.app"])
subprocess.run(["git", "config", "user.name", "TheLostChapter CMS"])

CONTENT_DIR = REPO_DIR / "the-lost-chapter" / "content" / "books"
VOICES_DIR = REPO_DIR / "the-lost-chapter" / "voices"
BOOK_DIR = CONTENT_DIR / BOOK_ID
AUDIO_DIR = BOOK_DIR / "audio"
AUDIO_DIR.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Repository ready!")

# ========== STEP 3: Load Model ==========
print("\n" + "="*50)
print("üöÄ STEP 3: Loading viXTTS model...")
print("="*50)

from huggingface_hub import hf_hub_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts import tokenizer as xtts_tokenizer

# Patch for Vietnamese
_orig_preprocess = xtts_tokenizer.VoiceBpeTokenizer.preprocess_text
def _patched(self, txt, lang):
    if lang == "vi":
        txt = txt.replace('"', '')
        txt = re.sub(r'\s+', ' ', txt)
        return txt.strip()
    return _orig_preprocess(self, txt, lang)
xtts_tokenizer.VoiceBpeTokenizer.preprocess_text = _patched

MODEL_DIR = Path("/content/models/vixtts")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
for f in ["config.json", "model.pth", "vocab.json"]:
    if not (MODEL_DIR / f).exists():
        hf_hub_download(repo_id="capleaf/viXTTS", filename=f, local_dir=str(MODEL_DIR))

config = XttsConfig()
config.load_json(str(MODEL_DIR / "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=str(MODEL_DIR / "model.pth"),
                      vocab_path=str(MODEL_DIR / "vocab.json"))
model.cuda()
print(f"‚úÖ Model loaded on GPU!")

# ========== STEP 4: Load Voice ==========
print("\n" + "="*50)
print("üé§ STEP 4: Loading voice profile...")
print("="*50)

voice_file = VOICES_DIR / f"{VOICE_PROFILE}.pt"
if voice_file.exists():
    voice_data = torch.load(voice_file, weights_only=False)
    gpt_cond_latent = voice_data["gpt_cond_latent"].cuda()
    speaker_embedding = voice_data["speaker_embedding"].cuda()
    print(f"‚úÖ Voice profile loaded: {VOICE_PROFILE}")
    print(f"   Created: {voice_data.get('created', 'unknown')}")
else:
    print(f"‚ùå Voice profile not found: {voice_file}")
    print(f"\nAvailable profiles: {[f.stem for f in VOICES_DIR.glob('*.pt')]}")
    print("\nPlease upload a voice sample to create one.")
    from google.colab import files
    uploaded = files.upload()
    if uploaded:
        from pydub import AudioSegment
        uploaded_file = list(uploaded.keys())[0]
        if uploaded_file.endswith('.mp3'):
            audio = AudioSegment.from_mp3(uploaded_file)
            wav_path = "/content/speaker.wav"
            audio.set_frame_rate(22050).set_channels(1).export(wav_path, format="wav")
        else:
            wav_path = uploaded_file
        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=wav_path)
        # Save profile
        VOICES_DIR.mkdir(parents=True, exist_ok=True)
        torch.save({
            "gpt_cond_latent": gpt_cond_latent.cpu(),
            "speaker_embedding": speaker_embedding.cpu(),
            "source": uploaded_file,
            "created": datetime.now().isoformat()
        }, voice_file)
        print(f"‚úÖ Voice cloned and saved as: {VOICE_PROFILE}")

# ========== STEP 5: Generate Audio ==========
print("\n" + "="*50)
print("üéµ STEP 5: Generating audio for all chapters...")
print("="*50)

def extract_vietnamese(text):
    """Extract Vietnamese text from bilingual content"""
    lines = []
    for line in text.split('\n'):
        line = line.strip()
        # Skip English (italic)
        if line.startswith('*') and line.endswith('*'):
            continue
        # Skip decorators
        if line in ['---', ''] or line.startswith('#'):
            if line.startswith('#'):
                clean = line.lstrip('#').strip()
                if '|' in clean:
                    clean = clean.split('|')[0].strip()
                if clean:
                    lines.append(clean)
            continue
        # Get Vietnamese part
        if '|' in line:
            line = line.split('|')[0].strip()
        if line:
            lines.append(line)
    return ' '.join(lines)

def generate_audio(text, output_path, pause=0.5):
    """Generate audio with timestamps"""
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
    
    all_audio = []
    silence = np.zeros(int(24000 * pause))
    timestamps = []
    current_time = 0.0
    
    for i, sentence in enumerate(sentences):
        if len(sentence) < 3:
            continue
        print(f"  [{i+1}/{len(sentences)}] {sentence[:40]}...")
        out = model.inference(sentence + ".", "vi", gpt_cond_latent, speaker_embedding, temperature=0.7)
        audio_data = out["wav"]
        
        duration = len(audio_data) / 24000
        timestamps.append({
            "start": round(current_time, 2),
            "end": round(current_time + duration, 2),
            "text": sentence
        })
        current_time += duration + pause
        
        all_audio.append(audio_data)
        all_audio.append(silence)
    
    combined = np.concatenate(all_audio)
    sf.write(str(output_path), combined, 24000)
    
    # Save timestamps
    ts_path = output_path.with_suffix('.json')
    with open(ts_path, 'w', encoding='utf-8') as f:
        json.dump(timestamps, f, ensure_ascii=False, indent=2)
    
    return len(combined) / 24000

# Load book
with open(BOOK_DIR / "book.json") as f:
    book = json.load(f)

print(f"\nüìñ Book: {book['title']}")
print(f"üìë Chapters: {book['chapters']}\n")

for chapter_id in book['chapters']:
    chapter_file = BOOK_DIR / "chapters" / f"{chapter_id}.json"
    with open(chapter_file) as f:
        chapter = json.load(f)
    
    print(f"\n--- {chapter_id}: {chapter['title']} ---")
    
    # Collect Vietnamese text
    all_text = []
    for section in chapter.get('sections', []):
        if section.get('type') == 'markdown':
            vi_text = extract_vietnamese(section.get('content', ''))
            if vi_text:
                all_text.append(vi_text)
    
    full_text = ' '.join(all_text)
    if not full_text.strip():
        print("  ‚ö† No Vietnamese text found, skipping...")
        continue
    
    output_file = AUDIO_DIR / f"{chapter_id}-vi.wav"
    duration = generate_audio(full_text, output_file)
    print(f"  ‚úÖ Generated: {output_file.name} ({duration:.1f}s)")

# ========== STEP 6: Push to GitHub ==========
print("\n" + "="*50)
print("üöÄ STEP 6: Pushing to GitHub...")
print("="*50)

os.chdir(REPO_DIR)
subprocess.run(["git", "add", "the-lost-chapter/content/"])
subprocess.run(["git", "add", "the-lost-chapter/voices/"])

result = subprocess.run(["git", "diff", "--cached", "--quiet"])
if result.returncode == 0:
    print("‚ö† No changes to commit.")
else:
    subprocess.run(["git", "commit", "-m", f"Generate audio for {BOOK_ID} with viXTTS"])
    subprocess.run(["git", "push", "origin", BRANCH])
    print(f"‚úÖ Pushed to GitHub!")

# ========== DONE ==========
print("\n" + "="*50)
print("üéâ ALL DONE!")
print("="*50)
print(f"\nüìÅ Audio files: {AUDIO_DIR}")
print(f"üåê GitHub: https://github.com/{GITHUB_USERNAME}/{REPO_NAME}")

# List generated files
print(f"\nüìã Generated files:")
for f in sorted(AUDIO_DIR.glob("*.wav")):
    size = f.stat().st_size / 1024 / 1024
    print(f"   {f.name} ({size:.1f} MB)")

In [None]:
#@title üîä Preview Audio
from pathlib import Path
from IPython.display import Audio, display

chapter = "ch01" #@param ["ch01", "ch02", "ch03"]

audio_file = AUDIO_DIR / f"{chapter}-vi.wav"
if audio_file.exists():
    print(f"üéß Playing: {audio_file.name}")
    display(Audio(str(audio_file)))
else:
    print(f"‚ùå File not found: {audio_file}")

In [None]:
#@title üì• Download All Audio
import shutil
from google.colab import files

zip_file = f"/content/{BOOK_ID}_audio.zip"
shutil.make_archive(zip_file.replace('.zip', ''), 'zip', AUDIO_DIR)
files.download(zip_file)
print(f"üì• Downloading: {BOOK_ID}_audio.zip")