# üéß One-Click Audio Generator

Generate audio for **Tr√≠ Nh·ªõ D·ªãu D√†ng** with your cloned voice.

## ‚ö° Quick Start
1. **Runtime ‚Üí Change runtime type ‚Üí T4 GPU**
2. Add `GITHUB_TOKEN` to Colab Secrets (üîë sidebar)
3. **Run All** (Ctrl+F9)

Your voice file (`my-voice.m4a`) is already in the repo! ‚úÖ

In [None]:
#@title ‚ö° ONE CLICK - Run Everything { display-mode: "form" }

#@markdown ### Settings
BOOK_ID = "gentle-mind" #@param {type:"string"}
VOICE_PROFILE = "default" #@param {type:"string"}
SKIP_EXISTING_AUDIO = True #@param {type:"boolean"}
GITHUB_USERNAME = "nmnhut-it" #@param {type:"string"}
REPO_NAME = "english-learning-app" #@param {type:"string"}
BRANCH = "main" #@param {type:"string"}

import subprocess, sys, os

# ========== STEP 1: Install ==========
print("="*50)
print("üì¶ STEP 1: Installing dependencies...")
print("="*50)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", 
                "coqui-tts", "torchcodec", "soundfile", "pydub"], check=True)

import torch, json, re, numpy as np, soundfile as sf
from pathlib import Path
from datetime import datetime
from pydub import AudioSegment
from google.colab import userdata, files
from IPython.display import Audio, display, HTML

print(f"‚úÖ Installed! GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'None'}")

# ========== STEP 2: Clone Repo ==========
print("\n" + "="*50)
print("üì• STEP 2: Cloning repository...")
print("="*50)

try:
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
except:
    GITHUB_TOKEN = input("Enter GitHub token: ")

REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
REPO_DIR = Path(f"/content/{REPO_NAME}")

if REPO_DIR.exists():
    os.chdir(REPO_DIR)
    subprocess.run(["git", "pull", "origin", BRANCH], check=True)
else:
    subprocess.run(["git", "clone", "--depth", "1", "-b", BRANCH, REPO_URL, str(REPO_DIR)], check=True)

os.chdir(REPO_DIR)
subprocess.run(["git", "config", "user.email", "colab@thelostchapter.app"])
subprocess.run(["git", "config", "user.name", "TheLostChapter CMS"])

CONTENT_DIR = REPO_DIR / "the-lost-chapter" / "content" / "books"
VOICES_DIR = REPO_DIR / "the-lost-chapter" / "voices"
VOICES_DIR.mkdir(parents=True, exist_ok=True)
BOOK_DIR = CONTENT_DIR / BOOK_ID
AUDIO_DIR = BOOK_DIR / "audio"
AUDIO_DIR.mkdir(parents=True, exist_ok=True)

# Show what we have
existing_profiles = list(VOICES_DIR.glob('*.pt'))
voice_samples = list(VOICES_DIR.glob('*.m4a')) + list(VOICES_DIR.glob('*.mp3')) + list(VOICES_DIR.glob('*.wav'))
print(f"‚úÖ Repository ready!")
print(f"üé§ Voice profiles (.pt): {[f.stem for f in existing_profiles] if existing_profiles else 'None'}")
print(f"üéµ Voice samples: {[f.name for f in voice_samples] if voice_samples else 'None'}")

# ========== STEP 3: Load Model ==========
print("\n" + "="*50)
print("üöÄ STEP 3: Loading viXTTS model...")
print("="*50)

from huggingface_hub import hf_hub_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts import tokenizer as xtts_tokenizer

# Patch for Vietnamese
_orig_preprocess = xtts_tokenizer.VoiceBpeTokenizer.preprocess_text
def _patched(self, txt, lang):
    if lang == "vi":
        txt = txt.replace('"', '')
        txt = re.sub(r'\s+', ' ', txt)
        return txt.strip()
    return _orig_preprocess(self, txt, lang)
xtts_tokenizer.VoiceBpeTokenizer.preprocess_text = _patched

MODEL_DIR = Path("/content/models/vixtts")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
for f in ["config.json", "model.pth", "vocab.json"]:
    if not (MODEL_DIR / f).exists():
        print(f"  Downloading {f}...")
        hf_hub_download(repo_id="capleaf/viXTTS", filename=f, local_dir=str(MODEL_DIR))
    else:
        print(f"  ‚úì {f} (cached)")

config = XttsConfig()
config.load_json(str(MODEL_DIR / "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=str(MODEL_DIR / "model.pth"),
                      vocab_path=str(MODEL_DIR / "vocab.json"))
model.cuda()
print(f"‚úÖ Model loaded on GPU!")

# ========== STEP 4: Load or Clone Voice ==========
print("\n" + "="*50)
print("üé§ STEP 4: Loading voice profile...")
print("="*50)

def convert_to_wav(input_file):
    """Convert any audio format to wav"""
    wav_path = "/content/speaker.wav"
    ext = Path(input_file).suffix.lower()
    
    if ext == '.m4a':
        audio = AudioSegment.from_file(str(input_file), format='m4a')
    elif ext == '.mp3':
        audio = AudioSegment.from_mp3(str(input_file))
    elif ext == '.wav':
        audio = AudioSegment.from_wav(str(input_file))
    else:
        audio = AudioSegment.from_file(str(input_file))
    
    audio = audio.set_frame_rate(22050).set_channels(1)
    audio.export(wav_path, format="wav")
    print(f"  ‚úì Converted {ext} ‚Üí wav ({len(audio)/1000:.1f}s)")
    return wav_path

voice_file = VOICES_DIR / f"{VOICE_PROFILE}.pt"

if voice_file.exists():
    # ===== REUSE EXISTING PROFILE =====
    print(f"‚úÖ Found saved profile: {VOICE_PROFILE}.pt")
    voice_data = torch.load(voice_file, weights_only=False)
    gpt_cond_latent = voice_data["gpt_cond_latent"].cuda()
    speaker_embedding = voice_data["speaker_embedding"].cuda()
    print(f"   Source: {voice_data.get('source', 'unknown')}")
    print(f"   Created: {voice_data.get('created', 'unknown')}")
    print(f"   üîÑ Reusing saved voice (instant!)")

elif voice_samples:
    # ===== AUTO-CLONE FROM EXISTING SAMPLE IN REPO =====
    sample_file = voice_samples[0]  # Use first found sample
    print(f"üéµ Found voice sample in repo: {sample_file.name}")
    print(f"   üß¨ Auto-cloning voice...")
    
    wav_path = convert_to_wav(sample_file)
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=wav_path)
    
    # Save as profile for next time
    torch.save({
        "gpt_cond_latent": gpt_cond_latent.cpu(),
        "speaker_embedding": speaker_embedding.cpu(),
        "source": sample_file.name,
        "created": datetime.now().isoformat(),
        "model": "viXTTS"
    }, voice_file)
    
    gpt_cond_latent = gpt_cond_latent.cuda()
    speaker_embedding = speaker_embedding.cuda()
    
    print(f"   ‚úÖ Voice cloned and saved as: {VOICE_PROFILE}.pt")
    print(f"   üìù Next time will be instant!")

else:
    # ===== UPLOAD NEW SAMPLE =====
    print(f"‚ö† No voice found. Please upload a sample (mp3/m4a/wav):")
    uploaded = files.upload()
    
    if not uploaded:
        raise Exception("‚ùå No file uploaded!")
    
    uploaded_file = list(uploaded.keys())[0]
    wav_path = convert_to_wav(uploaded_file)
    
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=wav_path)
    
    torch.save({
        "gpt_cond_latent": gpt_cond_latent.cpu(),
        "speaker_embedding": speaker_embedding.cpu(),
        "source": uploaded_file,
        "created": datetime.now().isoformat(),
        "model": "viXTTS"
    }, voice_file)
    
    gpt_cond_latent = gpt_cond_latent.cuda()
    speaker_embedding = speaker_embedding.cuda()
    
    print(f"‚úÖ Voice cloned and saved!")
    os.remove(uploaded_file)

# ========== STEP 5: Generate Audio ==========
print("\n" + "="*50)
print("üéµ STEP 5: Generating audio for all chapters...")
print("="*50)

def extract_vietnamese(text):
    lines = []
    for line in text.split('\n'):
        line = line.strip()
        if line.startswith('*') and line.endswith('*'): continue
        if line in ['---', '']: continue
        if line.startswith('#'):
            clean = line.lstrip('#').strip()
            if '|' in clean: clean = clean.split('|')[0].strip()
            if clean: lines.append(clean)
            continue
        if '|' in line: line = line.split('|')[0].strip()
        if line: lines.append(line)
    return ' '.join(lines)

def generate_audio(text, output_path, pause=0.5):
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip() and len(s.strip()) > 3]
    
    all_audio, timestamps = [], []
    silence = np.zeros(int(24000 * pause))
    current_time = 0.0
    
    for i, sentence in enumerate(sentences):
        print(f"  [{i+1}/{len(sentences)}] {sentence[:40]}...")
        out = model.inference(sentence + ".", "vi", gpt_cond_latent, speaker_embedding, temperature=0.7)
        audio_data = out["wav"]
        
        duration = len(audio_data) / 24000
        timestamps.append({"start": round(current_time, 2), "end": round(current_time + duration, 2), "text": sentence})
        current_time += duration + pause
        
        all_audio.extend([audio_data, silence])
    
    combined = np.concatenate(all_audio)
    sf.write(str(output_path), combined, 24000)
    
    with open(output_path.with_suffix('.json'), 'w', encoding='utf-8') as f:
        json.dump(timestamps, f, ensure_ascii=False, indent=2)
    
    return len(combined) / 24000

with open(BOOK_DIR / "book.json") as f:
    book = json.load(f)

print(f"\nüìñ Book: {book['title']}")
print(f"üìë Chapters: {book['chapters']}")
print(f"‚è≠Ô∏è Skip existing: {'ON' if SKIP_EXISTING_AUDIO else 'OFF'}\n")

generated, skipped = 0, 0

for chapter_id in book['chapters']:
    output_file = AUDIO_DIR / f"{chapter_id}-vi.wav"
    
    if SKIP_EXISTING_AUDIO and output_file.exists():
        print(f"‚è≠Ô∏è {chapter_id}: exists, skipping...")
        skipped += 1
        continue
    
    chapter_file = BOOK_DIR / "chapters" / f"{chapter_id}.json"
    with open(chapter_file) as f:
        chapter = json.load(f)
    
    print(f"\n--- {chapter_id}: {chapter['title']} ---")
    
    all_text = [extract_vietnamese(s.get('content', '')) 
                for s in chapter.get('sections', []) if s.get('type') == 'markdown']
    full_text = ' '.join(filter(None, all_text))
    
    if not full_text.strip():
        print("  ‚ö† No text, skipping...")
        continue
    
    duration = generate_audio(full_text, output_file)
    print(f"  ‚úÖ {output_file.name} ({duration:.1f}s)")
    generated += 1

print(f"\nüìä Done: {generated} generated, {skipped} skipped")

# ========== STEP 6: Push ==========
print("\n" + "="*50)
print("üöÄ STEP 6: Pushing to GitHub...")
print("="*50)

os.chdir(REPO_DIR)
subprocess.run(["git", "add", "the-lost-chapter/"])

result = subprocess.run(["git", "diff", "--cached", "--quiet"])
if result.returncode == 0:
    print("‚ö† No changes to commit.")
else:
    subprocess.run(["git", "commit", "-m", f"Generate audio for {BOOK_ID}"])
    subprocess.run(["git", "push", "origin", BRANCH])
    print(f"‚úÖ Pushed!")

print("\n" + "="*50)
print("üéâ ALL DONE!")
print("="*50)
for f in sorted(AUDIO_DIR.glob("*.wav")):
    print(f"   üîä {f.name} ({f.stat().st_size/1024/1024:.1f} MB)")

In [None]:
#@title üîä Preview Audio
chapter = "ch01" #@param ["ch01", "ch02", "ch03"]
audio_file = AUDIO_DIR / f"{chapter}-vi.wav"
if audio_file.exists():
    display(Audio(str(audio_file)))
else:
    print(f"‚ùå Not found: {audio_file}")

In [None]:
#@title üì• Download Audio
import shutil
shutil.make_archive(f"/content/{BOOK_ID}_audio", 'zip', AUDIO_DIR)
files.download(f"/content/{BOOK_ID}_audio.zip")