## 1Ô∏è‚É£ Setup Environment

Run this cell **once** to install dependencies and download models (~313 MB).

In [None]:
#@title üîß Install Dependencies & Download Models { display-mode: "form" }
#@markdown This will take 2-5 minutes on first run.

import os
import sys
from pathlib import Path

# Check if running on Colab
IN_COLAB = 'google.colab' in sys.modules
print(f"Running on: {'Google Colab' if IN_COLAB else 'Local Environment'}")

# Install kokoro package
print("\n[1/3] Installing kokoro package...")
!pip install -q kokoro>=0.9.2 soundfile scipy

# Install espeak-ng for phonemization
print("\n[2/3] Installing espeak-ng...")
if IN_COLAB:
    !apt-get -qq install espeak-ng > /dev/null 2>&1
    print("  espeak-ng installed")
else:
    print("  Please ensure espeak-ng is installed on your system")

# Download models from HuggingFace
print("\n[3/3] Downloading Kokoro-82M model and voices...")
from huggingface_hub import hf_hub_download

REPO = 'hexgrad/Kokoro-82M'
MODELS_DIR = Path('models')
KOKORO_DIR = MODELS_DIR / 'kokoro-82m'
VOICES_DIR = MODELS_DIR / 'voices'

KOKORO_DIR.mkdir(parents=True, exist_ok=True)
VOICES_DIR.mkdir(parents=True, exist_ok=True)

# Supported voices (32 total)
VOICES = {
    # American English (20 voices)
    'af_alloy': 'American Female', 'af_aoede': 'American Female', 'af_bella': 'American Female',
    'af_heart': 'American Female', 'af_jessica': 'American Female', 'af_kore': 'American Female',
    'af_nicole': 'American Female', 'af_nova': 'American Female', 'af_river': 'American Female',
    'af_sarah': 'American Female', 'af_sky': 'American Female',
    'am_adam': 'American Male', 'am_echo': 'American Male', 'am_eric': 'American Male',
    'am_fenrir': 'American Male', 'am_liam': 'American Male', 'am_michael': 'American Male',
    'am_onyx': 'American Male', 'am_puck': 'American Male', 'am_santa': 'American Male',
    # British English (8 voices)
    'bf_alice': 'British Female', 'bf_emma': 'British Female', 'bf_isabella': 'British Female',
    'bf_lily': 'British Female',
    'bm_daniel': 'British Male', 'bm_fable': 'British Male', 'bm_george': 'British Male',
    'bm_lewis': 'British Male',
    # Spanish (3 voices)
    'ef_dora': 'Spanish Female', 'em_alex': 'Spanish Male', 'em_santa': 'Spanish Male',
    # French (1 voice)
    'ff_siwis': 'French Female',
}

# Download base model
for f in ['config.json', 'kokoro-v1_0.pth']:
    if not (KOKORO_DIR / f).exists():
        print(f"  Downloading {f}...")
        hf_hub_download(REPO, f, local_dir=KOKORO_DIR)
    else:
        print(f"  {f} already exists")

# Download all voices
print(f"  Downloading {len(VOICES)} voice files...")
for i, voice in enumerate(VOICES.keys()):
    voice_file = f'voices/{voice}.pt'
    if not (MODELS_DIR / voice_file).exists():
        hf_hub_download(REPO, voice_file, local_dir=MODELS_DIR)
    if (i + 1) % 8 == 0:
        print(f"    {i + 1}/{len(VOICES)} voices downloaded")

print("\n‚úÖ Setup complete! All models downloaded.")
print(f"\nüìä Available: {len(VOICES)} voices across 4 languages")

## 2Ô∏è‚É£ Initialize TTS Engine

Load the Kokoro model and prepare for synthesis.

In [None]:
#@title üöÄ Initialize Kokoro TTS Engine { display-mode: "form" }

import torch
import soundfile as sf
from scipy.io import wavfile
from pathlib import Path
from kokoro import KPipeline
from IPython.display import Audio, display, HTML
import warnings
warnings.filterwarnings('ignore')

# Detect device
if torch.cuda.is_available():
    DEVICE = 'cuda'
    print(f"üéÆ Using GPU: {torch.cuda.get_device_name(0)}")
else:
    DEVICE = 'cpu'
    print("üíª Using CPU (GPU not available)")

# Language mapping
LANG_MAP = {
    'a': ('American English', 'a'),
    'b': ('British English', 'b'),
    'e': ('Spanish', 'e'),
    'f': ('French', 'f'),
}

# Voice categories
VOICE_CATEGORIES = {
    'af': 'American Female',
    'am': 'American Male',
    'bf': 'British Female',
    'bm': 'British Male',
    'ef': 'Spanish Female',
    'em': 'Spanish Male',
    'ff': 'French Female',
}

# Valid settings
VALID_SAMPLE_RATES = [8000, 16000, 22050, 24000, 44100, 48000]
MIN_SPEED, MAX_SPEED = 0.25, 4.0

# Initialize pipeline cache
_pipelines = {}

def get_pipeline(lang_code: str) -> KPipeline:
    """Get or create pipeline for language."""
    if lang_code not in _pipelines:
        _pipelines[lang_code] = KPipeline(lang_code=lang_code, device=DEVICE)
    return _pipelines[lang_code]

def list_voices():
    """Display all available voices."""
    print("\nüé§ Available Voices:")
    print("=" * 50)
    for prefix, category in VOICE_CATEGORIES.items():
        voices = [v for v in VOICES.keys() if v.startswith(prefix)]
        if voices:
            print(f"\n{category}:")
            for v in sorted(voices):
                print(f"  ‚Ä¢ {v}")
    print("\n" + "=" * 50)

def list_languages():
    """Display all available languages."""
    print("\nüåç Supported Languages:")
    print("=" * 50)
    print("Code | Language          | Voices")
    print("-----|-------------------|--------")
    print("  a  | American English  | 20")
    print("  b  | British English   | 8")
    print("  e  | Spanish           | 3")
    print("  f  | French            | 1")
    print("=" * 50 + "\n")

# Pre-load default pipeline
print("\n‚è≥ Loading default pipeline (American English)...")
_ = get_pipeline('a')
print("‚úÖ TTS Engine ready!\n")

# Show available options
list_languages()

## 3Ô∏è‚É£ Text-to-Speech Functions

Core functions for generating speech - similar to KTTS72 CLI.

In [None]:
#@title üîä Define TTS Functions { display-mode: "form" }

import numpy as np
from typing import Optional, Union

def synthesize(
    text: str,
    voice: str = 'af_heart',
    lang: str = 'a',
    speed: float = 1.0,
    sample_rate: int = 24000,
    output_file: Optional[str] = None,
    play_audio: bool = True,
    verbose: bool = True
) -> Optional[np.ndarray]:
    """
    Synthesize speech from text.
    
    Args:
        text: Text to synthesize (max 50,000 characters)
        voice: Voice name (e.g., 'af_heart', 'bm_lewis', 'ff_siwis')
        lang: Language code - 'a' (American), 'b' (British), 'e' (Spanish), 'f' (French)
        speed: Playback speed (0.25 - 4.0)
        sample_rate: Audio sample rate (8000, 16000, 22050, 24000, 44100, 48000)
        output_file: Optional output file path (.wav or .mp3)
        play_audio: Whether to play audio in notebook
        verbose: Whether to print progress info
    
    Returns:
        Audio data as numpy array (if no output_file specified)
    """
    # Validate inputs
    if not text or not text.strip():
        raise ValueError("Text cannot be empty")
    
    text = text.strip()
    if len(text) > 50000:
        raise ValueError(f"Text too long ({len(text)} chars). Maximum is 50,000 characters.")
    
    if voice not in VOICES:
        raise ValueError(f"Unknown voice '{voice}'. Use list_voices() to see available voices.")
    
    if lang not in LANG_MAP:
        raise ValueError(f"Unknown language '{lang}'. Valid: a, b, e, f")
    
    if not MIN_SPEED <= speed <= MAX_SPEED:
        raise ValueError(f"Speed must be between {MIN_SPEED} and {MAX_SPEED}")
    
    if sample_rate not in VALID_SAMPLE_RATES:
        raise ValueError(f"Invalid sample rate. Valid: {VALID_SAMPLE_RATES}")
    
    # Get voice path as string (kokoro expects path string, not tensor)
    voice_path = str(VOICES_DIR / f"{voice}.pt")
    if not Path(voice_path).exists():
        raise FileNotFoundError(f"Voice file not found: {voice_path}")
    
    if verbose:
        preview = text[:60] + ('...' if len(text) > 60 else '')
        print(f"üéôÔ∏è Synthesizing: '{preview}'")
        print(f"   Voice: {voice} ({VOICES[voice]})")
        print(f"   Language: {LANG_MAP[lang][0]}")
        print(f"   Speed: {speed}x, Sample Rate: {sample_rate} Hz")
    
    # Get pipeline
    pipeline = get_pipeline(lang)
    
    # Generate audio - pass voice path as string
    audio_segments = []
    for _, _, audio in pipeline(text, voice=voice_path, speed=speed):
        audio_segments.append(audio)
    
    if not audio_segments:
        raise RuntimeError("No audio generated")
    
    # Combine segments
    audio_data = np.concatenate(audio_segments)
    
    # Resample if needed (Kokoro outputs at 24000 Hz)
    if sample_rate != 24000:
        from scipy import signal
        audio_data = signal.resample(audio_data, int(len(audio_data) * sample_rate / 24000))
    
    # Save to file if requested
    if output_file:
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        ext = output_path.suffix.lower()
        if ext == '.mp3':
            # Save as WAV first, then convert
            temp_wav = output_path.with_suffix('.wav')
            sf.write(str(temp_wav), audio_data, sample_rate)
            # Try to convert to MP3
            try:
                import subprocess
                subprocess.run(['ffmpeg', '-y', '-i', str(temp_wav), '-b:a', '192k', str(output_path)],
                              capture_output=True, check=True)
                temp_wav.unlink()  # Remove temp WAV
                if verbose:
                    print(f"‚úÖ Saved: {output_path}")
            except (subprocess.CalledProcessError, FileNotFoundError):
                if verbose:
                    print(f"‚ö†Ô∏è MP3 encoding failed, saved as WAV: {temp_wav}")
                output_path = temp_wav
        else:
            sf.write(str(output_path), audio_data, sample_rate)
            if verbose:
                print(f"‚úÖ Saved: {output_path}")
    
    # Play audio in notebook
    if play_audio:
        display(Audio(audio_data, rate=sample_rate))
    
    return audio_data


def synthesize_file(
    text_file: str,
    output_file: str = 'output.wav',
    voice: str = 'af_heart',
    lang: str = 'a',
    speed: float = 1.0,
    sample_rate: int = 24000,
) -> np.ndarray:
    """
    Synthesize speech from a text file.
    
    Args:
        text_file: Path to UTF-8 text file
        output_file: Output audio file path
        voice: Voice name
        lang: Language code
        speed: Playback speed
        sample_rate: Audio sample rate
    
    Returns:
        Audio data as numpy array
    """
    text_path = Path(text_file)
    if not text_path.exists():
        raise FileNotFoundError(f"Text file not found: {text_file}")
    
    text = text_path.read_text(encoding='utf-8')
    return synthesize(
        text=text,
        voice=voice,
        lang=lang,
        speed=speed,
        sample_rate=sample_rate,
        output_file=output_file,
        play_audio=True,
        verbose=True
    )


def batch_synthesize(tasks: list, output_folder: str = 'generated_audio') -> list:
    """
    Batch synthesize multiple texts with different settings.
    
    Args:
        tasks: List of dicts with keys: text, voice, lang, speed, sample_rate, filename
        output_folder: Folder to save generated audio files
    
    Returns:
        List of generated file paths
    """
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    
    results = []
    print(f"\nüéµ Batch synthesizing {len(tasks)} files...")
    print(f"üìÅ Output folder: {output_path.absolute()}")
    print("-" * 50)
    
    for i, task in enumerate(tasks, 1):
        try:
            print(f"\n[{i}/{len(tasks)}] Processing...")
            file_path = output_path / task.get('filename', f'output_{i}.wav')
            
            synthesize(
                text=task['text'],
                voice=task.get('voice', 'af_heart'),
                lang=task.get('lang', 'a'),
                speed=task.get('speed', 1.0),
                sample_rate=task.get('sample_rate', 24000),
                output_file=str(file_path),
                play_audio=False,
                verbose=True
            )
            results.append(str(file_path))
            
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    print(f"\n‚úÖ Completed! Generated {len(results)}/{len(tasks)} files.")
    return results


print("‚úÖ TTS functions loaded!")
print("\nüìö Available functions:")
print("  ‚Ä¢ synthesize(text, voice, lang, speed, sample_rate, output_file)")
print("  ‚Ä¢ synthesize_file(text_file, output_file, voice, lang, speed)")
print("  ‚Ä¢ batch_synthesize(tasks, output_folder)")
print("  ‚Ä¢ list_voices()")
print("  ‚Ä¢ list_languages()")

---

## 4Ô∏è‚É£ Quick Start Examples

Try these examples to get started!

In [None]:
#@title üéØ Example 1: Simple Text-to-Speech { display-mode: "form" }

# Basic usage - just provide text!
synthesize("Hello! This is a test of the Kokoro text to speech system.")

In [None]:
#@title üéØ Example 2: Different Voices and Languages { display-mode: "form" }

# American English - Male voice
print("\nüá∫üá∏ American English (Male):")
synthesize("Hello from New York City!", voice="am_adam", lang="a")

# British English - Female voice
print("\nüá¨üáß British English (Female):")
synthesize("Good afternoon from London!", voice="bf_emma", lang="b")

In [None]:
#@title üéØ Example 3: Spanish and French { display-mode: "form" }

# Spanish
print("\nüá™üá∏ Spanish:")
synthesize("¬°Hola! ¬øC√≥mo est√°s? Bienvenido a nuestro sistema.", voice="ef_dora", lang="e")

# French
print("\nüá´üá∑ French:")
synthesize("Bonjour! Comment allez-vous aujourd'hui?", voice="ff_siwis", lang="f")

In [None]:
#@title üéØ Example 4: Adjust Speed { display-mode: "form" }

text = "The quick brown fox jumps over the lazy dog."

print("\nüê¢ Slow (0.8x):")
synthesize(text, speed=0.8)

print("\nüèÉ Fast (1.3x):")
synthesize(text, speed=1.3)

In [None]:
#@title üéØ Example 5: Save to File { display-mode: "form" }

# Save as WAV
synthesize(
    text="This audio will be saved to a file.",
    voice="af_heart",
    output_file="my_audio.wav",
    play_audio=True
)

# Download the file (on Colab)
if IN_COLAB:
    from google.colab import files
    files.download('my_audio.wav')

In [None]:
#@title üéØ Example 6: Batch Generation (Multiple Files) { display-mode: "form" }

# Define multiple synthesis tasks
tasks = [
    {
        "text": "Hello, this is a test of American English.",
        "voice": "af_heart",
        "lang": "a",
        "speed": 1.0,
        "sample_rate": 24000,
        "filename": "english_american.wav"
    },
    {
        "text": "Good afternoon, this is British English speaking.",
        "voice": "bm_lewis",
        "lang": "b",
        "speed": 0.9,
        "sample_rate": 22050,
        "filename": "english_british.wav"
    },
    {
        "text": "Hola, esto es una prueba en espa√±ol.",
        "voice": "ef_dora",
        "lang": "e",
        "speed": 1.1,
        "sample_rate": 44100,
        "filename": "spanish_test.wav"
    },
    {
        "text": "Bonjour, ceci est un test en fran√ßais.",
        "voice": "ff_siwis",
        "lang": "f",
        "speed": 1.0,
        "sample_rate": 48000,
        "filename": "french_test.wav"
    }
]

# Generate all files
generated_files = batch_synthesize(tasks, "generated_audio")

# Download as zip (on Colab)
if IN_COLAB:
    !zip -r generated_audio.zip generated_audio/
    from google.colab import files
    files.download('generated_audio.zip')

---

## 5Ô∏è‚É£ Interactive Mode

Use the form below to generate speech interactively!

In [None]:
#@title üéÆ Interactive TTS Generator { display-mode: "form" }

#@markdown ### Enter your text and settings:

text_input = "Hello! Welcome to KTTS72, the Kokoro Text to Speech system." #@param {type:"string"}

voice_select = "af_heart" #@param ["af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis", "ef_dora", "em_alex", "em_santa", "ff_siwis"]

language_select = "a" #@param ["a", "b", "e", "f"] {type:"string"}

speed_slider = 1.0 #@param {type:"slider", min:0.25, max:4.0, step:0.05}

sample_rate_select = 24000 #@param [8000, 16000, 22050, 24000, 44100, 48000] {type:"raw"}

save_to_file = False #@param {type:"boolean"}
output_filename = "output.wav" #@param {type:"string"}

#@markdown ---
#@markdown ### Click ‚ñ∂Ô∏è to generate!

# Generate speech
synthesize(
    text=text_input,
    voice=voice_select,
    lang=language_select,
    speed=speed_slider,
    sample_rate=sample_rate_select,
    output_file=output_filename if save_to_file else None,
    play_audio=True,
    verbose=True
)

# Auto-download if saving
if save_to_file and IN_COLAB:
    from google.colab import files
    files.download(output_filename)

---

## üìñ Reference

### Voice List

In [None]:
# Show all available voices
list_voices()

### Language Codes

| Code | Language | # Voices |
|------|----------|----------|
| `a` | American English | 20 |
| `b` | British English | 8 |
| `e` | Spanish | 3 |
| `f` | French | 1 |

### Parameters

| Parameter | Type | Default | Range/Options |
|-----------|------|---------|---------------|
| `text` | str | required | 1-50,000 chars |
| `voice` | str | `af_heart` | See voice list |
| `lang` | str | `a` | a, b, e, f |
| `speed` | float | 1.0 | 0.25 - 4.0 |
| `sample_rate` | int | 24000 | 8000-48000 |
| `output_file` | str | None | .wav or .mp3 |

---

## üìú Credits

- **Kokoro-82M Model**: [Hexgrad](https://huggingface.co/hexgrad/Kokoro-82M)
- **KTTS72**: Based on the kokoro_announce library