In [None]:
# RUN THIS FIRST
!pip install torch==2.6.0 torchvision torchaudio gradio

In [None]:
# THEN THIS ONE THIS DOES NOT WORK ON COLAB DUE TO NUMPY FAILING TO BUILD WHEELS
!pip install chatterbox-tts

In [None]:
# Chatterbox TTS Gradio UI for Google Colab/RunPod
# Run this to create an interactive TTS interface

# Install dependencies
import subprocess
import sys

def install_packages():
    """Install required packages"""
    packages = [
        "chatterbox-tts",
        "gradio",
        "torchaudio",
        "torch"
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✓ Installed {package}")
        except subprocess.CalledProcessError:
            print(f"✗ Failed to install {package}")

# Uncomment the line below to install packages (run once)
# install_packages()

import gradio as gr
import torchaudio as ta
import torch
import tempfile
import os
from pathlib import Path
import numpy as np

# Import Chatterbox models
try:
    from chatterbox.tts import ChatterboxTTS
    from chatterbox.mtl_tts import ChatterboxMultilingualTTS
except ImportError:
    print("Chatterbox not installed. Please run install_packages() first.")
    sys.exit(1)

class ChatterboxGradioInterface:
    def __init__(self):
        self.english_model = None
        self.multilingual_model = None

        # Supported languages for multilingual model
        self.supported_languages = {
            "Arabic": "ar",
            "Danish": "da",
            "German": "de",
            "Greek": "el",
            "English": "en",
            "Spanish": "es",
            "Finnish": "fi",
            "French": "fr",
            "Hebrew": "he",
            "Hindi": "hi",
            "Italian": "it",
            "Japanese": "ja",
            "Korean": "ko",
            "Malay": "ms",
            "Dutch": "nl",
            "Norwegian": "no",
            "Polish": "pl",
            "Portuguese": "pt",
            "Russian": "ru",
            "Swedish": "sv",
            "Swahili": "sw",
            "Turkish": "tr",
            "Chinese": "zh"
        }

    def load_english_model(self):
        """Load the English-only Chatterbox model"""
        if self.english_model is None:
            print("Loading English Chatterbox model...")
            device = "cuda" if torch.cuda.is_available() else "cpu"
            self.english_model = ChatterboxTTS.from_pretrained(device=device)
            print(f"✓ English model loaded on {device}")
        return "English model loaded successfully!"

    def load_multilingual_model(self):
        """Load the multilingual Chatterbox model"""
        if self.multilingual_model is None:
            print("Loading Multilingual Chatterbox model...")
            device = "cuda" if torch.cuda.is_available() else "cpu"
            self.multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device=device)
            print(f"✓ Multilingual model loaded on {device}")
        return "Multilingual model loaded successfully!"

    def generate_speech_english(self, text, audio_prompt, exaggeration, cfg_weight, temperature):
        """Generate speech using English model"""
        if not text.strip():
            return None, "Please enter some text to synthesize."

        if self.english_model is None:
            return None, "Please load the English model first."

        try:
            # Prepare generation parameters
            generate_kwargs = {
                "exaggeration": exaggeration,
                "temperature": temperature,
                "cfg_weight": cfg_weight,
            }

            # Add audio prompt if provided
            if audio_prompt is not None:
                generate_kwargs["audio_prompt_path"] = audio_prompt

            # Generate audio
            wav = self.english_model.generate(text, **generate_kwargs)

            # Save to temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                ta.save(temp_file.name, wav, self.english_model.sr)
                return temp_file.name, f"Generated speech successfully! Sample rate: {self.english_model.sr}Hz"

        except Exception as e:
            return None, f"Error generating speech: {str(e)}"

    def generate_speech_multilingual(self, text, language, audio_prompt, exaggeration, cfg_weight, temperature):
        """Generate speech using multilingual model"""
        if not text.strip():
            return None, "Please enter some text to synthesize."

        if self.multilingual_model is None:
            return None, "Please load the multilingual model first."

        language_code = self.supported_languages.get(language, "en")

        try:
            # Prepare generation parameters
            generate_kwargs = {
                "exaggeration": exaggeration,
                "temperature": temperature,
                "cfg_weight": cfg_weight,
            }

            # Add audio prompt if provided
            if audio_prompt is not None:
                generate_kwargs["audio_prompt_path"] = audio_prompt

            # Generate audio (truncate text to 300 chars as per official demo)
            wav = self.multilingual_model.generate(
                text[:300],
                language_id=language_code,
                **generate_kwargs
            )

            # Save to temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                ta.save(temp_file.name, wav, self.multilingual_model.sr)
                return temp_file.name, f"Generated {language} speech successfully! Sample rate: {self.multilingual_model.sr}Hz"

        except Exception as e:
            return None, f"Error generating speech: {str(e)}"

# Create the interface instance
chatterbox_interface = ChatterboxGradioInterface()

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Chatterbox TTS - Resemble AI", theme=gr.themes.Soft()) as demo:

        # Header
        gr.Markdown("""
        # 🎙️ Chatterbox TTS - Resemble AI

        Production-grade open source TTS models with emotion control and multilingual support.

        **Features:**
        - 🌍 23 languages supported (Multilingual model)
        - 🎭 Emotion exaggeration control
        - 🔊 Voice cloning with reference audio
        - ⚡ Ultra-stable alignment-informed inference
        """)

        # Model selection and loading
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🚀 Model Loading")
                load_english_btn = gr.Button("Load English Model", variant="primary")
                load_multilingual_btn = gr.Button("Load Multilingual Model", variant="primary")
                model_status = gr.Textbox(label="Model Status", interactive=False)

        # English TTS Tab
        with gr.Tab("English TTS"):
            with gr.Row():
                with gr.Column(scale=2):
                    en_text = gr.Textbox(
                        label="Text to Synthesize",
                        placeholder="Enter text in English...",
                        lines=3,
                        value="Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill."
                    )

                with gr.Column(scale=1):
                    en_audio_prompt = gr.Audio(
                        label="Voice Reference (Optional)",
                        type="filepath",
                        sources=["upload", "microphone"]
                    )

            with gr.Row():
                en_exaggeration = gr.Slider(
                    label="Exaggeration/Intensity",
                    minimum=0.25,
                    maximum=2.0,
                    value=0.5,
                    step=0.05,
                    info="Higher values = more expressive speech (extreme values can be unstable)"
                )
                en_cfg_weight = gr.Slider(
                    label="CFG/Pace Weight",
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.05,
                    info="Controls generation guidance and pacing"
                )

            with gr.Row():
                en_temperature = gr.Slider(
                    label="Temperature",
                    minimum=0.05,
                    maximum=5.0,
                    value=0.8,
                    step=0.05,
                    info="Controls randomness in generation"
                )

            en_generate_btn = gr.Button("🎵 Generate English Speech", variant="primary", size="lg")

            with gr.Row():
                en_output_audio = gr.Audio(label="Generated Speech", type="filepath")
                en_status = gr.Textbox(label="Status", interactive=False)

        # Multilingual TTS Tab
        with gr.Tab("Multilingual TTS"):
            with gr.Row():
                with gr.Column(scale=2):
                    mt_text = gr.Textbox(
                        label="Text to Synthesize (max 300 chars)",
                        placeholder="Enter text in any supported language...",
                        lines=3,
                        value="Bonjour, comment ça va? Ceci est le modèle de synthèse vocale multilingue Chatterbox."
                    )
                    mt_language = gr.Dropdown(
                        label="Language",
                        choices=list(chatterbox_interface.supported_languages.keys()),
                        value="French",
                        info="Select the language of your text"
                    )

                with gr.Column(scale=1):
                    mt_audio_prompt = gr.Audio(
                        label="Voice Reference (Optional)",
                        type="filepath",
                        sources=["upload", "microphone"]
                    )

            with gr.Row():
                mt_exaggeration = gr.Slider(
                    label="Exaggeration/Intensity",
                    minimum=0.25,
                    maximum=2.0,
                    value=0.5,
                    step=0.05,
                    info="Higher values = more expressive speech (extreme values can be unstable)"
                )
                mt_cfg_weight = gr.Slider(
                    label="CFG/Pace Weight",
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.05,
                    info="Controls generation guidance (0 for language transfer)"
                )

            with gr.Row():
                mt_temperature = gr.Slider(
                    label="Temperature",
                    minimum=0.05,
                    maximum=5.0,
                    value=0.8,
                    step=0.05,
                    info="Controls randomness in generation"
                )

            mt_generate_btn = gr.Button("🌍 Generate Multilingual Speech", variant="primary", size="lg")

            with gr.Row():
                mt_output_audio = gr.Audio(label="Generated Speech", type="filepath")
                mt_status = gr.Textbox(label="Status", interactive=False)

        # Tips and Examples
        with gr.Tab("💡 Tips & Examples"):
            gr.Markdown("""
            ## Usage Tips

            ### General Use (TTS and Voice Agents):
            - Default settings (`exaggeration=0.5`, `cfg_weight=0.5`) work well for most prompts
            - If the reference speaker has a fast speaking style, try lowering `cfg_weight` to around `0.3`

            ### Expressive or Dramatic Speech:
            - Try lower `cfg_weight` values (e.g. `~0.3`) and increase `exaggeration` to `0.7` or higher
            - Higher `exaggeration` tends to speed up speech; reducing `cfg_weight` helps compensate

            ### Voice References:
            - Upload a clean audio file (WAV recommended) of the voice you want to clone
            - Ensure the reference clip matches the specified language tag
            - For language transfer, set CFG weight to 0 to avoid accent inheritance

            ### Temperature Control:
            - Higher values (1.0+) = more varied and creative output
            - Lower values (0.5-) = more consistent and predictable output

            ## Example Texts

            **English:**
            - "Welcome to the future of AI-generated speech synthesis!"
            - "The weather today is absolutely beautiful with clear blue skies."

            **French:**
            - "Bonjour, comment allez-vous aujourd'hui?"
            - "Paris est une ville magnifique avec beaucoup d'histoire."

            **Spanish:**
            - "¡Hola! ¿Cómo estás? Espero que tengas un día maravilloso."

            **Chinese:**
            - "你好，今天天气真不错，希望你有一个愉快的周末。"

            **Japanese:**
            - "こんにちは、今日はとても良い天気ですね。"
            """)

        # Event handlers
        load_english_btn.click(
            chatterbox_interface.load_english_model,
            outputs=model_status
        )

        load_multilingual_btn.click(
            chatterbox_interface.load_multilingual_model,
            outputs=model_status
        )

        en_generate_btn.click(
            chatterbox_interface.generate_speech_english,
            inputs=[en_text, en_audio_prompt, en_exaggeration, en_cfg_weight, en_temperature],
            outputs=[en_output_audio, en_status]
        )

        mt_generate_btn.click(
            chatterbox_interface.generate_speech_multilingual,
            inputs=[mt_text, mt_language, mt_audio_prompt, mt_exaggeration, mt_cfg_weight, mt_temperature],
            outputs=[mt_output_audio, mt_status]
        )

        # Footer
        gr.Markdown("""
        ---
        <div style="text-align: center; color: #666;">
        Made with ❤️ by <strong>Resemble AI</strong> |
        <a href="https://resemble.ai">Website</a> |
        <a href="https://github.com/resemble-ai/chatterbox">GitHub</a> |
        <a href="https://huggingface.co/spaces/ResembleAI/Chatterbox">HuggingFace</a>
        </div>
        """)

    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()

    # Launch with public link for Colab/RunPod
    demo.launch(
        share=True,  # Creates public link
        server_name="0.0.0.0",  # Allow external access
        server_port=7860,  # Default Gradio port
        show_error=True,  # Show detailed errors
        quiet=False  # Show startup messages
    )