In [None]:
# @title 1. Install Dependencies
# Install the main library and UI tools
!pip install -q voxcpm modelscope gradio soundfile torch torchaudio torchcodec

# Install ffmpeg for audio processing
!apt-get install -y ffmpeg

print("Dependencies installed. Please run the next cell.")

In [None]:
# @title 2. Load Model (Fixed)
import os
import torch
import soundfile as sf
import numpy as np
from modelscope import snapshot_download

# --- THE FIX ---
# We monkeypatch torch.compile to prevent the CUDA Graph crash.
# This forces the model to run in standard (Eager) mode.
def no_op_compile(model, *args, **kwargs):
    print(f"Skipping compilation for {type(model).__name__} to prevent Colab crash.")
    return model

torch.compile = no_op_compile
# ----------------

from voxcpm import VoxCPM

# Check for GPU
if not torch.cuda.is_available():
    raise SystemError("GPU not found. Please go to Runtime -> Change runtime type -> T4 GPU")

print("Downloading and loading VoxCPM1.5... (This may take a minute)")

# 1. Download/Load Main Model
try:
    # Initialize the model
    model = VoxCPM.from_pretrained("openbmb/VoxCPM1.5")
    print("‚úÖ VoxCPM1.5 Loaded successfully (Eager Mode).")
except Exception as e:
    print(f"Error loading model: {e}")

# 2. Download Helper Models (for Denoising support)
try:
    print("Downloading ZipEnhancer for denoising support...")
    snapshot_download('iic/speech_zipenhancer_ans_multiloss_16k_base')
    snapshot_download('iic/SenseVoiceSmall')
    print("‚úÖ Helper models downloaded.")
except Exception as e:
    print(f"Warning: Could not download helper models. Denoising might not work. Error: {e}")

print("System Ready.")

In [None]:
# @title 3. Run Gradio Interface
import gradio as gr
import uuid

def generate_speech(
    text,
    prompt_audio,
    prompt_text,
    cfg_value,
    inference_timesteps,
    normalize,
    denoise
):
    # Create a unique filename for the output
    output_filename = f"output_{uuid.uuid4()}.wav"

    prompt_path = None

    # Handle Prompt Audio (Voice Cloning)
    if prompt_audio is not None:
        # Gradio passes audio as (sample_rate, data) or filepath depending on config.
        # VoxCPM expects a filepath.
        if isinstance(prompt_audio, str):
            prompt_path = prompt_audio
        else:
            # If it's a tuple, save it temporarily
            sr, data = prompt_audio
            prompt_path = f"temp_prompt_{uuid.uuid4()}.wav"
            sf.write(prompt_path, data, sr)

    print(f"Generating: '{text[:30]}...' | CFG: {cfg_value} | Steps: {inference_timesteps}")

    try:
        # Run Generation
        wav = model.generate(
            text=text,
            prompt_wav_path=prompt_path,
            prompt_text=prompt_text if prompt_text and prompt_text.strip() != "" else None,
            cfg_value=float(cfg_value),
            inference_timesteps=int(inference_timesteps),
            normalize=normalize,
            denoise=denoise,
            retry_badcase=True
        )

        # Save output
        sf.write(output_filename, wav, model.tts_model.sample_rate)

        # Cleanup temp prompt if created
        if prompt_path and prompt_path != prompt_audio:
            if os.path.exists(prompt_path):
                os.remove(prompt_path)

        return output_filename

    except Exception as e:
        raise gr.Error(f"Generation failed: {str(e)}")

# Define the UI Layout
with gr.Blocks(title="VoxCPM 1.5 Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # üéôÔ∏è VoxCPM 1.5
        **Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning**
        """
    )

    with gr.Row():
        with gr.Column():
            # Input Section
            txt_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter text here (English or Chinese)...",
                lines=3,
                value="VoxCPM is an innovative end-to-end TTS model designed to generate highly expressive speech."
            )

            with gr.Accordion("Voice Cloning (Optional)", open=True):
                gr.Markdown("*Upload a short audio clip (3-10s) to clone the voice.*")
                audio_prompt = gr.Audio(
                    label="Reference Audio",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                txt_prompt = gr.Textbox(
                    label="Reference Text (Optional)",
                    placeholder="Transcription of the reference audio. Improves cloning accuracy.",
                    info="If left empty, the model attempts to infer it, but providing text is better."
                )

            with gr.Accordion("Advanced Settings", open=False):
                slider_cfg = gr.Slider(
                    minimum=1.0, maximum=5.0, value=2.0, step=0.1,
                    label="CFG Value",
                    info="Higher = follows text/prompt closer. Lower = more expressive/random."
                )
                slider_steps = gr.Slider(
                    minimum=5, maximum=50, value=10, step=1,
                    label="Inference Timesteps",
                    info="Higher = better quality, slower speed."
                )
                chk_normalize = gr.Checkbox(
                    label="Normalize Text",
                    value=False,
                    info="Enable for standard text. Disable if using phonemes {HH AH0...}."
                )
                chk_denoise = gr.Checkbox(
                    label="Denoise Prompt",
                    value=False,
                    info="Removes noise from reference audio, but restricts output to 16kHz."
                )

            btn_gen = gr.Button("Generate Speech", variant="primary")

        with gr.Column():
            # Output Section
            audio_out = gr.Audio(label="Generated Audio")

    # Connect function
    btn_gen.click(
        fn=generate_speech,
        inputs=[
            txt_input,
            audio_prompt,
            txt_prompt,
            slider_cfg,
            slider_steps,
            chk_normalize,
            chk_denoise
        ],
        outputs=[audio_out]
    )

# Launch
demo.launch(share=True, debug=True)