In [None]:
!git clone https://github.com/minipasila/VibeVoice.git
%cd VibeVoice
!pip install -e .
!apt install ffmpeg  # or brew install ffmpeg on macOS
!pip install gradio

In [None]:
import gradio as gr
import os
import subprocess
import tempfile
import shutil
from pathlib import Path
import json
import torch
import warnings
warnings.filterwarnings("ignore")

class VibeVoiceInterface:
    def __init__(self):
        self.model_path = "microsoft/VibeVoice-1.5B"
        self.output_dir = "./outputs"
        self.demo_dir = "./demo"
        self.voices_dir = f"{self.demo_dir}/voices"
        self.text_examples_dir = f"{self.demo_dir}/text_examples"

        # Ensure output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        # Available voice options (based on the demo/voices folder)
        self.available_voices = {
            "Alice (English Woman)": "Alice",
            "Carter (English Man)": "Carter",
            "Frank (English Man)": "Frank",
            "Mary (English Woman with BGM)": "Mary",
            "Maya (English Woman)": "Maya",
            "Samuel (Indian Man)": "Samuel",
            "Anchen (Chinese Man with BGM)": "Anchen",
            "Bowen (Chinese Man)": "Bowen",
            "Xinran (Chinese Woman)": "Xinran"
        }

        # Load example texts
        self.example_texts = self.load_example_texts()

    def load_example_texts(self):
        """Load example texts from the demo/text_examples directory"""
        examples = {}
        if os.path.exists(self.text_examples_dir):
            for file in os.listdir(self.text_examples_dir):
                if file.endswith('.txt'):
                    try:
                        with open(os.path.join(self.text_examples_dir, file), 'r', encoding='utf-8') as f:
                            content = f.read().strip()
                            examples[file.replace('.txt', '').replace('_', ' ').title()] = content
                    except Exception as e:
                        print(f"Error loading {file}: {e}")
        return examples

    def setup_environment(self):
        """Setup the VibeVoice environment"""
        try:
            # Check if we're in the right directory
            if not os.path.exists("demo"):
                return "Error: Please run this from the VibeVoice directory or clone the repository first."

            # Check if ffmpeg is installed
            result = subprocess.run(["which", "ffmpeg"], capture_output=True)
            if result.returncode != 0:
                return "Error: ffmpeg not found. Please install it first."

            return "Environment setup complete!"

        except Exception as e:
            return f"Setup error: {str(e)}"

    def generate_speech(self, text, speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice,
                       example_text_selection, custom_voice_1, custom_voice_2, custom_voice_3, custom_voice_4,
                       progress=gr.Progress()):
        """Generate speech from text using VibeVoice"""

        try:
            progress(0.1, desc="Setting up...")

            # Use example text if selected
            if example_text_selection and example_text_selection != "None":
                if example_text_selection in self.example_texts:
                    text = self.example_texts[example_text_selection]

            if not text.strip():
                return None, "Please provide text to convert to speech."

            # Create temporary text file
            temp_text_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
            temp_text_file.write(text)
            temp_text_file.close()

            progress(0.3, desc="Preparing voices...")

            # Determine speakers from text and voice selections
            speakers = []
            voice_map = {
                1: speaker1_voice,
                2: speaker2_voice,
                3: speaker3_voice,
                4: speaker4_voice
            }

            # Count unique speakers in text
            lines = text.strip().split('\n')
            unique_speakers = set()
            for line in lines:
                if ':' in line:
                    speaker_part = line.split(':')[0].strip()
                    if speaker_part.lower().startswith('speaker'):
                        try:
                            speaker_num = int(speaker_part.lower().replace('speaker', '').strip())
                            unique_speakers.add(speaker_num)
                        except:
                            pass

            # Build speaker list based on detected speakers
            for speaker_num in sorted(unique_speakers):
                if speaker_num in voice_map and voice_map[speaker_num] != "None":
                    voice_name = self.available_voices.get(voice_map[speaker_num], voice_map[speaker_num])
                    speakers.append(voice_name)

            # Default to Alice and Frank if no speakers detected
            if not speakers:
                speakers = ["Alice", "Frank"]

            progress(0.5, desc="Processing custom voices...")

            # Handle custom voice files if provided
            custom_voices = [custom_voice_1, custom_voice_2, custom_voice_3, custom_voice_4]

            for i, custom_voice in enumerate(custom_voices, 1):
                if custom_voice is not None:
                    # Copy custom voice to voices directory
                    custom_voice_path = os.path.join(self.voices_dir, os.path.basename(custom_voice.name))
                    shutil.copy2(custom_voice.name, custom_voice_path)

                    # Use custom voice name (without extension)
                    custom_voice_name = os.path.splitext(os.path.basename(custom_voice.name))[0]

                    # Replace the corresponding speaker voice if it exists in our speakers list
                    if i <= len(speakers):
                        speakers[i-1] = custom_voice_name
                    else:
                        # Add new speaker if we don't have enough
                        speakers.append(custom_voice_name)

            # Construct command
            cmd = [
                "python", "demo/inference_from_file.py",
                "--model_path", self.model_path,
                "--txt_path", temp_text_file.name,
                "--speaker_names"
            ] + speakers

            progress(0.7, desc="Running VibeVoice inference...")

            # Run inference
            result = subprocess.run(cmd, capture_output=True, text=True, cwd="./")

            # Clean up temp file
            os.unlink(temp_text_file.name)

            if result.returncode != 0:
                return None, f"Error generating speech: {result.stderr}"

            progress(0.9, desc="Finalizing output...")

            # Find the generated audio file
            base_name = os.path.splitext(os.path.basename(temp_text_file.name))[0]
            output_file = f"./outputs/{base_name}_generated.wav"

            if os.path.exists(output_file):
                progress(1.0, desc="Complete!")
                return output_file, "Speech generated successfully!"
            else:
                # Try to find any generated file in outputs
                output_files = [f for f in os.listdir("./outputs") if f.endswith("_generated.wav")]
                if output_files:
                    latest_file = max([f"./outputs/{f}" for f in output_files], key=os.path.getctime)
                    return latest_file, "Speech generated successfully!"
                else:
                    return None, "No output file generated. Please check the logs."

        except Exception as e:
            return None, f"Error: {str(e)}"

    def load_example_text(self, selection):
        """Load selected example text"""
        if selection and selection != "None" and selection in self.example_texts:
            return self.example_texts[selection]
        return ""

    def create_interface(self):
        """Create the Gradio interface"""

        with gr.Blocks(title="VibeVoice - Advanced Text-to-Speech", theme=gr.themes.Soft()) as interface:
            gr.Markdown("""
            # 🎙️ VibeVoice - Advanced Multi-Speaker Text-to-Speech

            Generate high-quality speech with multiple speakers and natural conversations.

            **Instructions:**
            1. Choose from example texts or write your own
            2. Select voices for each speaker OR upload custom voice files
            3. Custom voices will override the selected preset voices
            4. Click "Generate Speech" to create audio

            **Text Format:** Use `Speaker 1:`, `Speaker 2:`, etc. for multi-speaker conversations.
            """)

            with gr.Row():
                with gr.Column(scale=2):
                    gr.Markdown("### 📝 Text Input")

                    example_dropdown = gr.Dropdown(
                        choices=["None"] + list(self.example_texts.keys()),
                        value="None",
                        label="Load Example Text"
                    )
                    gr.Markdown("*Select a pre-made example or choose 'None' to use custom text*")

                    text_input = gr.Textbox(
                        lines=10,
                        placeholder="Enter your text here...\n\nExample:\nSpeaker 1: Hello, how are you today?\nSpeaker 2: I'm doing great, thanks for asking!\nSpeaker 3: What are your plans for the weekend?\nSpeaker 4: I'm thinking of going hiking in the mountains.",
                        label="Text to Convert"
                    )
                    gr.Markdown("*Use 'Speaker 1:', 'Speaker 2:', etc. for different speakers*")

                    # Load example text when dropdown changes
                    example_dropdown.change(
                        fn=self.load_example_text,
                        inputs=[example_dropdown],
                        outputs=[text_input]
                    )

                with gr.Column(scale=1):
                    gr.Markdown("### 🎭 Voice Selection")
                    gr.Markdown("*Choose preset voices for each speaker*")

                    speaker1_voice = gr.Dropdown(
                        choices=["None"] + list(self.available_voices.keys()),
                        value="Alice (English Woman)",
                        label="Speaker 1 Voice"
                    )

                    speaker2_voice = gr.Dropdown(
                        choices=["None"] + list(self.available_voices.keys()),
                        value="Frank (English Man)",
                        label="Speaker 2 Voice"
                    )

                    speaker3_voice = gr.Dropdown(
                        choices=["None"] + list(self.available_voices.keys()),
                        value="Maya (English Woman)",
                        label="Speaker 3 Voice"
                    )

                    speaker4_voice = gr.Dropdown(
                        choices=["None"] + list(self.available_voices.keys()),
                        value="Carter (English Man)",
                        label="Speaker 4 Voice"
                    )

                    gr.Markdown("### 🎵 Custom Voices (Optional)")
                    gr.Markdown("*Upload voice samples to override preset voices. Custom voices take priority.*")

                    custom_voice_1 = gr.File(
                        label="Custom Voice for Speaker 1"
                    )

                    custom_voice_2 = gr.File(
                        label="Custom Voice for Speaker 2"
                    )

                    custom_voice_3 = gr.File(
                        label="Custom Voice for Speaker 3"
                    )

                    custom_voice_4 = gr.File(
                        label="Custom Voice for Speaker 4"
                    )

            with gr.Row():
                generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")

            with gr.Row():
                with gr.Column():
                    audio_output = gr.Audio(label="Generated Speech", type="filepath")
                    status_output = gr.Textbox(label="Status", interactive=False)

            # Example texts display
            with gr.Accordion("📚 Available Example Texts", open=False):
                if self.example_texts:
                    example_display = gr.Markdown(
                        "\n\n".join([f"**{name}:**\n```\n{content[:200]}{'...' if len(content) > 200 else ''}\n```"
                                    for name, content in list(self.example_texts.items())[:3]])
                    )
                else:
                    gr.Markdown("*No example texts found. Make sure you're running from the VibeVoice directory.*")

            # Setup info
            with gr.Accordion("⚙️ Setup Information", open=False):
                gr.Markdown("""
                ### Prerequisites:
                1. Clone the VibeVoice repository: `git clone https://github.com/minipasila/VibeVoice.git`
                2. Install dependencies: `pip install -e .`
                3. Install ffmpeg: `apt install ffmpeg` (Ubuntu) or `brew install ffmpeg` (macOS)
                4. Run this script from the VibeVoice directory

                ### Voice Files:
                - Demo voices are in `demo/voices/`
                - Custom voices should be WAV, MP3, or FLAC format
                - Upload custom voices to override the selected preset voices
                - Custom voice files should be clear, single-speaker recordings
                - Recommended: 3-10 second voice samples work best

                ### Model Information:
                - Using VibeVoice-1.5B model (suitable for T4 GPU)
                - For better quality, use VibeVoice-7B on more powerful hardware

                ### Multi-Voice Usage:
                - Mix and match: Use custom voices for some speakers, presets for others
                - Priority: Custom uploaded voices override preset selections
                - Fallback: If no custom voice uploaded, uses selected preset voice
                """)

            # Connect the generate button
            generate_btn.click(
                fn=self.generate_speech,
                inputs=[text_input, speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice,
                       example_dropdown, custom_voice_1, custom_voice_2, custom_voice_3, custom_voice_4],
                outputs=[audio_output, status_output],
                show_progress=True
            )

            # Add examples at the bottom
            gr.Examples(
                examples=[
                    ["Speaker 1: Welcome to VibeVoice!\nSpeaker 2: This is amazing technology for speech synthesis.",
                     "Alice (English Woman)", "Frank (English Man)", "None", "None"],
                    ["Speaker 1: Can you tell me about the weather today?\nSpeaker 2: It's sunny and warm, perfect for a walk outside.\nSpeaker 3: That sounds lovely for our picnic!",
                     "Maya (English Woman)", "Carter (English Man)", "Alice (English Woman)", "None"],
                    ["Speaker 1: 你好，今天天气怎么样？\nSpeaker 2: 今天天气很好，阳光明媚。",
                     "Xinran (Chinese Woman)", "Bowen (Chinese Man)", "None", "None"],
                    ["Speaker 1: Let's discuss the quarterly results.\nSpeaker 2: Sales have increased by 15% this quarter.\nSpeaker 3: That's excellent progress!\nSpeaker 4: What are our targets for next quarter?",
                     "Frank (English Man)", "Maya (English Woman)", "Carter (English Man)", "Alice (English Woman)"]
                ],
                inputs=[text_input, speaker1_voice, speaker2_voice, speaker3_voice, speaker4_voice],
                label="Quick Examples"
            )

        return interface

def main():
    # Initialize the interface
    vv_interface = VibeVoiceInterface()

    # Check environment setup
    setup_result = vv_interface.setup_environment()
    print(setup_result)

    if "Error" in setup_result:
        print("⚠️  Setup issues detected. Please resolve them before running.")
        print("Make sure you're in the VibeVoice directory and have installed dependencies.")

    # Create and launch the interface
    interface = vv_interface.create_interface()

    # Launch with sharing enabled for easy access
    interface.launch(
        share=True,            # Create a public link
        debug=True,            # Enable debug mode
        show_error=True        # Show detailed errors
    )

if __name__ == "__main__":
    main()