# üéôÔ∏è VieNeu-TTS Voice Cloning System

Production-ready voice cloning notebook with automatic audio preprocessing.

**Features:**
- ‚úÖ Auto-detect M4A files in project
- ‚úÖ Multi-format support (M4A/MP3/WAV)
- ‚úÖ Smart audio preprocessing
- ‚úÖ Extract optimal segments (5-10s)
- ‚úÖ Clean, maintainable code

## 1Ô∏è‚É£ System Check & Installation

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required packages
print("üì¶ Installing dependencies...")

!pip install -q vieneu --extra-index-url https://pnnbao97.github.io/llama-cpp-python-v0.3.16/cpu/
!pip install -q soundfile librosa pydub noisereduce
!apt-get install -qq espeak-ng ffmpeg

print("‚úÖ Installation complete!")

## 2Ô∏è‚É£ Configuration & Setup

In [None]:
# Import all required libraries
import os
import numpy as np
import soundfile as sf
import librosa
import noisereduce as nr
from pathlib import Path
from google.colab import files
from IPython.display import Audio, display, HTML
from pydub import AudioSegment
import warnings
warnings.filterwarnings('ignore')

# Constants
class AudioConfig:
    TARGET_SAMPLE_RATE = 24000
    TARGET_DB = -20
    NOISE_REDUCTION_STRENGTH = 0.7
    MIN_SEGMENT_DURATION = 3
    MAX_SEGMENT_DURATION = 10
    FRAME_LENGTH_MS = 0.025
    HOP_LENGTH_MS = 0.010
    RMS_THRESHOLD_MULTIPLIER = 0.5
    MAX_SEGMENTS_TO_SHOW = 5

class Directories:
    UPLOADS = "uploads"
    PROCESSED = "processed"
    OUTPUTS = "outputs"
    PROJECT_VOICES = "/content/voices"  # Update this path if needed
    
    @classmethod
    def setup(cls):
        """Create all required directories"""
        for dir_path in [cls.UPLOADS, cls.PROCESSED, cls.OUTPUTS]:
            os.makedirs(dir_path, exist_ok=True)

Directories.setup()
print("‚úÖ Configuration loaded!")

## 3Ô∏è‚É£ Audio Processing Functions

In [None]:
class AudioConverter:
    """Handles audio format conversion"""
    
    @staticmethod
    def convert_to_wav(input_path: str, output_path: str = None) -> str:
        """
        Convert audio file to WAV format
        
        Args:
            input_path: Source audio file path
            output_path: Target WAV file path (auto-generated if None)
        
        Returns:
            Path to converted WAV file
        """
        if output_path is None:
            output_path = f"{Directories.PROCESSED}/converted.wav"
        
        input_format = Path(input_path).suffix[1:]  # Remove dot
        print(f"üîÑ Converting {input_format.upper()} to WAV...")
        
        audio = AudioSegment.from_file(str(input_path))
        audio = audio.set_channels(1).set_frame_rate(AudioConfig.TARGET_SAMPLE_RATE)
        audio.export(output_path, format="wav")
        
        print(f"‚úÖ Converted: {output_path}")
        return output_path

In [None]:
class AudioNormalizer:
    """Normalizes audio levels"""
    
    @staticmethod
    def normalize(audio_data: np.ndarray, target_db: float = None) -> np.ndarray:
        """
        Normalize audio to target dB level
        
        Args:
            audio_data: Audio samples as numpy array
            target_db: Target dB level (uses config default if None)
        
        Returns:
            Normalized audio data
        """
        if target_db is None:
            target_db = AudioConfig.TARGET_DB
        
        rms = np.sqrt(np.mean(audio_data**2))
        if rms > 0:
            target_rms = 10 ** (target_db / 20)
            audio_data = audio_data * (target_rms / rms)
        
        return np.clip(audio_data, -1, 1)
    
    @staticmethod
    def reduce_noise(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
        """
        Apply noise reduction
        
        Args:
            audio_data: Audio samples
            sample_rate: Sample rate in Hz
        
        Returns:
            Noise-reduced audio
        """
        print("üîá Applying noise reduction...")
        noise_sample = audio_data[:int(sample_rate * 0.5)]
        return nr.reduce_noise(
            y=audio_data,
            sr=sample_rate,
            y_noise=noise_sample,
            prop_decrease=AudioConfig.NOISE_REDUCTION_STRENGTH
        )

In [None]:
class SpeechSegment:
    """Represents a speech segment with metadata"""
    
    def __init__(self, start: float, end: float, energy: float):
        self.start = start
        self.end = end
        self.energy = energy
    
    @property
    def duration(self) -> float:
        return self.end - self.start
    
    def __repr__(self) -> str:
        return f"SpeechSegment({self.start:.1f}s-{self.end:.1f}s, {self.duration:.1f}s)"


class SpeechSegmentDetector:
    """Detects speech segments in audio"""
    
    @staticmethod
    def find_segments(audio_data: np.ndarray, sample_rate: int) -> list[SpeechSegment]:
        """
        Find speech segments using energy-based detection
        
        Args:
            audio_data: Audio samples
            sample_rate: Sample rate
        
        Returns:
            List of SpeechSegment objects, sorted by energy
        """
        print("üîç Analyzing audio for speech segments...")
        
        frame_length = int(sample_rate * AudioConfig.FRAME_LENGTH_MS)
        hop_length = int(sample_rate * AudioConfig.HOP_LENGTH_MS)
        
        rms = librosa.feature.rms(
            y=audio_data,
            frame_length=frame_length,
            hop_length=hop_length
        )[0]
        
        threshold = np.mean(rms) * AudioConfig.RMS_THRESHOLD_MULTIPLIER
        is_speech = rms > threshold
        times = librosa.frames_to_time(
            np.arange(len(rms)),
            sr=sample_rate,
            hop_length=hop_length
        )
        
        segments = SpeechSegmentDetector._extract_segments(
            times, is_speech, rms, audio_data, sample_rate
        )
        
        if not segments:
            segments = SpeechSegmentDetector._create_fixed_windows(
                audio_data, sample_rate
            )
        
        segments.sort(key=lambda x: x.energy, reverse=True)
        return segments[:AudioConfig.MAX_SEGMENTS_TO_SHOW]
    
    @staticmethod
    def _extract_segments(
        times: np.ndarray,
        is_speech: np.ndarray,
        rms: np.ndarray,
        audio_data: np.ndarray,
        sample_rate: int
    ) -> list[SpeechSegment]:
        """Extract segments from speech detection"""
        segments = []
        in_speech = False
        start_time = 0
        
        for i, (t, speech) in enumerate(zip(times, is_speech)):
            if speech and not in_speech:
                start_time = t
                in_speech = True
            elif not speech and in_speech:
                segment = SpeechSegmentDetector._create_segment(
                    start_time, t, rms, i
                )
                if segment:
                    segments.append(segment)
                in_speech = False
        
        if in_speech:
            segment = SpeechSegmentDetector._create_segment(
                start_time, times[-1], rms, len(times)
            )
            if segment:
                segments.append(segment)
        
        return segments
    
    @staticmethod
    def _create_segment(
        start: float,
        end: float,
        rms: np.ndarray,
        index: int
    ) -> SpeechSegment:
        """Create a segment if duration is valid"""
        duration = end - start
        if duration >= AudioConfig.MIN_SEGMENT_DURATION:
            actual_end = min(end, start + AudioConfig.MAX_SEGMENT_DURATION)
            actual_duration = actual_end - start
            frame_count = int(actual_duration / AudioConfig.HOP_LENGTH_MS)
            energy = np.mean(rms[max(0, index - frame_count):index])
            return SpeechSegment(start, actual_end, energy)
        return None
    
    @staticmethod
    def _create_fixed_windows(
        audio_data: np.ndarray,
        sample_rate: int
    ) -> list[SpeechSegment]:
        """Create fixed-size windows as fallback"""
        print("‚ö†Ô∏è No clear speech segments found, creating fixed windows...")
        segments = []
        total_duration = len(audio_data) / sample_rate
        window_step = 5
        
        for start in np.arange(
            0,
            max(1, total_duration - AudioConfig.MIN_SEGMENT_DURATION),
            window_step
        ):
            end = min(start + AudioConfig.MAX_SEGMENT_DURATION, total_duration)
            if end - start >= AudioConfig.MIN_SEGMENT_DURATION:
                seg_data = audio_data[int(start * sample_rate):int(end * sample_rate)]
                energy = np.sqrt(np.mean(seg_data**2))
                segments.append(SpeechSegment(start, end, energy))
        
        return segments
    
    @staticmethod
    def extract_segment_audio(
        audio_data: np.ndarray,
        sample_rate: int,
        segment: SpeechSegment
    ) -> np.ndarray:
        """Extract audio data for a segment"""
        start_sample = int(segment.start * sample_rate)
        end_sample = int(segment.end * sample_rate)
        return audio_data[start_sample:end_sample]

In [None]:
class AudioPreprocessor:
    """Main audio preprocessing pipeline"""
    
    @staticmethod
    def process(
        input_path: str,
        apply_noise_reduction: bool = True
    ) -> tuple[np.ndarray, int, list[SpeechSegment]]:
        """
        Full preprocessing pipeline
        
        Args:
            input_path: Path to input audio file
            apply_noise_reduction: Whether to apply noise reduction
        
        Returns:
            (audio_data, sample_rate, segments)
        """
        print("\n" + "="*50)
        print("üéõÔ∏è AUDIO PREPROCESSING PIPELINE")
        print("="*50)
        
        # Convert format if needed
        input_path = Path(input_path)
        if input_path.suffix.lower() != '.wav':
            wav_path = AudioConverter.convert_to_wav(str(input_path))
        else:
            wav_path = str(input_path)
        
        # Load audio
        print("\nüìÇ Loading audio...")
        audio_data, sample_rate = librosa.load(
            wav_path,
            sr=AudioConfig.TARGET_SAMPLE_RATE,
            mono=True
        )
        print(f"   Duration: {len(audio_data)/sample_rate:.1f}s")
        print(f"   Sample rate: {sample_rate} Hz")
        
        # Apply noise reduction
        if apply_noise_reduction:
            audio_data = AudioNormalizer.reduce_noise(audio_data, sample_rate)
        
        # Normalize
        print("üìä Normalizing audio levels...")
        audio_data = AudioNormalizer.normalize(audio_data)
        
        # Find segments
        segments = SpeechSegmentDetector.find_segments(audio_data, sample_rate)
        print(f"\n‚úÖ Found {len(segments)} candidate segments")
        
        return audio_data, sample_rate, segments

print("‚úÖ Audio processing functions loaded!")

## 4Ô∏è‚É£ Find Available Voice Files

In [None]:
class VoiceFileFinder:
    """Finds available voice files in the project"""
    
    SUPPORTED_FORMATS = ['.m4a', '.mp3', '.wav']
    
    @classmethod
    def find_all(cls, search_dirs: list[str] = None) -> list[Path]:
        """
        Search for voice files in specified directories
        
        Args:
            search_dirs: List of directories to search (uses defaults if None)
        
        Returns:
            List of Path objects for found voice files
        """
        if search_dirs is None:
            # Search Colab environment directories
            search_dirs = [
                "/content",  # Colab content directory
                "./uploads",  # Uploaded files directory
                ".",  # Current directory
            ]
        
        found_files = []
        for search_dir in search_dirs:
            if os.path.exists(search_dir):
                try:
                    for fmt in cls.SUPPORTED_FORMATS:
                        found_files.extend(Path(search_dir).rglob(f"*{fmt}"))
                except (PermissionError, OSError):
                    # Skip directories we can't access
                    continue
        
        return sorted(set(found_files))
    
    @classmethod
    def display_found_files(cls) -> list[Path]:
        """Find and display all voice files"""
        print("üîç Searching for voice files in Colab environment...\n")
        files = cls.find_all()
        
        if files:
            print(f"‚úÖ Found {len(files)} voice file(s):\n")
            for i, file_path in enumerate(files, 1):
                size_mb = file_path.stat().st_size / (1024 * 1024)
                print(f"{i}. {file_path}")
                print(f"   Size: {size_mb:.2f} MB")
                print()
        else:
            print("‚ùå No voice files found in Colab environment.")
            print("üí° You'll be prompted to upload your M4A/MP3/WAV file next.")
        
        return files

# Search for existing voice files
available_files = VoiceFileFinder.display_found_files()

## 5Ô∏è‚É£ Load TTS Model

In [None]:
from vieneu import Vieneu

class TTSModelInfo:
    """TTS model information and capabilities"""
    
    def __init__(self, model):
        self.model = model
        self.has_clone_voice = hasattr(model, 'clone_voice')
        self.has_encode_reference = hasattr(model, 'encode_reference')
        self.has_create_voice = hasattr(model, 'create_voice')
    
    def display_capabilities(self):
        """Display model capabilities"""
        print("üîç Model capabilities:")
        print(f"   clone_voice: {'‚úÖ' if self.has_clone_voice else '‚ùå'}")
        print(f"   encode_reference: {'‚úÖ' if self.has_encode_reference else '‚ùå'}")
        print(f"   create_voice: {'‚úÖ' if self.has_create_voice else '‚ùå'}")
    
    def list_all_methods(self):
        """List all available methods"""
        methods = [m for m in dir(self.model) if not m.startswith('_')]
        print("\nüìã All available methods:")
        for method in methods:
            print(f"   - {method}")
    
    def get_preset_voices(self) -> list[str]:
        """Get list of preset voice names"""
        try:
            voices = self.model.list_preset_voices()
            if isinstance(voices, list) and voices:
                if isinstance(voices[0], tuple):
                    return [name for _, name in voices]
                return voices
        except Exception as e:
            print(f"   Could not list voices: {e}")
        return []

# Load model
print("üîÑ Loading VieNeu-TTS model...")
tts = Vieneu()
print("‚úÖ Model loaded!\n")

# Get model info
model_info = TTSModelInfo(tts)
model_info.display_capabilities()

# Get preset voices
print("\nüì¢ Available preset voices:")
preset_voices = model_info.get_preset_voices()
for voice in preset_voices:
    print(f"   - {voice}")

## 6Ô∏è‚É£ Select or Upload Voice File

In [None]:
# Upload voice file from local machine
print("üì§ Please upload your voice file (M4A/MP3/WAV)")
print("   Select: the-lost-chapter/voices/my-voice.m4a\n")

uploaded = files.upload()

if uploaded:
    uploaded_filename = list(uploaded.keys())[0]
    selected_file = Path(Directories.UPLOADS) / uploaded_filename
    
    with open(selected_file, "wb") as f:
        f.write(uploaded[uploaded_filename])
    
    size_mb = len(uploaded[uploaded_filename]) / (1024 * 1024)
    print(f"\n‚úÖ Uploaded: {uploaded_filename}")
    print(f"   Size: {size_mb:.2f} MB")
    print(f"   Saved to: {selected_file}")
else:
    raise Exception("No file uploaded. Please run this cell again and select a file.")

## 7Ô∏è‚É£ Preprocess Audio

In [None]:
# Process the audio
apply_noise_reduction = True  # @param {type:"boolean"}

audio_data, sample_rate, segments = AudioPreprocessor.process(
    str(selected_file),
    apply_noise_reduction=apply_noise_reduction
)

# Save full processed audio
processed_full_path = f"{Directories.PROCESSED}/full_processed.wav"
sf.write(processed_full_path, audio_data, sample_rate)

print("\nüîä Full processed audio:")
display(Audio(processed_full_path))

## 8Ô∏è‚É£ Preview Segments

In [None]:
# Extract and preview segments
print("üéµ Extracting candidate segments:\n")

segment_paths = []

for i, segment in enumerate(segments, 1):
    print("="*50)
    print(f"üìç Segment {i}")
    print(f"   Time: {segment.start:.1f}s - {segment.end:.1f}s")
    print(f"   Duration: {segment.duration:.1f}s")
    
    segment_audio = SpeechSegmentDetector.extract_segment_audio(
        audio_data, sample_rate, segment
    )
    segment_path = f"{Directories.PROCESSED}/segment_{i}.wav"
    sf.write(segment_path, segment_audio, sample_rate)
    segment_paths.append(segment_path)
    
    display(Audio(segment_path))
    print()

## 9Ô∏è‚É£ Select Best Segment

In [None]:
# Select the best segment
selected_segment_index = 1  # @param {type:"integer"}

if selected_segment_index < 1 or selected_segment_index > len(segments):
    print(f"‚ö†Ô∏è Invalid selection! Using segment 1")
    selected_segment_index = 1

selected_segment_path = segment_paths[selected_segment_index - 1]
selected_segment_info = segments[selected_segment_index - 1]

print(f"‚úÖ Selected Segment {selected_segment_index}")
print(f"   Time: {selected_segment_info.start:.1f}s - {selected_segment_info.end:.1f}s")
print(f"   Duration: {selected_segment_info.duration:.1f}s")
print("\nüîä Selected audio:")
display(Audio(selected_segment_path))

## üîü Clone Voice

In [None]:
# Enter transcript for the selected segment
sample_transcript = "Xin ch√†o, ƒë√¢y l√† gi·ªçng n√≥i c·ªßa t√¥i."  # @param {type:"string"}

print(f"üìù Transcript: {sample_transcript}")
print(f"üìÅ Audio file: {selected_segment_path}")
display(Audio(selected_segment_path))

In [None]:
class VoiceCloner:
    """Handles voice cloning with different API versions"""
    
    def __init__(self, tts_model, model_info: TTSModelInfo):
        self.tts = tts_model
        self.info = model_info
    
    def clone(self, audio_path: str, transcript: str, voice_name: str = "MyVoice"):
        """
        Clone voice using available API method
        
        Args:
            audio_path: Path to reference audio
            transcript: Transcript of reference audio
            voice_name: Name for cloned voice
        
        Returns:
            Voice data object or dict for TTS inference
        """
        print("üîÑ Cloning voice...\n")
        
        # Try different methods
        methods = [
            self._try_clone_voice,
            self._try_create_voice,
            self._try_encode_reference,
            self._try_direct_path,
            self._try_ref_audio
        ]
        
        for method in methods:
            result = method(audio_path, transcript, voice_name)
            if result is not None:
                return result
        
        print("‚ö†Ô∏è All cloning methods failed. Using preset voice.")
        return self._get_preset_voice()
    
    def _try_clone_voice(self, audio_path: str, transcript: str, voice_name: str):
        """Try clone_voice method"""
        if not self.info.has_clone_voice:
            return None
        try:
            print("Trying clone_voice method...")
            voice = self.tts.clone_voice(
                audio_path=audio_path,
                text=transcript,
                name=voice_name
            )
            print("‚úÖ clone_voice succeeded!")
            return voice
        except Exception as e:
            print(f"‚ùå clone_voice failed: {e}")
            return None
    
    def _try_create_voice(self, audio_path: str, transcript: str, voice_name: str):
        """Try create_voice method"""
        if not self.info.has_create_voice:
            return None
        try:
            print("Trying create_voice method...")
            voice = self.tts.create_voice(
                audio_path=audio_path,
                text=transcript
            )
            print("‚úÖ create_voice succeeded!")
            return voice
        except Exception as e:
            print(f"‚ùå create_voice failed: {e}")
            return None
    
    def _try_encode_reference(self, audio_path: str, transcript: str, voice_name: str):
        """Try encode_reference method"""
        if not self.info.has_encode_reference:
            return None
        try:
            print("Trying encode_reference method...")
            voice = self.tts.encode_reference(audio_path)
            print("‚úÖ encode_reference succeeded!")
            return voice
        except Exception as e:
            print(f"‚ùå encode_reference failed: {e}")
            return None
    
    def _try_direct_path(self, audio_path: str, transcript: str, voice_name: str):
        """Try using path directly"""
        try:
            print("Testing direct path method...")
            self.tts.infer(text="Test", voice=audio_path)
            print("‚úÖ Direct path works!")
            return audio_path
        except Exception as e:
            print(f"‚ùå Direct path failed: {e}")
            return None
    
    def _try_ref_audio(self, audio_path: str, transcript: str, voice_name: str):
        """Try ref_audio parameter"""
        try:
            print("Testing ref_audio parameter...")
            self.tts.infer(
                text="Test",
                ref_audio=audio_path,
                ref_text=transcript
            )
            print("‚úÖ ref_audio parameter works!")
            return {"ref_audio": audio_path, "ref_text": transcript}
        except Exception as e:
            print(f"‚ùå ref_audio failed: {e}")
            return None
    
    def _get_preset_voice(self):
        """Get first available preset voice"""
        voices = self.info.get_preset_voices()
        if voices:
            voice_name = voices[0]
            print(f"Using preset voice: {voice_name}")
            return self.tts.get_preset_voice(voice_name)
        return None

# Clone the voice
cloner = VoiceCloner(tts, model_info)
cloned_voice = cloner.clone(selected_segment_path, sample_transcript)

print(f"\n‚úÖ Voice setup complete!")
print(f"   Voice type: {type(cloned_voice)}")

## 1Ô∏è‚É£1Ô∏è‚É£ Generate Speech

In [None]:
class SpeechGenerator:
    """Generates speech from text using cloned voice"""
    
    def __init__(self, tts_model):
        self.tts = tts_model
    
    def generate(
        self,
        text: str,
        voice_data,
        output_path: str,
        sample_rate: int = None
    ) -> bool:
        """
        Generate speech audio file
        
        Args:
            text: Text to synthesize
            voice_data: Cloned voice data
            output_path: Path to save audio
            sample_rate: Output sample rate (uses config default if None)
        
        Returns:
            True if successful, False otherwise
        """
        if sample_rate is None:
            sample_rate = AudioConfig.TARGET_SAMPLE_RATE
        
        # Try different inference methods
        audio = self._try_inference(text, voice_data)
        
        if audio is not None:
            sf.write(output_path, audio, sample_rate)
            return True
        
        return False
    
    def _try_inference(self, text: str, voice_data):
        """Try different inference methods"""
        methods = [
            lambda: self._infer_with_voice(text, voice_data),
            lambda: self._infer_with_ref_audio(text, voice_data),
            lambda: self._infer_with_speaker_wav(text, voice_data),
            lambda: self._infer_simple(text, voice_data)
        ]
        
        for method in methods:
            try:
                return method()
            except Exception:
                continue
        
        return None
    
    def _infer_with_voice(self, text: str, voice_data):
        """Standard inference with voice parameter"""
        if isinstance(voice_data, dict) and "ref_audio" in voice_data:
            return self.tts.infer(
                text=text,
                ref_audio=voice_data["ref_audio"],
                ref_text=voice_data["ref_text"]
            )
        else:
            return self.tts.infer(
                text=text,
                voice=voice_data,
                temperature=1.0,
                top_k=50
            )
    
    def _infer_with_ref_audio(self, text: str, voice_data):
        """Inference with ref_audio parameter"""
        if isinstance(voice_data, dict):
            return self.tts.infer(
                text=text,
                ref_audio=voice_data["ref_audio"],
                ref_text=voice_data["ref_text"]
            )
        return None
    
    def _infer_with_speaker_wav(self, text: str, voice_data):
        """Inference with speaker_wav (XTTS-style)"""
        if isinstance(voice_data, str):
            return self.tts.infer(text=text, speaker_wav=voice_data)
        return None
    
    def _infer_simple(self, text: str, voice_data):
        """Simple inference without extra params"""
        return self.tts.infer(text=text, voice=voice_data)

# Create generator
generator = SpeechGenerator(tts)
print("‚úÖ Speech generator ready!")

In [None]:
# Generate speech with cloned voice
text_to_speak = "Xin ch√†o m·ªçi ng∆∞·ªùi. T√¥i l√† tr·ª£ l√Ω ·∫£o ƒë∆∞·ª£c t·∫°o b·ªüi VieNeu TTS. R·∫•t vui ƒë∆∞·ª£c g·∫∑p c√°c b·∫°n."  # @param {type:"string"}

print(f"üìù Text: {text_to_speak}")
print("\nüîÑ Generating speech...")

output_path = f"{Directories.OUTPUTS}/cloned_speech.wav"

if generator.generate(text_to_speak, cloned_voice, output_path):
    print("\n‚úÖ Speech generated!")
    print("\nüîä Your cloned voice:")
    display(Audio(output_path))
else:
    print("\n‚ùå Generation failed")
    print("\nTrying with default voice...")
    try:
        audio = tts.infer(text=text_to_speak)
        sf.write(output_path, audio, AudioConfig.TARGET_SAMPLE_RATE)
        print("\nüîä Generated with default voice:")
        display(Audio(output_path))
    except Exception as e:
        print(f"Default voice also failed: {e}")

## 1Ô∏è‚É£2Ô∏è‚É£ Batch Generation

In [None]:
# Generate multiple samples
example_texts = [
    "H√¥m nay th·ªùi ti·∫øt r·∫•t ƒë·∫πp, ch√∫ng ta ƒëi d·∫°o c√¥ng vi√™n nh√©.",
    "Ch√†o bu·ªïi s√°ng! B·∫°n ƒë√£ ƒÉn s√°ng ch∆∞a?",
    "T√¥i c√≥ th·ªÉ gi√∫p b·∫°n ƒë·ªçc s√°ch, t·∫°o chatbot, ho·∫∑c l√†m tr·ª£ l√Ω ·∫£o.",
    "Hello! I can also speak English with Vietnamese accent."
]

print("üéµ Generating samples...\n")

for i, text in enumerate(example_texts, 1):
    print("="*50)
    print(f"üìç Sample {i}: {text}")
    
    output_path = f"{Directories.OUTPUTS}/sample_{i}.wav"
    if generator.generate(text, cloned_voice, output_path):
        display(Audio(output_path))
    else:
        print("‚ùå Failed to generate")
    print()

## 1Ô∏è‚É£3Ô∏è‚É£ Interactive Mode

In [None]:
# Interactive TTS - run this cell multiple times with different text
your_text = "Nh·∫≠p vƒÉn b·∫£n c·ªßa b·∫°n v√†o ƒë√¢y!"  # @param {type:"string"}

print(f"üîÑ Generating: {your_text}")
output_path = f"{Directories.OUTPUTS}/interactive.wav"

if generator.generate(your_text, cloned_voice, output_path):
    print("\nüîä Result:")
    display(Audio(output_path))
else:
    print("‚ùå Generation failed")

## 1Ô∏è‚É£4Ô∏è‚É£ Download Results

In [None]:
import zipfile

# Create ZIP with all outputs
zip_path = "vieneu_outputs.zip"

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file in Path(Directories.OUTPUTS).glob("*.wav"):
        zipf.write(file, f"outputs/{file.name}")
    for file in Path(Directories.PROCESSED).glob("*.wav"):
        zipf.write(file, f"processed/{file.name}")

print("üì¶ Files in ZIP:")
with zipfile.ZipFile(zip_path, 'r') as zipf:
    for name in zipf.namelist():
        print(f"   - {name}")

print("\nüì• Downloading...")
files.download(zip_path)
print("‚úÖ Download complete!")

## 1Ô∏è‚É£5Ô∏è‚É£ Cleanup

In [None]:
# Clean up resources
tts.close()
print("‚úÖ Resources cleaned up!")
print("\nüéâ Voice cloning session complete!")