In [1]:
# AI Voice Converter Pro - Gradio Version (Fastest Performance)
# Optimized for Google Colab deployment

# ===============================

!pip install -q gradio yt-dlp librosa soundfile scipy numpy torch
!apt-get update -qq && apt-get install -y -qq ffmpeg

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [None]:
import gradio as gr
import torch
import librosa
import soundfile as sf
import numpy as np
import yt_dlp
import os
import warnings
import gc
import time
import hashlib
import tempfile
import shutil
from pathlib import Path
from scipy.signal import butter, filtfilt, hilbert
from scipy import ndimage

warnings.filterwarnings('ignore')

# Setup directories
TEMP_DIR = "/tmp/voice_temp"
OUTPUT_DIR = "/tmp/voice_output"
CACHE_DIR = "/tmp/voice_cache"

for directory in [TEMP_DIR, OUTPUT_DIR, CACHE_DIR]:
    os.makedirs(directory, exist_ok=True)

# ==============================
# Enhanced Voice Effect Modeling
# ==============================
class OptimizedVoiceModeler:
    @staticmethod
    def animated_voice_processing(vocals, sr):
        """Bright, energetic animated voice (formerly female voice)"""
        # 1. Higher pitch for animated character
        vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=9)

        # 2. Slight speed increase for energy
        vocals = librosa.effects.time_stretch(vocals, rate=1.15)

        # 3. Boost high-mid frequencies for brightness
        b_bright, a_bright = butter(2, [2000, 5000], btype='band', fs=sr)
        bright_boost = filtfilt(b_bright, a_bright, vocals) * 1.6
        min_len = min(len(vocals), len(bright_boost))
        vocals = vocals[:min_len] + bright_boost[:min_len] * 0.35

        # 4. Add sparkle for animation
        b_sparkle, a_sparkle = butter(1, 4000, btype='high', fs=sr)
        sparkle = filtfilt(b_sparkle, a_sparkle, vocals) * 1.4
        min_len = min(len(vocals), len(sparkle))
        vocals = vocals[:min_len] + sparkle[:min_len] * 0.25

        # 5. Reduce muddy low frequencies
        b_clean, a_clean = butter(3, 300, btype='high', fs=sr)
        vocals = filtfilt(b_clean, a_clean, vocals)

        # 6. Animated character vibrato
        vibrato_freq = 6.0
        vibrato_depth = 0.025
        t = np.arange(len(vocals)) / sr
        vibrato = np.sin(2 * np.pi * vibrato_freq * t) * vibrato_depth
        vocals = vocals * (1 + vibrato * 0.4)

        # 7. Energetic compression
        vocals = np.tanh(vocals * 1.4) * 0.88

        return librosa.util.normalize(vocals) * 0.9

    @staticmethod
    def chipmunk_processing(vocals, sr):
        """High-pitched squeaky chipmunk voice - reduced music bleed"""
        # 1. Very high pitch shift
        vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=16)

        # 2. Speed up for cartoon effect
        vocals = librosa.effects.time_stretch(vocals, rate=1.4)

        # 3. Focus only on very high frequencies to reduce music
        b_squeak, a_squeak = butter(4, 3500, btype='high', fs=sr)
        vocals = filtfilt(b_squeak, a_squeak, vocals)

        # 4. Boost ultra-high frequencies
        b_ultra, a_ultra = butter(2, [4000, 8000], btype='band', fs=sr)
        ultra_boost = filtfilt(b_ultra, a_ultra, vocals) * 2.0
        min_len = min(len(vocals), len(ultra_boost))
        vocals = vocals[:min_len] + ultra_boost[:min_len] * 0.6

        # 5. Completely remove low frequencies to eliminate music bleed
        b_highpass, a_highpass = butter(6, 1000, btype='high', fs=sr)
        vocals = filtfilt(b_highpass, a_highpass, vocals)

        # 6. Sharp compression for cartoon effect
        vocals = np.tanh(vocals * 3.5) * 0.75

        # 7. Chipmunk vibrato
        vibrato_freq = 8.5
        vibrato_depth = 0.04
        t = np.arange(len(vocals)) / sr
        vibrato = np.sin(2 * np.pi * vibrato_freq * t) * vibrato_depth
        vocals = vocals * (1 + vibrato)

        return librosa.util.normalize(vocals) * 0.85

    @staticmethod
    def slowed_reverb_processing(vocals, sr):
        """Improved slowed + reverb - less slow, more atmospheric"""
        # 1. Gentle pitch down for warmth (reduced from -2 to -1)
        vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=-1)

        # 2. Less extreme slowdown for better delivery (0.90 instead of 0.82)
        vocals = librosa.effects.time_stretch(vocals, rate=0.90)

        # 3. Enhanced echo chamber style reverb with more complex delays
        reverb_vocals = vocals.copy()
        delays = [int(0.04 * sr), int(0.08 * sr), int(0.15 * sr), int(0.28 * sr), int(0.42 * sr), int(0.6 * sr)]

        for i, delay in enumerate(delays):
            if delay < len(vocals):
                delayed = np.pad(vocals, (delay, 0))[:len(vocals)]
                decay = 0.6 * (0.75 ** i)  # Stronger initial reverb
                reverb_vocals += delayed * decay

        # 4. More atmospheric mix with enhanced reverb
        vocals = vocals * 0.45 + reverb_vocals * 0.55

        # 5. Warm, spacious filtering with enhanced low-mid warmth
        b_warm, a_warm = butter(2, [180, 4000], btype='band', fs=sr)
        warm_vocals = filtfilt(b_warm, a_warm, vocals) * 1.2
        min_len = min(len(vocals), len(warm_vocals))
        vocals = vocals[:min_len] + warm_vocals[:min_len] * 0.3

        # 6. Add subtle chorus effect for depth
        chorus_delay = int(0.02 * sr)
        if chorus_delay < len(vocals):
            chorus = np.pad(vocals, (chorus_delay, 0))[:len(vocals)]
            vocals = vocals * 0.8 + chorus * 0.2

        # 7. Gentle saturation for warmth
        vocals = np.tanh(vocals * 1.2) * 0.9

        return librosa.util.normalize(vocals) * 0.87

    @staticmethod
    def party_banger_processing(vocals, sr):
        """ENHANCED Party Mashup - Fixed version without repetitive high-freq noise"""

        def safe_frequency(freq, sr):
            nyquist = sr / 2
            return max(1, min(freq, nyquist - 1))

        def safe_bandpass(low, high, sr):
            nyquist = sr / 2
            low = max(1, min(low, nyquist - 2))
            high = max(low + 1, min(high, nyquist - 1))
            return [low, high]

        # Keep original vocals untouched
        original_vocals = vocals.copy()

        # ============ RHYTHM SECTION (FIXED) ============

        # 1. KICK DRUM (clean, no high-freq artifacts)
        kick_drums = np.zeros_like(vocals)
        kick_length = int(0.15 * sr)
        for beat_pos in range(0, len(vocals), int(0.5 * sr)):
            if beat_pos + kick_length < len(vocals):
                t_kick = np.arange(kick_length) / sr
                # Focus on low frequencies only for kick
                kick_drum = (np.sin(2 * np.pi * 55 * t_kick) * 0.8 +
                            np.sin(2 * np.pi * 80 * t_kick) * 0.4) * np.exp(-8 * t_kick) * np.hanning(kick_length)
                kick_drums[beat_pos:beat_pos+kick_length] += kick_drum * 0.8

        # 2. SNARE DRUM (reduced high frequencies)
        snare_drums = np.zeros_like(vocals)
        snare_length = int(0.08 * sr)
        for snare_pos in range(int(1.0 * sr), len(vocals), int(1.0 * sr)):
            if snare_pos + snare_length < len(vocals):
                snare_noise = np.random.normal(0, 0.4, snare_length)
                # FIXED: Lower high-freq cutoff to prevent chirping
                snare_freq = safe_bandpass(300, 4000, sr)  # Was 8000
                b_snare, a_snare = butter(2, snare_freq, btype='band', fs=sr)
                snare = filtfilt(b_snare, a_snare, snare_noise) * np.exp(-8 * np.arange(snare_length) / sr)
                snare_drums[snare_pos:snare_pos+snare_length] += snare * 0.6

        # 3. REMOVE HI-HATS (main source of chirping sound)
        # Hi-hats removed completely to eliminate chirping

        # ============ BASS AND MELODIC ELEMENTS ============

        # 4. BASS LINE (unchanged - low frequencies)
        bass_line = np.zeros_like(vocals)
        bass_length = int(0.35 * sr)
        bass_pattern = [65, 82, 73, 55]  # Simplified pattern
        bass_interval = int(0.8 * sr)  # Less frequent

        for i, bass_pos in enumerate(range(0, len(vocals), bass_interval)):
            if bass_pos + bass_length < len(vocals):
                freq = bass_pattern[i % len(bass_pattern)]
                t_bass = np.arange(bass_length) / sr
                bass_note = (np.sin(2 * np.pi * freq * t_bass) * 0.7 +
                            np.sin(2 * np.pi * freq * 2 * t_bass) * 0.3) * np.hanning(bass_length)
                bass_line[bass_pos:bass_pos+bass_length] += bass_note * 0.6

        # 5. LEAD MELODY (reduced frequency range)
        lead_melody = np.zeros_like(vocals)
        lead_length = int(0.3 * sr)  # Longer notes
        melody_notes = [262, 294, 330, 349]  # Lower octave: C-D-E-F
        melody_interval = int(1.2 * sr)  # Less frequent

        for i, lead_pos in enumerate(range(int(0.4 * sr), len(vocals), melody_interval)):
            if lead_pos + lead_length < len(vocals):
                freq = melody_notes[i % len(melody_notes)]
                t_lead = np.arange(lead_length) / sr
                # Smoother envelope, no vibrato to prevent artifacts
                lead_note = np.sin(2 * np.pi * freq * t_lead) * np.hanning(lead_length)
                lead_melody[lead_pos:lead_pos+lead_length] += lead_note * 0.35

        # 6. CHORD PADS (low-mid frequencies only)
        pad_synth = np.zeros_like(vocals)
        pad_length = int(4 * sr)
        for pad_pos in range(0, len(vocals), int(8 * sr)):
            if pad_pos + pad_length < len(vocals):
                t_pad = np.arange(pad_length) / sr
                # Lower frequency chord
                pad_chord = (np.sin(2 * np.pi * 110 * t_pad) +
                            np.sin(2 * np.pi * 138.5 * t_pad) +
                            np.sin(2 * np.pi * 165 * t_pad)) * 0.06
                # Smooth envelope
                envelope = np.ones(pad_length)
                fade_time = int(1.5 * sr)
                envelope[:fade_time] = np.linspace(0, 1, fade_time)
                envelope[-fade_time:] = np.linspace(1, 0, fade_time)
                pad_synth[pad_pos:pad_pos+pad_length] += pad_chord * envelope

        # 7. SIMPLE PLUCK (less frequent, lower pitch)
        pluck_synth = np.zeros_like(vocals)
        pluck_length = int(0.2 * sr)
        pluck_notes = [220, 247, 277]  # A-B-C# (lower octave)
        pluck_interval = int(2.0 * sr)  # Much less frequent

        for i, pluck_pos in enumerate(range(int(1.0 * sr), len(vocals), pluck_interval)):
            if pluck_pos + pluck_length < len(vocals):
                freq = pluck_notes[i % len(pluck_notes)]
                t_pluck = np.arange(pluck_length) / sr
                pluck_note = np.sin(2 * np.pi * freq * t_pluck) * np.exp(-6 * t_pluck) * np.hanning(pluck_length)
                pluck_synth[pluck_pos:pluck_pos+pluck_length] += pluck_note * 0.25

        # ============ CONTROLLED EFFECTS ============

        # 8. GENTLE SWEEPS (lower frequency range)
        sweep_effects = np.zeros_like(vocals)
        sweep_length = int(3 * sr)
        for sweep_pos in range(0, len(vocals), int(16 * sr)):
            if sweep_pos + sweep_length < len(vocals):
                t_sweep = np.arange(sweep_length) / sr
                # FIXED: Lower frequency range to prevent chirping
                freq_sweep = 80 + (800 * t_sweep / (sweep_length / sr))  # Was 100-2000
                sweep_sound = np.sin(2 * np.pi * freq_sweep * t_sweep) * 0.1
                envelope = (np.linspace(0, 1, sweep_length) * np.linspace(1, 0, sweep_length)) ** 2
                sweep_effects[sweep_pos:sweep_pos+sweep_length] += sweep_sound * envelope

        # ============ FINAL MIX ============
        party_mix = (original_vocals * 1.0 +
                    kick_drums * 0.8 +
                    snare_drums * 0.6 +
                    bass_line * 0.7 +
                    lead_melody * 0.5 +
                    pad_synth * 0.4 +
                    pluck_synth * 0.4 +
                    sweep_effects * 0.3)

        # ============ POST-PROCESSING TO REMOVE ARTIFACTS ============

        # Apply notch filter to remove any remaining high-frequency artifacts
        from scipy.signal import iirnotch

        # Remove potential chirping frequencies
        for notch_freq in [4500, 5500, 6500]:  # Common chirping frequencies
            if notch_freq < sr/2:
                b_notch, a_notch = iirnotch(notch_freq, 30, sr)
                party_mix = filtfilt(b_notch, a_notch, party_mix)

        # Gentle low-pass filter to remove any high-frequency noise
        cutoff_freq = safe_frequency(8000, sr)
        b_lpf, a_lpf = butter(2, cutoff_freq, btype='low', fs=sr)
        party_mix = filtfilt(b_lpf, a_lpf, party_mix)

        # Final limiting
        party_mix = np.tanh(party_mix * 1.1) * 0.9

        return librosa.util.normalize(party_mix) * 0.85



# ==============================
# Downloader (unchanged)
# ==============================
class FastDownloader:
    def __init__(self):
        self.max_duration = 90
        self.cache_dir = CACHE_DIR

    def download_audio_fast(self, url, filename="audio", progress_callback=None):
        try:
            if progress_callback:
                progress_callback(0.1, "Starting download...")

            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
            cache_path = os.path.join(self.cache_dir, f"{url_hash}.wav")

            if os.path.exists(cache_path):
                if progress_callback:
                    progress_callback(0.3, "Loading from cache...")
                output_path = os.path.join(TEMP_DIR, f"{filename}.wav")
                shutil.copy2(cache_path, output_path)
                try:
                    with open(os.path.join(self.cache_dir, f"{url_hash}_title.txt"), 'r') as f:
                        title = f.read().strip()
                except:
                    title = "Cached Audio"
                return output_path, title

            output_path = os.path.join(TEMP_DIR, f"{filename}.wav")
            strategies = [
                self._try_embedded_client,
                self._try_android_client,
                self._try_web_client,
                self._try_basic_client
            ]

            for i, strategy in enumerate(strategies):
                try:
                    if progress_callback:
                        progress_callback(0.1 + i*0.05, f"Trying method {i+1}...")

                    title = strategy(url, filename)
                    if title:
                        if progress_callback:
                            progress_callback(0.25, f"Downloaded: {title}")

                        y, sr = librosa.load(output_path, sr=22050, duration=self.max_duration)
                        sf.write(output_path, y, sr)

                        try:
                            shutil.copy2(output_path, cache_path)
                            with open(os.path.join(self.cache_dir, f"{url_hash}_title.txt"), 'w') as f:
                                f.write(title)
                        except:
                            pass

                        return output_path, title

                except Exception as e:
                    if progress_callback:
                        progress_callback(0.1 + i*0.05, f"Method {i+1} failed, trying next...")
                    continue

            raise Exception("All download methods failed")

        except Exception as e:
            raise Exception(f"Download failed: {str(e)}")

    def _try_embedded_client(self, url, filename):
        output_path = os.path.join(TEMP_DIR, f"{filename}.wav")

        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'format': 'worstaudio[filesize<15M]/worst',
            'outtmpl': os.path.join(TEMP_DIR, f"{filename}.%(ext)s"),
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '64',
            }],
            'postprocessor_args': ['-t', '90', '-ar', '22050'],
            'http_headers': {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
            },
            'extractor_args': {
                'youtube': {
                    'player_client': ['web_embedded_player', 'android_embedded_player'],
                    'skip_age_gate': True,
                }
            },
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            return info.get('title', 'Unknown')[:30]

    def _try_android_client(self, url, filename):
        output_path = os.path.join(TEMP_DIR, f"{filename}.wav")

        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'format': 'worstaudio/worst',
            'outtmpl': os.path.join(TEMP_DIR, f"{filename}.%(ext)s"),
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '64',
            }],
            'postprocessor_args': ['-t', '90', '-ar', '22050'],
            'http_headers': {
                'User-Agent': 'com.google.android.youtube/18.11.34',
            },
            'extractor_args': {
                'youtube': {
                    'player_client': ['android', 'android_music'],
                    'skip_age_gate': True,
                }
            },
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            return info.get('title', 'Unknown')[:30]

    def _try_web_client(self, url, filename):
        output_path = os.path.join(TEMP_DIR, f"{filename}.wav")

        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'format': 'worstaudio/worst',
            'outtmpl': os.path.join(TEMP_DIR, f"{filename}.%(ext)s"),
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '64',
            }],
            'postprocessor_args': ['-t', '90', '-ar', '22050'],
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            return info.get('title', 'Unknown')[:30]

    def _try_basic_client(self, url, filename):
        output_path = os.path.join(TEMP_DIR, f"{filename}.wav")

        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'format': 'worst',
            'outtmpl': os.path.join(TEMP_DIR, f"{filename}.%(ext)s"),
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '48',
            }],
            'postprocessor_args': ['-t', '90'],
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            return info.get('title', 'Unknown')[:30]

# ==============================
# Enhanced Voice Effect Converter
# ==============================
class OptimizedVoiceConverter:
    def __init__(self):
        self.voice_modeler = OptimizedVoiceModeler()
        self.styles = {
            "Animated Voice": "animated_voice_processing",
            "Slowed + Reverb": "slowed_reverb_processing",
            "Party Mashup": "party_banger_processing",  # Updated name
        }

    def convert_voice_effect(self, youtube_url, voice_style, vocal_vol, music_vol, progress_fn):
        start_time = time.time()
        temp_id = str(int(time.time()) % 10000)

        try:
            downloader = FastDownloader()
            audio_path, title = downloader.download_audio_fast(youtube_url, f"input_{temp_id}", progress_fn)

            progress_fn(0.35, "Advanced vocal separation...")

            y, sr = librosa.load(audio_path, sr=22050, mono=False)
            vocals, instrumental = self._separate_vocals_optimized(y, sr, voice_style)

            progress_fn(0.55, f"Applying {voice_style} effect...")

            converted_vocals = self._apply_voice_effect(vocals, sr, voice_style)

            progress_fn(0.75, "Professional mixing...")

            final_mix = self._optimized_mix(converted_vocals, instrumental, vocal_vol, music_vol, sr, voice_style)

            output_path = os.path.join(OUTPUT_DIR, f"final_{temp_id}.wav")
            sf.write(output_path, final_mix, sr)

            try:
                os.remove(audio_path)
            except:
                pass

            processing_time = time.time() - start_time
            progress_fn(1.0, f"{voice_style} effect complete! ({processing_time:.1f}s)")

            gc.collect()
            return output_path, title, processing_time

        except Exception as e:
            for f in Path(TEMP_DIR).glob(f"*{temp_id}*"):
                try:
                    f.unlink()
                except:
                    pass
            gc.collect()
            raise Exception(f"Conversion failed: {str(e)}")

    def convert_voice_effect_from_file(self, audio_file_path, voice_style, vocal_vol, music_vol, progress_fn):
        start_time = time.time()
        temp_id = str(int(time.time()) % 10000)

        try:
            progress_fn(0.1, "Loading uploaded audio...")

            y, sr = librosa.load(audio_file_path, sr=22050, mono=False, duration=90)
            vocals, instrumental = self._separate_vocals_optimized(y, sr, voice_style)

            progress_fn(0.4, f"Applying {voice_style} effect...")

            converted_vocals = self._apply_voice_effect(vocals, sr, voice_style)

            progress_fn(0.75, "Professional mixing...")

            final_mix = self._optimized_mix(converted_vocals, instrumental, vocal_vol, music_vol, sr, voice_style)

            output_path = os.path.join(OUTPUT_DIR, f"final_{temp_id}.wav")
            sf.write(output_path, final_mix, sr)

            processing_time = time.time() - start_time
            progress_fn(1.0, f"{voice_style} effect complete! ({processing_time:.1f}s)")

            gc.collect()
            return output_path, "Uploaded Audio", processing_time

        except Exception as e:
            gc.collect()
            raise Exception(f"Conversion failed: {str(e)}")

    def _separate_vocals_optimized(self, y, sr, voice_style):
        """Optimized vocal separation based on voice style"""
        if len(y.shape) == 2:
            left, right = y[0], y[1]
            min_len = min(len(left), len(right))
            left, right = left[:min_len], right[:min_len]

            mid = (left + right) / 2
            side = (left - right) / 2

            S_mid = librosa.stft(mid, n_fft=2048, hop_length=512)
            magnitude = np.abs(S_mid)

            freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
            vocal_mask = np.ones_like(magnitude)

            # Optimize separation based on voice style
            if voice_style == "Chipmunk":
                # Focus on higher frequencies for chipmunk to reduce music bleed
                vocal_range = (freqs >= 200) & (freqs <= 6000)
                vocal_mask[vocal_range] *= 2.5
                vocal_mask[freqs < 200] *= 0.1
                vocal_mask[freqs > 6000] *= 0.3
            elif voice_style == "Party Mashup":
                # Enhanced separation for party mashup
                vocal_range = (freqs >= 100) & (freqs <= 5500)
                vocal_mask[vocal_range] *= 2.2
                vocal_mask[freqs < 100] *= 0.4
                vocal_mask[freqs > 5500] *= 0.7
            else:
                # Standard separation for other effects
                vocal_range = (freqs >= 80) & (freqs <= 5000)
                vocal_mask[vocal_range] *= 2.0
                vocal_mask[freqs < 80] *= 0.3
                vocal_mask[freqs > 5000] *= 0.5

            S_vocals = S_mid * vocal_mask
            vocals_enhanced = librosa.istft(S_vocals, hop_length=512)

            min_len = min(len(vocals_enhanced), len(side), len(mid))
            vocals = (vocals_enhanced[:min_len] * 0.8 + side[:min_len] * 0.2)
            instrumental = mid[:min_len] - vocals * 0.35

        else:
            S = librosa.stft(y, n_fft=2048)
            magnitude = np.abs(S)

            vocal_mask = np.ones_like(magnitude)
            freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)

            if voice_style == "Chipmunk":
                vocal_range = (freqs >= 200) & (freqs <= 6000)
                vocal_mask[vocal_range] *= 2.2
                vocal_mask[freqs < 200] *= 0.2
            elif voice_style == "Party Mashup":
                vocal_range = (freqs >= 100) & (freqs <= 5500)
                vocal_mask[vocal_range] *= 1.9
                vocal_mask[freqs < 100] *= 0.5
            else:
                vocal_range = (freqs >= 80) & (freqs <= 5000)
                vocal_mask[vocal_range] *= 1.6
                vocal_mask[freqs < 80] *= 0.4

            vocals = librosa.istft(S * vocal_mask)

            min_len = min(len(y), len(vocals))
            vocals = vocals[:min_len]
            instrumental = y[:min_len] - vocals * 0.3

        return vocals, instrumental

    def _apply_voice_effect(self, vocals, sr, style_name):
        if style_name not in self.styles:
            return vocals

        processing_method = getattr(self.voice_modeler, self.styles[style_name])
        return processing_method(vocals, sr)

    def _optimized_mix(self, vocals, instrumental, vocal_vol, music_vol, sr, voice_style):
        """Enhanced mixing with style-specific optimizations"""
        min_len = min(len(vocals), len(instrumental))
        vocals = vocals[:min_len] * vocal_vol
        instrumental = instrumental[:min_len] * music_vol

        # Enhanced style-specific mixing
        if voice_style == "Chipmunk":
            # Reduce music more aggressively for chipmunk
            try:
                b_notch, a_notch = butter(3, [800, 4000], btype='band', fs=sr)
                notch = filtfilt(b_notch, a_notch, instrumental)
                min_len = min(len(instrumental), len(notch))
                instrumental_ducked = instrumental[:min_len] - notch[:min_len] * 0.4
            except:
                instrumental_ducked = instrumental * 0.7
        elif voice_style == "Party Mashup":
            # Special processing for mashup style - enhance electronic feel
            try:
                # Boost bass and treble in instrumental for electronic feel
                b_bass, a_bass = butter(2, 150, btype='low', fs=sr)
                bass_boost = filtfilt(b_bass, a_bass, instrumental) * 1.3

                b_treble, a_treble = butter(2, 8000, btype='high', fs=sr)
                treble_boost = filtfilt(b_treble, a_treble, instrumental) * 1.2

                # Duck only mid frequencies where vocals sit
                b_notch, a_notch = butter(2, [800, 3500], btype='band', fs=sr)
                notch = filtfilt(b_notch, a_notch, instrumental)

                min_len = min(len(instrumental), len(notch), len(bass_boost), len(treble_boost))
                instrumental_ducked = (instrumental[:min_len] - notch[:min_len] * 0.2 +
                                    bass_boost[:min_len] * 0.15 + treble_boost[:min_len] * 0.1)
            except:
                instrumental_ducked = instrumental * 0.95
        elif voice_style == "Slowed + Reverb":
            # Enhance atmospheric mixing for slowed reverb
            try:
                # Gentle high-cut on instrumental for warmth
                b_warm, a_warm = butter(2, 6000, btype='low', fs=sr)
                warm_inst = filtfilt(b_warm, a_warm, instrumental)

                # Light ducking in vocal range
                b_notch, a_notch = butter(1, [1000, 2800], btype='band', fs=sr)
                notch = filtfilt(b_notch, a_notch, instrumental)
                min_len = min(len(warm_inst), len(notch))
                instrumental_ducked = warm_inst[:min_len] - notch[:min_len] * 0.15
            except:
                instrumental_ducked = instrumental * 0.85
        else:
            # Standard ducking
            try:
                b_notch, a_notch = butter(2, [1200, 3000], btype='band', fs=sr)
                notch = filtfilt(b_notch, a_notch, instrumental)
                min_len = min(len(instrumental), len(notch))
                instrumental_ducked = instrumental[:min_len] - notch[:min_len] * 0.25
            except:
                instrumental_ducked = instrumental

        min_len = min(len(vocals), len(instrumental_ducked))
        vocals = vocals[:min_len]
        instrumental_ducked = instrumental_ducked[:min_len]

        mixed = vocals + instrumental_ducked

        # Style-specific final mastering
        if voice_style == "Party Mashup":
            # Add subtle stereo enhancement for electronic feel
            max_val = np.max(np.abs(mixed))
            if max_val > 0.85:
                mixed = mixed / max_val * 0.85
            mixed = np.tanh(mixed * 1.15) * 0.92
        else:
            max_val = np.max(np.abs(mixed))
            if max_val > 0.9:
                mixed = mixed / max_val * 0.9
            mixed = np.tanh(mixed * 1.05) * 0.95

        return mixed

    def get_styles(self):
        return list(self.styles.keys())

# ==============================
# Gradio Interface
# ==============================
converter = OptimizedVoiceConverter()

def process_with_input(youtube_url, uploaded_file, voice_style, vocal_vol, music_vol, progress=gr.Progress()):
    def update_progress(fraction, text):
        progress(fraction, desc=text)

    try:
        if uploaded_file is not None:
            output_path, title, processing_time = converter.convert_voice_effect_from_file(
                uploaded_file, voice_style, vocal_vol, music_vol, update_progress
            )
        elif youtube_url and youtube_url.strip():
            output_path, title, processing_time = converter.convert_voice_effect(
                youtube_url.strip(), voice_style, vocal_vol, music_vol, update_progress
            )
        else:
            return None, "❌ Error: Please provide either a YouTube URL or upload an audio file", ""

        return output_path, f"✅ {voice_style} effect applied in {processing_time:.1f}s", f"🎵 {title}"
    except Exception as e:
        return None, f"❌ Error: {str(e)}", ""

with gr.Blocks(
    title="AI Voice Effect Converter",
    theme=gr.themes.Soft(),
    css="""
        .effect-card { background: linear-gradient(45deg, #667eea, #764ba2); color: white;
                      padding: 1rem; border-radius: 10px; margin: 0.5rem; text-align: center; }
        .feature { background: #f8f9fa; padding: 1rem; border-radius: 8px; margin: 0.5rem;
                  border-left: 4px solid #28a745; }
    """
) as app:

    gr.HTML("""
    <div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea, #764ba2); color: white; border-radius: 15px; margin-bottom: 2rem;">
        <h1 style="margin: 0; font-size: 2.5em;">🎤 Enhanced Voice Effect Converter</h1>
        <p style="margin: 1rem 0; font-size: 1.3em;"> Improved Slowed Reverb • Party Mashup • Animated Voice</p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=3):
            youtube_input = gr.Textbox(
                label="🎵 YouTube URL",
                placeholder="Paste YouTube URL for voice transformation...",
                lines=2
            )

            audio_file = gr.Audio(
                label="📁 Or Upload Audio File",
                type="filepath",
                sources=["upload"]
            )

            voice_style = gr.Dropdown(
                choices=converter.get_styles(),
                label="🎤 Voice Effect Style",
                value="Animated Voice",
                info="Choose from 3 voice effects"
            )

            with gr.Row():
                vocal_vol = gr.Slider(0.8, 2.0, 1.3, 0.1, label="🎙️ Voice Level")
                music_vol = gr.Slider(0.2, 1.0, 0.5, 0.1, label="🎵 Music Level")

            convert_btn = gr.Button("🎤 TRANSFORM VOICE", variant="primary", size="lg")

        with gr.Column(scale=2):
            status_display = gr.Textbox(label="🔄 Processing Status", lines=3)
            file_info_display = gr.Textbox(label="📁 Track Info", lines=2)
            audio_output = gr.Audio(label="🎧 Voice Effect Output")



    gr.Examples([
        ["https://www.youtube.com/watch?v=L_jWHffIx5E", "Animated Voice"],
        ["https://www.youtube.com/watch?v=JGwWNGJdvx8", "Slowed + Reverb"],
        ["https://www.youtube.com/watch?v=example1", "Party Mashup"],
    ], inputs=[youtube_input, voice_style], label="🎯 Try These Examples")

    convert_btn.click(
        fn=process_with_input,
        inputs=[youtube_input, audio_file, voice_style, vocal_vol, music_vol],
        outputs=[audio_output, status_display, file_info_display]
    )

    gr.HTML("""
    <div style="margin-top: 2rem; padding: 1rem; background: linear-gradient(45deg, #ff6b6b, #4ecdc4); border-radius: 10px; text-align: center;">
        <p style="margin: 0; color: #2c3e50;">
            🎯 <strong>Enhanced Voice Effects</strong> • Party Mashup with electronic elements • Improved Slowed + Reverb timing
        </p>
    </div>
    """)

if __name__ == "__main__":
    app.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://70ea88810db53096c3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
