In [None]:
!pip install f5-tts gradio ruaccent runorm transformers torch torchaudio huggingface_hub soundfile pydub

Collecting f5-tts
  Downloading f5_tts-1.1.7-py3-none-any.whl.metadata (10 kB)
Collecting ruaccent
  Downloading ruaccent-1.5.8.3-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting bitsandbytes>0.37.0 (from f5-tts)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting cached_path (from f5-tts)
  Downloading cached_path-1.7.3-py3-none-any.whl.metadata (19 kB)
Collecting ema_pytorch>=0.5.2 (from f5-tts)
  Downloading ema_pytorch-0.7.7-py3-none-any.whl.metadata (689 bytes)
Collecting gradio
  Downloading gradio-5.35.0-py3-none-any.whl.metadata (16 kB)
Collecting hydra-core>=1.3.0 (from f5-tts)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting numpy<=1.26.4 (from f5-tts)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.

In [None]:
# –ù–ê–ñ–ê–¢–¨ RESTART SESSION

In [None]:
#!/usr/bin/env python3
import time
import re
import os
import gc
import tempfile
import traceback
from pydub import AudioSegment
from pathlib import Path

import gradio as gr
import numpy as np
import soundfile as sf
import torch
import torchaudio
from huggingface_hub import hf_hub_download, snapshot_download
from ruaccent import RUAccent
from runorm import RUNorm
from f5_tts.infer.utils_infer import (
    infer_process,
    load_model,
    load_vocoder,
    preprocess_ref_audio_text,
    remove_silence_for_generated_wav,
    save_spectrogram,
    tempfile_kwargs,
)
from f5_tts.model import DiT

# üîπ –î–æ–±–∞–≤—å—Ç–µ —ç—Ç–∏ –∏–º–ø–æ—Ä—Ç—ã –¥–ª—è Whisper
from transformers import WhisperProcessor, WhisperForConditionalGeneration

MODELS_CONFIG = {
    "ESpeech-TTS-1_SFT-95K": {
        "repo": "ESpeech/ESpeech-TTS-1_SFT-95K",
        "model_file": "espeech_tts_95k.pt",
        "vocab_file": "vocab.txt",
        "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    },
    "ESpeech-TTS-1_RL-V1": {
        "repo": "ESpeech/ESpeech-TTS-1_RL-V1",
        "model_file": "espeech_tts_rlv1.pt",
        "vocab_file": "vocab.txt",
        "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    },
    "ESpeech-TTS-1_RL-V2": {
        "repo": "ESpeech/ESpeech-TTS-1_RL-V2",
        "model_file": "espeech_tts_rlv2.pt",
        "vocab_file": "vocab.txt",
        "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    },
    "F5TTS_v1_Base_v2_Misha24-10": {
        "repo": "Misha24-10/F5-TTS_RUSSIAN",
        "model_file": "F5TTS_v1_Base_v2/model_last_inference.safetensors",
        "vocab_file": "F5TTS_v1_Base/vocab.txt",
        "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    },
    "F5TTS_v1_Base_1.25M_SWivid": {
    "repo": "SWivid/F5-TTS",
    "model_file": "F5TTS_v1_Base/model_1250000.safetensors",
    "vocab_file": "F5TTS_v1_Base/vocab.txt",
    "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    },
}

# –ì–ª–æ–±–∞–ª—å–Ω—ã–µ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –¥–ª—è –º–æ–¥–µ–ª–µ–π
loaded_models = {}
loaded_vocoder = None
current_model_name = None
stop_generation = False
remember_seed = False
last_seed = -1

# üî¥ –ù–û–í–û–ï: –°–ª–æ–≤–∞—Ä—å –¥–ª—è –ª–æ–∫–∞–ª—å–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π
local_models_config = {}

def check_cuda_availability():
    """Check if CUDA is available and raise error if not"""
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available! This application requires GPU with CUDA support.")
    print(f"CUDA is available. Using device: {torch.cuda.get_device_name()}")

def add_local_model(model_path, vocab_path, model_name):
    """–î–æ–±–∞–≤–∏—Ç—å –ª–æ–∫–∞–ª—å–Ω—É—é –º–æ–¥–µ–ª—å –≤ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é"""
    global local_models_config, MODELS_CONFIG
    
    if not model_path or not vocab_path or not model_name:
        return gr.update(value="‚ùå –ó–∞–ø–æ–ª–Ω–∏—Ç–µ –≤—Å–µ –ø–æ–ª—è", visible=True)
    
    # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Å—É—â–µ—Å—Ç–≤–æ–≤–∞–Ω–∏–µ —Ñ–∞–π–ª–æ–≤
    if not os.path.exists(model_path):
        return gr.update(value="‚ùå –§–∞–π–ª –º–æ–¥–µ–ª–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω", visible=True)
    
    if not os.path.exists(vocab_path):
        return gr.update(value="‚ùå –§–∞–π–ª —Å–ª–æ–≤–∞—Ä—è –Ω–µ –Ω–∞–π–¥–µ–Ω", visible=True)
    
    # –î–æ–±–∞–≤–ª—è–µ–º –º–æ–¥–µ–ª—å –≤ –ª–æ–∫–∞–ª—å–Ω—É—é –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é
    local_models_config[model_name] = {
        "repo": "local",
        "model_file": model_path,
        "vocab_file": vocab_path,
        "model_cfg": dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
    }
    
    # –û–±–Ω–æ–≤–ª—è–µ–º –æ–±—â–∏–π –∫–æ–Ω—Ñ–∏–≥ –º–æ–¥–µ–ª–µ–π
    updated_models_config = {**MODELS_CONFIG, **local_models_config}
    
    return gr.update(
        choices=list(updated_models_config.keys()),
        value=model_name
    ), gr.update(value=f"‚úÖ –õ–æ–∫–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å '{model_name}' –¥–æ–±–∞–≤–ª–µ–Ω–∞", visible=True)

def load_model_with_progress(model_name, progress_callback=None):
    """–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —Å –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä–æ–º –ø—Ä–æ–≥—Ä–µ—Å—Å–∞"""
    global loaded_models, current_model_name, stop_generation, MODELS_CONFIG, local_models_config
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ —Ñ–ª–∞–≥–∞ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
    if stop_generation:
        return None
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –∫–æ–Ω—Ñ–∏–≥–∏ –º–æ–¥–µ–ª–µ–π
    all_models_config = {**MODELS_CONFIG, **local_models_config}
    
    # –í–∞–ª–∏–¥–∞—Ü–∏—è –≤—Ö–æ–¥–Ω—ã—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤
    if not model_name or model_name not in all_models_config:
        raise ValueError(f"Unknown model: {model_name}")
    
    # –ï—Å–ª–∏ –º–æ–¥–µ–ª—å —É–∂–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –µ—ë
    if model_name in loaded_models:
        return loaded_models[model_name]
    
    # –ü–æ–ª—É—á–∞–µ–º –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é –º–æ–¥–µ–ª–∏
    config = all_models_config[model_name]
    
    # üî¥ –î–õ–Ø –õ–û–ö–ê–õ–¨–ù–´–• –ú–û–î–ï–õ–ï–ô - –ü–†–û–í–ï–†–Ø–ï–ú –§–ê–ô–õ–´
    if config['repo'] == "local":
        model_path = config['model_file']
        vocab_path = config['vocab_file']
        
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Local model file not found: {model_path}")
        if not os.path.exists(vocab_path):
            raise FileNotFoundError(f"Local vocab file not found: {vocab_path}")
            
        print(f"Loading local model from: {model_path}")
        
    else:
        # –°—Ç–∞—Ä–∞—è –ª–æ–≥–∏–∫–∞ –¥–ª—è –º–æ–¥–µ–ª–µ–π –∏–∑ HuggingFace
        if progress_callback:
            progress_callback("üîç –ü–æ–∏—Å–∫ –º–æ–¥–µ–ª–∏...", 0.1)
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º –¥–æ—Å—Ç—É–ø–Ω–æ—Å—Ç—å CUDA
        check_cuda_availability()

        model_path = None
        vocab_path = None
        
        # –û–±–Ω–æ–≤–ª—è–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å
        if progress_callback:
            progress_callback("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏...", 0.3)
        
        # –°–∫–∞—á–∏–≤–∞–µ–º —Ñ–∞–π–ª—ã –º–æ–¥–µ–ª–∏
        try:
            model_path = hf_hub_download(repo_id=config['repo'], filename=config['model_file'])
            vocab_path = hf_hub_download(repo_id=config['repo'], filename=config['vocab_file'])
            print(f"Downloaded model to {model_path}")
            print(f"Downloaded vocab to {vocab_path}")
        except Exception as e:
            print("hf_hub_download failed:", e)
            
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ñ–ª–∞–≥ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
        if stop_generation:
            return None
            
        # –ï—Å–ª–∏ –Ω–µ —É–¥–∞–ª–æ—Å—å —Å–∫–∞—á–∞—Ç—å –æ—Ç–¥–µ–ª—å–Ω—ã–µ —Ñ–∞–π–ª—ã, –ø—Ä–æ–±—É–µ–º —Å–∫–∞—á–∞—Ç—å –≤—Å—é –º–æ–¥–µ–ª—å
        if model_path is None or vocab_path is None:
            try:
                local_dir = f"cache_{config['repo'].replace('/', '_')}"
                if progress_callback:
                    progress_callback("üì¶ –°–∫–∞—á–∏–≤–∞–Ω–∏–µ –ø–æ–ª–Ω–æ–π –º–æ–¥–µ–ª–∏...", 0.5)
                snapshot_dir = snapshot_download(repo_id=config['repo'], cache_dir=None, local_dir=local_dir)
                possible_model = os.path.join(snapshot_dir, config['model_file'])
                possible_vocab = os.path.join(snapshot_dir, config['vocab_file'])
                if os.path.exists(possible_model):
                    model_path = possible_model
                if os.path.exists(possible_vocab):
                    vocab_path = possible_vocab
            except Exception as e:
                print("snapshot_download failed:", e)
                
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ñ–ª–∞–≥ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
        if stop_generation:
            return None
            
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ —Ñ–∞–π–ª—ã —Å—É—â–µ—Å—Ç–≤—É—é—Ç
        if not model_path or not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found: {model_path}")
        if not vocab_path or not os.path.exists(vocab_path):
            raise FileNotFoundError(f"Vocab file not found: {vocab_path}")

    # üî¥ –û–ë–©–ê–Ø –õ–û–ì–ò–ö–ê –ó–ê–ì–†–£–ó–ö–ò –ú–û–î–ï–õ–ò
    if progress_callback:
        progress_callback("üîÑ –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –≤ –ø–∞–º—è—Ç—å...", 0.7)
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å
    print(f"Loading model from: {model_path}")
    model = load_model(DiT, config['model_cfg'], model_path, vocab_file=vocab_path)

    # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ñ–ª–∞–≥ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏
    if stop_generation:
        return None
        
    # –û–±–Ω–æ–≤–ª—è–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å
    if progress_callback:
        progress_callback("üöÄ –ü–µ—Ä–µ–Ω–æ—Å –º–æ–¥–µ–ª–∏ –Ω–∞ GPU...", 0.9)
    
    # –ü–µ—Ä–µ–º–µ—â–∞–µ–º –º–æ–¥–µ–ª—å –Ω–∞ GPU
    device = torch.device("cuda")
    model.to(device)
    print(f"Model loaded and moved to CUDA: {device}")

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –≤ –∫—ç—à
    loaded_models[model_name] = model
    current_model_name = model_name

    # –§–∏–Ω–∞–ª—å–Ω–æ–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
    if progress_callback:
        progress_callback("‚úÖ –ú–æ–¥–µ–ª—å –∑–∞–≥—Ä—É–∂–µ–Ω–∞!", 1.0)
    
    return model

def ensure_model(model_name, progress_callback=None):
    """Ensure model is loaded with progress tracking"""
    global loaded_models, MODELS_CONFIG, local_models_config
    
    if not model_name:
        raise ValueError("Model name must be specified")
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –∫–æ–Ω—Ñ–∏–≥–∏ –º–æ–¥–µ–ª–µ–π
    all_models_config = {**MODELS_CONFIG, **local_models_config}
    
    if model_name not in all_models_config:
        raise ValueError(f"Unknown model: {model_name}")
    
    # –ï—Å–ª–∏ –º–æ–¥–µ–ª—å —É–∂–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –µ—ë
    if model_name in loaded_models:
        return loaded_models[model_name]
    
    # –ò–Ω–∞—á–µ –∑–∞–≥—Ä—É–∂–∞–µ–º —Å –æ—Ç—Å–ª–µ–∂–∏–≤–∞–Ω–∏–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
    return load_model_with_progress(model_name, progress_callback)

def ensure_vocoder():
    global loaded_vocoder
    if loaded_vocoder is not None:
        return loaded_vocoder

    # –ü—Ä–æ–≤–µ—Ä–∫–∞ CUDA
    check_cuda_availability()

    print("‚è≥ Loading vocoder...")
    
    try:
        loaded_vocoder = load_vocoder()
        device = torch.device("cuda")
        loaded_vocoder.to(device)
        print("‚úÖ Vocoder loaded successfully")
        return loaded_vocoder
    except Exception as e:
        print(f"‚ùå Vocoder loading failed: {e}")
        raise e

def stop_generation_process():
    """–û—Å—Ç–∞–Ω–æ–≤–∫–∞ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏"""
    global stop_generation
    stop_generation = True
    torch.cuda.empty_cache()
    gc.collect()
    return "üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞"

def reset_stop_flag():
    """–°–±—Ä–æ—Å —Ñ–ª–∞–≥–∞ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏"""
    global stop_generation
    stop_generation = False

# --- –ó–∞–≥—Ä—É–∑–∫–∞ RUNorm ---
print("Loading RUNorm (text normalizer)...")
try:
    normalizer = RUNorm()
    normalizer.load(
        model_size="medium",
        device="cuda" if torch.cuda.is_available() else "cpu",
        workdir="./local_cache"  # –ö—ç—à–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–µ–π
    )
    print("RUNorm loaded.")
except Exception as e:
    print(f"Failed to load RUNorm: {e}")
    exit(1)

# --- Check CUDA ---
print("Loading RUAccent...")
accentizer = RUAccent()
accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True, tiny_mode=False)
print("RUAccent loaded.")


print("Loading Whisper ASR model (supports Russian and English)...")
try:
    from transformers import WhisperProcessor, WhisperForConditionalGeneration

    ASR_MODEL_NAME = "openai/whisper-medium"  # –ë—ã—Å—Ç—Ä–æ –∏ —Ç–æ—á–Ω–æ. –ú–æ–∂–Ω–æ: "openai/whisper-base", "medium", "large-v3"
    asr_processor = WhisperProcessor.from_pretrained(ASR_MODEL_NAME)
    asr_model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_NAME)
    asr_model.to("cuda" if torch.cuda.is_available() else "cpu")
    print("Whisper model loaded. Supports Russian, English, and many other languages.")
except Exception as e:
    print(f"Failed to load Whisper model: {e}")
    asr_processor = None
    asr_model = None

def transcribe_audio(audio_path):
    if not audio_path or asr_processor is None or asr_model is None:
        return ""

    try:
        # 1. –ó–∞–≥—Ä—É–∑–∫–∞ –∞—É–¥–∏–æ
        waveform, sample_rate = torchaudio.load(audio_path)

        # 2. –†–µ—Å–µ–º–ø–ª–∏–Ω–≥ –≤ 16 –∫–ì—Ü
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # 3. –ú–æ–Ω–æ
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # 4. –í numpy
        input_audio = waveform.squeeze().numpy()

        # 5. –û–±—Ä–∞–±–æ—Ç–∫–∞: –ø–æ–ª—É—á–∞–µ–º input_features (–±–µ–∑ –ø–∞–¥–¥–∏–Ω–≥–∞)
        inputs = asr_processor(
            input_audio,
            sampling_rate=16000,
            return_tensors="pt",
            # ‚ùó –£–±–∏—Ä–∞–µ–º padding –∏ attention_mask ‚Äî –±—É–¥–µ–º –ø–∞–¥–∏—Ç—å –≤—Ä—É—á–Ω—É—é
        )

        input_features = inputs.input_features  # [1, 80, T], –≥–¥–µ T < 3000

        # üîπ üî• –†–£–ß–ù–û–ô –ü–ê–î–î–ò–ù–ì –î–û 3000
        current_length = input_features.shape[-1]
        if current_length < 3000:
            pad_length = 3000 - current_length
            input_features = torch.nn.functional.pad(input_features, (0, pad_length), mode='constant', value=0)
        elif current_length > 3000:
            input_features = input_features[..., :3000]  # –æ–±—Ä–µ–∑–∞–µ–º, –µ—Å–ª–∏ –≤–¥—Ä—É–≥ –±–æ–ª—å—à–µ

        print("Input features shape:", input_features.shape)  # –î–æ–ª–∂–Ω–æ –±—ã—Ç—å [1, 80, 3000]

        # 6. –ü–µ—Ä–µ–Ω–æ—Å–∏–º –Ω–∞ GPU
        input_features = input_features.to(asr_model.device)

        # 7. –ì–µ–Ω–µ—Ä–∞—Ü–∏—è
        with torch.no_grad():
            generated_ids = asr_model.generate(
                input_features=input_features,
                language=None,
                task="transcribe",
            )

        # 8. –î–µ—Ü–æ–¥–∏–Ω–≥
        transcription = asr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return transcription.strip().capitalize() + "."

    except Exception as e:
        print("ASR transcription failed:", e)
        return "–û—à–∏–±–∫–∞ —Ä–∞—Å–ø–æ–∑–Ω–∞–≤–∞–Ω–∏—è."
# --- –ö–æ–Ω–µ—Ü —Ñ—É–Ω–∫—Ü–∏–∏ ---

def validate_and_transcribe_audio(audio_path):
    if not audio_path:
        return None, "", gr.update(value="", visible=False)

    try:
        # –ü—Ä–æ—Å—Ç–æ —Ç—Ä–∞–Ω—Å–∫—Ä–∏–±–∏—Ä—É–µ–º –∞—É–¥–∏–æ –±–µ–∑ –ø—Ä–æ–≤–µ—Ä–æ–∫
        transcription = transcribe_audio(audio_path)
        
        # ‚úÖ –í—Å–µ–≥–¥–∞ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –∞—É–¥–∏–æ—Ñ–∞–π–ª
        return audio_path, transcription, gr.update(value="", visible=False)
        
    except Exception as e:
        print("Validation failed:", e)
        return None, "", gr.update(value="‚ö†Ô∏è **–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø—Ä–æ–≤–µ—Ä–∫–µ –∞—É–¥–∏–æ**", visible=True)

# Check CUDA availability at startup
print("Checking CUDA availability...")
try:
    check_cuda_availability()
except RuntimeError as e:
    print(f"FATAL ERROR: {e}")
    exit(1)

# --- –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞: –°–ù–ê–ß–ê–õ–ê runorm, –ü–û–¢–û–ú ruaccent ---
def process_text_with_accent(text):
    if not text or not text.strip():
        return text

    # --- 1. –ü—Ä–æ–≤–µ—Ä–∫–∞: –µ—Å—Ç—å –ª–∏ —É–¥–∞—Ä–µ–Ω–∏—è (+) –≤ —Å–ª–æ–≤–∞—Ö? ---
    # –ò—â–µ–º + –º–µ–∂–¥—É –±—É–∫–≤–∞–º–∏: "–ø—Ä–∏–≤+–µ—Ç", "–≤—ã—Å+–æ–∫–∏–π", –Ω–æ –Ω–µ "5 + 3"
    has_accent_in_words = bool(re.search(r'\w\+\w', text))
    # ---

    # --- 2. –ï—Å–ª–∏ –ù–ï–¢ —É–¥–∞—Ä–µ–Ω–∏–π ‚Üí –Ω–æ—Ä–º–∞–ª–∏–∑—É–µ–º —á–µ—Ä–µ–∑ runorm ---
    if not has_accent_in_words:
        try:
            # –ó–∞–º–µ–Ω—è–µ–º + –Ω–∞ –≤—Ä–µ–º–µ–Ω–Ω—ã–π —Å–∏–º–≤–æ–ª, —á—Ç–æ–±—ã runorm –Ω–µ —Ç—Ä–æ–≥–∞–ª —É–¥–∞—Ä–µ–Ω–∏—è
            text_safe = text.replace("+", "‚ö°")
            text_normalized = normalizer.norm(text_safe)
            # –í–æ–∑–≤—Ä–∞—â–∞–µ–º + –æ–±—Ä–∞—Ç–Ω–æ (–Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π)
            text = text_normalized.replace("‚ö°", "+")
        except Exception as e:
            print(f"runorm.norm() failed: {e}")
            text = text
    else:
        # –£–¥–∞—Ä–µ–Ω–∏—è —É–∂–µ –µ—Å—Ç—å ‚Üí –ø—Ä–æ–ø—É—Å–∫–∞–µ–º runorm
        text = text
    # ---

    # --- 3. –†–∞—Å—Å—Ç–∞–Ω–æ–≤–∫–∞ —É–¥–∞—Ä–µ–Ω–∏–π —á–µ—Ä–µ–∑ ruaccent (–µ—Å–ª–∏ –Ω–µ—Ç +) ---
    if '+' in text:
        return text
    try:
        return accentizer.process_all(text)
    except Exception as e:
        print(f"ruaccent failed: {e}")
        return text

def process_texts_only(ref_text, gen_text):
    # –ë–ª–æ–∫–∏—Ä–æ–≤–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏ –ø—É—Ç–µ–π –∫ —Ñ–∞–π–ª–∞–º
    if ref_text and isinstance(ref_text, str):
        if any(bad in ref_text for bad in ["/tmp/", ".wav", ".mp3", ".ogg", ".flac", "/home/", "/root/"]):
            ref_text = ""  # –∏–ª–∏ –æ—Å—Ç–∞–≤—å—Ç–µ –ø—É—Å—Ç—ã–º
    return (
        process_text_with_accent(ref_text),
        process_text_with_accent(gen_text)
    )

def get_current_seed_display():
    """–ü–æ–ª—É—á–∏—Ç—å —Ç–µ–∫—É—â–µ–µ –∑–Ω–∞—á–µ–Ω–∏–µ seed –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è"""
    global last_seed
    # –ï—Å–ª–∏ last_seed –µ—â–µ –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω, –≥–µ–Ω–µ—Ä–∏—Ä—É–µ–º —Å–ª—É—á–∞–π–Ω—ã–π
    if last_seed == -1:
        last_seed = np.random.randint(0, 2**31 - 1)
        print(f"üé≤ –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω –Ω–∞—á–∞–ª—å–Ω—ã–π seed: {last_seed}")
    return last_seed

# üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–ê–Ø –§–£–ù–ö–¶–ò–Ø: –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç—É—Å–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–∏
def update_model_loading_status(model_name):
    """–û–±–Ω–æ–≤–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç—É—Å–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–∏"""
    global loaded_models, MODELS_CONFIG, local_models_config
    
    if not model_name:
        return gr.update(value="<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å –Ω–µ –≤—ã–±—Ä–∞–Ω–∞</div>", visible=True)
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –∫–æ–Ω—Ñ–∏–≥–∏ –º–æ–¥–µ–ª–µ–π
    all_models_config = {**MODELS_CONFIG, **local_models_config}
    
    if model_name not in all_models_config:
        return gr.update(value=f"<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å {model_name} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞</div>", visible=True)
    
    # üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –ü—Ä–æ–≤–µ—Ä—è–µ–º, –∑–∞–≥—Ä—É–∂–µ–Ω–∞ –ª–∏ –º–æ–¥–µ–ª—å –≤ –¥–∞–Ω–Ω—ã–π –º–æ–º–µ–Ω—Ç
    if model_name in loaded_models:
        model_type = "üîß –õ–æ–∫–∞–ª—å–Ω–∞—è" if all_models_config[model_name]['repo'] == "local" else "üåê HuggingFace"
        return gr.update(value=f"<div class='model-status model-loaded'>‚úÖ {model_name} ({model_type}) –∑–∞–≥—Ä—É–∂–µ–Ω–∞</div>", visible=True)
    else:
        # üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –ü–æ–∫–∞–∑—ã–≤–∞–µ–º —Å—Ç–∞—Ç—É—Å "–Ω–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞" —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ –º–æ–¥–µ–ª—å –Ω–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
        model_type = "üîß –õ–æ–∫–∞–ª—å–Ω–∞—è" if all_models_config[model_name]['repo'] == "local" else "üåê HuggingFace"
        return gr.update(value=f"<div class='model-status model-loading'>üîÑ {model_name} ({model_type}) –Ω–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞</div>", visible=True)

# üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–ê–Ø –§–£–ù–ö–¶–ò–Ø: –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —Å –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ–º —Å—Ç–∞—Ç—É—Å–∞
def load_model_with_status(model_name, progress=gr.Progress()):
    """–ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ —Å –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ–º —Å—Ç–∞—Ç—É—Å–∞"""
    global loaded_models
    
    if not model_name:
        return gr.update(value="<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å –Ω–µ –≤—ã–±—Ä–∞–Ω–∞</div>", visible=True)
    
    # –ï—Å–ª–∏ –º–æ–¥–µ–ª—å —É–∂–µ –∑–∞–≥—Ä—É–∂–µ–Ω–∞, –ø—Ä–æ—Å—Ç–æ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º —Å—Ç–∞—Ç—É—Å
    if model_name in loaded_models:
        return update_model_loading_status(model_name)
    
    try:
        # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º —Å—Ç–∞—Ç—É—Å –∑–∞–≥—Ä—É–∑–∫–∏
        progress(0.1, desc="üîÑ –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏...")
        
        # –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å
        model = load_model_with_progress(model_name, lambda msg, p: progress(p, desc=msg))
        
        if model:
            # –£—Å–ø–µ—à–Ω–∞—è –∑–∞–≥—Ä—É–∑–∫–∞
            return update_model_loading_status(model_name)
        else:
            return gr.update(value="<div class='model-status model-error'>‚ùå –ó–∞–≥—Ä—É–∑–∫–∞ –æ—Ç–º–µ–Ω–µ–Ω–∞</div>", visible=True)
            
    except Exception as e:
        error_msg = f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {str(e)}"
        print(f"Model loading error: {traceback.format_exc()}")
        return gr.update(value=f"<div class='model-status model-error'>{error_msg}</div>", visible=True)

def on_model_select(model_name):
    """–û–±—Ä–∞–±–æ—Ç—á–∏–∫ –≤—ã–±–æ—Ä–∞ –º–æ–¥–µ–ª–∏ –≤ dropdown"""
    global MODELS_CONFIG, local_models_config
    
    if not model_name:
        return gr.update(value="<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å –Ω–µ –≤—ã–±—Ä–∞–Ω–∞</div>", visible=True)
    
    # –û–±—ä–µ–¥–∏–Ω—è–µ–º –∫–æ–Ω—Ñ–∏–≥–∏ –º–æ–¥–µ–ª–µ–π
    all_models_config = {**MODELS_CONFIG, **local_models_config}
    
    if model_name not in all_models_config:
        return gr.update(value=f"<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å {model_name} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞</div>", visible=True)
    
    # üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –ü—Ä–æ—Å—Ç–æ –ø–æ–∫–∞–∑—ã–≤–∞–µ–º —Ç–µ–∫—É—â–∏–π —Å—Ç–∞—Ç—É—Å –±–µ–∑ –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–π –∑–∞–≥—Ä—É–∑–∫–∏
    return update_model_loading_status(model_name)

def synthesize(
    ref_audio,
    ref_text,
    gen_text,
    remove_silence,
    seed_input_value,
    remember_seed_checkbox,
    model_choice,
    cross_fade_duration=0.15,
    nfe_step=32,
    speed=1.0,
    sway_sampling_coef=-1,
    cfg_strength=2,
    audio_format="wav",
    bitrate="192k",
    progress=gr.Progress()
):
    global stop_generation, last_seed, remember_seed
    
    # üî¥ –û–ë–ù–û–í–õ–Ø–ï–ú –§–õ–ê–ì –ó–ê–ü–û–ú–ò–ù–ê–ù–ò–Ø –°–ò–î–ê
    remember_seed = remember_seed_checkbox
    
    reset_stop_flag()
    
    # üî¥ –°–†–ê–ó–£ –û–ë–ù–û–í–õ–Ø–ï–ú –°–¢–ê–¢–£–° –ú–û–î–ï–õ–ò
    current_model_status = update_model_loading_status(model_choice)
    
    # üî¥ –†–ê–ù–ù–Ø–Ø –ü–†–û–í–ï–†–ö–ê STOP_FLAG
    if stop_generation:
        gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
        return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status
    
    # üîÑ –û–ë–ù–û–í–õ–Ø–ï–ú –°–¢–ê–¢–£–° –ó–ê–ì–†–£–ó–ö–ò
    progress(0.05, desc="üîÑ –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞...")
    
    # üéØ –ü–†–û–í–ï–†–ö–ê –í–•–û–î–ù–´–• –î–ê–ù–ù–´–•
    if not ref_audio:
        gr.Warning("‚ùå –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –ø—Ä–µ–¥–æ—Å—Ç–∞–≤—å—Ç–µ —Ä–µ—Ñ–µ—Ä–µ–Ω—Å–Ω–æ–µ –∞—É–¥–∏–æ")
        return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status
    
    if not gen_text or not gen_text.strip():
        gr.Warning("‚ùå –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –≤–≤–µ–¥–∏—Ç–µ —Ç–µ–∫—Å—Ç –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏")
        return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status
        
    if not ref_text or not ref_text.strip():
        gr.Warning("‚ùå –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –ø—Ä–µ–¥–æ—Å—Ç–∞–≤—å—Ç–µ —Ä–µ—Ñ–µ—Ä–µ–Ω—Å–Ω—ã–π —Ç–µ–∫—Å—Ç")
        return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status

    # üîê –ó–ê–©–ò–¢–ê –û–¢ –ü–£–¢–ï–ô –ö –§–ê–ô–õ–ê–ú
    if gen_text and isinstance(gen_text, str):
        if any(x in gen_text for x in ["/tmp/", ".wav", ".mp3", ".ogg", ".flac"]):
            gr.Warning("‚ö†Ô∏è –ü–æ–ª–µ 'Text to Generate' —Å–æ–¥–µ—Ä–∂–∏—Ç –ø—É—Ç—å –∫ —Ñ–∞–π–ª—É. –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –≤–≤–µ–¥–∏—Ç–µ —Ç–µ–∫—Å—Ç –≤—Ä—É—á–Ω—É—é.")
            return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status

    # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
    if stop_generation:
        gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
        return None, None, ref_text, gen_text, gr.update(value=get_current_seed_display()), current_model_status

    # üé≤ –£–°–¢–ê–ù–û–í–ö–ê SEED - –ö–õ–Æ–ß–ï–í–û–ï –ò–ó–ú–ï–ù–ï–ù–ò–ï!
    current_seed = seed_input_value
    if seed_input_value is None or seed_input_value < 0 or seed_input_value > 2**31 - 1:
        # üî¥ –ì–ï–ù–ï–†–ò–†–£–ï–ú –°–õ–£–ß–ê–ô–ù–´–ô –°–ò–î
        current_seed = np.random.randint(0, 2**31 - 1)
        print(f"üé≤ –°–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω —Å–ª—É—á–∞–π–Ω—ã–π seed: {current_seed}")
    
    # üî¥ –ó–ê–ü–û–ú–ò–ù–ê–ï–ú –°–ò–î –ï–°–õ–ò –í–ö–õ–Æ–ß–ï–ù–ê –ì–ê–õ–û–ß–ö–ê
    if remember_seed:
        last_seed = current_seed
        print(f"üíæ Seed —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {current_seed}")
    
    torch.manual_seed(int(current_seed))
    
    # üî¥ –í–´–í–û–î–ò–ú –ò–ù–§–û–†–ú–ê–¶–ò–Æ –û –°–ò–î–ï
    print(f"üé≤ –ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è seed: {current_seed} (–∑–∞–ø–æ–º–Ω–∏—Ç—å: {remember_seed})")

    # üìù –û–ë–†–ê–ë–û–¢–ö–ê –¢–ï–ö–°–¢–ê
    progress(0.08, desc="üìù –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞...")

    # --- ‚úÖ –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ç–µ–∫—Å—Ç—ã –∫–∞–∫ –µ—Å—Ç—å ---
    processed_ref_text = ref_text
    processed_gen_text = gen_text

    # --- ‚úÖ –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è processed_ref_text_final –î–û try ---
    processed_ref_text_final = ref_text  

    try:
        # --- 1. –û–¢–õ–û–ñ–ï–ù–ù–ê–Ø –ó–ê–ì–†–£–ó–ö–ê –ú–û–î–ï–õ–ï–ô ---
        progress(0.1, desc="üîç –ü—Ä–æ–≤–µ—Ä–∫–∞ –º–æ–¥–µ–ª–µ–π...")
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        # üì• –ó–ê–ì–†–£–ó–ö–ê –û–°–ù–û–í–ù–û–ô –ú–û–î–ï–õ–ò
        progress(0.15, desc="üì• –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ TTS...")
        model = ensure_model(model_choice, lambda msg, p: progress(p, desc=msg))
        
        # üî¥ –û–ë–ù–û–í–õ–Ø–ï–ú –°–¢–ê–¢–£–° –ü–û–°–õ–ï –ó–ê–ì–†–£–ó–ö–ò –ú–û–î–ï–õ–ò
        if model:
            current_model_status = update_model_loading_status(model_choice)
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        # üîä –ó–ê–ì–†–£–ó–ö–ê –í–û–ö–û–î–ï–†–ê (–û–¢–õ–û–ñ–ï–ù–ù–ê–Ø)
        progress(0.2, desc="üîä –ó–∞–≥—Ä—É–∑–∫–∞ –≤–æ–∫–æ–¥–µ—Ä–∞...")
        vocoder = ensure_vocoder()
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
            
    except Exception as e:
        error_msg = f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –º–æ–¥–µ–ª–µ–π: {str(e)}"
        gr.Warning(error_msg)
        print(f"Model loading error: {traceback.format_exc()}")
        return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

    # üéØ –ü–†–û–í–ï–†–ö–ê –£–°–ü–ï–®–ù–û–ô –ó–ê–ì–†–£–ó–ö–ò
    if model is None or vocoder is None:
        gr.Warning("‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –º–æ–¥–µ–ª–∏")
        return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

    device = torch.device("cuda")

    try:
        # --- 2. –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–ê –ê–£–î–ò–û –ò –¢–ï–ö–°–¢–ê ---
        progress(0.3, desc="üîß –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –∞—É–¥–∏–æ...")
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        try:
            ref_audio_proc, processed_ref_text_final = preprocess_ref_audio_text(
                ref_audio,
                processed_ref_text,
                show_info=gr.Info
            )
        except Exception as e:
            error_msg = f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏: {str(e)}"
            gr.Warning(error_msg)
            traceback.print_exc()
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        # --- 3. –ì–ï–ù–ï–†–ê–¶–ò–Ø –ê–£–î–ò–û ---
        progress(0.5, desc="üéµ –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –∞—É–¥–∏–æ...")
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        try:
            final_wave, final_sample_rate, combined_spectrogram = infer_process(
                ref_audio_proc,
                processed_ref_text_final,
                processed_gen_text,
                model,
                vocoder,
                cross_fade_duration=cross_fade_duration,
                nfe_step=nfe_step,
                speed=speed,
                sway_sampling_coef=sway_sampling_coef,
                cfg_strength=cfg_strength,
                show_info=gr.Info,
                progress=progress,
            )
        except Exception as e:
            error_msg = f"‚ùå –û—à–∏–±–∫–∞ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –∞—É–¥–∏–æ: {str(e)}"
            gr.Warning(error_msg)
            traceback.print_exc()
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

        # --- 4. –£–î–ê–õ–ï–ù–ò–ï –¢–ò–®–ò–ù–´ (–û–ü–¶–ò–û–ù–ê–õ–¨–ù–û) ---
        if remove_silence:
            progress(0.7, desc="üîá –£–¥–∞–ª–µ–Ω–∏–µ —Ç–∏—à–∏–Ω—ã...")
            
            # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
            if stop_generation:
                gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
                return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status
            
            try:
                with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
                    temp_path = f.name
                    sf.write(temp_path, final_wave, final_sample_rate)
                    remove_silence_for_generated_wav(temp_path)
                    final_wave_tensor, _ = torchaudio.load(temp_path)
                    final_wave = final_wave_tensor.squeeze().cpu().numpy()
            except Exception as e:
                print("‚ö†Ô∏è –£–¥–∞–ª–µ–Ω–∏–µ —Ç–∏—à–∏–Ω—ã –Ω–µ —É–¥–∞–ª–æ—Å—å:", e)
                # –ù–µ –ø—Ä–µ—Ä—ã–≤–∞–µ–º –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ, –ø—Ä–æ–¥–æ–ª–∂–∞–µ–º —Å –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–º –∞—É–¥–∏–æ

        # --- 5. –≠–ö–°–ü–û–†–¢ –í –§–û–†–ú–ê–¢ ---
        progress(0.8, desc="üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞...")
        
        # üî¥ –ü–†–û–í–ï–†–ö–ê STOP_FLAG
        if stop_generation:
            gr.Warning("üõë –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞")
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

        # üéµ –ü–û–î–ì–û–¢–û–í–ö–ê –ê–£–î–ò–û –î–õ–Ø –≠–ö–°–ü–û–†–¢–ê
        safe_kwargs = {k: v for k, v in tempfile_kwargs.items() if k != 'delete'}

        with tempfile.NamedTemporaryFile(suffix=f".{audio_format}", delete=False, **safe_kwargs) as tmp:
            temp_output_path = tmp.name

        wave = final_wave
        if wave.ndim == 1:
            channels = 1
        else:
            channels = min(wave.shape[0], 2)
            wave = wave[:channels].T

        # üîß –ù–û–†–ú–ê–õ–ò–ó–ê–¶–ò–Ø –ê–£–î–ò–û
        if np.max(np.abs(wave)) == 0:
            wave_int16 = np.zeros(wave.shape, dtype=np.int16)
        else:
            wave_int16 = np.int16(wave / np.max(np.abs(wave)) * 32767)

        # üíæ –°–û–•–†–ê–ù–ï–ù–ò–ï –í –í–´–ë–†–ê–ù–ù–û–ú –§–û–†–ú–ê–¢–ï
        audio_segment = AudioSegment(
            wave_int16.tobytes(),
            frame_rate=final_sample_rate,
            sample_width=2,
            channels=channels
        )

        try:
            if audio_format == "mp3":
                audio_segment.export(temp_output_path, format="mp3", bitrate=bitrate)
            elif audio_format == "ogg":
                audio_segment.export(temp_output_path, format="ogg", bitrate=bitrate)
            elif audio_format == "flac":
                audio_segment.export(temp_output_path, format="flac")
            else:
                audio_segment.export(temp_output_path, format="wav")
        except Exception as e:
            error_msg = f"‚ùå –û—à–∏–±–∫–∞ —ç–∫—Å–ø–æ—Ä—Ç–∞ –∞—É–¥–∏–æ: {str(e)}"
            gr.Warning(error_msg)
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

        # üìÅ –°–û–ó–î–ê–ù–ò–ï –§–ò–ù–ê–õ–¨–ù–û–ì–û –§–ê–ô–õ–ê
        timestamp = int(time.time())
        safe_text = f"audio_{timestamp}"
        final_output_path = os.path.join(os.path.dirname(temp_output_path), f"{safe_text}.{audio_format}")

        try:
            if os.path.exists(final_output_path):
                os.remove(final_output_path)
            os.rename(temp_output_path, final_output_path)
        except Exception as e:
            error_msg = f"‚ùå –û—à–∏–±–∫–∞ —Å–æ–∑–¥–∞–Ω–∏—è —Ñ–∞–π–ª–∞: {str(e)}"
            gr.Warning(error_msg)
            return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

        # --- 6. –°–û–•–†–ê–ù–ï–ù–ò–ï –°–ü–ï–ö–¢–†–û–ì–†–ê–ú–ú–´ ---
        progress(0.9, desc="üìä –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Å–ø–µ–∫—Ç—Ä–æ–≥—Ä–∞–º–º—ã...")
        
        spectrogram_path = None
        try:
            with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
                spectrogram_path = tmp_spectrogram.name
                save_spectrogram(combined_spectrogram, spectrogram_path)
        except Exception as e:
            print("‚ö†Ô∏è –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Å–ø–µ–∫—Ç—Ä–æ–≥—Ä–∞–º–º—ã –Ω–µ —É–¥–∞–ª–æ—Å—å:", e)
            # –ù–µ –ø—Ä–µ—Ä—ã–≤–∞–µ–º –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ, –ø—Ä–æ–¥–æ–ª–∂–∞–µ–º –±–µ–∑ —Å–ø–µ–∫—Ç—Ä–æ–≥—Ä–∞–º–º—ã

        progress(1.0, desc="‚úÖ –ì–æ—Ç–æ–≤–æ!")
        
        # üéâ –í–û–ó–í–†–ê–¢ –£–°–ü–ï–®–ù–û–ì–û –†–ï–ó–£–õ–¨–¢–ê–¢–ê
        return (
            final_output_path,
            spectrogram_path,
            processed_ref_text_final,
            processed_gen_text,
            gr.update(value=current_seed),  # üî¥ –í–û–ó–í–†–ê–©–ê–ï–ú –†–ï–ê–õ–¨–ù–´–ô –°–ò–î
            current_model_status  # üî¥ –í–û–ó–í–†–ê–©–ê–ï–ú –ê–ö–¢–£–ê–õ–¨–ù–´–ô –°–¢–ê–¢–£–° –ú–û–î–ï–õ–ò
        )

    except Exception as e:
        error_msg = f"‚ùå –ù–µ–æ–∂–∏–¥–∞–Ω–Ω–∞—è –æ—à–∏–±–∫–∞: {str(e)}"
        print(f"Unexpected error in synthesize: {traceback.format_exc()}")
        gr.Warning(error_msg)
        return None, None, processed_ref_text, processed_gen_text, gr.update(value=current_seed), current_model_status

    finally:
        # üßπ –û–ß–ò–°–¢–ö–ê –ü–ê–ú–Ø–¢–ò (–ï–°–õ–ò –ù–ï –ë–´–õ–û –û–°–¢–ê–ù–û–í–ö–ò)
        if not stop_generation:
            try:
                torch.cuda.empty_cache()
                gc.collect()
            except Exception as e:
                print("‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ—á–∏—Å—Ç–∫–µ –ø–∞–º—è—Ç–∏:", e)

with gr.Blocks(title="ESpeech-TTS", css="""
    .error-markdown {
        color: red !important;
        font-weight: bold;
        text-align: center;
        font-size: 18px !important;
        margin: 10px 0;
        animation: fadeIn 0.3s, fadeOut 0.3s 4.7s forwards;
    }
    .model-status {
        padding: 10px;
        border-radius: 5px;
        margin: 10px 0;
        text-align: center;
        font-weight: bold;
    }
    .model-loaded {
        background: #d4edda;
        color: #155724;
        border: 1px solid #c3e6cb;
    }
    .model-error {
        background: #f8d7da;
        color: #721c24;
        border: 1px solid #f5c6cb;
    }
    .model-loading {
        background: #fff3cd;
        color: #856404;
        border: 1px solid #ffeaa7;
    }
    .loading-status {
        padding: 10px;
        margin: 10px 0;
        border-radius: 5px;
        text-align: center;
    }
    .local-model-input {
        background: #f8f9fa;
        padding: 15px;
        border-radius: 8px;
        border: 1px solid #dee2e6;
        margin: 10px 0;
    }
    @keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } }
    @keyframes fadeOut { from { opacity: 1; } to { opacity: 0; } }
    """) as app:
    gr.Markdown("# ESpeech-TTS")
    gr.Markdown("üí° **–°–æ–≤–µ—Ç:** –î–æ–±–∞–≤—å—Ç–µ —Å–∏–º–≤–æ–ª '+' –¥–ª—è —É–¥–∞—Ä–µ–Ω–∏—è (–Ω–∞–ø—Ä–∏–º–µ—Ä, '–ø—Ä–∏–≤+–µ—Ç')")
    gr.Markdown("üé≤ **–°–æ–≤–µ—Ç:** Seed –æ—Ç–≤–µ—á–∞–µ—Ç –∑–∞ –≤–∞—Ä–∏–∞—Ç–∏–≤–Ω–æ—Å—Ç—å –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏. –í—ã –º–æ–∂–µ—Ç–µ –ø–æ–¥–±–∏—Ä–∞—Ç—å –¥–ª—è —Å–µ–±—è —É–¥–∞—á–Ω–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ –∏–ª–∏ –∫–∞–∂–¥—ã–π —Ä–∞–∑ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å —Å–ª—É—á–∞–π–Ω—ã–π seed")
    gr.Markdown("üöÄ **CUDA Required:** This application requires GPU with CUDA support")

    # üî¥ –ù–û–í–´–ô –ë–õ–û–ö: –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ª–æ–∫–∞–ª—å–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π
    with gr.Accordion("üîß –î–æ–±–∞–≤–∏—Ç—å –ª–æ–∫–∞–ª—å–Ω—É—é –º–æ–¥–µ–ª—å", open=False):
        with gr.Row():
            local_model_path = gr.Textbox(
                label="–ü—É—Ç—å –∫ —Ñ–∞–π–ª—É –º–æ–¥–µ–ª–∏",
                placeholder="hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors –∏–ª–∏ /path/to/model.safetensors",
                scale=3
            )
            local_vocab_path = gr.Textbox(
                label="–ü—É—Ç—å –∫ —Ñ–∞–π–ª—É —Å–ª–æ–≤–∞—Ä—è", 
                placeholder="hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt –∏–ª–∏ /path/to/vocab.txt",
                scale=3
            )
            local_model_name = gr.Textbox(
                label="–ò–º—è –º–æ–¥–µ–ª–∏",
                placeholder="My Local Model",
                scale=2
            )
        with gr.Row():
            add_local_model_btn = gr.Button("‚ûï –î–æ–±–∞–≤–∏—Ç—å –ª–æ–∫–∞–ª—å–Ω—É—é –º–æ–¥–µ–ª—å", variant="secondary")
            local_model_status = gr.HTML(visible=False)

    # --- üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–´–ô –°–¢–ê–¢–£–° –ú–û–î–ï–õ–ò ---
    initial_model = "ESpeech-TTS-1_SFT-95K"
    initial_status = update_model_loading_status(initial_model)

    model_status = gr.HTML(
        value=initial_status["value"],
        visible=initial_status["visible"]
    )

    with gr.Row():
        with gr.Column():
            # --- –í—ã–±–æ—Ä –º–æ–¥–µ–ª–∏ ---
            model_choice = gr.Dropdown(
                choices=list(MODELS_CONFIG.keys()),
                value=initial_model,
                label="–í—ã–±–µ—Ä–∏—Ç–µ –º–æ–¥–µ–ª—å",
                info="üåê HuggingFace –º–æ–¥–µ–ª–∏ –∏–ª–∏ üîß –ª–æ–∫–∞–ª—å–Ω—ã–µ –º–æ–¥–µ–ª–∏"
            )
            
            # --- üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–ê–Ø –ö–ù–û–ü–ö–ê –ó–ê–ì–†–£–ó–ö–ò –ú–û–î–ï–õ–ò ---
            load_model_btn = gr.Button("üîÑ –ó–∞–≥—Ä—É–∑–∏—Ç—å –≤—ã–±—Ä–∞–Ω–Ω—É—é –º–æ–¥–µ–ª—å", variant="secondary")
            
        with gr.Column():
            ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
            ref_text_input = gr.Textbox(
                label="Reference Text",
                lines=2,
                placeholder="Text corresponding to reference audio"
            )
            # –ù–æ–≤–æ–µ: —Ç–µ–∫—Å—Ç –æ—à–∏–±–∫–∏
            ref_audio_warning = gr.Markdown("", visible=False, elem_classes="error-markdown")
        with gr.Column():
            gen_text_input = gr.Textbox(
                label="Text to Generate",
                lines=5,
                max_lines=20,
                placeholder="Enter text to synthesize..."
            )

    # --- üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–´–ï –û–ë–†–ê–ë–û–¢–ß–ò–ö–ò –î–õ–Ø –ó–ê–ì–†–£–ó–ö–ò –ú–û–î–ï–õ–ò ---
    def on_model_select(model_name):
        """–û–±—Ä–∞–±–æ—Ç—á–∏–∫ –≤—ã–±–æ—Ä–∞ –º–æ–¥–µ–ª–∏ –≤ dropdown"""
        if model_name:
            return update_model_loading_status(model_name)
        return gr.update(value="<div class='model-status model-error'>‚ùå –ú–æ–¥–µ–ª—å –Ω–µ –≤—ã–±—Ä–∞–Ω–∞</div>", visible=True)

    # üî¥ –ù–û–í–´–ô –û–ë–†–ê–ë–û–¢–ß–ò–ö: –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –ª–æ–∫–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏
    add_local_model_btn.click(
        fn=add_local_model,
        inputs=[local_model_path, local_vocab_path, local_model_name],
        outputs=[model_choice, local_model_status]
    )

    # –ê–≤—Ç–æ–∑–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –ø—Ä–∏ –≤—ã–±–æ—Ä–µ - –¢–û–õ–¨–ö–û –°–¢–ê–¢–£–°
    model_choice.change(
        fn=on_model_select,
        inputs=model_choice,
        outputs=model_status
    )

    # –ö–Ω–æ–ø–∫–∞ –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ–π –∑–∞–≥—Ä—É–∑–∫–∏
    load_model_btn.click(
        fn=load_model_with_status,
        inputs=[model_choice],
        outputs=model_status
    )

    # --- –ü–æ–¥–∫–ª—é—á–µ–Ω–∏–µ –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–≥–æ —Ä–∞—Å–ø–æ–∑–Ω–∞–≤–∞–Ω–∏—è ‚úÖ---
    ref_audio_input.change(
        fn=validate_and_transcribe_audio,
        inputs=ref_audio_input,
        outputs=[ref_audio_input, ref_text_input, ref_audio_warning]
    )

    process_text_btn = gr.Button("‚úèÔ∏è –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞", variant="secondary")

    with gr.Accordion("Advanced Settings", open=False):
        with gr.Row():
            # üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –ø–æ–∫–∞–∑—ã–≤–∞–µ–º —Ä–µ–∞–ª—å–Ω—ã–π seed
            seed_input = gr.Number(label="Seed (random)", value=get_current_seed_display(), precision=0)
            remember_seed_checkbox = gr.Checkbox(label="üíæ –ó–∞–ø–æ–º–Ω–∏—Ç—å —ç—Ç–æ—Ç seed", value=False)
            remove_silence = gr.Checkbox(label="Remove Silences", value=False)
        with gr.Row():
            speed_slider = gr.Slider(label="Speed", minimum=0.3, maximum=2.0, value=1.0, step=0.1)
            nfe_slider = gr.Slider(label="NFE Steps", minimum=4, maximum=64, value=48, step=2)
        with gr.Row():
            cross_fade_slider = gr.Slider(label="Cross-Fade Duration (s)", minimum=0.0, maximum=1.0, value=0.15, step=0.01)
            sway_sampling_slider = gr.Slider(label="Sway Sampling Coef", minimum=-1, maximum=1, value=-1, step=0.1)
        with gr.Row():
            cfg_strength_slider = gr.Slider(label="CFG Strength", minimum=0.5, maximum=5.0, value=2.0, step=0.1)
        with gr.Row():
            audio_format = gr.Radio(["wav", "mp3", "ogg", "flac"], label="–§–æ—Ä–º–∞—Ç", value="wav")
            bitrate = gr.Radio(["128k", "192k", "320k"], label="–ë–∏—Ç—Ä–µ–π—Ç (mp3/ogg)", value="192k", visible=lambda fmt: fmt in ["mp3", "ogg"])

    # --- –û–±—Ä–∞–±–æ—Ç—á–∏–∫ –∏–∑–º–µ–Ω–µ–Ω–∏—è —Ñ–æ—Ä–º–∞—Ç–∞ ---
    def update_bitrate_visibility(audio_format):
        return gr.update(visible=audio_format in ["mp3", "ogg"])

    audio_format.change(
        update_bitrate_visibility,
        inputs=audio_format,
        outputs=bitrate
    )

    # --- üî¥ –î–û–ë–ê–í–õ–ï–ù–û: –ö–Ω–æ–ø–∫–∞ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏ ---
    with gr.Row():
        generate_btn = gr.Button("üé§ Generate Speech", variant="primary", size="lg")
        stop_btn = gr.Button("üõë Stop Generation", variant="stop", size="lg")

    with gr.Row():
        audio_output = gr.Audio(label="üéß –ê—É–¥–∏–æ", type="filepath")
        spectrogram_output = gr.Image(label="Spectrogram", type="filepath")

    process_text_btn.click(
        process_texts_only,
        inputs=[ref_text_input, gen_text_input],
        outputs=[ref_text_input, gen_text_input]
    )

    # --- üî¥ –î–û–ë–ê–í–õ–ï–ù–û: –û–±—Ä–∞–±–æ—Ç—á–∏–∫ –æ—Å—Ç–∞–Ω–æ–≤–∫–∏ ---
    stop_btn.click(
        fn=stop_generation_process,
        outputs=None,
        queue=False
    )

    # üî¥ –ò–°–ü–†–ê–í–õ–ï–ù–ù–´–ô –í–´–ó–û–í –°–ò–ù–¢–ï–ó–ê –° –û–ë–ù–û–í–õ–ï–ù–ò–ï–ú –°–¢–ê–¢–£–°–ê
    generate_btn.click(
        synthesize,
        inputs=[
            ref_audio_input,
            ref_text_input,
            gen_text_input,
            remove_silence,
            seed_input,
            remember_seed_checkbox,
            model_choice,
            cross_fade_slider,
            nfe_slider,
            speed_slider,
            sway_sampling_slider,
            cfg_strength_slider,
            audio_format,
            bitrate,
        ],
        outputs=[
            audio_output, 
            spectrogram_output, 
            ref_text_input, 
            gen_text_input, 
            seed_input,
            model_status  # üî¥ –î–û–ë–ê–í–õ–Ø–ï–ú –û–ë–ù–û–í–õ–ï–ù–ò–ï –°–¢–ê–¢–£–°–ê –ú–û–î–ï–õ–ò
        ]
    )

if __name__ == "__main__":
    app.launch(share=True)

Checking CUDA availability...
CUDA is available. Using device: Tesla T4
Preloading model...
CUDA is available. Using device: Tesla T4
Trying to download model file 'espeech_tts_rlv2.pt' and 'vocab.txt' from hub 'ESpeech/ESpeech-TTS-1_RL-V2'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloaded model to /root/.cache/huggingface/hub/models--ESpeech--ESpeech-TTS-1_RL-V2/snapshots/f582b6e5897fe8a5835059405a8439d13bdf7684/espeech_tts_rlv2.pt
Downloaded vocab to /root/.cache/huggingface/hub/models--ESpeech--ESpeech-TTS-1_RL-V2/snapshots/f582b6e5897fe8a5835059405a8439d13bdf7684/vocab.txt
Loading model from: /root/.cache/huggingface/hub/models--ESpeech--ESpeech-TTS-1_RL-V2/snapshots/f582b6e5897fe8a5835059405a8439d13bdf7684/espeech_tts_rlv2.pt

vocab :  /root/.cache/huggingface/hub/models--ESpeech--ESpeech-TTS-1_RL-V2/snapshots/f582b6e5897fe8a5835059405a8439d13bdf7684/vocab.txt
token :  custom
model :  /root/.cache/huggingface/hub/models--ESpeech--ESpeech-TTS-1_RL-V2/snapshots/f582b6e5897fe8a5835059405a8439d13bdf7684/espeech_tts_rlv2.pt 

Model loaded and moved to CUDA: cuda
Model preloaded.
Loading RUAccent...
RUAccent loaded.
Preloading vocoder...
CUDA is available. Using device: Tesla T4
Loading vocoder...
Download Vocos from huggingface charactr/vocos-mel-24khz
Vocoder