## Пример запуска [XTTS](https://github.com/coqui-ai/TTS) модели обученной на транскрипции для русского языка


Установка XTTS

In [1]:
!pip install TTS==0.22.0



Скачиваем веса XTTS модели с [huggingface](https://huggingface.co/omogr/XTTS-ru-ipa)
Устанавливаем [транскриптор](https://github.com/omogr/omogre).

In [2]:
!mkdir model
!git clone https://huggingface.co/omogr/XTTS-ru-ipa model
!pip install git+https://github.com/omogr/omogre.git

Cloning into 'model'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (10/10), 196.71 KiB | 1.07 MiB/s, done.


In [14]:
import os
import torch
import torchaudio

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

from omogre import Transcriptor
import IPython.display as ipd

Скачиваем веса моделей транскриптора. Инициализируем XTTS и транскриптор.

In [22]:
model_dir = 'model'

def clear_gpu_cache():
    # clear the GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

XTTS_MODEL = None

def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
    global XTTS_MODEL
    clear_gpu_cache()
    assert xtts_checkpoint and xtts_config and xtts_vocab

    config = XttsConfig()
    config.load_json(xtts_config)
    XTTS_MODEL = Xtts.init_from_config(config)
    print("Loading XTTS model...")
    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint,
        vocab_path=xtts_vocab, use_deepspeed=False, speaker_file_path='-')
    if torch.cuda.is_available():
        XTTS_MODEL.cuda()

    if XTTS_MODEL is None:
        return False
    print(" ... model loaded!")


def run_tts(tts_text, gpt_cond_latent, speaker_embedding):
    out = XTTS_MODEL.inference(
        text=tts_text,
        language='ru',
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=XTTS_MODEL.config.temperature,
        length_penalty=XTTS_MODEL.config.length_penalty,
        repetition_penalty=XTTS_MODEL.config.repetition_penalty,
        top_k=XTTS_MODEL.config.top_k,
        top_p=XTTS_MODEL.config.top_p,
    )

    out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
    return out["wav"]


class XttsInference:
    def __init__(self, transcriptor_data_path='omogre_data',
                 xtts_model_path='model'):
      clear_gpu_cache()
      # данные будут скачаны в директорию 'omogre_data'
      self.transcriptor = Transcriptor(data_path=transcriptor_data_path)
      xtts_checkpoint = os.path.join(xtts_model_path, "model.pth")
      xtts_config = os.path.join(xtts_model_path, "config.json")
      xtts_vocab = os.path.join(xtts_model_path, "vocab.json")
      load_model(xtts_checkpoint, xtts_config, xtts_vocab)

      reference_audio = os.path.join(xtts_model_path, "reference_audio.wav")
      if not reference_audio:
          print("empty reference_audio")
          return False

      self.gpt_cond_latent, self.speaker_embedding = XTTS_MODEL.get_conditioning_latents(
          audio_path=reference_audio,
          gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
          max_ref_length=XTTS_MODEL.config.max_ref_len,
          sound_norm_refs=XTTS_MODEL.config.sound_norm_refs
      )

    def __call__(self, src_text):
      tts_text = ' '.join(self.transcriptor([src_text]))
      audio = run_tts(tts_text, self.gpt_cond_latent, self.speaker_embedding)
      return tts_text, audio


xtts_inference = XttsInference()


  state_dict = torch.load(weights_path, map_location='cpu')


Loading XTTS model...


  return torch.load(f, map_location=map_location, **kwargs)


 ... model loaded!


Пример порождения аудио для одной фразы

In [23]:
src_text = 'МИД Турции официально заявил, что Турция заинтересована во вступлении в БРИКС.'
print('src_text', src_text)
tts_text, audio = xtts_inference(src_text)
print('Speech generated!', tts_text)

# Сохраняем результат
output_file='audio.wav'
torchaudio.save(output_file, audio, 24000)
ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=24000))


src_text МИД Турции официально заявил, что Турция заинтересована во вступлении в БРИКС.
Speech generated! mʲ`it t`urtsɨɪ ɐfʲɪtsɨ`alʲnə zəjɪvʲ`iɫ, ʂt`o t`urtsɨjə zəɪnʲtʲɪrʲɪs`ovənə v`o fstʊplʲ`enʲɪɪ v brʲ`iks.
