# This notebook is used to do custom voice TTS using [Coqui](https://github.com/coqui-ai/TTS) YourTTS

#### Download and install Coqui as per yourtts_readme.txt

complete steps 1 and 2 of the readme

In [None]:
!pip install -q torchaudio ipywidgets

In [None]:
!tts --list_models

Get the YourTTS model and configs


In [None]:
!wget https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip

Follow Step 3 in the readme

Imports

In [None]:
import sys
TTS_PATH = "TTS/"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally

import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio

from glob import glob

import torch

from TTS.tts.utils.synthesis import synthesis

# This line was picked from https://colab.research.google.com/drive/1r0NDBxxW5RZjQ1Jy99XohnY6thYWNBCd?usp=sharing#scrollTo=2akFqoi7UiD4
# This import was not resolved
# from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.text.tokenizer import TTSTokenizer
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

Paths for model and config downloaded (yourtts_models)

In [None]:
OUT_PATH = 'yourtts_out/'

# create output path
os.makedirs(OUT_PATH, exist_ok=True)

# model vars 
MODEL_PATH = './yourtts_models/model_file.pth'
CONFIG_PATH = './yourtts_models/config.json'

TTS_SPEAKERS = "./yourtts_models/speakers.json"
# SE = Speaker Encoder
SE_MODEL_PATH="./yourtts_models/model_se.pth"
CONFIG_SE_PATH = "./yourtts_models/config_se.json"
print(CONFIG_PATH)
USE_CUDA = torch.cuda.is_available()

Restore model

In [None]:
# load the config, config path was defined above
C = load_config(CONFIG_PATH)


# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False
C.model_args['speaker_encoder_config_path'] = CONFIG_SE_PATH
C.model_args['speaker_encoder_model_path'] = SE_MODEL_PATH

model = setup_model(C)
# model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))



This is important - remove the speaker encoder

In [None]:

# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)


model.eval()

if USE_CUDA:
    model = model.cuda()

# synthesize voice
use_griffin_lim = False

#### Speaker Encoder Setup

install helper libraries

In [None]:
! pip install -q pydub ffmpeg-normalize==1.21.0

Imports

In [None]:
from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
import librosa

load the Speaker Encoder

In [None]:
SE_speaker_manager = SpeakerManager(encoder_model_path=SE_MODEL_PATH, \
                                    encoder_config_path=CONFIG_SE_PATH, \
                                        use_cuda=USE_CUDA)

Define a helper function

In [None]:
def compute_spec(ref_file):
  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
  spec = ap.spectrogram(y)
  spec = torch.FloatTensor(spec).unsqueeze(0)
  return spec

#### Voice Conversion

##### Upload, normalize and resample your wav files

Upload wav files - I have saved the wavs in Libri-speech format - directory MyTTSDataset/en

In [None]:
WAVS_PATH = "./MyTTSDataset/wavs/"
reference_files = os.listdir(WAVS_PATH)
new_list = [ WAVS_PATH+s for s in reference_files]

#print(new_list)
for sample in new_list:
    !ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f
    

Compute Embedding

In [None]:
reference_emb = SE_speaker_manager.compute_embedding_from_clip(new_list)

In [None]:
print(reference_emb)

define inference variables

In [None]:
model.length_scale = 1  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.
text = "It took me quite a long time to develop a voice and now that I have it I am not going to be silent."

In [None]:
#  Choose language id
model.language_manager.ids

In [None]:
language_id = 0 # english

#### Synthesis

In [None]:
from typing import Any


print(" > text: {}".format(text))
wav, alignment, _, _ = synthesis(
                    model,
                    text,
                    C,
                    use_cuda = USE_CUDA,
                    speaker_id=None,
                    style_wav=None,
                    use_griffin_lim=True,
                    do_trim_silence=False,
                    d_vector=reference_emb,
                    language_id=language_id,
                ).values()
print("Generated Audio")
IPython.display.display(Audio(wav, rate=ap.sample_rate))
file_name = "text1.wav"
#file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
out_path = os.path.join(OUT_PATH, file_name)
print(" > Saving output to {}".format(out_path))
ap.save_wav(wav, out_path)

In [None]:
# This did not work - stores an empty file
# 
# emb_file = "emb_file.json"
# print(SE_speaker_manager.get_speakers())
# SE_speaker_manager.save_embeddings_to_file(os.path.join(OUT_PATH, emb_file))