#### Finetune YourTTS model for my voice dataset

    In this notebook I use the small dataset of my voice and finetune the YourTTS model

##### Setup

* Use pip install -e .[all] tts==0.7.1 to get the correct version of TTS

In [None]:
!pip install -q torchaudio ipywidgets

In [None]:
import os
import torch
from tqdm import tqdm

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.utils.managers import save_file

In [None]:
!wget https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip

Make a directory YourTTS_models and save the contents of the above zip file in it

In [None]:
OUT_PATH = 'TTS_train/'
REF_PATH = './YourTTS_models/'

# create output path
os.makedirs(OUT_PATH, exist_ok=True)

# model vars 
MODEL_PATH = './YourTTS_models/model_file.pth'
CONFIG_PATH = './YourTTS_models/config.json'
TTS_LANGUAGES = "./YourTTS_models/language_ids.json"
TTS_SPEAKERS = "./YourTTS_models/speakers.json"
# SE = Speaker Encoder
SE_MODEL_PATH="./YourTTS_models/model_se.pth"
CONFIG_SE_PATH = "./YourTTS_models/config_se.json"
print(CONFIG_PATH)
USE_CUDA = torch.cuda.is_available()

##### Compute Embeddings from new custom dataset

* Modify config_se.json to match the new dataset

The format is:
```
    /MyTTSDataset
    | /en  
        | -> metadata.txt
        | -> /wavs
            | -> audio1.wav
            | -> audio2.wav
            | ...
```

In [None]:
from TTS.tts.datasets import load_tts_samples
# custom formatter implementation
def customFormatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "shiva"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0])+".wav"
            text = cols[1]
            items.append({'text':text, 'audio_file':wav_file, 'speaker_name':speaker_name, \
                            'audio_unique_name':cols[0]})
    return items


Most of the code below is taken from compute_embeddings.py

In [None]:
output_path = "TTS_train"
dataset_config = BaseDatasetConfig()
dataset_config.formatter = customFormatter
dataset_config.dataset_name = "shiva"
dataset_config.path = os.path.join(output_path, "Shiva-1.0/en")
dataset_config.meta_file_train = "metadata.txt"
dataset_config.language = "en"
dataset_config.speaker_name = "shiva"
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, 
                                               eval_split=True, 
                                               eval_split_size=0.1,
                                               formatter=customFormatter)



In [None]:
samples = train_samples + eval_samples
print(samples)
encoder_manager = SpeakerManager(
    encoder_model_path= SE_MODEL_PATH,
    encoder_config_path= CONFIG_SE_PATH,
    d_vectors_file_path= None,
    use_cuda=USE_CUDA,
)

In [None]:

print(encoder_manager.encoder_config.class_name_key)

In [None]:

class_name_key = encoder_manager.encoder_config.class_name_key
print(class_name_key)

# compute speaker embeddings
new_list = []
speaker_mapping = {}
embedding_key = "shiva_en.wav"
for idx, fields in enumerate(tqdm(samples)):
    #class_name = fields[class_name_key]
    audio_file = fields["audio_file"]
    new_list.append(audio_file)
    
    
# extract the embedding from the full list of audio files
embedd = encoder_manager.compute_embedding_from_clip(new_list)

# create speaker_mapping if target dataset is defined
speaker_mapping[embedding_key] = {}
speaker_mapping[embedding_key]["name"] = dataset_config.dataset_name + "-en-1\n"
speaker_mapping[embedding_key]["embedding"] = embedd

if speaker_mapping:
    # save speaker_mapping if target dataset is defined
    if os.path.isdir(REF_PATH):
        mapping_file_path = os.path.join(REF_PATH, "speakers_mod.json")


    save_file(speaker_mapping, mapping_file_path)
    print("Speaker embeddings saved at:", mapping_file_path)

```
!!! Point the d_vector in config.json to speakers_mod.json
```

##### Prepare the config

In [None]:
# load the config, config path was defined above
C = load_config(CONFIG_PATH)


# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.datasets = dataset_config
C.model_args['d_vector_file'] = os.path.join(REF_PATH, "speakers_mod.json")
C.model_args['use_speaker_encoder_as_loss'] = False
C.model_args['speaker_encoder_config_path'] = CONFIG_SE_PATH
C.model_args['speaker_encoder_model_path'] = SE_MODEL_PATH

C.save_json(os.path.join(REF_PATH, "config_mod.json"))

##### Train: Get set go

```
model.eval() does not work

```

In [None]:

model = setup_model(C)
# model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)

# set languages
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)



model.eval()

if USE_CUDA:
    model = model.cuda()

# synthesize voice
use_griffin_lim = False

```
This system command does not go through

```

In [None]:
NEW_CONFIG_PATH = os.path.join(REF_PATH + "config_mod.json")
command = f"python ./TTS/TTS/bin/train_tts.py --config_path {NEW_CONFIG_PATH} --restore_path {MODEL_PATH}"
os.system(command)

##### Tensorboard 

In [None]:
!pip install tensorboard
!tensorboard --logdir=TTS_train/<run_dir>

##### Test the model

In [None]:
import glob, os
output_path = "TTS_train"
ckpt = output_path + "best_model.pth"
config = output_path + "config.json"

In [None]:
!tts --text "Is it recognize speech or wreck a nice beach?" \
      --model_path $ckpt \
      --config_path $config \
      --out_path test1.wav

##### Listen to the synthesized speech

In [None]:
import IPython
IPython.display.Audio("test1.wav")