This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.

Before running this script please DON'T FORGET: 
- to set file paths.
- to download related model files from TTS.
- download or clone related repos, linked below.
- setup the repositories. ```python setup.py install```
- to checkout right commit versions (given next to the model) of TTS.
- to set the right paths in the cell below.

Repository:
- TTS: https://github.com/mozilla/TTS

In [1]:
%load_ext autoreload
%autoreload 2
import os
import importlib
import random
import librosa
import torch

import numpy as np
from tqdm import tqdm
from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping

# you may need to change this depending on your system
os.environ['CUDA_VISIBLE_DEVICES']='0'


from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config

You should also adjust all the path constants to point at the relevant locations for you locally

In [12]:
MODEL_RUN_PATH = "../../Mozilla-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/"
MODEL_PATH = MODEL_RUN_PATH + "best_model.pth.tar"
CONFIG_PATH = MODEL_RUN_PATH + "config.json"


DATASETS_NAME = ['brspeech'] # list the datasets
DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']
DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']

USE_CUDA = True

In [13]:
!ls -1 $MODEL_RUN_PATH

libritts_360-half-September-28-2019_10+46AM-8565c50


In [16]:
#Preprocess dataset
meta_data = []
for i in range(len(DATASETS_NAME)):
    preprocessor = importlib.import_module('TTS.datasets.preprocess')
    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())
    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])
      
meta_data= list(meta_data)

In [20]:
c = load_config(CONFIG_PATH)
ap = AudioProcessor(**c['audio'])

model = SpeakerEncoder(**c.model)
model.load_state_dict(torch.load(MODEL_PATH)['model'])
model.eval()
if USE_CUDA:
    model.cuda()

embeddings_dict = {}
len_meta_data= len(meta_data)

for i in tqdm(range(len_meta_data)):
    _, wav_file, speaker_id = meta_data[i]
    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T
    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
    if USE_CUDA:
        mel_spec = mel_spec.cuda()
    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)
    if speaker_id in embeddings_dict.keys():
        # append speaker embedding in speaker embedding list
        embeddings_dict[speaker_id].append(np.array(embedd))
    else:# if speaker dont have speaker embedding list
        embeddings_dict[speaker_id]= [np.array(embedd)] 
    #np.save(output_files[idx], embedd.detach().cpu().numpy())
    
for speaker_id in embeddings_dict.keys():
    embeddings_dict[speaker_id] = np.mean(np.array(embeddings_dict[speaker_id]), axis=0)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > num_mels:40
 | > min_level_db:-100
 | > frame_shift_ms:12.5
 | > frame_length_ms:50
 | > ref_level_db:20
 | > num_freq:1025
 | > power:None
 | > preemphasis:0.98
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > sound_norm:False
 | > n_fft:2048
 | > hop_length:200
 | > win_length:800


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location='cpu' to map your storages to the CPU.

In [24]:
!nvidia-smi


CommandNotFoundError: No command 'conda activated'.
Did you mean 'conda activate'?



In [26]:
torch.cuda.is_available()



False

In [None]:
# create and export speakers.json
speaker_mapping = {name: {'id': i, 'embedding':embeddings_dict[name].reshape(-1).tolist()} for i, name in enumerate(embeddings_dict.keys())}
save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)


In [None]:
#test load integrity
speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)
assert speaker_mapping == speaker_mapping_load
print("The file speakers.json has been exported to ",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')