In [1]:
import sys
from matplotlib import rcParams 
rcParams["figure.figsize"] = (16,5)
sys.path.append('')

import librosa
import librosa.display

from TTS.tts.models.tacotron2 import Tacotron2 
from TTS.tts.utils import *
from TTS.tts.utils.generic_utils import setup_model
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.tts.utils.text import text_to_sequence, phoneme_to_sequence
from TTS.tts.utils.text.symbols import symbols, phonemes
import torch
from TTS.tts.utils.synthesis import synthesis

In [2]:
# Set constants

# MODEL_PATH = '/content/drive/MyDrive/Mestrado/TTS/GST_rosana_only/checkpoint_120000.pth.tar'
CONFIG_PATH =  './experiments/debug_gst_logits/config.json'

# MODEL_PATH ='/content/drive/MyDrive/Mestrado/TTS/GST_3speaker_CPQD/best_model.pth.tar'
# CONFIG_PATH = '/content/drive/MyDrive/Mestrado/TTS/GST_3speaker_CPQD/config.json'

CONFIG = load_config(CONFIG_PATH)
# CONFIG['datasets'][0]['path'] = './'
# CONFIG['output_path'] = './'
# CONFIG['audio']['signal_norm'] = False
# CONFIG['audio']['stats_path'] = ''
# CONFIG['use_phonemes'] = False
# CONFIG['save_step'] = 500
use_cuda = True

# load the model
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)


# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, 2, 2, CONFIG, None)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:
 | > hop_length:256
 | > win_length:1024
 > Using model: Tacotron2
Training with 2 speakers and 2 styles
Use style target = True
Use semi supervised = True


In [3]:
import os
import pandas as pd

In [4]:
import argparse
import glob
import os
import sys
import time
import traceback

import numpy as np
import torch

from random import randrange
from torch.utils.data import DataLoader
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.utils.distribute import (DistributedSampler,
                                      apply_gradient_allreduce,
                                      init_distributed, reduce_tensor)
from TTS.tts.utils.generic_utils import setup_model, check_config_tts
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping,
                                    save_speaker_mapping)
from TTS.tts.utils.styles import (get_styles, load_style_mapping,
                                    save_style_mapping)
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
                                     create_experiment_folder, get_git_branch,
                                     remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
                                gradual_training_scheduler, set_weight_decay,
                                setup_torch_training_env)

In [5]:
use_cuda, num_gpus = setup_torch_training_env(True, False)

def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None, style_mapping = None):
    if is_val and not c.run_eval:
        loader = None
    else:
        dataset = MyDataset(
            r,
            c.text_cleaner,
            compute_linear_spec=c.model.lower() == 'tacotron',
            meta_data=meta_data_eval if is_val else meta_data_train,
            ap=ap,
            tp=c.characters if 'characters' in c.keys() else None,
            batch_group_size=0 if is_val else c.batch_group_size *
            c.batch_size,
            min_seq_len=c.min_seq_len,
            max_seq_len=c.max_seq_len,
            phoneme_cache_path=c.phoneme_cache_path,
            use_phonemes=c.use_phonemes,
            phoneme_language=c.phoneme_language,
            enable_eos_bos=c.enable_eos_bos_chars,
            verbose=verbose,
            speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
        loader = DataLoader(
            dataset,
            batch_size=c.eval_batch_size if is_val else c.batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=False,
            sampler=sampler,
            num_workers=c.num_val_loader_workers
            if is_val else c.num_loader_workers,
            pin_memory=False)
    return loader

def format_data(data, speaker_mapping=None, style_mapping = None):
    if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
        speaker_mapping = load_speaker_mapping(OUT_PATH)
    if style_mapping is None and c.use_style_embeddings:
        style_mapping = load_style_mapping(OUT_PATH)

    # setup input data
    text_input = data[0]
    text_lengths = data[1]
    speaker_names = data[2]
    linear_input = data[3] if c.model in ["Tacotron"] else None
    mel_input = data[4]
    mel_lengths = data[5]
    stop_targets = data[6]
    style_targets = data[10]
    avg_text_length = torch.mean(text_lengths.float())
    avg_spec_length = torch.mean(mel_lengths.float())

    if c.use_speaker_embedding:
        if c.use_external_speaker_embedding_file:
            speaker_embeddings = data[8]
            speaker_ids = None
        else:
            speaker_ids = [
                speaker_mapping[speaker_name] for speaker_name in speaker_names
            ]
            speaker_ids = torch.LongTensor(speaker_ids)
            speaker_embeddings = None
    else:
        speaker_embeddings = None
        speaker_ids = None

    if c.use_style_embedding:
        style_targets = [
                style_mapping[style_target] for style_target in style_targets
            ]
        if c.use_one_hot_style: # Style target will be a one hotted vector
            style_targets_ = np.zeros((len(style_targets), len(style_mapping)))
            for i in range(len(style_targets_)):
                if(style_targets[i] != 0): # If we force the 0 mapped style to be the non
                    style_targets_[i][style_targets[i]] = 1 # For each position we one hot encode it
            
            style_targets = style_targets_
            
            style_targets = torch.FloatTensor(style_targets)
            
            del style_targets_
        else: # Style target will be just the indice
            style_targets = torch.LongTensor(style_targets)
    else:
        style_targets = None

    # set stop targets view, we predict a single stop token per iteration.
    stop_targets = stop_targets.view(text_input.shape[0],
                                     stop_targets.size(1) // c.r, -1)
    stop_targets = (stop_targets.sum(2) >
                    0.0).unsqueeze(2).float().squeeze(2)

    # dispatch data to GPU
    if use_cuda:
        text_input = text_input.cuda(non_blocking=True)
        text_lengths = text_lengths.cuda(non_blocking=True)
        mel_input = mel_input.cuda(non_blocking=True)
        mel_lengths = mel_lengths.cuda(non_blocking=True)
        linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
        stop_targets = stop_targets.cuda(non_blocking=True)
        if speaker_ids is not None:
            speaker_ids = speaker_ids.cuda(non_blocking=True)
        if speaker_embeddings is not None:
            speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
        if style_targets is not None:
            style_targets = style_targets.cuda(non_blocking=True)

    return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length, style_targets


 > Using CUDA:  True
 > Number of GPUs:  1


In [6]:
config_path = './experiments/debug_gst_logits/config.json'
experiment_folder = './experiments/debug_gst_logits/'
c = load_config(config_path)

In [7]:
OUT_PATH = experiment_folder

In [8]:
ap = AudioProcessor(**c.audio)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:
 | > hop_length:256
 | > win_length:1024


In [9]:
meta_data_train, meta_data_eval = load_meta_data(c.datasets)

 | > Found 10 files in D:\Mestrado\Emotion Audio Synthesis (TTS)\repo_final\pt_etts\experiments\debug_gst_logits


In [10]:
if 'characters' in c.keys():
    symbols, phonemes = make_symbols(**c.characters)

# DISTRUBUTED
if num_gpus > 1:
    init_distributed(args.rank, num_gpus, args.group_id,
                     c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)

In [11]:
restore_path = None # primeiro ele é none pra criar o json
# restore_path = experiment_folder

# parse speakers
if c.use_speaker_embedding:
    speakers = get_speakers(meta_data_train)
    if restore_path:
        if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            if not speaker_mapping:
                print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file")
                speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
                if not speaker_mapping:
                    raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file")
            speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
        elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file
            prev_out_path = os.path.dirname(restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            speaker_embedding_dim = None
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                "introduce new speakers to " \
                                                "a previously trained model."
    else: # if start new train and don't use External Embedding file
        speaker_mapping = {name: i for i, name in enumerate(speakers)}
        speaker_embedding_dim = None
    save_speaker_mapping(OUT_PATH, speaker_mapping)
    num_speakers = len(speaker_mapping)
    print("Training with {} speakers: {}".format(num_speakers,
                                                 ", ".join(speakers)))
else:
    num_speakers = 0
    speaker_embedding_dim = None
    speaker_mapping = None
    
    
# parse styles
if c.use_style_embedding:
    styles = get_styles(meta_data_train)
    if restore_path:
        prev_out_path = os.path.dirname(restore_path)
        style_mapping = load_style_mapping(prev_out_path)
        style_embedding_dim = None
        assert all([style in style_mapping
                    for style in styles]), "As of now you, you cannot " \
                                            "introduce new styles to " \
                                            "a previously trained model."
    else: # if start new train and don't use External Embedding file
        style_mapping = {name: i for i, name in enumerate(styles)}
        style_embedding_dim = None
    save_style_mapping(OUT_PATH, style_mapping)
    num_styles = len(style_mapping)
    print("Training with {} styles: {}".format(num_styles,
                                                 ", ".join(styles)))
else:
    num_styles = 0
    style_embedding_dim = None
    style_mapping = None

model = setup_model(num_chars, num_speakers,num_styles, c, speaker_embedding_dim)

Training with 2 speakers: marco, rosana
Training with 2 styles: happy, neutral
 > Using model: Tacotron2
Training with 2 speakers and 2 styles
Use style target = True
Use semi supervised = True


In [12]:
style_mapping

{'happy': 0, 'neutral': 1}

In [13]:
epoch = 0
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
                           verbose=(epoch == 0), speaker_mapping=speaker_mapping, style_mapping = style_mapping)


 > DataLoader initialization
 | > Use phonemes: False
 | > Number of instances : 10
 | > Max length sequence: 23
 | > Min length sequence: 2
 | > Avg length sequence: 13.1
 | > Num. instances discarded by max-min (max=153, min=6) seq limits: 1
 | > Batch group size: 128.


In [14]:

for data in data_loader:
    text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length, style_targets = format_data(data, speaker_mapping, style_mapping)
    if c.bidirectional_decoder or c.double_decoder_consistency:
        decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward, logits = model.cuda()(
            text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
    else:
        decoder_output, postnet_output, alignments, stop_tokens, logits = model.cuda()(
            text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
        decoder_backward_output = None
        alignments_backward = None
    
    break

In [15]:
style_targets

tensor([0, 1, 0, 1, 0, 0, 1, 0, 0], device='cuda:0')

In [16]:
logits

tensor([[[ 0.1603, -0.1403]],

        [[ 0.1808, -0.1226]],

        [[ 0.1722, -0.1222]],

        [[ 0.1613, -0.1395]],

        [[ 0.1957, -0.1344]],

        [[ 0.2262, -0.1109]],

        [[ 0.1780, -0.1305]],

        [[ 0.1151, -0.1461]],

        [[ 0.2864, -0.2490]]], device='cuda:0', grad_fn=<AddBackward0>)

In [19]:
from torch import nn
gst_logits_target = style_targets
gst_logits = logits
loss = 0
return_dict = {}

criterion = nn.CrossEntropyLoss(ignore_index = -100)

if c.gst_style_loss:
    gst_style_loss = criterion(gst_logits.squeeze(0).squeeze(1), gst_logits_target)
    loss += gst_style_loss
    return_dict['gst_logits_loss'] = gst_style_loss

In [18]:
return_dict #ignoring 0

{'gst_logits_loss': tensor(0.8568, device='cuda:0', grad_fn=<NllLossBackward>)}

In [20]:
return_dict # ignoring -100 (No sem isupervised)

{'gst_logits_loss': tensor(0.6437, device='cuda:0', grad_fn=<NllLossBackward>)}