In [None]:
import numpy as np
import pyworld as pw
import librosa
import os
import re
import matplotlib.pyplot as plt

In [None]:
import pandas as pd

ch_df = pd.read_csv("D:/Mestrado/CH Corpus/CH_Unicamp-20210309T013712Z-003/CH_Unicamp/base_recortes/recortes_silvano/correspondencias.csv", sep = ';',
                    names=["orig_name", "file_name", "style", "un1", "un2","un3", 'un4'])

emotions_corresp = pd.read_csv("D:/Mestrado/CH Corpus/CH_Unicamp-20210309T013712Z-003/CH_Unicamp/base_recortes/recortes_silvano/emotions.csv", sep = ';')

In [None]:
map_emotions = dict(emotions_corresp.values)

In [None]:
ch_df['style'] = ch_df['style'].map(map_emotions)

In [None]:
is_neutral = []
for orig_name in ch_df.orig_name:
    if re.search('Neutra', orig_name):
        is_neutral.append(1)
    else:
        is_neutral.append(0)
  
ch_df['is_neutral'] = is_neutral
ch_df.loc[ch_df['is_neutral'] == 1, 'style'] = 'Neutral'

In [None]:
ch_df['style'].value_counts()

In [None]:
def get_pitch_range(audio_path, sr = None, pmin = 5, pmax = 95):
    '''
        Takes the audio (.wav) file path and return the pitch range.
        
        Here, pitch range is defined as follows in https://arxiv.org/pdf/2009.06775v1.pdf
        
        pitch range = P95(pitch) - P05(pitch), where pitch is the pitch contour ignoring silence.
        
    '''
    x, fs = librosa.load(audio_path, sr=sr)
    _f0, t = pw.dio(x.astype(np.double), fs)    # raw pitch extractor
    f0 = pw.stonemask(x.astype(np.double), _f0, t, fs)  # pitch refinement
    
    lower_bound = np.percentile(np.log(f0[f0>0]), pmin)
    upper_bound = np.percentile(np.log(f0[f0>0]), pmax)
    
    pitch_range = upper_bound - lower_bound
    
    return pitch_range

def get_logpitch_mean(audio_path, sr = None):
    '''
        Takes the audio (.wav) file path and return the pitch mean.
        
        Here, log pitch mean is defined as follows in https://arxiv.org/pdf/2009.06775v1.pdf
        
        log pitch mean = mean(log(pitch)), where pitch is the pitch contour ignoring silence.
        
    '''
    x, fs = librosa.load(audio_path, sr=sr)
    _f0, t = pw.dio(x.astype(np.double), fs)    # raw pitch extractor
    f0 = pw.stonemask(x.astype(np.double), _f0, t, fs)  # pitch refinement
    
    logpitch_mean = np.log(f0[f0>0]).median()
    
    return logpitch_mean

def get_logpitch_median(audio_path, sr = None):
    '''
        Takes the audio (.wav) file path and return the pitch mean.
        
        Here, log pitch mean is defined as follows in https://arxiv.org/pdf/2009.06775v1.pdf
        
        log pitch mean = mean(log(pitch)), where pitch is the pitch contour ignoring silence.
        
    '''
    x, fs = librosa.load(audio_path, sr=sr)
    _f0, t = pw.dio(x.astype(np.double), fs)    # raw pitch extractor
    f0 = pw.stonemask(x.astype(np.double), _f0, t, fs)  # pitch refinement
    
    logpitch_median = np.median(np.log(f0[f0>0]))
    
    return logpitch_median

def get_pitch_p995(audio_path, sr = None):
    '''
        Takes the audio (.wav) file path and return the pitch mean.
        
        Here, log pitch mean is defined as follows in https://arxiv.org/pdf/2009.06775v1.pdf
        
        log pitch mean = mean(log(pitch)), where pitch is the pitch contour ignoring silence.
        
    '''
    x, fs = librosa.load(audio_path, sr=sr)
    _f0, t = pw.dio(x.astype(np.double), fs)    # raw pitch extractor
    f0 = pw.stonemask(x.astype(np.double), _f0, t, fs)  # pitch refinement
    
    pitch_p995 = np.percentile(np.log(f0[f0>0]), .995)
    
    return pitch_p995



def get_energy(audio_path, sr = None, top_level_db = 10, frame_length=1024, hop_length = 512):
    '''
        Takes the audio (.wav) file path and return the mean speech energy.
        
        Here, speech energy is defined as follows in https://arxiv.org/pdf/2009.06775v1.pdf
        
        E = 20*log(mean(abs(x))), where x is audio amplitudes without silence
        
    '''
    
    x, fs = librosa.load(audio_path, sr=sr)
    
    # Getting the non silent partitions
    non_silent_partitions = librosa.effects.split(x, top_db=top_level_db, frame_length=frame_length, hop_length=hop_length)
    x_clean = []
    for interval in non_silent_partitions:
        x_clean.extend(x[interval[0]:interval[1]])

    x_clean = np.array(x_clean) 
    
    energy = 20*np.log(abs(x).mean())
    
    return energy

def get_cpqd_lab_speaking_rate(audio_file, lab_path):
    '''
        Takes the audio (.wav) file path and lab (.lab) file path from CPqD environment.
        
        It counts the phones/duration. Which is a proxy for speaking rate.
    '''
    
    x , sr = librosa.load(audio_file, sr = None)
    
    with open(lab_path , 'r', encoding = 'latin-1') as f:
        for k in f.readlines():
            if("phones" in k[:10]):
                qtde_phones = len(k[10:].replace('|', '').split())
                break
    
    qtde_phones = round(qtde_phones/(len(x)/sr), 3)
    
    return qtde_phones

In [None]:
local_path = "D:/Mestrado/CH Corpus/CH_Unicamp-20210309T013712Z-003/CH_Unicamp/base_recortes/recortes_silvano/audios/"

In [None]:
pitchs = []

for wav in ch_df['file_name'].values:
    wav_path = local_path + wav + '.wav'
    pitch_range = get_pitch_range(wav_path)
    pitchs.append(pitch_range)
    
ch_df['pitch_range'] = pitchs

In [None]:
m_pitchs = []

for wav in ch_df['file_name'].values:
    wav_path = local_path + wav + '.wav'
    pitch_mean = get_logpitch_median(wav_path)
    m_pitchs.append(pitch_mean)
    
ch_df['pitch_logpitch_mean'] = m_pitchs

In [None]:
p995 = []

for wav in ch_df['file_name'].values:
    wav_path = local_path + wav + '.wav'
    p995_ = get_pitch_p995(wav_path)
    p995.append(p995_)
    
ch_df['p995'] = p995

In [None]:
# Get mean pitch range by emotion

pitch_mean = ch_df.groupby('style').agg({'pitch_range': np.mean}).reset_index()
pitch_min = ch_df.groupby('style').agg({'pitch_range': np.min}).reset_index()
pitch_max = ch_df.groupby('style').agg({'pitch_range': np.max}).reset_index()

pitch_df = pitch_mean
pitch_df['pitch_min'] = pitch_min['pitch_range']
pitch_df['pitch_max'] = pitch_max['pitch_range']
pitch_df.sort_values(by='pitch_range')

In [None]:
# Get mean pitch range by emotion

logpitch_mean = ch_df.groupby('style').agg({'pitch_logpitch_mean': np.mean}).reset_index()
logpitch_min = ch_df.groupby('style').agg({'pitch_logpitch_mean': np.min}).reset_index()
logpitch_max = ch_df.groupby('style').agg({'pitch_logpitch_mean': np.max}).reset_index()

logpitch_df = logpitch_mean
logpitch_df['pitch_min'] = logpitch_min['pitch_logpitch_mean']
logpitch_df['pitch_max'] = logpitch_max['pitch_logpitch_mean']
logpitch_df.sort_values(by='pitch_logpitch_mean')

In [None]:
ch_df

In [None]:
# Get mean pitch range by emotion

energy_mean = ch_df.groupby('style').agg({'p995': np.mean}).reset_index()
energy_min = ch_df.groupby('style').agg({'p995': np.min}).reset_index()
energy_max = ch_df.groupby('style').agg({'p995': np.max}).reset_index()

energy_df = energy_mean
energy_df['energy_min'] = energy_min['p995']
energy_df['energy_max'] = energy_max['p995']
energy_df.sort_values(by='p995')

In [None]:
import os

In [None]:
os.listdir('../../../')

In [None]:
stats = pd.read_csv("../../../cpqd_aux_logpitchnorm/stats.csv")
stats

In [None]:
full_train = pd.read_csv("../../../cpqd_aux_logpitchnorm/full_df_pitch_norm_train.csv", sep='|', encoding='latin-1')
full_val = pd.read_csv("../../../cpqd_aux_logpitchnorm/full_df_pitch_norm_val.csv", sep='|', encoding='latin-1')

In [None]:
full_train.head()

In [None]:
full_train.groupby(['emb_id','style_target']).agg({'pitch_range': np.mean}).reset_index()

In [None]:
full_val.groupby(['emb_id','style_target']).agg({'pitch_range': np.mean}).reset_index()

In [None]:
import seaborn as sns

In [None]:
sns.distplot(full_train[(full_train['emb_id']=='adriana')&(full_train['style_target']=='t_eps_animado_rf')]['pitch_range'])

In [None]:
sns.distplot(full_train[(full_train['emb_id']=='adriana')&(full_train['style_target']=='t_neutro')]['pitch_range'])

In [None]:
import seaborn as sns

In [None]:
ch_df['norm_pitch_range'] = (ch_df['pitch_range'] - ch_df['pitch_range'].mean())/ch_df['pitch_range'].std()
ch_df['norm_energy'] = (ch_df['energy'] - ch_df['energy'].mean())/ch_df['energy'].std()

In [None]:
anal_styles = ['Happy-for','Fear']
plt.figure(figsize=(10,10))
for style in ch_df['style'].unique():
    if(style in anal_styles):
        filt_df = ch_df[ch_df['style'] == style]
        plt.scatter(filt_df['norm_pitch_range'], filt_df['norm_energy'], label = f'Emotion = {style}', alpha = 0.5)
plt.legend()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
lb = LabelEncoder()

ch_df['target'] = lb.fit_transform(ch_df['style'])

In [None]:
df_nonneutral = ch_df[ch_df['style'] != 'Neutral']

In [None]:
x_train, x_val, y_train, y_val = train_test_split(df_nonneutral[['norm_pitch_range','norm_energy']], df_nonneutral['target'], test_size = 0.2,
                                                 stratify = df_nonneutral['target'], random_state = 42)

In [None]:
lr = LogisticRegression(random_state = 42)
rf = RandomForestClassifier(n_estimators=50, random_state=42)

In [None]:
lr.fit(x_train, y_train)
rf.fit(x_train, y_train)

In [None]:
lr.score(x_val, y_val)

In [None]:
rf.score(x_val, y_val)

In [None]:
y_val

In [None]:
audio_path = local_path + 'silvano0002.wav'

In [None]:
lab_local_path = "D:/Mestrado/CH Corpus/CH_Unicamp-20210309T013712Z-003/CH_Unicamp/base_recortes/recortes_silvano/cpqd_transcripts/"
lab_path = lab_local_path + 'silvano0002.lab'

In [None]:
get_cpqd_lab_speaking_rate(audio_path, lab_path)

In [None]:
audio_path = local_path + 'silvano0001.wav'

In [None]:
get_energy(audio_path)

In [None]:
audio_path = 'D:/Mestrado/CH Corpus/CH_Unicamp-20210309T013712Z-003/CH_Unicamp/base_recortes/recortes_silvano/audios/silvano0001.wav'

In [None]:
get_pitch_range(audio_path)

In [None]:
non_silent_partitions = librosa.effects.split(x, top_db=10, frame_length=1024, hop_length=512)

In [None]:
non_silent_partitions

In [None]:
x_clean = []
for interval in non_silent_partitions:
    x_clean.extend(x[interval[0]:interval[1]])

x_clean = np.array(x_clean)

In [None]:
20*np.log(x_clean.mean())

In [None]:
x[non_silent_partitions[0][0]:non_silent_partitions[0][1]]

In [None]:
plt.plot(x[non_silent_partitions[0][0]:non_silent_partitions[0][1]])

In [None]:
plt.plot(x)

In [None]:
plt.plot(x_clean)

In [None]:
x, fs = librosa.load('/content/drive/MyDrive/CH_Unicamp/base_recortes/recortes_silvano/audios/silvano0002.wav', sr=None)

In [None]:
_f0, t = pw.dio(x.astype(np.double), fs)    # raw pitch extractor
f0 = pw.stonemask(x.astype(np.double), _f0, t, fs)  # pitch refinement

In [None]:
import pandas as pd

In [None]:
base = pd.read_csv("./experiments/debug_gst_logits/debug_meta_prosodic.csv", delimiter='|', encoding='latin-1')

In [None]:
base['pitch_range'] = 1
base['speaking_rate'] = 0.22
base['energy'] = 0.2

base.to_csv("./experiments/debug_gst_logits/debug_meta_prosodic.csv", index = False, encoding='latin-1', sep='|')

In [None]:
base = pd.read_csv("./experiments/debug_gst_logits/debug_meta_prosodic.csv", delimiter='|', encoding='latin-1')
base['pitch_range'] = 1

base[['wav_file','text','speakers','styles','pitch_range']].to_csv("./experiments/debug_gst_logits/debug_meta_prosodic_onlypitch.csv", index = False, encoding='latin-1', sep='|')

In [1]:
import argparse
import glob
import os
import sys
import time
import traceback

import numpy as np
import torch

from random import randrange
from torch.utils.data import DataLoader
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.utils.distribute import (DistributedSampler,
                                      apply_gradient_allreduce,
                                      init_distributed, reduce_tensor)
from TTS.tts.utils.generic_utils import setup_model, check_config_tts
from TTS.tts.utils.io import save_best_model, save_checkpoint
from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping,
                                    save_speaker_mapping)
from TTS.tts.utils.styles import (get_styles, load_style_mapping,
                                    save_style_mapping)
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
                                     create_experiment_folder, get_git_branch,
                                     remove_experiment_folder, set_init_dict)
from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import (NoamLR, adam_weight_decay, check_update,
                                gradual_training_scheduler, set_weight_decay,
                                setup_torch_training_env)

In [2]:
use_cuda, num_gpus = setup_torch_training_env(True, False)

def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None, style_mapping = None):
    if is_val and not c.run_eval:
        loader = None
    else:
        dataset = MyDataset(
            r,
            c.text_cleaner,
            compute_linear_spec=c.model.lower() == 'tacotron',
            meta_data=meta_data_eval if is_val else meta_data_train,
            ap=ap,
            tp=c.characters if 'characters' in c.keys() else None,
            batch_group_size=0 if is_val else c.batch_group_size *
            c.batch_size,
            min_seq_len=c.min_seq_len,
            max_seq_len=c.max_seq_len,
            phoneme_cache_path=c.phoneme_cache_path,
            use_phonemes=c.use_phonemes,
            phoneme_language=c.phoneme_language,
            enable_eos_bos=c.enable_eos_bos_chars,
            verbose=verbose,
            speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
        sampler = DistributedSampler(dataset) if num_gpus > 1 else None
        loader = DataLoader(
            dataset,
            batch_size=c.eval_batch_size if is_val else c.batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=False,
            sampler=sampler,
            num_workers=c.num_val_loader_workers
            if is_val else c.num_loader_workers,
            pin_memory=False)
    return loader

def format_data(data, speaker_mapping=None, style_mapping = None):
    if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
        speaker_mapping = load_speaker_mapping(OUT_PATH)
    if style_mapping is None and c.use_style_embedding:
        style_mapping = load_style_mapping(OUT_PATH)

    # setup input data
    text_input = data[0]
    text_lengths = data[1]
    speaker_names = data[2]
    linear_input = data[3] if c.model in ["Tacotron"] else None
    mel_input = data[4]
    mel_lengths = data[5]
    stop_targets = data[6]
    style_targets = data[10]
    pitch_range = data[11]
    speaking_rate = data[12]
    energy = data[13]

    avg_text_length = torch.mean(text_lengths.float())
    avg_spec_length = torch.mean(mel_lengths.float())

    if c.use_speaker_embedding:
        if c.use_external_speaker_embedding_file:
            speaker_embeddings = data[8]
            speaker_ids = None
        else:
            speaker_ids = [
                speaker_mapping[speaker_name] for speaker_name in speaker_names
            ]
            speaker_ids = torch.LongTensor(speaker_ids)
            speaker_embeddings = None
    else:
        speaker_embeddings = None
        speaker_ids = None

    if c.use_style_embedding:
        style_targets = [
                style_mapping[style_target] for style_target in style_targets
            ]
        if c.use_one_hot_style: # Style target will be a one hotted vector
            style_targets_ = np.zeros((len(style_targets), len(style_mapping)-1))
            for i in range(len(style_targets_)):
                if(style_targets[i] != 0): # If we force the 0 mapped style to be the non
                    style_targets_[i][style_targets[i]-1] = 1 # For each position we one hot encode it
            
            style_targets = style_targets_
            
            style_targets = torch.FloatTensor(style_targets)
            
            del style_targets_
        else: # Style target will be just the indice 
            style_targets = torch.LongTensor(style_targets) # To use in CrossEntropyLoss need to be LongTensor
    else:
        style_targets = None

    # Prosodic features          
    if pitch_range is not None:
        pitch_range = torch.LongTensor(pitch_range)
    
    if speaking_rate is not None:
        speaking_rate = torch.LongTensor(speaking_rate)

    if energy is not None:
        energy = torch.LongTensor(energy)


    # set stop targets view, we predict a single stop token per iteration.
    stop_targets = stop_targets.view(text_input.shape[0],
                                     stop_targets.size(1) // c.r, -1)
    stop_targets = (stop_targets.sum(2) >
                    0.0).unsqueeze(2).float().squeeze(2)

    # dispatch data to GPU
    if use_cuda:
        text_input = text_input.cuda(non_blocking=True)
        text_lengths = text_lengths.cuda(non_blocking=True)
        mel_input = mel_input.cuda(non_blocking=True)
        mel_lengths = mel_lengths.cuda(non_blocking=True)
        linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None
        stop_targets = stop_targets.cuda(non_blocking=True)
        if speaker_ids is not None:
            speaker_ids = speaker_ids.cuda(non_blocking=True)
        if speaker_embeddings is not None:
            speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
        if style_targets is not None:
            style_targets = style_targets.cuda(non_blocking=True)

        # Prosodic features          
        if pitch_range is not None:
            pitch_range = pitch_range.cuda(non_blocking=True)
        
        if speaking_rate is not None:
            speaking_rate = speaking_rate.cuda(non_blocking=True)

        if energy is not None:
            energy = energy.cuda(non_blocking=True)

    return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, \
        speaker_embeddings, avg_text_length, avg_spec_length, style_targets, pitch_range, speaking_rate, energy


 > Using CUDA:  True
 > Number of GPUs:  1


In [3]:
config_path = './experiments/debug_prosodic_linear/config.json'
experiment_folder = './experiments/debug_prosodic_features/'
c = load_config(config_path)

# c['use_prosodic_linear'] = False
# c['prosodic_dim'] = 64

In [4]:
OUT_PATH = experiment_folder

In [5]:
ap = AudioProcessor(**c.audio)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:
 | > hop_length:256
 | > win_length:1024


In [6]:
meta_data_train, meta_data_eval = load_meta_data(c.datasets)

 | > Found 10 files in D:\Mestrado\Emotion Audio Synthesis (TTS)\repo_final\pt_etts\experiments\debug_gst_logits


In [7]:
meta_data_train

[['hi',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0001.wav',
  'rosana',
  'neutral',
  1.0],
 ['my name is lucas',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0002.wav',
  'marco',
  'neutral',
  1.0],
 ['and yours?',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0003.wav',
  'rosana',
  'neutral',
  1.0],
 ['my name is jessica',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0004.wav',
  'marco',
  'neutral',
  1.0],
 ['how are you doing?',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0005.wav',
  'rosana',
  'happy',
  1.0],
 ['fine and you?',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0006.wav',
  'marco',
  'happy',
  1.0],
 ['the same',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0007.wav',
  'rosana',
  'happy',
  1.0],
 ['wanna hangout some day?',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0008.wav',
  'marco',
  'happy',
  1.0],
 ['yes of course',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ001-0009.wav',
  'rosana',
  'happy',
  1.0],
 ['ok, see ya',
  './data/LJSpeech/LJSpeech-1.1/wavs/LJ0

In [8]:
if 'characters' in c.keys():
    symbols, phonemes = make_symbols(**c.characters)

# DISTRUBUTED
if num_gpus > 1:
    init_distributed(args.rank, num_gpus, args.group_id,
                     c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)

In [9]:
restore_path = None # primeiro ele é none pra criar o json
# restore_path = experiment_folder

# parse speakers
if c.use_speaker_embedding:
    speakers = get_speakers(meta_data_train)
    if restore_path:
        if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file
            prev_out_path = os.path.dirname(args.restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            if not speaker_mapping:
                print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file")
                speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
                if not speaker_mapping:
                    raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file")
            speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
        elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file
            prev_out_path = os.path.dirname(restore_path)
            speaker_mapping = load_speaker_mapping(prev_out_path)
            speaker_embedding_dim = None
            assert all([speaker in speaker_mapping
                        for speaker in speakers]), "As of now you, you cannot " \
                                                "introduce new speakers to " \
                                                "a previously trained model."
    else: # if start new train and don't use External Embedding file
        speaker_mapping = {name: i for i, name in enumerate(speakers)}
        speaker_embedding_dim = None
    save_speaker_mapping(OUT_PATH, speaker_mapping)
    num_speakers = len(speaker_mapping)
    print("Training with {} speakers: {}".format(num_speakers,
                                                 ", ".join(speakers)))
else:
    num_speakers = 0
    speaker_embedding_dim = None
    speaker_mapping = None
    
    
# parse styles
if((c.use_style_embedding) | (c.use_style_lookup)):
    styles = get_styles(meta_data_train)
    if restore_path:
        prev_out_path = os.path.dirname(restore_path)
        style_mapping = load_style_mapping(prev_out_path)
        style_embedding_dim = None
        assert all([style in style_mapping
                    for style in styles]), "As of now you, you cannot " \
                                            "introduce new styles to " \
                                            "a previously trained model."
    else: # if start new train and don't use External Embedding file
        style_mapping = {name: i for i, name in enumerate(styles)}
        style_embedding_dim = None
    save_style_mapping(OUT_PATH, style_mapping)
    num_styles = len(style_mapping)
    print("Training with {} styles: {}".format(num_styles,
                                                 ", ".join(styles)))
else:
    num_styles = 3
    style_embedding_dim = None
    style_mapping = None

model = setup_model(num_chars, num_speakers, num_styles, c, speaker_embedding_dim)

Training with 2 speakers: marco, rosana
 > Using model: Tacotron2
Training with 2 speakers and 3 styles
Use style target = True
Use semi supervised = False


In [10]:
# checkpoint = torch.load("./experiments/debug_style_lookup/best_model.pth.tar", map_location='cpu')

In [11]:
# model.load_state_dict(checkpoint['model'])

In [12]:
print(model)

Tacotron2(
  (speaker_embedding): Embedding(2, 64)
  (prosodic_linear): Linear(in_features=1, out_features=64, bias=False)
  (embedding): Embedding(217, 512, padding_idx=0)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (1): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (activation): ReLU()
      )
      (2): ConvBNBlock(
        (convolution1d): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        (batch_normalization): BatchNorm1d(5

In [13]:
# print(model.linear_style_target_layer)

In [14]:
print(model.decoder)

Decoder(
  (prenet): Prenet(
    (linear_layers): ModuleList(
      (0): Linear(
        (linear_layer): Linear(in_features=80, out_features=256, bias=False)
      )
      (1): Linear(
        (linear_layer): Linear(in_features=256, out_features=256, bias=False)
      )
    )
  )
  (attention_rnn): LSTMCell(896, 1024)
  (attention): OriginalAttention(
    (query_layer): Linear(
      (linear_layer): Linear(in_features=1024, out_features=128, bias=False)
    )
    (inputs_layer): Linear(
      (linear_layer): Linear(in_features=640, out_features=128, bias=False)
    )
    (v): Linear(
      (linear_layer): Linear(in_features=128, out_features=1, bias=True)
    )
    (location_layer): LocationLayer(
      (location_conv1d): Conv1d(2, 32, kernel_size=(31,), stride=(1,), padding=(15,), bias=False)
      (location_dense): Linear(
        (linear_layer): Linear(in_features=32, out_features=128, bias=False)
      )
    )
  )
  (decoder_rnn): LSTMCell(1664, 1024)
  (linear_projection): Linear(

In [15]:
epoch = 0
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
                           verbose=(epoch == 0), speaker_mapping=speaker_mapping, style_mapping = style_mapping)


 > DataLoader initialization
 | > Use phonemes: False
 | > Number of instances : 10
 | > Max length sequence: 23
 | > Min length sequence: 2
 | > Avg length sequence: 13.1
 | > Num. instances discarded by max-min (max=153, min=6) seq limits: 1
 | > Batch group size: 128.


In [16]:

for data in data_loader:
    text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length, style_targets, \
    pitch_range, speaking_rate, energy = format_data(data, speaker_mapping, style_mapping)
    # forward pass model
    if c.bidirectional_decoder or c.double_decoder_consistency:
        decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward, logits = model.cuda()(
            text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings,
            pitch_range = pitch_range, speaking_rate=speaking_rate, energy=energy, style_ids = style_targets)
    else:
        decoder_output, postnet_output, alignments, stop_tokens, logits = model.cuda()(
            text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings,
            pitch_range = pitch_range, speaking_rate=speaking_rate, energy=energy, style_ids = style_targets)
        decoder_backward_output = None
        alignments_backward = None

entrou pitch


In [17]:
data

(tensor([[ 69,  47,  60,  60,  47, 101,  54,  47,  60,  53,  61,  67,  66, 101,
           65,  61,  59,  51, 101,  50,  47,  71, 100],
         [ 59,  71, 101,  60,  47,  59,  51, 101,  55,  65, 101,  56,  51,  65,
           65,  55,  49,  47,   0,   0,   0,   0,   0],
         [ 54,  61,  69, 101,  47,  64,  51, 101,  71,  61,  67, 101,  50,  61,
           55,  60,  53, 100,   0,   0,   0,   0,   0],
         [ 59,  71, 101,  60,  47,  59,  51, 101,  55,  65, 101,  58,  67,  49,
           47,  65,   0,   0,   0,   0,   0,   0,   0],
         [ 52,  55,  60,  51, 101,  47,  60,  50, 101,  71,  61,  67, 100,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0],
         [ 71,  51,  65, 101,  61,  52, 101,  49,  61,  67,  64,  65,  51,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0],
         [ 47,  60,  50, 101,  71,  61,  67,  64,  65, 100,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,   0],
         [ 61,  57,  95, 101,  65,  51,  51, 101

In [None]:
x = torch.rand((9,1))
y = torch.rand((9,1,5))

In [None]:
torch.cat((x.unsqueeze(1),y), -1).shape

In [None]:
# model.linear_style_target_layer

In [None]:
pitch_range

In [None]:
mel_input.shape

In [None]:
print(pitch_range)

In [None]:
input_mask, output_mask = model.compute_masks(text_lengths, mel_lengths)
# B x D_embed x T_in_max
embedded_inputs = model.embedding(text_input).transpose(1, 2)
# B x T_in_max x D_en
encoder_outputs = model.encoder(embedded_inputs, text_lengths)

In [None]:
enc_out, gst_out, logits = model.compute_gst(encoder_outputs, mel_input, None, True, style_targets,style_targets,style_targets)

In [None]:
enc_out.shape, encoder_outputs.shape, gst_out.shape

In [None]:
style_targets.shape

In [None]:
style_targets.unsqueeze(1).shape

In [None]:
model._concat_speaker_embedding(enc_out, style_targets.unsqueeze(1)).shape

In [None]:
c

In [None]:
gst_outputs, logits = model.gst_layer(mel_input, None)

In [None]:
gst_outputs.shape

In [None]:
pitch_range.shape

In [None]:
torch.cat((gst_outputs, pitch_range.unsqueeze(1).unsqueeze(1)), -1)

In [None]:
from torch import nn
l = nn.Linear(1, 16).cuda()
l(pitch_range.unsqueeze(1).float()).shape

In [None]:
pitch_range.unsqueeze(1).unsqueeze(1).shape

In [None]:
pitch_range.unsqueeze(1)[0][0] = 2

In [None]:
pitch_range

In [None]:
pf = torch.zeros((9,2))

In [None]:
pf[:,0] = pitch_range

In [None]:
pf

In [None]:
pitch_range.shape[0]

In [None]:
pf[:].cuda()

In [None]:
pf[:,0].shape

In [None]:
l(pf[:,0].unsqueeze(1).cuda()).shape

In [None]:
pitch_range.device

In [None]:
l(pf[:,0].unsqueeze(1))