# Style Transfer Inference Demo for Flowtron on Google COLAB¶ 

Original code is by:

Rafael Valle, Kevin Shih, Ryan Prenger and Bryan Catanzaro | NVIDIA

The Google Colaboratory style trasfer code was written by;

Bence Halpern | PhD Student | University of Amsterdam, TU Delft, Netherlands Cancer Institute

**E-mail about info and discussions:** b.m.halpern[atttt]uva.nl

## Intro
This notebook requires a GPU runtime to run. Please select the menu option "**Runtime**" -> "**Change runtime type**", select "**Hardware Accelerator**" -> "**GPU**" and click "**SAVE**"

## Model Description

The TTS used in this colab is Flowtron. The original paper is:

- VALLE, Rafael, et al. Flowtron: an Autoregressive Flow-based Generative Network for Text-to-Speech Synthesis. arXiv preprint arXiv:2005.05957, 2020.

The style transfer method used is the robust reference audio-based method to perform emotional style transfer. To my knowledge, this was first done in the Tacotron 2 GST by Kwon et al. We use this method with Flowtron to get emotional audio. More detail about the reference audio-based method:

- KWON, Ohsung, et al. An Effective Style Token Weight Control Technique for End-to-End Emotional Speech Synthesis. IEEE Signal Processing Letters, 2019, 26.9: 1383-1387.

## Dataset

The happy and sad reference emotional signals are from the RAVDESS dataset. 

Please cite their work if you use the emotional data in your work:
- Livingstone SR, Russo FA (2018) The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5): e0196391. https://doi.org/10.1371/journal.pone.0196391.




## Data preparation

For your custom style transfer, you need to provide your own audio files and file lists. The easiest way you can learn how to do this is by mimicking the examples below. Upload the audio files and your file lists to your Google Drive and set it to public access.

Check that the printed downloaded size agrees with the original size. If not, you might have made a mistake in the download link or you forgot to make it public.

Don't forget to downsample your audios. You can use the bash script for that in the happy.zip. The Flowtron uses 22050 Hz and 16-bit depth.

In [1]:
#from unidecode import unidecode

import sys
print(sys.executable)
import os
print(os.listdir())
import matplotlib
matplotlib.use("Agg")
import matplotlib.pylab as plt

import os
import argparse
import json
import sys
import numpy as np
import torch

# Matplotlib might need to be downgraded?
import unidecode

from flowtron import Flowtron
from torch.utils.data import DataLoader
from data import Data, load_wav_to_torch
from train import update_params

sys.path.insert(0, "tacotron2")
sys.path.insert(0, "tacotron2/waveglow")
from glow import WaveGlow
from scipy.io.wavfile import write
from torch.nn import ReplicationPad1d, ReflectionPad1d
from glob import glob
from torch.distributions import Normal

import IPython
from data import DataCollate

print("done")

C:\Users\nicol\anaconda3\envs\psyche\python.exe
['.git', '.gitmodules', '.ipynb_checkpoints', 'apex', 'audio_processing.py', 'config.json', 'data', 'data.py', 'distributed.py', 'Dockerfile', 'filelists', 'flowtron.py', 'flowtron_logger.py', 'flowtron_plotting_utils.py', 'inference.py', 'inference_style_transfer.ipynb', 'jupyter_in_psyche.bat', 'LICENSE', 'models', 'radam.py', 'README.md', 'README_nico.txt', 'requirements.txt', 'results', 'Style_Transfer_for_Flowtron.ipynb', 'tacotron2', 'text', 'train.py', '__pycache__']
done


In [2]:
def infer(flowtron_path, waveglow_path, text, speaker_id, n_frames, sigma,
          seed,emotion,utterance=None):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    model = Flowtron(**model_config).cuda()
    state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
    model.load_state_dict(state_dict)
    model.eval()
    print("Loaded checkpoint '{}')" .format(flowtron_path))

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()

    styleset = Data("filelists/" + str(emotion) +"_reference_audios.txt",
                    **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))

    print(len(styleset))
 # Feeding the dataset in one batch: modify if you have larger datast
    batch_size = len(styleset)

    # Sampler is assumed none for COLAB, because n_gpu=1
    style_loader = DataLoader(styleset, num_workers=1, shuffle=False,
                              sampler=None, batch_size=batch_size,
                              pin_memory=False, collate_fn=DataCollate())

    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()

    text = trainset.get_text(text).cuda()
    synth_speaker_vecs = speaker_vecs[None]
    text = text[None]


    with torch.no_grad():


        for batch in style_loader:
            mel, speaker_vecs, utt_text, in_lens, out_lens, gate_target = batch

            mel, speaker_vecs, utt_text = mel.cuda(), speaker_vecs.cuda(), utt_text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(), gate_target.cuda()

            residual, _, _, _, _, _, _ = model.forward(mel, speaker_vecs, utt_text, in_lens, out_lens)
            residual = residual.permute(1, 2, 0)

        # At this stage the latent vectors are zero-padded which is not appropriate, because it violates the assumption
        # of Gaussian latent space, leading to artefacts.



        residual_accumulator = torch.zeros((residual.shape[0], 80, n_frames)).to("cuda")

        for i in range(residual.shape[0]):
            current_len = out_lens[i].cpu().numpy()

            if current_len < n_frames:
                num_tile = int(np.ceil(n_frames/current_len))
                residual_accumulator[i,:,:] = torch.repeat_interleave(residual[i,:,:current_len],repeats=num_tile,dim=1)[:,:n_frames]

        residual_accumulator = torch.mean(residual_accumulator,dim=0)[None,:,:]
       
        average_over_time = False
        if not average_over_time:
            dist = Normal(residual_accumulator, sigma)
            z_style = dist.sample()
        else:
            print(residual_accumulator.shape)
            residual_accumulator = residual_accumulator.mean(dim=2)
            dist = Normal(residual_accumulator,sigma)
            z_style = dist.sample((n_frames,)).permute(1,2,0)

        mels, attentions = model.infer(z_style, synth_speaker_vecs, text)

    for k in range(len(attentions)):
        attention = torch.cat(attentions[k]).cpu().numpy()
        fig, axes = plt.subplots(1, 2, figsize=(16, 4))
        axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
        axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto')
        fig.savefig('sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k))
        plt.close("all")

    audio = waveglow.infer(mels.half(), sigma=0.8).float()
    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    return audio

In [None]:
# Parse configs. Globals are never nice, but we use it anyway
with open("config.json") as f:
  data = f.read()

global config
config = json.loads(data)
#update_params(config, args.params)

data_config = config["data_config"]
global model_config
model_config = config["model_config"]

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False

# Emotion parameter, happy and sad is implemented
emotion="happy"
#emotion="sad"

# DEFAULT SETUP: LJS
speaker_id=0

# LIBRITTS SETUP - UNCOMMENT
#data_config["training_files"] = "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt"
#model_config["n_speakers"] = 123
#speaker_id=40 


# There are some utterances that don't work, this one is tested, feel free to
# experiment, but don't ask why it doesn't work! 

# I don't know what is going
audio = infer('models/flowtron_ljs.pt', 'models/waveglow_256channels_v4.pt',"{H} {E} {L} {L}", speaker_id=speaker_id,n_frames=400, sigma=0.01, seed=1,emotion=emotion)

%matplotlib inline
plt.plot(audio[:])

IPython.display.Audio(audio[:],rate=22050)

In [None]:
IPython.display.Audio(audio[:],rate=22050)
print("done")

In [None]:
audio