In [1]:
%cd ..

/home/utanko/ondewo/ondewo-t2s


In [2]:
from inference.nemo_modules.inference_data_layer import InferenceDataLayer
import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.tts as nemo_tts
import wave
import numpy as np
from scipy.io.wavfile import write
import IPython
from pathlib import Path
from ruamel.yaml import YAML

from normalization.postprocessor import Postprocessor

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

[NeMo W 2020-09-29 16:44:14 audio_preprocessing:56] Could not import torchaudio. Some features might not work.
[NeMo W 2020-09-29 16:44:14 audio_preprocessing:61] Unable to import APEX. Mixed precision and distributed training will not work.
[NeMo W 2020-09-29 16:44:14 transformer_modules:36] Unable to import FusedLayerNorm  from APEX. Using regular LayerNorm instead.




In [3]:
from inference.nemo_synthesizer import NemoSynthesizer

yaml = YAML(typ="safe")
with open(Path("config", "ondewo_t2s_config.yaml")) as f:
    config = yaml.load(f)

nemo_synthesizer = NemoSynthesizer(config=config, load_waveglow=True)


Loaded Tacotron2 model.
Loaded WaveGlow model.
Loaded WaveGlow denoiser with strength 0.05.


In [4]:
texts=[#"Zu meiner Familie gehören vier Personen.", 
       #"Die Mutter bin ich und dann gehört natürlich mein Mann dazu.", 
       #"Wir haben zwei Kinder, einen Sohn, der sechs Jahre alt ist und eine dreijährige Tochter."]
       #"Hast du etwas Zeit für mich? Dann singe ich ein Lied."]
       #"Texts include specialised articles, biographies and summaries."]
       #"I like to move it move it, I like to move it move it, we like to ... move it."]
       #"Der gemeinsam mit dem BioWare-Autorenteam von Star Wars The Old Republic."]
        "When George Lucas finished the production of Star Wars VI, he cried."]

# make graph
data_layer = InferenceDataLayer(
    texts=texts,
    labels=nemo_synthesizer.labels,
    batch_size=2,
    num_workers=1,
    bos_id=nemo_synthesizer.bos_id,
    eos_id=nemo_synthesizer.eos_id,
    pad_id=nemo_synthesizer.pad_id,
    shuffle=False,
)

# building inference pipeline
transcript, transcript_len = data_layer()
transcript_embedded = nemo_synthesizer.tacotron_embedding(char_phone=transcript)
transcript_encoded = nemo_synthesizer.tacotron_encoder(char_phone_embeddings=transcript_embedded,
                                                      embedding_length=transcript_len, )
mel_decoder, gate, alignments, mel_len = nemo_synthesizer.tacotron_decoder(
    char_phone_encoded=transcript_encoded, encoded_length=transcript_len,
)
mel_postnet = nemo_synthesizer.tacotron_postnet(mel_input=mel_decoder)
audio_pred = nemo_synthesizer.waveglow(mel_spectrogram=mel_postnet)

# running the inference pipeline
evaluated_tensors = nemo_synthesizer.neural_factory.infer(
    tensors=[mel_postnet, gate, alignments, audio_pred, mel_len])

mel_len_result = evaluated_tensors[-1]
audio_result = evaluated_tensors[-2]

In [5]:
result = []
for i in range(len(mel_len_result)):
    for j in range(audio_result[i].shape[0]):
        sample_len = mel_len_result[i][j] * nemo_synthesizer.config['tacotron2']['config']["n_stride"]
        sample = audio_result[i].cpu().numpy()[j][:sample_len]

        if nemo_synthesizer.is_denoiser_active:
            sample, _ = nemo_synthesizer.waveglow.denoise(
                sample, strength = nemo_synthesizer.denoiser_strength)

        result.append(sample)


In [6]:
save_file = "notebooks/kerstin_generated_waveglow.wav"

In [7]:
audio = Postprocessor.postprocess(result)

# conversion to 16-bit PCM
audio *= np.iinfo(np.int16).max
audio = audio.astype("int16")

# save audio to file
write(save_file, 22050, audio)

In [8]:
IPython.display.Audio(save_file)

In [9]:
from inference.melgan import MelGAN


# MelGAN
melgan_conf = {
    "input_size": 80,
    "ngf": 32,
    "n_residual_layers": 3,
    "device": "cuda",
    "load_path": "models/melgan/multi_speaker.pt"
}
melgan = MelGAN(melgan_conf)

mel_out = evaluated_tensors[0]
mel_list = [mel.cpu().numpy() for mel in mel_out[0]]

result_melgan = melgan.synthesize(mel_list)


In [10]:
save_file = "notebooks/kerstin_generated_melgan.wav"

In [11]:
audio_melgan = Postprocessor.postprocess(result_melgan)

# conversion to 16-bit PCM
audio_melgan *= np.iinfo(np.int16).max
audio_melgan = audio_melgan.astype("int16")

# save audio to file
write(save_file, 22050, audio_melgan)

In [12]:
audio_melgan.shape

(128152,)

In [13]:
IPython.display.Audio(save_file)

In [14]:
# MELGAN V2
import torch

vocoder = torch.hub.load('seungwonpark/melgan', 'melgan')
#mel = torch.randn(1, 80, 234) # use your own mel-spectrogram here
mel_list = np.array(mel_list)

print('Input mel-spectrogram shape: {}'.format(mel_list.shape))

if torch.cuda.is_available():
    print('Moving data & model to GPU')
    vocoder = vocoder.cuda()
    mel_list = torch.from_numpy(mel_list).cuda()

with torch.no_grad():
    audio = vocoder.inference(mel_list)

print('Output audio shape: {}'.format(audio.shape))

Input mel-spectrogram shape: (1, 80, 442)
Moving data & model to GPU
Output audio shape: torch.Size([113152])


Using cache found in /home/utanko/.cache/torch/hub/seungwonpark_melgan_master


In [15]:
mel_list.shape

torch.Size([1, 80, 442])

In [16]:
audio.cpu().numpy()

array([-2,  4,  1, ...,  1,  5,  3], dtype=int16)

In [17]:
save_file = "notebooks/kerstin_generated_melgan_v2.wav"

In [18]:
#audio_melgan = Postprocessor.postprocess([audio.cpu().numpy()])
audio_melgan = audio.cpu().numpy()

# conversion to 16-bit PCM
#audio_melgan *= np.iinfo(np.int16).max
#audio_melgan = audio_melgan.astype("int16")

# save audio to file
write(save_file, 22050, audio_melgan)

In [19]:
audio_melgan

array([-2,  4,  1, ...,  1,  5,  3], dtype=int16)

In [20]:
IPython.display.Audio(save_file)

In [21]:
print([mel.cpu().numpy().shape for mels in mel_out for mel in mels])

[(80, 442)]


In [22]:
import torch

torch.from_numpy(mel_list[1]).shape

IndexError: index 1 is out of bounds for dimension 0 with size 1