In [None]:
import logging, sys
logging.disable(sys.maxsize)

from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder, TextToWaveform
from nemo.collections.tts.models import FastPitchModel, Tacotron2Model
from nemo.collections.tts.modules.hifigan_modules import Generator as Hifigan_generator

import IPython.display as ipd
import numpy as np
from PIL import Image
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
import torch
import json
import librosa
import os
from nemo.collections.tts.models import TwoStagesModel
from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
from nemo.collections.asr.models import EncDecSpeakerLabelModel
from nemo.collections.tts.models import HifiGanModel

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
def infer_taco(spec_gen_model, vocoder_model, str_input, speaker = None):
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed)
        if isinstance(vocoder_model, Hifigan_generator):
            audio = vocoder_model(x=spectrogram.half()).squeeze(1)
        else:
            audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio


def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        if isinstance(vocoder_model, Hifigan_generator):
            audio = vocoder_model(x=spectrogram.half()).squeeze(1)
        else:
            audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

def get_best_ckpt(experiment_base_dir, new_speaker_id, duration_mins, mixing_enabled, original_speaker_id):
    if not mixing_enabled:
        exp_dir = "{}/{}_to_{}_no_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    else:
        exp_dir = "{}/{}_to_{}_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    
    return _get_best_ckpt_from_experiment(exp_dir)
    

def _get_best_ckpt_from_experiment(exp_dir):
    ckpt_candidates = []
    last_ckpt = None
    for root, dirs, files in os.walk(exp_dir):
        for file in files:
            if file.endswith(".ckpt"):
                try:
                    val_error = float(file.split("v_loss=")[1].split("-epoch")[0])
                except:
                    val_error = 0.0
                if "last" in file:
                    last_ckpt = os.path.join(root, file)
                ckpt_candidates.append( (val_error, os.path.join(root, file)))
    ckpt_candidates.sort()
    
    return ckpt_candidates, last_ckpt

wav_featurizer = WaveformFeaturizer(sample_rate=44100, int_values=False, augmentor=None)
mel_processor = AudioToMelSpectrogramPreprocessor(
        window_size = None,
        window_stride = None,
        sample_rate=44100,
        n_window_size=2048,
        n_window_stride=512,
        window="hann",
        normalize=None,
        n_fft=None,
        preemph=None,
        features=80,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=1e-05,
        dither=0.0,
        pad_to=1,
        frame_splicing=1,
        exact_pad=False,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=1.0
)

In [None]:
# vocoder = Hifigan_generator(
#     resblock=1,
#     upsample_rates=[8, 8, 4, 2],
#     upsample_kernel_sizes=[16, 16, 4, 4],
#     upsample_initial_channel=512,
#     resblock_kernel_sizes=[3, 7, 11],
#     resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
# )
# nemo_gen_keys = [k for k in vocoder.state_dict().keys()]
# adlr_gen_ckpt = torch.load("/home/pneekhara/Checkpoints/g_01224000")['generator']
# adlr_gen_keys = adlr_gen_ckpt.keys()

# new_nemo_ckpt = {nemo_key: adlr_gen_ckpt[adlr_key] for adlr_key, nemo_key in zip(adlr_gen_keys, nemo_gen_keys)}
# vocoder.load_state_dict(new_nemo_ckpt)
# vocoder = vocoder.cuda().eval().half()

cfg = {'linvocoder':  {'_target_': 'nemo.collections.tts.models.two_stages.GriffinLimModel',
                     'cfg': {'n_iters': 64, 'n_fft': 2048, 'l_hop': 512}},
       'mel2spec': {'_target_': 'nemo.collections.tts.models.two_stages.MelPsuedoInverseModel',
                   'cfg': {'sampling_rate': 44100, 'n_fft': 2048, 
                           'mel_fmin': 0, 'mel_fmax': None, 'mel_freq': 80}}}
vocoder_gl = TwoStagesModel(cfg).eval().cuda()

vocoder = HifiGanModel.load_from_checkpoint("/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.08-epoch=899.ckpt")
vocoder.eval().cuda()


vocoder22050 = HifiGanModel.restore_from("/home/pneekhara/PreTrainedModels/hifigan_ljspeech.nemo")
vocoder22050.eval().cuda()

In [None]:
_, taco_model_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/TacoRough44100")
print(taco_model_ckpt)
spec_model = Tacotron2Model.load_from_checkpoint(taco_model_ckpt)
# spec_model = Tacotron2Model.restore_from("/home/pneekhara/PreTrainedModels/Tacotron2-8051-char.nemo")
spec_model.cuda()
spec, audio = infer_taco(spec_model, vocoder, "This is a tricky text to speech example for comparing Taco tron and Fast pitch models.")
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
_, fastpitch_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsAutomatedResetPitch/8051_to_6097_no_mixing_1_mins")
print(fastpitch_ckpt)
spec_model = FastPitchModel.load_from_checkpoint(fastpitch_ckpt)
# spec_model = Tacotron2Model.restore_from("/home/pneekhara/PreTrainedModels/tts_en_tacotron2.nemo")
spec_model.cuda()
spec, audio = infer(spec_model, vocoder, "This is a tricky text to speech example for comparing Taco tron and Fast pitch models.", speaker = 1)
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
taco_model_ckpt

In [None]:
spec_model_original = FastPitchModel.restore_from('/home/pneekhara/PreTrainedModels/FastPitch.nemo')
spec_model_original.eval().cuda()
# spec_model = FastPitchModel.load_from_checkpoint("/home/pneekhara/Experiments/8051to92Simple100/FastPitch/2021-07-19_09-30-44/checkpoints/FastPitch--v_loss=1.11-epoch=182.ckpt")

data_dir = "/home/pneekhara/Datasets/78419/Hi_Fi_TTS_v_0_backup"
experiment_base_dir = "/home/pneekhara/ExperimentsAutomatedResetPitch/"
clean_other_mapping = {
    92 : 'clean',
    6097 : 'clean'
}

full_data_ckpts = {
    92 : '/home/pneekhara/Checkpoints/FastPitchSpeaker92Epoch999.ckpt',
    6097 : '/home/pneekhara/Checkpoints/FastPitch6097Epoch999.ckpt'
}
num_val = 1

for speaker in [6097, 92]:
    manifest_path = os.path.join(data_dir, "{}_manifest_{}_{}.json".format(speaker, clean_other_mapping[speaker], "dev"))
    val_records = []
    with open(manifest_path, "r") as f:
        for i, line in enumerate(f):
            val_records.append( json.loads(line) )
            if len(val_records) >= num_val:
                break
    print ("**** REAL VALIDATION *****")
    for vidx, val_record in enumerate(val_records):
        print("Audio path:", val_record['audio_filepath'] )
        audio_path = os.path.join( data_dir, val_record['audio_filepath'] )
        
        real_wav = wav_featurizer.process(audio_path)
        real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
        real_mel = real_mel[0]
#         imshow(real_mel[0].cpu().numpy(), origin="lower", aspect = "auto")
#         plt.show()
        real_mel = real_mel.cuda()
        with torch.no_grad():
            # vocoded_audio_real = vocoder(x=real_mel.half()).squeeze(1)
            vocoded_audio_real = vocoder.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()

        # vocoded_audio_real = vocoded_audio_real.to('cpu').numpy()
        # audio, sr = librosa.load(audio_path, sr=None)
        print (vidx, val_record['text'])
        print("Ground Truth Audio for speaker:", speaker)
        ipd.display(ipd.Audio(real_wav, rate=44100))
        print("Ground truh spectrogram vocoded (HiFiGAN):", speaker)
        ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
        # print("Vocoded (GL) from real spectrogram:", speaker)
    print ("************************")
    print ("********Generated*********")
    for duration_mins in ["All", 60, 5, 1]:
        for mixing in [False, True]:
            if duration_mins == "All":
                if mixing:
                    continue
                last_ckpt = full_data_ckpts[speaker]
            else:
                model_ckpts, last_ckpt = get_best_ckpt(experiment_base_dir, speaker, duration_mins, mixing, 8051)
            if last_ckpt is None:
                print ("Checkpoint not found for:", "Speaker: {} | Dataset size: {} mins | Mixing:{}".format(speaker, duration_mins, mixing)) 
                continue
                
            # print(last_ckpt)
            spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
            spec_model.eval().cuda()
            _speaker=None
            if mixing:
                _speaker = 1
            for val_record in val_records:
                print ("SYNTHESIZED FOR -- Speaker: {} | Dataset size: {} mins | Mixing:{} | Text: {}".format(speaker, duration_mins, mixing, val_record['text']))
                spec, audio = infer(spec_model, vocoder, val_record['text'], speaker = _speaker)
                ipd.display(ipd.Audio(audio, rate=44100))
                %matplotlib inline
                #if spec is not None:
#                 imshow(spec, origin="lower", aspect = "auto")
#                 plt.show()

In [None]:
text_to_generate = "by nineteen fifty two this figure had become one hundred fifty four thousand two hundred seventy seven by virtue of minor changes', 'duration': 9.34, 'text_no_preprocessing': 'By 1952, this figure had become 154,277 by virtue of minor changes."
spec, audio = infer(spec_model_original, vocoder, text_to_generate)
print("Original model trained on speaker 8051")
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
vocoder_ckpt_path = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsHiFiJon/JonFinetuningMixing")[1]
print (vocoder_ckpt_path)
vocoder_jon = HifiGanModel.load_from_checkpoint(vocoder_ckpt_path)
vocoder_jon.eval().cuda()
_, last_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningMixed")
print(last_ckpt)
spec_model_custom = FastPitchModel.load_from_checkpoint(last_ckpt).cuda()

# text_to_generate = "This is an experiment to see how good the model performs on real world data."
# spec, audio = infer(spec_model_custom, vocoder_jon, text_to_generate)
# print("Jon's synthetic voice - finetuned vocoder")
# ipd.display(ipd.Audio(audio, rate=44100))







In [None]:
val_manifest_file = "/home/pneekhara/JonData/val_list.json"




with open(val_manifest_file) as f:
    all_lines = f.read().split("\n")
    for line in all_lines:
        if len(line) > 0:
            record = json.loads(line)
            
            print("Jon's actual voice")
            ipd.display(ipd.Audio(record['audio_filepath']))
            
            real_wav = wav_featurizer.process(record['audio_filepath'])
            real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
            real_mel = real_mel[0]
            real_mel = real_mel.cuda()
            with torch.no_grad():
#                 vocoded_audio_real = vocoder(x=real_mel.half()).squeeze(1)
#                 vocoded_audio_real = vocoded_audio_real.to('cpu').numpy()
                vocoded_audio_real = vocoder_jon.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
                print("Ground truh spectrogram vocoded (HiFiGAN):")
                ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
            text_to_generate = record['text']
            spec, audio = infer(spec_model_custom, vocoder_jon, text_to_generate, speaker=1)
            print("Jon's synthetic voice - finetuned vocoder")
            ipd.display(ipd.Audio(audio, rate=44100))
            
            spec, audio = infer(spec_model_custom, vocoder, text_to_generate, speaker = 1)
            print("Jon's synthetic voice - universal vocoder")
            ipd.display(ipd.Audio(audio, rate=44100))
            print ("*********************")


In [None]:
text_to_generate = "This is Jon's voice. Does it sound like him?"
spec, audio = infer(spec_model_custom, vocoder_jon, text_to_generate, speaker = 1)
print("Jon's synthetic voice - finetuned vocoder")
ipd.display(ipd.Audio(audio, 
                      rate=44100))
