In [None]:
import logging, sys
logging.disable(sys.maxsize)

import IPython.display as ipd
import numpy as np
import torch
import json
import librosa
import os

from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.modules.hifigan_modules import Generator as Hifigan_generator
from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor
from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
from nemo.collections.tts.models import HifiGanModel


os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        if isinstance(vocoder_model, Hifigan_generator):
            audio = vocoder_model(x=spectrogram.half()).squeeze(1)
        else:
            audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio




def _get_best_ckpt_from_experiment(exp_dir):
    ckpt_candidates = []
    last_ckpt = None
    for root, dirs, files in os.walk(exp_dir):
        for file in files:
            if file.endswith(".ckpt"):
                try:
                    val_error = float(file.split("v_loss=")[1].split("-epoch")[0])
                except:
                    val_error = 0.0
                if "last" in file:
                    last_ckpt = os.path.join(root, file)
                ckpt_candidates.append( (val_error, os.path.join(root, file)))
    ckpt_candidates.sort()
    
#     return ckpt_candidates, last_ckpt
    return ckpt_candidates, ckpt_candidates[0][1]

wav_featurizer = WaveformFeaturizer(sample_rate=44100, int_values=False, augmentor=None)
mel_processor = AudioToMelSpectrogramPreprocessor(
        window_size = None,
        window_stride = None,
        sample_rate=44100,
        n_window_size=2048,
        n_window_stride=512,
        window="hann",
        normalize=None,
        n_fft=None,
        preemph=None,
        features=80,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=1e-05,
        dither=0.0,
        pad_to=1,
        frame_splicing=1,
        exact_pad=False,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=1.0
)

In [None]:
def get_synthesis_models(fastpitch_model_path, hifigan_model_path):
    print (fastpitch_model_path)
    print (hifigan_model_path)
    spec_model = FastPitchModel.load_from_checkpoint(fastpitch_model_path)
    spec_model.cuda().eval()
    vocoder = HifiGanModel.load_from_checkpoint(hifigan_model_path)
    vocoder.cuda().eval()
    
    return spec_model, vocoder

def play_validation_samples(spec_model, vocoder):
    val_manifest_file = "/home/pneekhara/JonData/val_list.json"
    with open(val_manifest_file) as f:
        all_lines = f.read().split("\n")
        for lidx, line in enumerate(all_lines):
            if len(line) > 0:
                record = json.loads(line)
                text_to_generate = record['text']
                #print (text_to_generate)
                print("Jon's actual voice")
                ipd.display(ipd.Audio(record['audio_filepath']))

                real_wav = wav_featurizer.process(record['audio_filepath'])
                real_mel, _ = mel_processor.get_features(real_wav[None], torch.tensor([[real_wav.shape[0]]]).long() )
                real_mel = real_mel[0]
                real_mel = real_mel.cuda()
                with torch.no_grad():
                    vocoded_audio_real = vocoder.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
                    print("Ground truh spectrogram vocoded (HiFiGAN):")
                    ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
                
                spec, audio = infer(spec_model, vocoder, text_to_generate, speaker=0)
                print("Synthetic audio")
                ipd.display(ipd.Audio(audio, rate=44100))
                
                if lidx >= 2:
                    break


In [None]:
fastpitch_8051_ckpt = "/home/pneekhara/PreTrainedModels/FastPitch.nemo"
fastpitch_nomix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningv2")[1]
fastpitch_mix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningMixed")[1]
hifigan_universal_ckpt = "/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.08-epoch=899.ckpt"
hifigan_nomix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsHiFiJon/JonFinetuning")[1]
hifigan_mix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsHiFiJon/JonFinetuningMixing")[1]
print(fastpitch_8051_ckpt)
print(fastpitch_nomix_finetuned_ckpt)
print(fastpitch_mix_finetuned_ckpt)
print(hifigan_universal_ckpt)
print(hifigan_nomix_finetuned_ckpt)
print(hifigan_mix_finetuned_ckpt)

In [None]:
fastpitch_8051_ckpt = "/home/pneekhara/PreTrainedModels/FastPitch.nemo"
fastpitch_nomix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningv2")[1]
fastpitch_mix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningMixed")[1]
hifigan_universal_ckpt = "/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.08-epoch=899.ckpt"
# hifigan_universal_ckpt = "/home/pneekhara/PreTrainedModels/HifiGan--val_loss=0.09-epoch=202-last.ckpt"
hifigan_nomix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsHiFiJon/JonFinetuning")[1]
hifigan_mix_finetuned_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsHiFiJon/JonFinetuningMixing")[1]

spec_model, vocoder = get_synthesis_models(fastpitch_nomix_finetuned_ckpt, hifigan_mix_finetuned_ckpt)

In [None]:
text_to_generate = "Oh! This is a really good text to speech synthesis model."
spec, audio = infer(spec_model, vocoder, text_to_generate, speaker = 1)
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
text_to_generate = "Oh! This is a really good text to speech synthesis model."
spec, audio = infer(spec_model, vocoder, text_to_generate, speaker = 1)
ipd.display(ipd.Audio(audio, rate=44100))

In [None]:
play_validation_samples(spec_model, vocoder)

In [None]:
for dur_loss in [0.1]:
    print("dur_loss coeffecient", dur_loss)
    _fastpitch_ckpt = _get_best_ckpt_from_experiment("/home/pneekhara/ExperimentsMainBranch/JonFinetuningDurLoss_{}".format(dur_loss))[1]
    spec_model, vocoder = get_synthesis_models(_fastpitch_ckpt, hifigan_mix_finetuned_ckpt)
    play_validation_samples(spec_model, vocoder)

In [None]:
def calculate_speaking_rate(mainfest_fp):
    
    chars_per_second_list = []
    with open(mainfest_fp) as f:
        all_lines = f.read().split("\n")
        for line in all_lines:
            if len(line) > 0:
                record = json.loads(line)
                chars_per_second = len(record['text'])/record['duration']
                chars_per_second_list.append(chars_per_second)
    
    return np.mean(chars_per_second_list)

In [None]:
print(calculate_speaking_rate("/home/pneekhara/JonData/train_list.json"))
print(calculate_speaking_rate("/home/pneekhara/JonData/val_list.json"))

In [None]:
print(calculate_speaking_rate("/home/pneekhara/Datasets/78419/Hi_Fi_TTS_v_0_backup/8051_manifest_clean_train.json"))
# print(calculate_speaking_rate("/home/pneekhara/JonData/val_list.json"))

In [None]:
from nemo.collections.asr.data import audio_to_text

In [None]:
from omegaconf import OmegaConf
conf = OmegaConf.load('/home/pneekhara/NeMo/examples/tts/conf/fastpitch_align_44100.yaml')

In [None]:
conf.validation_datasets = "/home/pneekhara/JonData/val_list.json"
conf.prior_folder = "/home/pneekhara/dump"
print (conf.model.validation_ds.dataset)

In [None]:
from hydra.utils import instantiate

val_dataset = instantiate(conf.model.validation_ds.dataset)

In [None]:

conf.model.validation_ds.dataloader_params.batch_size = 1
conf.model.validation_ds.dataloader_params.num_workers = 1
val_loader = torch.utils.data.DataLoader(val_dataset, collate_fn=val_dataset.collate_fn, **conf.model.validation_ds.dataloader_params)

In [None]:
# val_loader.cuda()
for batch in val_loader:
    with torch.no_grad():
        audio, audio_lens, text, text_lens, attn_prior, pitch, speakers = batch
        for item in [audio, audio_lens, text, text_lens, attn_prior, pitch, speakers]:
            if item is not None:
                item = item.cuda()
        print(audio)
        
        
                    
                    
        mels, spec_len = spec_model.preprocessor(input_signal=audio.cuda(), length=audio_lens.cuda())
        
        real_mel = mels
        real_mel = real_mel.cuda()
        with torch.no_grad():
            vocoded_audio_real = vocoder.convert_spectrogram_to_audio(spec=real_mel).cpu().numpy()
            print("Ground truh spectrogram vocoded (HiFiGAN):")
            ipd.display(ipd.Audio(vocoded_audio_real, rate=44100))
            
        mels_pred, _, _, log_durs_pred, pitch_pred, attn_soft, attn_logprob, attn_hard, attn_hard_dur, pitch = spec_model(
                text=text.cuda(),
                durs=None,
                pitch=pitch.cuda(),
                speaker=speakers,
                pace=1.0,
                spec=mels.cuda(),
                attn_prior=attn_prior.cuda(),
                mel_lens=spec_len.cuda(),
                input_lens=text_lens.cuda())
        synthesized_audio = vocoder.convert_spectrogram_to_audio(spec=mels_pred)
        synthesized_audio = synthesized_audio.to('cpu').numpy()
        ipd.display(ipd.Audio(synthesized_audio, rate=44100))
        print (mels_pred)

In [None]:
synthesized_audio = vocoder.convert_spectrogram_to_audio(spec=mels_pred)

In [None]:
spec_model.learn_alignment

In [None]:
val_loader

In [None]:
for record in val_loader:
    print(record)
    break

In [None]:
for record in val_loader:
    print(record)
    break

In [None]:
for record in val_loader:
    print(record)
    break