In [1]:
from pathlib import Path
import numpy as np
import librosa
from IPython.display import Audio

from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder

In [2]:
encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(Path("synthesizer/saved_models/pretrained/pretrained.pt"))
vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))

Loaded encoder "pretrained.pt" trained to step 1564501
Synthesizer using device: cpu
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt


In [3]:
def load_wav(path: str):
    original_wav, original_sampling_rate = librosa.load(path)
    return encoder.preprocess_wav(original_wav, original_sampling_rate)

voice_wav = load_wav("data/real/milo-clean.mp3")
original_wav = load_wav("data/real/nick.mp3")



In [4]:
voice_embedding = encoder.embed_utterance(voice_wav)
voice_embedding.shape

(256,)

In [5]:
spectrogram = Synthesizer.make_spectrogram(original_wav)
spectrogram.shape

(80, 651)

In [6]:
generated_wav = vocoder.infer_waveform(spectrogram)
generated_wav.dtype

{| ████████████████ 142500/144000 | Batch Size: 15 | Gen Rate: 6.8kHz | }

dtype('float64')

In [7]:
Audio(generated_wav, rate=encoder.sampling_rate)