In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2022 Peter Wu
#  MIT License (https://opensource.org/licenses/MIT)

"""Demo for speech-to-EMA and EMA-to-speech models."""

import os
import IPython

import numpy as np
import soundfile as sf
import torch
import yaml

from tqdm import tqdm

from ats.bin.decode import ar_loop
from ats.utils import load_model


# Speech-to-EMA

In [15]:
# Load Speech-to-EMA model
inversion_checkpoint_path = "../exp/mocha_train_f8nema_w2mocha_hifi6/checkpoint-105000steps.pkl"
inversion_config_path = "../conf/w2mocha_hifi6.yaml"

# load config
with open(inversion_config_path) as f:
    inversion_config = yaml.load(f, Loader=yaml.Loader)

if torch.cuda.is_available():
    inversion_device = torch.device("cuda")
else:
    inversion_device = torch.device("cpu")
inversion_model = load_model(inversion_checkpoint_path, inversion_config)
inversion_model.remove_weight_norm()
inversion_model = inversion_model.eval().to(inversion_device)

In [24]:
# Predict EMA sequence for given speech
input_wav_path = '../downloads/emadata/cin_us_faet0/wav/faet0_009.wav'
# input_wav_path = '../downloads/cmu_arctic/cmu_us_awb_arctic/wav/arctic_a0001.wav'
# input_wav_path = '../downloads/cmu_arctic/cmu_us_ksp_arctic/wav/arctic_a0001.wav'
output_art_path = None
    # NOTE change None to file path if want to save predicted EMA sequence

audio, sr = sf.read(input_wav_path)

with torch.no_grad():
    audio = torch.tensor(audio, dtype=torch.float).to(inversion_device)
    pred = ar_loop(inversion_model, audio, inversion_config)
    if output_art_path is not None:
        np.save(output_art_path, pred.cpu().numpy())


In [20]:
pred.cpu().numpy().shape

(800, 13)

# EMA-to-Speech

In [25]:
# Load EMA-to-Speech model
synthesis_checkpoint_path = "../exp/k_mocha_train_f8nema_mocha2w_hifi/checkpoint-130000steps.pkl"
synthesis_config_path = "../exp/k_mocha_train_f8nema_mocha2w_hifi/config.yml" # "../conf/e2w_hifigan.yaml"

# load config
with open(synthesis_config_path) as f:
    synthesis_config = yaml.load(f, Loader=yaml.Loader)

if torch.cuda.is_available():
    synthesis_device = torch.device("cuda")
else:
    synthesis_device = torch.device("cpu")
synthesis_model = load_model(synthesis_checkpoint_path, synthesis_config)
synthesis_model.remove_weight_norm()
synthesis_model = synthesis_model.eval().to(synthesis_device)

In [26]:
# Predict waveform for given EMA sequence
output_wav_path = "temp3.wav"

with torch.no_grad():
    c = torch.tensor(pred, dtype=torch.float).to(synthesis_device)
    y = ar_loop(synthesis_model, c, synthesis_config)
    sf.write(
        output_wav_path,
        y.cpu().numpy(),
        synthesis_config["sampling_rate"],
        "PCM_16",
    )


  c = torch.tensor(pred, dtype=torch.float).to(synthesis_device)


In [20]:
IPython.display.Audio(output_wav_path)
