In [98]:
import os
from text import text_to_sequence
from synthesize import preprocess_english
from model import FastSpeech2
from text.symbols import symbols
import yaml
import torch
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger_eng')

ckpt_path = "/data/nzxyin/FastSPARC2/output/ckpt/LibriTTS_R_1/10000.pth.tar"
os.environ['CUDA_VISIBLE_DEVICES']='0'
device = "cuda:0" if torch.cuda.is_available() else 'cpu'

preprocess_config = yaml.load(open("config/LibriTTS-R/preprocess.yaml", "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open("config/LibriTTS-R/model.yaml", "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open("config/LibriTTS-R/train.yaml", "r"), Loader=yaml.FullLoader)

model = FastSpeech2(model_config, preprocess_config).to(device)
ckpt = torch.load(ckpt_path)
model.load_state_dict(ckpt["model"])

text = r"{sil} hello, this is john smith. may I ask who is this? {sil}"
text = np.array(text_to_sequence(text, preprocess_config["preprocessing"]["text"]["text_cleaners"]))
print(text)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/nzxyin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[359  11  45  42  49  49  52   6  11  57  45  46  56  11  46  56  11  47
  52  45  51  11  56  50  46  57  45   7  11  50  38  62  11  46  11  38
  56  48  11  60  45  52  11  46  56  11  57  45  46  56  10  11 359]


In [103]:
texts = torch.from_numpy(text).unsqueeze(0).to(device)
text_lens = torch.tensor([texts.shape[1]]).to(device)
max_text_len = max(text_lens)
(
    lr_output,
    pitch_predictions,
    energy_predictions,
    log_duration_predictions,
    duration_rounded,
    periodicity_predictions,
    ema_prediction,
    src_masks,
    bn_masks,
    src_lens,
    bn_lens,
) = model(texts, text_lens, max_text_len)

In [104]:
import numpy as np
import os
os.makedirs("/data/nzxyin/FastSPARC2/sample_predictions/", exist_ok=True)
np.save("/data/nzxyin/FastSPARC2/sample_predictions/pitch_predictions.npy", pitch_predictions.detach().cpu().numpy()[0])
np.save("/data/nzxyin/FastSPARC2/sample_predictions/energy_predictions.npy", energy_predictions.detach().cpu().numpy()[0])
np.save("/data/nzxyin/FastSPARC2/sample_predictions/periodicity_predictions.npy", periodicity_predictions.detach().cpu().numpy()[0])
np.save("/data/nzxyin/FastSPARC2/sample_predictions/ema_predictions.npy", ema_prediction.detach().cpu().numpy()[0])

In [105]:
periodicity_predictions[0]

tensor([0.0581, 0.0403, 0.0361, 0.0295, 0.0277, 0.0393, 0.0286, 0.0446, 0.0525,
        0.0461, 0.1034, 0.0968, 0.6046, 0.5885, 0.6061, 0.5700, 0.5923, 0.6049,
        0.5900, 0.5973, 0.5675, 0.5791, 0.5839, 0.5790, 0.7149, 0.7633, 0.7332,
        0.7588, 0.7769, 0.7884, 0.7614, 0.7833, 0.7789, 0.7872, 0.7555, 0.7488,
        0.7481, 0.7639, 0.7644, 0.7590, 0.7650, 0.7289, 0.6933, 0.6973, 0.6672,
        0.6443, 0.6318, 0.6619, 0.6564, 0.6699, 0.6863, 0.6708, 0.7073, 0.7052,
        0.6644, 0.6460, 0.5800, 0.5715, 0.5876, 0.2475, 0.2162, 0.3263, 0.3094,
        0.3281, 0.2946, 0.2038, 0.2621, 0.2890, 0.2215, 0.2945, 0.1751, 0.2455,
        0.4265, 0.3007, 0.3243, 0.2014, 0.1611, 0.1775, 0.2563, 0.3147, 0.3846,
        0.3242, 0.2026, 0.2141, 0.1734, 0.3235, 0.3256, 0.3917, 0.4938, 0.5179,
        0.5630, 0.5638, 0.5686, 0.5732, 0.5836, 0.5861, 0.5872, 0.5648, 0.5703,
        0.5697, 0.5740, 0.5681, 0.5618, 0.5244, 0.5527, 0.5486, 0.5592, 0.4850,
        0.4676, 0.4845, 0.4974, 0.5016, 

In [102]:
duration_rounded[0]

tensor([11., 17., 13., 14., 16., 16., 12., 12., 21., 17., 16., 17., 18., 19.,
        21., 20., 25., 19., 16., 17., 15., 23., 20., 21., 16., 17., 23., 22.,
        19., 18., 16., 20., 18., 16., 21., 16., 16., 17., 21., 18., 17., 15.,
        16., 16., 16., 17., 15., 16., 17., 19., 15., 24., 10.],
       device='cuda:0', grad_fn=<SelectBackward0>)

18