In [1]:
import io
import torch
from fastapi import FastAPI, WebSocket
from text import text_to_sequence
from models import SynthesizerTrn
from text.symbols import symbols
from scipy.io.wavfile import write
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from text.mlb_fr_symbols import symbols as fr_symbols
from text.mlb_fr import text_to_sequence as fr_text_to_sequence

from text.vctk_symbols import symbols as vctk_symbols
from text.vctk import text_to_sequence as vctk_text_to_sequence

from text.rw_symbols import symbols as rw_symbols
from text.rw import text_to_sequence as rw_text_to_sequence


from scipy.io.wavfile import write
import os
import commons

In [2]:
VCTK_CONFIG = "./configs/vctk_base.json"
VCTK_MODEL = "./models/vctk.pth"

In [3]:
vctk_hps = utils.get_hparams_from_file(VCTK_CONFIG)


In [4]:

vctk_gpu_model = SynthesizerTrn(
    len(vctk_symbols),
    vctk_hps.data.filter_length // 2 + 1,
    vctk_hps.train.segment_size // vctk_hps.data.hop_length,
    n_speakers=vctk_hps.data.n_speakers,
    **vctk_hps.model).cuda()
_ = vctk_gpu_model.eval()

_ = utils.load_checkpoint("./models/vctk.pth", vctk_gpu_model, None)



vctk_cpu_model = SynthesizerTrn(
    len(vctk_symbols),
    vctk_hps.data.filter_length // 2 + 1,
    vctk_hps.train.segment_size // vctk_hps.data.hop_length,
    n_speakers=vctk_hps.data.n_speakers,
    **vctk_hps.model).cpu()
_ = vctk_cpu_model.eval()

_ = utils.load_checkpoint("./models/vctk.pth", vctk_cpu_model, None)



INFO:root:Loaded checkpoint './models/vctk.pth' (iteration 0)
INFO:root:Loaded checkpoint './models/vctk.pth' (iteration 0)


In [5]:
def get_text_vctk(text, hps):
    text_norm = vctk_text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [7]:
stn_tst = get_text_vctk("VITS is Awesome!", vctk_hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([4]).cuda()
    audio = vctk_gpu_model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()


b'RIFF24\x01\x00WAVEfmt \x12\x00\x00\x00\x03\x00\x01\x00"V\x00\x00\x88X\x01\x00\x04\x00 \x00\x00\x00fact\x04\x00\x00\x00\x00M\x00\x00data\x004\x01\x00\xfb\x1e^;R\x19V;\x88\xf1P;\xbb>A;\xa3\xe8E;\xd1\x89J;\x86\xfeE;\xd3II;\xd6,G;?\xd8J;\xf7pQ;OzW;\x8ere;\xff9g;\xf3\xa7m;\x85Hr;.Fr;\xbcFl;\xf1]l;\x01Iv;9 z;\xc6\xd7v;\xe9\x91w;\xc3\xef{;R\xebx;>\xd9\x80;\x02|\x7f;\xb8\xa2~;\xe7(\x83;vT\x83;1=\x85;~\xc8\x81;\x9f3\x83;c\xb7\x83;N\xe9\x82;-Y\x8c;\x999\x8b;\x014\x8f;^\x19\x90;\xfc\xdd\x8a;\xaf\xb6\x8b;\x03\xd4\x8b;\x98\xa4\x88;\xf1\xb3\x89;\x87\x10\x8c;{\x99\x8a;\xc4|\x90;\xb1\xcc\x8f;\x07g\x93;\x96\x1e\x93;\xfd\xd8\x94;`\xeb\x92;\xc3\xf0\x96;\xa4\xa7\x98;\xc4\x05\x98;\xb2\x9e\x90;\tk\x91;5\x95\x92;r\x94\x91;\xfcF\x97;:\x0b\x98;\xdc\x1e\x9a;\xa1\xc2\x92;2Q\x98;\xe41\x93;\x06\xa2\x9a;9\xb7\x99;\x16\t\x98;;\xd9\x9c;\x86\xdb\x9c;\xe3\x90\x9b;U\xbb\x9c;\xe4D\x9e;\xfb\xdc\x9b;\xf4V\x9e;P\xfc\x9b;\xca\xa0\x9f;\xcc\xb9\xa1;\x98g\xa2;\x1eB\xa0;b\x1c\x9f;R\xab\x9b;\x1f|\xa1;a;\x9f;x\n\x9d;\xef+\x9e;`\

In [11]:
def vctk_gpu(stn_tst,model,hps):

    with torch.no_grad():
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        sid = torch.LongTensor([4]).cuda()
        audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

    audio_file = io.BytesIO()
    write(audio_file, hps.data.sampling_rate, audio)
    audio_file.seek(0)
    return audio_file.read()

def vctk_cpu(stn_tst,model,hps):

    with torch.no_grad():
        x_tst = stn_tst.cpu().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu()
        sid = torch.LongTensor([4]).cuda()
        audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
        
    audio_file = io.BytesIO()
    write(audio_file, hps.data.sampling_rate, audio)
    audio_file.seek(0)
    return audio_file.read()

In [20]:
def get_audio_gpu_vctk(text,vctk_gpu_model,vctk_hps):
    return vctk_gpu(get_text_vctk(text, vctk_hps),vctk_gpu_model,vctk_hps)

In [21]:
get_audio_gpu_vctk("VITS is Awesome!",vctk_gpu_model,vctk_hps)

b'RIFF20\x01\x00WAVEfmt \x12\x00\x00\x00\x03\x00\x01\x00"V\x00\x00\x88X\x01\x00\x04\x00 \x00\x00\x00fact\x04\x00\x00\x00\x00L\x00\x00data\x000\x01\x00 \xed\x8f;\xa7\xd6\x89;\xb1\xc4\x89;4\xb1\x83;<\xe1\x84;\x87\xc0\x88;\xf3\xd2\x89;\xe7M\x8a;\x04\xed\x89;\x81\xee\x8c;\x90\x8b\x8e;P\x80\x8f;\xe0l\x96;\x17\x04\x9a;I?\x9e;\xc2\x86\x9f;\xc1>\x9e;\xcc!\x9b;\x89A\x9d;+g\xa4;\xab\xa6\xa8;\x96\xb4\xa5;\x95+\xa8;\x08\\\xac;\x97\x02\xac;\xe2\xfe\xad;QR\xae;&]\xb2;\xd4\xd8\xb7;I6\xbb;\xae}\xbd;\x85\xd8\xba;!\x06\xbb;\xe2\xaa\xb8;8\x95\xb5;\xba\xdc\xbc;\xe0K\xbf;/b\xc3;f&\xc0;\x06{\xba;\x86\xd1\xba;\xe7\xb8\xb6;\xa6\xad\xb0; y\xb6;\xc4\r\xbd;\xfa\xcb\xbb;\xee\xeb\xc8;Gs\xca;\xc3\xb9\xcb;\x02\xb9\xcb;\x99\xad\xca;\x16\x98\xc9;!\x9e\xcc;a \xcd;9\xc6\xce;X\xbb\xc7;\xe5\xf5\xc3;S\xd7\xc3;VW\xc7;[/\xcd;\xeew\xce;\xccc\xd2;\x85\xb7\xc9;5\x8f\xcc;uB\xc7;L\xa2\xd1;\x1d:\xd3;\x1e\x05\xd2;A\x88\xd8;\xc3\x86\xda;\xe8\xd3\xdb;\x98\x81\xd8;\xd1\xc1\xdc;*4\xdf;6*\xe1;\xe5\x19\xe4;\xbd4\xe7;\xa4\x93\xe4;\x06\xab

In [None]:
vctk_hps.data