In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("./configs/genshin_base_ms.json")

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint('./checkpoints/G_2036.pth', net_g, None)
# _ = utils.load_checkpoint('../VITS_model/pretrain/G_809000.pth', net_g, None)
# _ = utils.load_checkpoint('../VITS_model/proper/G_1803.pth', net_g, None)
speaker = ['Paimon', 'Miko', 'Kazuha', 'Nahida',\
            'Hutao', 'Ayaka', 'Yoimiya', 'Ganyu',\
            'Mona', 'Ei']

import soundfile as sf
import random
f = open('./filelists/val_filelist.txt', 'r', encoding='utf8')
lines = f.readlines()
f.close()

In [22]:

randomIndex = random.sample(range(len(lines)), 1)[0]
text = lines[randomIndex].strip().split('\t')[-1]
thisSpeaker = speaker[int(lines[randomIndex].strip().split('\t')[1])]
speed = 1. #@param {type:"slider", min:0.1, max:3, step:0.05}
# text = ''
print(f'{thisSpeaker}: {text}')
with torch.no_grad():
    for i in range(len(speaker)):
        stn_tst = get_text(text, hps)
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        sid = torch.LongTensor([i]).cuda()#@param {type:"longtensor", 0:9}
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667,\
            noise_scale_w=0.8, length_scale=speed)[0][0,0].data.cpu().float().numpy()
        print(f'{speaker[sid]}:')
        ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))        
        audio_path = f'./output/{speaker[sid]}.wav'
        sf.write(audio_path,audio,samplerate=hps.data.sampling_rate)

Mona: 怎么样，要真是正经做生意的人，没什么不敢的吧？
Paimon:


Miko:


Kazuha:


Nahida:


Hutao:


Ayaka:


Yoimiya:


Ganyu:


Mona:


Ei:


In [4]:

from scipy.io.wavfile import read
import numpy as np

post_enc = net_g.enc_q
# print(post_enc) # 513, 192
def load_wav_to_torch(full_path):
    sampling_rate, data = read(full_path)
    return torch.FloatTensor(data.astype(np.float32)), sampling_rate

def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
    hann_window = {}
    dtype_device = str(y.dtype) + '_' + str(y.device)
    wnsize_dtype_device = str(win_size) + '_' + dtype_device
    if wnsize_dtype_device not in hann_window:
        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
                      center=center, pad_mode='reflect', normalized=False, onesided=True)

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
    return spec

def get_audio(filename, max_wav_value=32768.0, filter_length=1024, hop_length=256, win_length=1024):
    audio, sampling_rate = load_wav_to_torch(filename)
    audio_norm = audio / max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    spec = spectrogram_torch(audio_norm, filter_length,
        sampling_rate, hop_length, win_length,
        center=False)
    spec = torch.squeeze(spec, 0)
    
    spec_lengths = torch.LongTensor(1)
    spec_lengths[0] = spec.size(1)
    
    
    spec_padded = torch.FloatTensor(1, 513, spec.size(1))
    spec_padded.zero_()
    spec_padded[0, :, :spec.size(1)] = spec
    
    return spec_padded.cuda(), spec_lengths.cuda()


In [5]:
import pyaudio
import wave
CHUNK = 1024  # 每个缓冲区的帧数
FORMAT = pyaudio.paInt16  # 采样位数
CHANNELS = 1  # 单声道
RATE = 22050  # 采样频率
def record_audio(wave_out_path, record_second):
    """ 录音功能 """
    p = pyaudio.PyAudio()  # 实例化对象
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)  # 打开流，传入响应参数
    wf = wave.open(wave_out_path, 'wb')  # 打开 wav 文件。
    wf.setnchannels(CHANNELS)  # 声道设置
    wf.setsampwidth(p.get_sample_size(FORMAT))  # 采样位数设置
    wf.setframerate(RATE)  # 采样频率设置

    for _ in range(0, int(RATE * record_second / CHUNK)):
        data = stream.read(CHUNK)
        wf.writeframes(data)  # 写入数据
    stream.stop_stream()  # 关闭流
    stream.close()
    p.terminate()
    wf.close()
    !python -m spleeter separate ./src/myrecord.wav -p spleeter:2stems -o ./src/

record_audio('./src/myrecord.wav', 5)

INFO:spleeter:File src\myrecord/vocals.wav written succesfully
INFO:spleeter:File src\myrecord/accompaniment.wav written succesfully


In [9]:
import inference
audio_path = './src/audio/5.05.wav'
with torch.no_grad():
    print('original')
    ipd.display(ipd.Audio(audio_path, rate=hps.data.sampling_rate))
    print('TTS')
    inference.gen_speech('刀剑抱业，名工怀宝？', 'Hutao', net_g, hps)
    print('converted')
    inference.gen_speech_sts(audio_path, 'Ayaka', 'Hutao', net_g, hps)
    # ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate))

original


TTS


converted
