In [38]:
import torch
import torch.utils.data as utils
from librosa.core import istft,load,stft
from librosa.output import write_wav

from IPython.display import display,Audio
from tqdm.notebook import tqdm
import glob
import numpy as np
import os
import random
import re

import model as mm
import utils as ut
from parameter import Parameter

In [39]:
p=Parameter()
datasets_save_dir = p.datasets_path
split = p.datasets_split #test/val/train
batch_size = p.batch_size
sample_rate = p.sample_rate
num_layer = p.num_layer
model_dir_path = p.model_path

clean_speech_dir = p.target_path
noise_dir = p.noise_path

audio_len = p.audio_len

fft_size = p.fft_size
hop_length = p.hop_length

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA is available:", torch.cuda.is_available())

CUDA is available: True


In [40]:
def length_fitting(data,audio_len):
    if len(data) > audio_len:
        data = data[:audio_len]
    else: 
        while len(data) < audio_len:
            data = np.concatenate((data,data),0)[:audio_len]
    return data

c_files = ut.take_path(clean_speech_dir)
n_files = ut.take_path(noise_dir)

random.shuffle(c_files)
random.shuffle(n_files)

num_c_files = len(c_files)
num_n_files = len(n_files)

In [41]:
model_path = 'model_layer5_20201219_163036_Epoch20.pt'

In [44]:
for i in range(2):

    c_data, sr_c = load(c_files[i], sr=None)
    n_data, sr_n = load(n_files[i], sr=None)

    if sr_c != sample_rate:
        c_data, _ = load(c_files[i], sr=sample_rate)

    if sr_n != sample_rate:
        n_data, _ = load(n_files[i], sr=sample_rate)

    n_data = length_fitting(n_data,audio_len)

    if len(c_data) < audio_len:
        print("音声データが短すぎます。")

    else:
        #modelのimport
        c_p = c_data[:audio_len]
        c_p_stft=stft(c_p, n_fft=fft_size, hop_length=hop_length)
        f = c_p_stft.shape[0]
        t = c_p_stft.shape[1]
        num_layer = int(re.sub("\\D", "", model_path)[0])
        model = mm.Net(t,f, num_layer)
        model.load_state_dict(torch.load(model_dir_path+model_path))

        # processing
        step = len(c_data) // audio_len

        ret = torch.zeros([batch_size,f,t])

        for i in tqdm(range(step+1),leave=False,desc='[AUDIO Process..]'):
            
            if i == step:
                c_p = np.zeros(audio_len)
                c = c_data[i*audio_len :]
                c_p[:len(c)] = c

            else:
                c_p = c_data[i*audio_len : (i+1)*audio_len]
                
            n_p = n_data[: audio_len]

            c_p_stft=stft(c_p, n_fft=fft_size, hop_length=hop_length)
            n_p_stft=stft(n_p, n_fft=fft_size, hop_length=hop_length)

            addnoise_stft=c_p_stft+n_p_stft

            # modelに通す
            addnoise_tensor=np.abs(addnoise_stft).astype(np.float32)
            addnoise_tensor=torch.from_numpy(addnoise_tensor.astype(np.float32)).clone()

            model.eval()
            mask = model(ret.float())[int(batch_size / 3),:,:]
            mask = mask.to('cpu').detach().numpy().copy()

            audio = addnoise_stft * mask

            audio =istft(audio, hop_length=hop_length)
            addnoise =istft(addnoise_stft, hop_length=hop_length)

            if i ==0:
                output = audio
                _output = addnoise

            else:
                output = np.concatenate([output,audio])
                _output = np.concatenate([_output,addnoise])

        display(Audio(_output,rate = sample_rate))
        display(Audio(output,rate = sample_rate))

HBox(children=(HTML(value='[AUDIO Process..]'), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='[AUDIO Process..]'), FloatProgress(value=0.0, max=2.0), HTML(value='')))