In [1]:
import librosa

from IPython.display import display, Audio
import glob
from tqdm.notebook import tqdm
import random
import math

# !pip install pyroomacoustics
import pyroomacoustics as pa
import pandas as pd

# from scipy.signal import stft,istft,convolve
import scipy as sp
import numpy as np

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


# Datasets
音声　CMU Arctic Corpus

```
# CMU Arctic Corpusのダウンロード(初回のみ)
pa.datasets.CMUArcticCorpus(basedir="./CMU_ARCTIC" ,download=True)

c_files = "/disk107/Datasets/CMU_ARCTIC/*/*wav/*.wav"
```
雑音 Ernie, UrbanSound8K
```
# n_files1 = glob.glob(noise1+"/disk107/Datasets/noise/ernieNoise/trim_wavs/*wav")#ernie
# n_files2 = glob.glob(noise2+"/disk107/Datasets/UrbanSound8K/audio/fold*/*.wav")#urban
```

# データセットの作成

In [2]:
c_files = glob.glob("/disk107/Datasets/CMU_ARCTIC/*/*wav/*.wav")
n_files = glob.glob("/disk107/Datasets/UrbanSound8K/audio/fold*/*.wav")

print("clear voice is",len(c_files),"files")
print("Urban Noise is",len(n_files),"files")

audio_len = 2**15
usedata_num = 15600
sample_rate = 16000

#stft
hann_window = 512
DFT_size = 512
time_shift = 256

clear voice is 15603 files
Urban Noise is 8732 files


In [3]:
def stft(x, win, step):
    l = len(x) # 入力信号の長さ
    N = len(win) # 窓幅、つまり切り出す幅
    M = int(np.ceil(float(l - N + step) / step)) # スペクトログラムの時間フレーム数
    
    new_x = np.zeros(N + ((M - 1) * step),dtype="float64")
    new_x[: l] = x # 信号をいい感じの長さにする
    
    X = np.zeros([M, N], dtype = "complex64") # スペクトログラムの初期化(複素数型)
    for m in range(M):
        start = step * m
        X[m, :] = sp.fft.fft(new_x[start : start + N] * win)
    return X

def istft(X, win, step):
    M, N = X.shape
    assert (len(win) == N), "FFT length and window length are different."

    l = (M - 1) * step + N
    x = np.zeros(l,dtype="float64")
    wsum = np.zeros(l,dtype="float64")
    for m in range(M):
        start = step * m
        ### 滑らかな接続
        x[start : start + N] = x[start : start + N] + sp.fft.ifft(X[m, :]).real * win
        wsum[start : start + N] += win ** 2 
    pos = (wsum != 0)
    x_pre = x.copy()
    ### 窓分のスケール合わせ
    x[pos] /= wsum[pos]
    return x

In [4]:
def makePSM(addnoise,clean):
    X, S = addnoise, clean #観測信号、所望信号stft型
    A = (np.abs(S) / np.abs(X)) * np.cos((np.angle(S)-np.angle(X)))
    B = np.maximum(A,0)
    G = np.minimum(B,1)
    return G

In [5]:
def addnoise(c_data,n_data,SNR = 0.1):
    """
    c_data:音声データ
    n_data:ノイズデータ
    
    クリーンな音声データに任意の雑音をつけます。
    
    音声データの長さがノイズデータの長さより長い時→ノイズデータをリピートして音声データの長さに合わせる
    音声データの長さがノイズデータの長さより短い時→ノイズデータを音声な音声データの長さに合わせて切り捨て
    """

    c_data_s = c_data * SNR #ernieノイズの音量が小さいので調整してます。

    if len(c_data) == len(n_data):
        noise_data = c_data_s + n_data

    elif len(c_data) > len(n_data):

        q, mod = divmod(len(c_data), len(n_data))

        if q == 1:
            new = np.append(n_data,n_data[:mod])

        else:
            new = np.append(n_data,n_data)
            for i in range(q-2):
                new = np.append(new,n_data)
            new = np.append(new,n_data[:mod])

        noise_data = c_data_s + new

    else:
        noise_data = c_data_s + n_data[:len(c_data)]
        
    return noise_data

In [None]:
# data = np.array([])
# label = np.array([])

# fftLen = 512 # とりあえず
# win = np.hamming(fftLen) # ハミング窓
# step = int(fftLen / 2)

####################
t = usedata_num

import time

start = time.time()

data = np.zeros([t,127,512])
label = np.zeros([t,127,512])

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

fftLen = 512 # とりあえず
win = np.hamming(fftLen) # ハミング窓
step = int(fftLen / 2)


for i in tqdm(range(t)):
    # audioデータを作る
    c_data,_ = librosa.load(str(c_files[i]),sr=sample_rate)
    if len(c_data)<audio_len:
        _c_data = np.zeros([audio_len])
        _c_data[:len(c_data)] = c_data
        c_data = _c_data

    else:
        c_data = c_data[:audio_len]

    #check
#     if len(c_data) != audio_len:
#         print("[CHECK] LENGTH ERROR")

    """
    雑音データが足りないので、雑音はランダムに選択し、SNRも0.7-1.2の間からランダムに選択してつけてる
    """
    n_data,_ = librosa.load(n_files[random.randint(0,len(n_files)-1)],sr=sample_rate)
    c_n_data = addnoise(c_data,n_data,SNR = random.uniform(0.7, 1.2))

    # stft
    stft_data = stft(c_n_data,win,step)
    stft_label= stft(c_data,win,step)
    stft_label_psm = makePSM(stft_data,stft_label)

    # append
    data[i] = stft_data
    label[i] = stft_label_psm

    _, _check = divmod(i, 500)

    if _check == 0:#500回ごとに保存
        print("save")
        start = time.time()
        np.savez('test', data, label)
        print ("save_time:{0}".format(elapsed_time) + "[sec]")

elapsed_time:0.0003070831298828125[sec]


HBox(children=(FloatProgress(value=0.0, max=15600.0), HTML(value='')))



save
save_time:0.0003070831298828125[sec]


  This is separate from the ipykernel package so we can avoid doing imports until


save


In [None]:
np.savez('datasets', data, label)

In [75]:
print(stft_data.shape,stft_label.shape,stft_label_psm.shape)

(array([-3.9672852e-04,  9.1552734e-05,  0.0000000e+00, ...,
         2.1362305e-04,  4.8828125e-04,  1.0070801e-03], dtype=float32),
 16000)

In [None]:
print(stft_data.shape)
print(stft_label_psm.shape)

In [31]:
c_data = np.load("dev104/yamamoto/clean_dataset.npy",allow_pickle=True)
n_data = np.load("dev104/yamamoto/noise_dataset.npy",allow_pickle=True)
c_data ,n_data = c_data[:15600], n_data[:15600]
#

In [51]:
display(Audio(c_data[25][:2**15] ,rate=16000))
display(Audio(n_data[25][:2**15] ,rate=16000))

In [57]:
def data_adjuster(data):
    ret = np.zeros([15600,2**15])
    z = np.zeros(2**15)
    for i in tqdm(range(15600)):
        
        ret[i,:] = data[i][:2**15]
    return ret

c_data_array = data_adjuster(c_data)
n_data_array = data_adjuster(n_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15600.0), HTML(value='')))




ValueError: could not broadcast input array from shape (32319) into shape (32768)

In [42]:
transform = transforms.Compose([
    transforms.ToTensor(),# numpy -> tensor
    transforms.Lambda(lambda x: x.view(-1))]) #一次元配列にキャスト

class MakeDataset(torch.utils.data.Dataset):

    def __init__(self, data, label, transform=None):
        self.transform = transform
        self.data = data
        self.data_num = len(data)
        self.label = label

    def __len__(self):
        return self.data_num

    def __getitem__(self, idx):
        if self.transform:
          out_data = self.transform(self.data)[0][idx]
          out_label = self.label[idx]
        else:
          out_data = self.data[idx]
          out_label =  self.label[idx]

        return out_data, out_label

In [23]:
dataset = MakeDataset(data, label, transform)

In [25]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=False)

In [None]:
# clean：CMU Arctic Copus内のwavファイル
c_files = glob.glob("CMU_ARCTIC/*/wav/*.wav")
random.shuffle(c_files)

# noise
n_files1 = glob.glob(noise1+"/disk107/Datasets/noise/ernieNoise/trim_wavs/*wav")#ernie
n_files2 = glob.glob(noise2+"/disk107/Datasets/UrbanSound8K/audio/fold*/*.wav")#urban
n_files = np.append(n_files1,n_files2)#合体
random.shuffle(n_files)

clean = []
noise = []

for n in tqdm(len(c_files)):
    # noise
    c_data, fs = librosa.load(c_files[n],sr=16000)
    n_data, fs = librosa.load(n_files[n],sr=16000)
    
    noise_data = addnoise(c_data,n_data,SNR = 0.7)
    
    clean.append(c_data)
    noise.append(noise_data)

In [None]:
np.save('dataset', clean)
np.save('dataset', noise)

In [None]:
import glob
from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm
import librosa
import soundfile as sf
import numpy as np
import os
import random

# Input data
wave_dir = "cleardata"
noise_dir= "UrbanSound8K/audio"
# Output data
outdir_cleardata = "label"
outdir_addnoisedata = "data"

if(os.path.isdir(outdir_cleardata)==False):
    os.mkdir(outdir_cleardata)
    
if(os.path.isdir(outdir_addnoisedata)==False):
    os.mkdir(outdir_addnoisedata)


def addnoise(c_data,n_data,SNR = 0.1):
    """
    c_data:音声データ
    n_data:ノイズデータ
    
    クリーンな音声データに任意の雑音をつけます。
    
    音声データの長さがノイズデータの長さより長い時→ノイズデータをリピートして音声データの長さに合わせる
    音声データの長さがノイズデータの長さより短い時→ノイズデータを音声な音声データの長さに合わせて切り捨て
    """

    c_data_s = c_data * SNR #ernieノイズの音量が小さいので調整してます。

    if len(c_data) == len(n_data):
        noise_data = c_data_s + n_data

    elif len(c_data) > len(n_data):

        q, mod = divmod(len(c_data), len(n_data))

        if q == 1:
            new = np.append(n_data,n_data[:mod])

        else:
            new = np.append(n_data,n_data)
            for i in range(q-2):
                new = np.append(new,n_data)
            new = np.append(new,n_data[:mod])

        noise_data = c_data_s + new

    else:
        noise_data = c_data_s + n_data[:len(c_data)]
        
    return noise_data

def main():

    # Lord data
    
    print("Loading Cleandata")
    clean_wave_files = glob.glob(wave_dir+"/*.wav")
    n_sources=len(clean_wave_files)
    clean_data=[]
    n_samples=0
    
    for wave_file in tqdm(clean_wave_files):
        data, rate = librosa.load(wave_file,sr=None)
        if n_samples<len(data):
            n_samples=len(data)
        clean_data.append(data)
        
    print("Loading Noisedata")
    noise_files = glob.glob(noise_dir+"/*/*.wav")
    noise_data=[]
    for f in tqdm(noise_files):
        data, rate = librosa.load(f,sr=rate)
        noise_data.append(data)
        
    print(n_sources," wave files")
    print(len(noise_files)," noise files")
    
    # zero pudding
    
    output = np.zeros([n_sources,n_samples])
    for i in tqdm(range(n_sources)):
        output[i,:len(clean_data[i])] = clean_data[i]
    
    # write wave file
    """
    雑音データが足りないので、雑音はランダムに選択し、SNRも0.7-1.2の間からランダムに選択してつけてる
    """
    for i in tqdm(range(n_sources)):
        sf.write("label/"+str(i)+"clean.wav", output[i], rate, subtype="PCM_16")
        
        noise_num = random.randint(0,len(noise_files)-1)
        noise_select = noise_data[noise_num]
        addnoisedata = addnoise(output[i],noise_select,SNR = random.uniform(0.7, 1.2))
        
        sf.write("data/"+str(i)+"noise.wav", addnoisedata, rate, subtype="PCM_16")
        
        
main()

# Pytorchデータセット

Pytorchで使える形のデータセットに変換する

In [3]:
!pip install torchvision

Defaulting to user installation because normal site-packages is not writeable
Collecting torchvision
  Downloading torchvision-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (12.7 MB)
[K     |████████████████████████████████| 12.7 MB 1.9 MB/s eta 0:00:01
Collecting torch==1.7.0
  Downloading torch-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (776.7 MB)
[K     |████████████████████████████████| 776.7 MB 6.1 kB/s  eta 0:00:013    |████████                        | 196.6 MB 46.6 MB/s eta 0:00:13     |████████▎                       | 200.0 MB 46.6 MB/s eta 0:00:13     |████████▉                       | 213.6 MB 46.6 MB/s eta 0:00:13     |████████████▌                   | 303.7 MB 100.5 MB/s eta 0:00:05     |██████████████▍                 | 349.3 MB 48.9 MB/s eta 0:00:09     |████████████████▍               | 398.8 MB 83.0 MB/s eta 0:00:05
[?25hCollecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, torch, torchvision
  Attempting uni

In [20]:
import torch
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
import numpy as np

In [18]:
data = np.array([np.random.rand(4, 4) ,np.random.rand(4, 4) ,np.random.rand(4, 4) ,np.random.rand(4, 4)] )
label = np.array([np.random.rand() ,np.random.rand() ,np.random.rand() ,np.random.rand() ] )

print(data)
print(label)

[[[8.24178103e-01 8.96925028e-01 8.53524229e-01 3.15020677e-01]
  [6.57282795e-01 9.29953348e-01 2.74329969e-01 8.49399759e-02]
  [5.85645874e-01 3.62046550e-01 6.85960314e-02 5.80835625e-02]
  [5.60042848e-01 7.83973694e-01 1.71624019e-03 3.10017381e-01]]

 [[6.00260107e-01 3.28815850e-01 8.03307454e-01 7.95665949e-01]
  [6.26823327e-01 9.46720606e-01 7.33376166e-01 2.56167377e-01]
  [2.51235270e-04 4.43188307e-01 8.63620285e-01 1.84617557e-01]
  [2.78681082e-01 7.22243071e-01 8.77811449e-01 4.03489815e-01]]

 [[5.57133119e-01 9.08576045e-01 6.01379093e-01 1.13659158e-01]
  [3.96816932e-01 7.72364848e-01 1.27760879e-01 8.81234321e-01]
  [9.92872184e-01 1.73998834e-02 4.51251414e-01 9.61393823e-01]
  [4.05457965e-01 6.02573089e-01 7.28263335e-01 9.96481768e-02]]

 [[7.10517238e-01 1.35029510e-01 3.63578760e-01 6.29193044e-01]
  [6.43714010e-01 4.93121733e-01 1.96270146e-01 2.06259677e-01]
  [4.69415857e-01 1.03702126e-01 5.80036433e-01 2.79308845e-01]
  [7.63783645e-01 2.25412215e-01 2

2