In [1]:
import librosa

from IPython.display import display, Audio
import glob
from tqdm.notebook import tqdm
import random
import math

# !pip install pyroomacoustics
import pyroomacoustics as pa
import pandas as pd

# from scipy.signal import stft,istft,convolve
import scipy as sp
import numpy as np

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


# Datasets
音声　CMU Arctic Corpus

```
# CMU Arctic Corpusのダウンロード(初回のみ)
pa.datasets.CMUArcticCorpus(basedir="./CMU_ARCTIC" ,download=True)

c_files = "/disk107/Datasets/CMU_ARCTIC/*/*wav/*.wav"
```
雑音 Ernie, UrbanSound8K
```
# n_files1 = glob.glob(noise1+"/disk107/Datasets/noise/ernieNoise/trim_wavs/*wav")#ernie
# n_files2 = glob.glob(noise2+"/disk107/Datasets/UrbanSound8K/audio/fold*/*.wav")#urban
```

# データセットの作成

In [2]:
c_files = glob.glob("/disk107/Datasets/CMU_ARCTIC/*/*wav/*.wav")
n_files = glob.glob("/disk107/Datasets/UrbanSound8K/audio/fold*/*.wav")

print("clear voice is",len(c_files),"files")
print("Urban Noise is",len(n_files),"files")

audio_len = 2**15
usedata_num = 15600
sample_rate = 16000

#stft
hann_window = 512
DFT_size = 512
time_shift = 256

clear voice is 15603 files
Urban Noise is 8732 files


In [3]:
def stft(x, win, step):
    l = len(x) # 入力信号の長さ
    N = len(win) # 窓幅、つまり切り出す幅
    M = int(np.ceil(float(l - N + step) / step)) # スペクトログラムの時間フレーム数
    
    new_x = np.zeros(N + ((M - 1) * step),dtype="float64")
    new_x[: l] = x # 信号をいい感じの長さにする
    
    X = np.zeros([M, N], dtype = "complex64") # スペクトログラムの初期化(複素数型)
    for m in range(M):
        start = step * m
        X[m, :] = sp.fft.fft(new_x[start : start + N] * win)
    return X

def istft(X, win, step):
    M, N = X.shape
    assert (len(win) == N), "FFT length and window length are different."

    l = (M - 1) * step + N
    x = np.zeros(l,dtype="float64")
    wsum = np.zeros(l,dtype="float64")
    for m in range(M):
        start = step * m
        ### 滑らかな接続
        x[start : start + N] = x[start : start + N] + sp.fft.ifft(X[m, :]).real * win
        wsum[start : start + N] += win ** 2 
    pos = (wsum != 0)
    x_pre = x.copy()
    ### 窓分のスケール合わせ
    x[pos] /= wsum[pos]
    return x

In [4]:
def makePSM(addnoise,clean):
    X, S = addnoise, clean #観測信号、所望信号stft型
    A = (np.abs(S) / np.abs(X)) * np.cos((np.angle(S)-np.angle(X)))
    B = np.maximum(A,0)
    G = np.minimum(B,1)
    return G

In [5]:
def addnoise(c_data,n_data,SNR = 0.1):
    """
    c_data:音声データ
    n_data:ノイズデータ
    
    クリーンな音声データに任意の雑音をつけます。
    
    音声データの長さがノイズデータの長さより長い時→ノイズデータをリピートして音声データの長さに合わせる
    音声データの長さがノイズデータの長さより短い時→ノイズデータを音声な音声データの長さに合わせて切り捨て
    """

    c_data_s = c_data * SNR #ernieノイズの音量が小さいので調整してます。

    if len(c_data) == len(n_data):
        noise_data = c_data_s + n_data

    elif len(c_data) > len(n_data):

        q, mod = divmod(len(c_data), len(n_data))

        if q == 1:
            new = np.append(n_data,n_data[:mod])

        else:
            new = np.append(n_data,n_data)
            for i in range(q-2):
                new = np.append(new,n_data)
            new = np.append(new,n_data[:mod])

        noise_data = c_data_s + new

    else:
        noise_data = c_data_s + n_data[:len(c_data)]
        
    return noise_data

In [None]:
t = usedata_num

import time

start = time.time()

data = np.zeros([t,127,512])
label = np.zeros([t,127,512])

elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

fftLen = 512 # とりあえず
win = np.hamming(fftLen) # ハミング窓
step = int(fftLen / 2)


for i in tqdm(range(t)):
    # audioデータを作る
    c_data,_ = librosa.load(str(c_files[i]),sr=sample_rate)
    if len(c_data)<audio_len:
        _c_data = np.zeros([audio_len])
        _c_data[:len(c_data)] = c_data
        c_data = _c_data

    else:
        c_data = c_data[:audio_len]

    #check
#     if len(c_data) != audio_len:
#         print("[CHECK] LENGTH ERROR")

    """
    雑音データが足りないので、雑音はランダムに選択し、SNRも0.7-1.2の間からランダムに選択してつけてる
    """
    n_data,_ = librosa.load(n_files[random.randint(0,len(n_files)-1)],sr=sample_rate)
    c_n_data = addnoise(c_data,n_data,SNR = random.uniform(0.7, 1.2))

    # stft
    stft_data = stft(c_n_data,win,step)
    stft_label= stft(c_data,win,step)
    stft_label_psm = makePSM(stft_data,stft_label)

    # append
    data[i] = stft_data
    label[i] = stft_label_psm

    _, _check = divmod(i, 500)

    if _check == 0:#500回ごとに保存
        print("save")
        start = time.time()
        np.savez('test', data, label)
        print ("save_time:{0}".format(elapsed_time) + "[sec]")
        
np.savez('datasets', data, label)

elapsed_time:0.0003070831298828125[sec]


HBox(children=(FloatProgress(value=0.0, max=15600.0), HTML(value='')))



save
save_time:0.0003070831298828125[sec]


  This is separate from the ipykernel package so we can avoid doing imports until


save


In [75]:
print(stft_data.shape,stft_label.shape,stft_label_psm.shape)

(array([-3.9672852e-04,  9.1552734e-05,  0.0000000e+00, ...,
         2.1362305e-04,  4.8828125e-04,  1.0070801e-03], dtype=float32),
 16000)