In [45]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, Conv2D, Flatten, Dropout, MaxPooling2D, BatchNormalization

import librosa
import math

%matplotlib inline

### Helper functions

In [46]:
# Transform a spectra to image like data form with dim 20x513
def imageize_spec(X):
    raw = []
    for t in range(len(X)-19):
        raw.append(X[t: t+20])
    X_abs_2d = np.array(raw).reshape(-1, 20, 513, 1)
    return X_abs_2d
    

In [47]:
# Calculate SNR 
def calculateSNR(st, st_h):
    st_sum = np.sum(np.abs(st))
    diff_sum = np.sum(np.abs(st-st_h))
#     print(st_sum, diff_sum)
    return 10*math.log(st_sum**2/diff_sum**2)

In [48]:
# Load training data 
s, sr_s=librosa.load('data/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr_x=librosa.load('data/train_dirty_male.wav', sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)
print(S.shape)
print(X.shape)
S_abs = np.abs(S)
X_abs = np.abs(X)

X_train = X_abs.T[:].reshape(-1, 513, 1)
y_train = S_abs.T[:]


(513, 2459)
(513, 2459)


In [98]:
# https://towardsdatascience.com/building-a-convolutional-neural-network-cnn-in-keras-329fbbadc5f5
model_path = './keras_1.h5'

#create model
model1 = Sequential()

try:
    model1 = load_model(model_path)
except:
    #add model layers
    model1.add(Conv1D(128, kernel_size=3, activation='relu', input_shape=(513,1)))
    model1.add(Conv1D(64, kernel_size=3, activation='relu'))
    model1.add(Conv1D(32, kernel_size=3, activation='relu'))
    model1.add(Flatten())
    model1.add(BatchNormalization())
    model1.add(Dense(512, activation='relu'))
    model1.add(BatchNormalization())
    model1.add(Dropout(0.2))    
    model1.add(Dense(512, activation='relu'))
    model1.add(Dropout(0.2))
    model1.add(Dense(513, activation='relu'))

    #compile model using accuracy to measure model performance
    model1.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    
    


In [106]:
model1.fit(X_train, y_train, epochs=10)
model1.save(model_path)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [100]:
def denoise_sound_conv1(input_file_name, output_file_name):
    sn, sr=librosa.load(input_file_name, sr=None)
    testX=librosa.stft(sn, n_fft=1024, hop_length=512)
    testX_abs = np.abs(testX)
    S_test_abs = model1.predict(testX_abs.T.reshape(-1, 513, 1)).T.reshape(513, -1)
    ratio = (testX / testX_abs)
    Sh = np.multiply(ratio, S_test_abs)
    librosa.output.write_wav(output_file_name, librosa.istft(Sh, hop_length=512), sr)
    

In [108]:
denoise_sound_conv1('data/test_x_01.wav', 'recover_01_d1.wav')
denoise_sound_conv1('data/test_x_02.wav', 'recover_02_d1.wav')

In [107]:
# Calculate SNR by training data
S_test_abs = model1.predict(X_train).T.reshape(513, -1)
ratio = (X / X_abs)
s_t = np.multiply(ratio, S_test_abs)
print(calculateSNR(S, s_t))

8.221322455695503


## 2D CNN

In [68]:
def padding_frames(X):
    testX_pad = X
    # padding
    for i in range(19):
        testX_pad = np.hstack((testX_pad, np.zeros((513, 1))))
    return testX_pad


In [69]:
# "imageize" data for 2D CNN
S_abs_T = S_abs.T
X_abs_T = np.abs(padding_frames(X)).T
S_abs_2d = S_abs_T
X_abs_2d = imageize_spec(X_abs_T)

print(X_abs_2d.shape)
print(S_abs_T.shape)


(2459, 20, 513, 1)
(2459, 513)


In [109]:
model_path = './keras_2.h5'

#create model
model2 = Sequential()

try:
    model2 = load_model(model_path)
except:
    #add model layers
    model2.add(Conv2D(16, (3, 3), activation='relu', input_shape=(20, 513,1)))
    model2.add(Conv2D(32, (3, 3), activation='relu'))
    model2.add(MaxPooling2D(pool_size=(2, 2)))
    model2.add(Conv2D(32, (3, 3), activation='relu'))
    model2.add(Conv2D(64, (3, 3), activation='relu'))
    model2.add(Flatten())
    model2.add(BatchNormalization())
    model2.add(Dense(512, activation='relu'))
    model2.add(BatchNormalization())
    model2.add(Dense(512, activation='relu'))
    model2.add(Dropout(0.2))
    model2.add(Dense(513, activation='relu'))

    #compile model using accuracy to measure model performance
    model2.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    

In [110]:
# Train 2D CNN model
model2.fit(X_abs_2d, S_abs_T, epochs=20, batch_size=32)
model2.save(model_path)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [72]:
def denoise_sound_conv2(input_file_name, output_file_name):
    sn, sr=librosa.load(input_file_name, sr=None)
    testX=librosa.stft(sn, n_fft=1024, hop_length=512)
    testX_abs = np.abs(testX)
    testX_abs_pad = np.abs(padding_frames(testX))
    X_abs_2d = imageize_spec(testX_abs_pad.T)
    S_test_abs = model2.predict(X_abs_2d).T.reshape(513, -1)
    ratio = (testX / testX_abs)
    Sh = np.multiply(ratio, S_test_abs)
    librosa.output.write_wav(output_file_name, librosa.istft(Sh, hop_length=512), sr)
    

In [113]:
denoise_sound_conv2('data/test_x_01.wav', 'recover_01_d2.wav')
denoise_sound_conv2('data/test_x_02.wav', 'recover_02_d2.wav')

In [111]:
# Calculate SNR by training data
testX_abs = X_abs
testX_pad = X_abs
# padding
for i in range(19):
    testX_pad = np.hstack((testX_pad, np.zeros((513, 1))))
testX_abs_pad = np.abs(testX_pad)

X_abs_2d = imageize_spec(testX_abs_pad.T)
S_test_abs = model2.predict(X_abs_2d).T.reshape(513, -1)

ratio = (X / X_abs)
s_t = np.multiply(ratio, S_test_abs)



In [112]:
# Calculate SNR
print(calculateSNR(S, s_t))

6.147324825076792


## Result

I found the training time of 1D CNN is faster than 2D CNN, since I can fit all the data into one epoch to train the model. 

In addition, even the SNR of 2D CNN can be as good as the 1D CNN, I still feel the quality of 1D CNN is much better than 2D CNN based on my subjective listening test.