## Speech Denoising Using Deep Learning 

### A fully connected network with 2 hidden layers and 1000 hidden units each is used for speech denoising.
### For the two hidden layers I have used tanh activation functions. (Referred from:- http://paris.cs.illinois.edu/pubs/liu-interspeech2014.pdf)
### Apart from this, I have used Xavier initialization for weights, batch normalization and Adam Optimizer.
### For the last layer since we are required to output non negative, I have opted for relu activation function.
### Applying 20% dropouts during training helped improve the recovered test signal.

In [1]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import librosa
%matplotlib notebook

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="6"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.33

In [3]:
tf.set_random_seed(123)

In [4]:
# Import train input and output data
s, sr=librosa.load('train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('train_dirty_male.wav', sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)

In [5]:
# Transposeed the training data to get the data samples in rows and features in columns
# Taking the absolute values of the STFT data
abs_S = np.abs(S.T)
abs_X = np.abs(X.T)

In [6]:
# Import test data
s1, sr=librosa.load('test_x_01.wav', sr=None)
S1 =librosa.stft(s1, n_fft=1024, hop_length=512)
s2, sr=librosa.load('test_x_02.wav', sr=None)
S2 =librosa.stft(s2, n_fft=1024, hop_length=512)
print(S1.shape, S2.shape)

(513, 142) (513, 380)


In [7]:
test1 = np.abs(S1.T)
test2 = np.abs(S2.T)

In [8]:
test2.shape

(380, 513)

In [9]:
# Define fully connected network structure
num_hidden_1 = 1000 # hidden layer 1
num_hidden_2 = 1000 # hidden layer 2
num_features = 513 # input features
num_output = 513 # number of outputs

In [11]:
x = tf.placeholder('float', shape=(None,num_features))
y = tf.placeholder('float',shape=(None,num_output))
keep_probability = tf.placeholder("float")  # probability for dropouts

In [12]:
def fully_connected_net(data, keep_probability):
    # Weights and bias initialization
    weights = {
        'w1': tf.Variable(tf.random_normal(shape=[num_features, num_hidden_1],stddev= tf.sqrt(2/(num_features+num_hidden_1)))),
        'w2': tf.Variable(tf.random_normal(shape=[num_hidden_1, num_hidden_2],stddev= tf.sqrt(2/(num_hidden_1+num_hidden_2)))),
        'wout': tf.Variable(tf.random_normal(shape=[num_hidden_2, num_output],stddev = tf.sqrt(2/(num_hidden_2 + num_output))))
    }
    biases = {
        'bout': tf.Variable(tf.random_normal([num_output]))
    }
    # Layer 1 with batch normalization
    z1 = data@weights['w1']
    batch_mean1, batch_var1 = tf.nn.moments(z1, [0])
    z1hat = (z1 - batch_mean1) / tf.sqrt(batch_var1+epsilon)
    # Create two new parameters, scale and beta (shift)
    scale1 = tf.Variable(tf.ones([num_hidden_1]))
    beta1 = tf.Variable(tf.zeros([num_hidden_1]))
    z1_hat = tf.nn.batch_normalization(z1, batch_mean1, batch_var1,beta1, scale1,epsilon)
    l1 = tf.nn.dropout(tf.nn.tanh(z1_hat), keep_probability)
    
    # Layer 2 with batch normalization
    z2 = l1@weights['w2']
    batch_mean2, batch_var2 = tf.nn.moments(z2,[0])
    scale2 = tf.Variable(tf.ones([num_hidden_2]))
    beta2 = tf.Variable(tf.zeros([num_hidden_2]))
    z2_hat = tf.nn.batch_normalization(z2,batch_mean2,batch_var2,beta2,scale2,epsilon)
    l2 = tf.nn.dropout(tf.nn.tanh(z2_hat), keep_probability)
    
    # Output Layer
    output = tf.nn.relu(tf.add(tf.matmul(l2, weights['wout']) , biases['bout']))
    return output


In [13]:
batch_size=250
# Small epsilon value for the batch normalization
epsilon = 1e-3

In [14]:
def train_network(x):
    prediction = fully_connected_net(x, keep_probability)
    cost = tf.losses.mean_squared_error(y,prediction, weights=1.0)
    train_step = tf.train.AdamOptimizer(learning_rate= 0.001).minimize(cost)
    saver = tf.train.Saver()
    #feedforward and backpropogagtion
    epochs = 2500
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            epoch_loss = 0
            start_index = 0
            #X_shuffled, S_shuffled = shuffle(abs_X,abs_S)
            for _ in range(int(abs_X.shape[0]/batch_size)):
                end_index = start_index +batch_size
                if end_index > abs_X.shape[0]:
                    end_index = abs_X.shape[0]
                batch_x = abs_X[start_index:end_index]
                batch_y = abs_S[start_index: end_index]
                start_index = end_index + 1
                _, err = sess.run([train_step, cost], feed_dict={x: batch_x, y: batch_y, keep_probability: 0.8})
                epoch_loss += err
            for i in range(epoch % 20 == 0):
                print('Epoch ',epoch, ' completed out of ',epochs, 'loss: ', epoch_loss)
        print('Epoch ',epoch, ' completed out of ',epochs, 'loss: ', epoch_loss)
        saver.save(sess, 'model2/')
       
        test1_pred = sess.run(prediction, feed_dict = {x: test1, keep_probability: 1})
        test2_pred = sess.run(prediction, feed_dict = {x: test2, keep_probability: 1})
        
        return test1_pred, test2_pred

In [15]:
test1_pred, test2_pred = train_network(x)

Epoch  0  completed out of  2500 loss:  5.590878009796143
Epoch  20  completed out of  2500 loss:  0.20284949243068695
Epoch  40  completed out of  2500 loss:  0.15414383076131344
Epoch  60  completed out of  2500 loss:  0.1339481808245182
Epoch  80  completed out of  2500 loss:  0.11833239905536175
Epoch  100  completed out of  2500 loss:  0.11120777484029531
Epoch  120  completed out of  2500 loss:  0.10340986307710409
Epoch  140  completed out of  2500 loss:  0.10120680648833513
Epoch  160  completed out of  2500 loss:  0.09567071683704853
Epoch  180  completed out of  2500 loss:  0.09085197700187564
Epoch  200  completed out of  2500 loss:  0.08633249159902334
Epoch  220  completed out of  2500 loss:  0.08616419322788715
Epoch  240  completed out of  2500 loss:  0.08484551962465048
Epoch  260  completed out of  2500 loss:  0.08111290307715535
Epoch  280  completed out of  2500 loss:  0.08034995244815946
Epoch  300  completed out of  2500 loss:  0.07909437408670783
Epoch  320  compl

In [16]:
out1 = (S1/test1.T)* test1_pred.T
out2 = (S2/test2.T)* test2_pred.T

In [17]:
test1_recons = librosa.istft(out1, win_length= 1024, hop_length=512)
test2_recons = librosa.istft(out2, win_length= 1024, hop_length=512)

In [18]:
librosa.output.write_wav('test_s_01_recons.wav', test1_recons, sr)
librosa.output.write_wav('test_s_02_recons.wav', test2_recons, sr)