In [3]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
import librosa

%matplotlib inline

In [4]:
# Calculate SNR 
def calculateSNR(st, st_h):
    st_sum = np.sum(np.abs(st))
    diff_sum = np.sum(np.abs(st-st_h))
    return 10*math.log(st_sum**2/diff_sum**2)

In [70]:
# Calculate IBM by given source signal and noise
def get_M(signal, noise):
    M = np.zeros_like(signal, dtype=np.int8)
    (ht, wd) = M.shape
    for i in range(ht):
        for j in range(wd):
            if signal[i][j] > noise[i][j]:
                M[i][j] = 1
    return M

In [71]:
# Training Parameters
learning_rate = 0.001
batch_size = 8

# Hyper Parameter
time_steps = 8   #
num_input = 513   # total data input (513 channel)
num_classes = 513 # total classes (513 channel)
num_hidden = 128  # hidden layer num of features
keep_prob = 0.8


is_training = False

In [136]:
tf.reset_default_graph()


In [137]:

with tf.variable_scope("rnn_model", reuse=tf.AUTO_REUSE):
    X = tf.placeholder("float", [None, time_steps, num_input])
    Y = tf.placeholder("float", [None, num_classes])
    weight = tf.Variable(tf.random_normal([num_hidden, num_classes]))
    bias = tf.Variable(tf.random_normal([num_classes]))


In [138]:
def RNN(x):    
    # Define a lstm cell with tensorflow
    cell1 = tf.contrib.rnn.DropoutWrapper(tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_hidden), output_keep_prob=keep_prob)
    cell2 = tf.contrib.rnn.DropoutWrapper(tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_hidden), output_keep_prob=keep_prob)
    cell3 = tf.contrib.rnn.DropoutWrapper(tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_hidden), output_keep_prob=keep_prob)
    cell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2, cell3])        
    
    outputs, states = tf.nn.dynamic_rnn(cell, x, dtype=tf.float32)    
    ret = tf.matmul(tf.reshape(outputs, (-1, num_hidden)), weight) + bias
    print(ret.shape)
    return ret


In [139]:
# Calculate loss, train_op
with tf.variable_scope("rnn_model", reuse=tf.AUTO_REUSE):
    Y_pred = RNN(X)
    loss = tf.losses.mean_squared_error(labels=Y, predictions=Y_pred)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

(?, 513)


In [150]:
# Add Padding, to 
def fit_RNN_input_dim(data):
    (row, _) = data.shape
    count = 0
    while row % time_steps != 0:
        data = np.vstack((data, np.zeros(num_input)))
        row += 1
        count += 1
    
    return data, count

In [191]:
def process_file_data(i):
    s, _=librosa.load('timit-homework/tr/trs{:04d}.wav'.format(i), sr=None)
    signal = librosa.stft(s, n_fft=1024, hop_length=512)
    s, _=librosa.load('timit-homework/tr/trn{:04d}.wav'.format(i), sr=None)
    noise = librosa.stft(s, n_fft=1024, hop_length=512)
    s, _=librosa.load('timit-homework/tr/trx{:04d}.wav'.format(i), sr=None)
    mixture = librosa.stft(s, n_fft=1024, hop_length=512)
    
    signal_abs = np.abs(signal)
    noise_abs = np.abs(noise)
    mixture_abs = np.abs(mixture)

    signal_T = signal.T
    noise_T = noise.T
    mixture_T = mixture.T
    
    signal_abs_T = signal_abs.T
    noise_abs_T = noise_abs.T
    mixture_abs_T = mixture_abs.T
    
    M = get_M(signal_abs_T, noise_abs_T)
    
    return [mixture_abs_T, M.reshape(-1, num_classes)]

In [194]:

data_to_use = 1200

# Create data stream for training
dataset = tf.data.Dataset.range(data_to_use).map(
    lambda idx: tuple(tf.py_func(process_file_data, [idx], [tf.float32, tf.int8]))).repeat()

batch_data = dataset
iterator = batch_data.make_one_shot_iterator()
next_element = iterator.get_next()


# Initial session
sess=tf.Session()
tf.global_variables_initializer().run(session=sess)


In [195]:
# Training
training_steps = 10000
for i in range(training_steps+1):
    (batch_x, batch_y) = sess.run(next_element)
    (batch_x_fit, _) = fit_RNN_input_dim(batch_x)
    (batch_y_fit, _) = fit_RNN_input_dim(batch_y)
    batch_x_fit = batch_x_fit.reshape(-1, time_steps, num_input)
    
    _, loss_value = sess.run((train_op, loss), feed_dict={X: batch_x_fit, Y: batch_y_fit})
    if i % 1000 == 0:        
        print('Training Step:' + str(i) + '  Train Loss =  ' + str(loss_value))
#         print('Training Step:' + str(i) + '  Validation Loss =  ' + 
#               str(sess.run(loss, feed_dict={X: v_mixture_abs_T_fit.reshape(-1, time_steps, num_input), Y: v_M_fit})) + 
#               '  Train Loss =  ' + str(loss_value))

Training Step:0  Train Loss =  1.1613156
Training Step:1000  Train Loss =  0.20274538
Training Step:2000  Train Loss =  0.21462098
Training Step:3000  Train Loss =  0.14499989
Training Step:4000  Train Loss =  0.14095008
Training Step:5000  Train Loss =  0.1398262
Training Step:6000  Train Loss =  0.103418596
Training Step:7000  Train Loss =  0.09724436
Training Step:8000  Train Loss =  0.14713061
Training Step:9000  Train Loss =  0.111143224
Training Step:10000  Train Loss =  0.12785242


In [166]:
# Get M for traiing data
def recover_train(input_mixture, mixture_abs_T):
    (mixture_fit, diff) = fit_RNN_input_dim(mixture_abs_T)
    input_x = mixture_fit.reshape(-1, time_steps, num_input)
    train_M = sess.run(Y_pred, feed_dict={X: input_x})
    if diff > 0:
        return train_M[:-diff] * input_mixture
    return train_M * input_mixture

In [44]:
recover_signal = recover_train(mixture_T, mixture_abs_T)
# librosa.output.write_wav("./train_recover.wav", librosa.istft(recover_signal.T, hop_length=512), sr)

(118552, 513)


In [68]:
# Recover Validaion track!
v_recover_signal = recover_train(v_mixture_T, v_mixture_abs_T)
# librosa.output.write_wav("./v_recover_rnn.wav", librosa.istft(v_recover_signal.T, hop_length=512), sr)

(118552, 513)


In [72]:
# Calculate SNR for training data and validation data
print(calculateSNR(signal.T, recover_signal))
print(calculateSNR(v_signal.T, v_recover_signal))

12.795495739503428
12.075764216524007


In [69]:
## Recover test data
def recover_test_data(path, target_path, files_to_load):
    for i in range(files_to_load):
        s, _=librosa.load('{}/tex{:04d}.wav'.format(path, i), sr=None)
        S=librosa.stft(s, n_fft=1024, hop_length=512)
        S_abs = np.abs(S).T
        recover_S = recover_train(S.T, S_abs)
        librosa.output.write_wav('{}/tex{:04d}.wav'.format(target_path, i), librosa.istft(recover_S.T, hop_length=512), sr)
        


In [73]:
recover_test_data('timit-homework/te', './result_te', 400)

(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(176, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(168, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(112, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(120, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(152, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(144, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)
(104, 513)