In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
import librosa

%matplotlib inline


In [2]:
# Calculate SNR 
def calculateSNR(st, st_h):
    st_sum = np.sum(np.abs(st))
    diff_sum = np.sum(np.abs(st-st_h))
    return 10*math.log(st_sum**2/diff_sum**2)

In [37]:
# Read data

def read_data(path, prefix, files_to_load):
    signal = np.zeros((513, 0))
    noise = np.zeros((513, 0))
    mixture = np.zeros((513, 0))
    for i in range(files_to_load):
        s, _=librosa.load('{}/{}s{:04d}.wav'.format(path, prefix, i), sr=None)
        S=librosa.stft(s, n_fft=1024, hop_length=512)
        signal = np.hstack((signal, S))
        s, _=librosa.load('{}/{}n{:04d}.wav'.format(path, prefix, i), sr=None)
        S=librosa.stft(s, n_fft=1024, hop_length=512)
        noise = np.hstack((noise, S))
        s, _=librosa.load('{}/{}x{:04d}.wav'.format(path, prefix, i), sr=None)
        S=librosa.stft(s, n_fft=1024, hop_length=512)
        mixture = np.hstack((mixture , S))

    return signal, noise, mixture



In [40]:
# TRAIN_DATA_LEN = 1199
TRAIN_DATA_LEN = 50
signal, noise, mixture = read_data('timit-homework/tr', 'tr', TRAIN_DATA_LEN)
v_signal, v_noise, v_mixture = read_data('timit-homework/v', 'v', 50)

In [210]:
print(signal.shape)
print(v_signal.shape)
# Try to output~
sr = 16000
# librosa.output.write_wav("./test_concat.wav", librosa.istft(signal, hop_length=512), sr)
# librosa.output.write_wav("./v_test_concat.wav", librosa.istft(v_mixture, hop_length=512), sr)

(513, 4310)
(513, 4310)


In [178]:
# Preprocess data
signal_abs = np.abs(signal)
noise_abs = np.abs(noise)
mixture_abs = np.abs(mixture)

signal_T = signal.T
noise_T = noise.T
mixture_T = mixture.T
signal_abs_T = signal_abs.T
noise_abs_T = noise_abs.T
mixture_abs_T = mixture_abs.T

In [179]:
# For validation data 
# Preprocess Data
v_signal_abs = np.abs(v_signal)
v_noise_abs = np.abs(v_noise)
v_mixture_abs = np.abs(v_mixture)

v_signal_T = v_signal.T
v_noise_T = v_noise.T
v_mixture_T = v_mixture.T
v_signal_abs_T = v_signal_abs.T
v_noise_abs_T = v_noise_abs.T
v_mixture_abs_T = v_mixture_abs.T

In [180]:
# Calculate IBM 
def get_M(signal, noise):
    M = np.zeros_like(signal)
    (ht, wd) = M.shape
    for i in range(ht):
        for j in range(wd):
            if signal_abs_T[i][j] > noise_abs_T[i][j]:
                M[i][j] = 1
    return M

In [181]:
train_M = get_M(signal_abs_T, signal_abs_T)

In [183]:
v_M = get_M(v_signal_abs_T, v_signal_abs_T)

In [185]:
# Sanity check the shape of X(mixture) and M
print(mixture_abs_T.shape, train_M.shape)
print(v_mixture_abs_T.shape, v_M.shape)

(4310, 513) (4310, 513)
(4310, 513) (4310, 513)


In [211]:
# Testing M 
v_S_recover = v_M * mixture_T
# print(v_S_recover.shape)
sr = 16000
# librosa.output.write_wav("./v_before_recover.wav", librosa.istft(mixture, hop_length=512), sr)
# librosa.output.write_wav("./v_recover.wav", librosa.istft(v_S_recover.T, hop_length=512), sr)


In [83]:

# Training Parameters
learning_rate = 0.001
training_steps = 10000
batch_size = 128

# Network Parameters
num_input = 513   # total data input (513 channel)
time_steps = 8   #
num_hidden = 128  # hidden layer num of features
num_classes = 513 # total classes (513 channel)

is_training = False

In [84]:
with tf.variable_scope("rnn_model", reuse=tf.AUTO_REUSE):
    X = tf.placeholder("float", [None, time_steps, num_input])
    Y = tf.placeholder("float", [None, num_classes])

# batch_size*128*513
weights = {
    'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}

biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}


In [167]:
# tf.reset_default_graph()

def RNN(x, weights, biases):    
    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)
    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, axis=1)

    # Define a lstm cell with tensorflow
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden)
    # Get lstm cell output
    outputs, states = tf.nn.static_rnn(lstm_cell, x, dtype=tf.float32)
    outputs = [ tf.matmul(output, weights['out']) + biases['out'] for output in outputs]
    stack_outputs = tf.stack(outputs)

    return tf.reshape(stack_outputs, (-1, num_classes))


In [168]:
# Calculate loss, train_op
with tf.variable_scope("rnn_model", reuse=tf.AUTO_REUSE):
    Y_pred = RNN(X, weights, biases)
    loss = tf.losses.mean_squared_error(labels=Y, predictions=Y_pred)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [170]:
print(Y_pred.shape, Y.shape)

(?, 513) (?, 513)


In [171]:
# Create batch data for training
train_data = tf.data.Dataset.from_tensor_slices(tf.constant(mixture_abs_T))
label_data = tf.data.Dataset.from_tensor_slices(tf.constant(M.reshape(-1, num_classes)))
batch_data = tf.data.Dataset.zip((train_data, label_data)).repeat().batch(batch_size)

iterator = batch_data.make_one_shot_iterator()
next_batch = iterator.get_next()

In [200]:
# Add Padding, 
def fit_RNN_input_dim(data, label):
    (row, _) = data.shape
    print(row)
    while row % time_steps != 0:
        data = np.vstack((data, np.zeros(num_input)))
        label = np.vstack((label, np.zeros(num_input)))
        row += 1
    
    return data, label
    
v_mixture_abs_T_fit, v_M_fit = fit_RNN_input_dim(v_mixture_abs_T, v_M)

print(v_mixture_abs_T_fit.shape, v_M_fit.shape)

4310
(4312, 513) (4312, 513)


In [172]:
# Initial session
sess=tf.Session()
tf.global_variables_initializer().run(session=sess)

In [173]:
# Create checkpoint
saver = tf.train.Saver()
CHECK_POINT_FILE_NAME = "./hw3_2.ckpt"

try:
    saver.restore(sess, CHECK_POINT_FILE_NAME)
except:
    pass

In [202]:
# Training
for i in range(training_steps+1):
    (batch_x, batch_y) = sess.run(next_batch)
    batch_x = batch_x.reshape(-1, time_steps, num_input)
#     print(batch_x.shape, batch_y.shape)
    _, loss_value = sess.run((train_op, loss), feed_dict={X: batch_x, Y: batch_y})
    if i % 1000 == 0:
        is_training = False
        print('Training Step:' + str(i) + '  Accuracy =  ' + 
              str(sess.run(loss, feed_dict={X: v_mixture_abs_T_fit.reshape(-1, time_steps, num_input), Y: v_M_fit})) + 
              '  Loss =  ' + str(loss_value))
        is_training = True

        

Training Step:0  Accuracy =  0.23734966  Loss =  0.18094409
Training Step:500  Accuracy =  0.23546435  Loss =  0.23945925
Training Step:1000  Accuracy =  0.23612012  Loss =  0.27440012
Training Step:1500  Accuracy =  0.23785299  Loss =  0.26867792
Training Step:2000  Accuracy =  0.2387508  Loss =  0.23705155
Training Step:2500  Accuracy =  0.23886047  Loss =  0.18743317
Training Step:3000  Accuracy =  0.23716904  Loss =  0.18975642
Training Step:3500  Accuracy =  0.2359404  Loss =  0.2661727
Training Step:4000  Accuracy =  0.23538412  Loss =  0.23644198
Training Step:4500  Accuracy =  0.23568231  Loss =  0.19328924
Training Step:5000  Accuracy =  0.23790792  Loss =  0.1918992
Training Step:5500  Accuracy =  0.23748426  Loss =  0.17660941
Training Step:6000  Accuracy =  0.23965931  Loss =  0.1956235
Training Step:6500  Accuracy =  0.24040137  Loss =  0.18700877
Training Step:7000  Accuracy =  0.23723778  Loss =  0.2601164
Training Step:7500  Accuracy =  0.23662151  Loss =  0.24319452
Tr