# Simple two layer autoencoder on STFT data

Taken from https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/autoencoder.py



In [1]:
from __future__ import division, print_function, absolute_import
%pylab inline
import wget
from scipy.io import wavfile
import librosa
import tensorflow as tf
import numpy as np
import os

Populating the interactive namespace from numpy and matplotlib


In [2]:
fname = 'Stockhausen_Kontakte.wav'

if not os.path.exists(fname):
    url = 'https://www.dropbox.com/s/f717af2vhiagcbp/'+fname+'?dl=1'
    filename = wget.download(url)

In [3]:
# Read wav file to floating values
fs,x = wavfile.read(fname)
t1 = 10*fs
t2 = 80*fs
x = x[t1:t2]
x = np.float64(x)/2**15
# Peak Normalization

x/=abs(x).max()

# Extract overlapping frames and create a spectrogram in each one

L = x.size

frame_size = 2**15
step_size = int(frame_size/2)

# Number of frames
n_frames = int( (L-frame_size)/step_size+1 )

block_size = 2**10
hop = int(block_size/2)
NFFT = block_size

level_threshold = 1e-5

n_blocks = int( (frame_size-block_size)/hop)+1+ int(block_size/hop)

# S = np.zeros((n_frames,block_size/2+1,n_blocks), dtype = np.complex128)
S = []

for i in range(n_frames):
    frame = x[i*step_size:i*step_size+frame_size]
    S_ = librosa.stft(frame,n_fft=NFFT,hop_length=hop,win_length=block_size )/2/NFFT
    if abs(S_).mean()>level_threshold:
        S_ = 20*log10(abs(S_)).clip(-60,0)
        S_ = (S_ - S_.min()) /(S_.max() - S_.min())
        S.append( S_ )

print("Dimensions: frames, freq bins, time bins")
S = np.array(S)
print(n_frames)
batch_length, freq_bins,time_bins = S.shape
print(batch_length, freq_bins,time_bins)


Dimensions: frames, freq bins, time bins
187
168 513 65


In [4]:
class Data():
    
    def __init__(self,data):
        import random
        
        self.data = data
        self.batch_ix = 0
        self.length = self.data.shape[0]
        self.ixs = list(range(self.length))
        random.shuffle(self.ixs)
        
    def next_batch(self,batch_size):
        self.batch_ix+= batch_size
        self.batch_ix = (self.batch_ix+batch_size) % self.length
        return self.data[self.ixs[self.batch_ix:self.batch_ix+batch_size]]
    
data = Data( S.reshape((S.shape[0],S.shape[1]*S.shape[2])) )
n_stfts = data.length


In [5]:
# Parameters
learning_rate = 0.05
training_epochs = 200
batch_size = 128

display_step = 5

# Network Parameters
dimensions= [512,128,64,32]

n_input = data.data.shape[1]

In [None]:
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])

weights = {}
biases = {}
layer_input = n_input

for layer_i, layer_output in enumerate(dimensions[:]):
    weights['encoder_h'+str(layer_i)] = tf.Variable(tf.random_normal([layer_input, layer_output]))
    biases['encoder_b'+str(layer_i)] = tf.Variable(tf.random_normal([layer_output]))
    layer_input = layer_output

for layer_i, layer_output in enumerate(dimensions[:-1][::-1]+[n_input]):
    weights['decoder_h'+str(layer_i)] = tf.Variable(tf.random_normal([layer_input, layer_output]))
    biases['decoder_b'+str(layer_i)] = tf.Variable(tf.random_normal([layer_output]))
    layer_input = layer_output
    
shape = tf.nn.sigmoid

# Building the encoder
layers = {}
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_input = x
    
    for layer_i, layer_output in enumerate(dimensions[:]):
        layers[layer_i] = shape(tf.add(tf.matmul(layer_input, weights['encoder_h'+str(layer_i)]), 
                                               biases['encoder_b'+str(layer_i)]))
        layer_input = layers[layer_i]

    return layer_input


# Building the decoder
def decoder(x):

    layer_input = x
    for layer_i, layer_output in enumerate(dimensions[:]):
        layers[layer_i] = shape(tf.add(tf.matmul(layer_input, weights['decoder_h'+str(layer_i)]), 
                                               biases['decoder_b'+str(layer_i)]))
        layer_input = layers[layer_i]

    return layer_input

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# Initializing the variables
init = tf.initialize_all_variables()

In [None]:
# Launch the graph
config = tf.ConfigProto( device_count = {'GPU': 0} )
sess = tf.Session(config=config)
sess.run(init)
total_batch = int(data.length/batch_size)
# Training cycle
for epoch in range(training_epochs):
    # Loop over all batches
    for i in range(total_batch):
        batch_xs = data.next_batch(batch_size)
        # Run optimization op (backprop) and cost op (to get loss value)
        opt,cost_value = sess.run([optimizer, cost], feed_dict={X: batch_xs})

    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%04d' % (epoch+1),"cost=", "{:.9f}".format(cost_value))

print("Optimization Finished!")

Epoch: 0001 cost= 0.244794711
Epoch: 0006 cost= 0.218649656
Epoch: 0011 cost= 0.210039690
Epoch: 0016 cost= 0.203050673
Epoch: 0021 cost= 0.199733287
Epoch: 0026 cost= 0.194319278
Epoch: 0031 cost= 0.192361236
Epoch: 0036 cost= 0.188801423


In [None]:
compare = [3,21,46]

out = sess.run(y_pred, feed_dict={X: data.data[compare]})
original = data.data[compare]
figure(figsize=(14,10))
for i,c in enumerate(compare):
    
    subplot(len(compare),2,i*2+1)
    imshow(out[i].reshape((freq_bins,time_bins)),aspect='auto',origin='bottom')
    subplot(len(compare),2,i*2+2)
    imshow(original[i].reshape((freq_bins,time_bins)),aspect='auto',origin='bottom')

subplot(len(compare),2,1)
title("Autoencoder")
subplot(len(compare),2,2)
title("Original")

# savefig("results_dim_"+str(dimensions)+"_cost_"+"{:.4f}.png".format(cost_value))

In [None]:
out = sess.run(encoder_op, feed_dict={X: data.data[3:4]})
matshow(out.reshape((8,dimensions[-1]/8)))
colorbar()

In [None]:
Y = np.zeros((1,dimensions[-1]),dtype=np.float32)
figure(figsize=(14,10))
for i in range(dimensions[-1]):
    Y[0,i] = 1
    out = sess.run(decoder(Y), feed_dict={X: data.data[3:4]})
    subplot(dimensions[-1]/8,8,i+1)
    imshow(out.reshape((freq_bins,time_bins)),aspect='auto')
    colorbar()

In [None]:
I = imread("results_dim_[256, 64]_cost_0.0082.png")
figure(figsize=(14,10))
imshow(I)
plt.axis('off');