# Convolutional autoencoder on STFT data

Taken from https://github.com/pkmital/tensorflow_tutorials/blob/master/python/08_denoising_autoencoder.py

In [None]:
from __future__ import division, print_function, absolute_import
%pylab notebook

import tensorflow as tf
import numpy as np
import os
import librosa
import librosa.display
from  librosa.util import frame
from scipy.signal import resample

%run ../utils.py
%run nnutils.py

In [None]:
path = "../wavs/"
fname = 'Grisey_partiels.wav'

name = fname[:-4]

filename = path+fname

fs,track = wav2audio_segment(filename,100,160)

track = resample(track,int(track.size/2.0))
fs = fs/2.0

NFFT = 2**11
HOP = int(NFFT/4)

STFT = librosa.stft(track,n_fft=NFFT, hop_length=HOP,center=True)

X = abs(STFT)**2

block_length = 16
idx = frame(arange(X.shape[1]),frame_length=block_length, hop_length=int(block_length/4))
X_blocks = abs(X)[:,idx]
X_blocks = np.rollaxis(X_blocks,2)

In [None]:
figure(figsize=(14,6))
librosa.display.specshow(librosa.logamplitude(X,ref_power=np.max),hop_length = HOP, sr=fs,y_axis='mel', fmax=8000, x_axis='time')
colorbar(label='Intensity (dB)')   

In [None]:
print("Dimensions: data length, freq bins, time bins")
total_length, freq_bins,time_bins = X_blocks.shape
print(total_length, freq_bins,time_bins)
print("Number of frames per block",block_length)
print(block_length*NFFT/fs,'seconds')
print("Overlap of frames per block:",block_length/2)
print(block_length*NFFT/fs/2,'seconds')

## Create dataset  (Data class) with spectrum frames

In [None]:
meanX = X_blocks.mean(0)
stdX = X_blocks.std(0)+0.001
X_blocks_ = (X_blocks-meanX)/stdX
data = Data( X_blocks_.reshape((X_blocks.shape[0],freq_bins*time_bins)) )
print(X_blocks.min(),X_blocks.max())

## Convolutional Autoencoder parameters

In [None]:
display_step = 10
batch_size = 400
n_epochs = 300
learning_rate = 0.002
l2scale = 0.01
n_filters=[16,16,16,16]
n_features =  int(data.data.shape[1])
input_shape = [None, n_features]
filter_sizes = [4,4,4,2]
strides = [[1, 2, 2, 1],
           [1, 2, 2, 1],
           [1, 2, 2, 1],
           [1, 2, 2, 1]]

In [None]:
cae = CAE(input_shape,freq_bins,time_bins, n_filters, filter_sizes, strides,learning_rate=learning_rate)
cae.init_session()
costlist = cae.train(data, batch_size, n_epochs)
sess = cae.get_session()
variables_dict = cae.get_variables_dict()

In [None]:
cae.save("checkpoints/CAE1")

In [None]:
figure(figsize=(8,4))
plot(costlist,'g')
ylabel('Cost')
xlabel('Batchs Steps');

In [None]:
duration = track.size/fs
compare = range(0,total_length,1)
recon = sess.run(variables_dict['Y'], feed_dict={ variables_dict['X']: data.data[compare]})

original = data.data[compare]

M = []
for i,r in enumerate(original):
    M.append( r.reshape(freq_bins,time_bins)*stdX+meanX)

M2 = np.hstack(M)
figure(figsize=(12,4))

librosa.display.specshow(librosa.logamplitude(M2,ref_power=np.max),hop_length = HOP*block_length/2.0,sr=fs*block_length,y_axis='mel', fmax=8000, x_axis='time')
title('Input')
colorbar(label='Intensity (dB)')

M = []
for i,r in enumerate(recon):
    M.append( r.reshape(freq_bins,time_bins)*stdX )
    
M2 = np.hstack(M)
figure(figsize=(12,4))

librosa.display.specshow(librosa.logamplitude(M2,ref_power=np.max),hop_length = HOP*block_length/2.0,sr=fs*block_length,y_axis='mel', fmax=8000, x_axis='time')

title('Autoencoded');
colorbar(label='Intensity (dB)')


In [None]:
figure()
imshow(log10(original[50].reshape(freq_bins,time_bins)*stdX+meanX),aspect='auto')