# NN autoencoder on STFT data

Taken from https://github.com/pkmital/tensorflow_tutorials/blob/master/python/08_denoising_autoencoder.py

In [None]:
from __future__ import division, print_function, absolute_import
%pylab inline

import tensorflow as tf
import numpy as np
import os
import librosa
import librosa.display
from  librosa.util import frame
from scipy.signal import resample

%run ../utils.py
%run nnutils.py

rc_default()

SAVEFIG = False

In [None]:
path = "../wavs/"
fname = 'Grisey_partiels.wav'

name = fname[:-4]

filename = path+fname

fs,track = fragment_from_wav(filename,0,140)

track = resample(track,int(track.size/2.0))
fs = fs/2.0


NFFT = 2**11
HOP = int(NFFT/4)

STFT = librosa.stft(track,n_fft=NFFT, hop_length=HOP,center=True).T
mel = librosa.feature.melspectrogram(S=abs(STFT)**2).T

X = STFT/float(NFFT)/2

In [None]:
figure(figsize=(14,4))
librosa.display.specshow(librosa.logamplitude(X.T,ref_power=np.max),hop_length = HOP, sr=fs,y_axis='mel', fmax=8000, x_axis='time')
colorbar(label='Intensity (dB)');

In [None]:
print("Dimensions: freq bins, time bins")
X = X[:,:NFFT//2]
time_bins,freq_bins = X.shape
print(freq_bins,time_bins)

## Create dataset  (Data class) with real and imaginary parts of spectrum

In [None]:
# X_ = log10(abs(X)**2)
X_ = c_[X.real,X.imag]
meanX = X_.mean(0)
stdX = X_.std(0)+0.001
Xnorm = (X_-meanX)/stdX
data = Data( Xnorm )
time_bins,freq_bins =data.data.shape
print(freq_bins,time_bins)

In [None]:
figure(figsize=(14,4))

imshow(tanh(X_.T*500),aspect='auto',cmap=cm.seismic,origin='bottom')
colorbar();
title('Spectrogram with real and imaginary part');

## Neural Autoencoder parameters

In [None]:
display_step = 10
batch_size = 2000
n_epochs = 300
learning_rate = 0.0015
l2scale = 0.00000000001
dimensions= [1024,512,256,128,64,32,16]
activation = tf.nn.tanh

In [None]:
nae = NAE(freq_bins,dimensions,activation=activation,bias=True,l2scale=l2scale,learning_rate=learning_rate,stddev=0.1,meaninit=.00)
nae.init_session()

In [None]:
costlist,costnoreglist = nae.train(data, batch_size, n_epochs)

In [None]:
# nae.save("checkpoints/NAE")

In [None]:
figure(figsize=(8,4))
semilogy(costlist,'g')
semilogy(costnoreglist,'b')
ylabel('Cost')
xlabel('Batchs Steps');

In [None]:
sess = nae.get_session()
variable_dict = nae.get_variables_dict()

duration = track.size/fs
compare = range(0,time_bins,1)
recon = sess.run(variable_dict['Y'], feed_dict={ variable_dict['X']: data.data[compare]})

orig = data.data[compare]
M =  orig*stdX+meanX 
M_ = zeros((M.shape[0],M.shape[1]//2+1),dtype=complex64)
M_.real[:,:NFFT//2] = M[:,:NFFT//2]
M_.imag[:,:NFFT//2] = M[:,NFFT//2:]
M = 10*log10(abs(M_))

figure(figsize=(14,4))

librosa.display.specshow(M.T,hop_length = HOP,sr=fs,y_axis='linear', fmax=8000, x_axis='time',cmap = cm.viridis)

title('Input')
colorbar(label='Intensity (dB)')
if SAVEFIG:plt.savefig('figs/original')

M = recon*stdX+meanX
M_ = zeros((M.shape[0],M.shape[1]//2+1),dtype=complex64)
M_.real[:,:NFFT//2] = M[:,:NFFT//2]
M_.imag[:,:NFFT//2] = M[:,NFFT//2:]
M = 10*log10(abs(M_))

figure(figsize=(14,4))

librosa.display.specshow(M.T,hop_length = HOP,sr=fs,y_axis='linear', fmax=8000, x_axis='time',cmap = cm.viridis)

title('Autoencoded');
colorbar(label='Intensity (dB)');
if SAVEFIG: plt.savefig('figs/ae')
    
figure(figsize=(14,4))
Z = sess.run(variable_dict['z'], feed_dict={ variable_dict['X']: data.data[compare]})
idx = np.lexsort(Z)
librosa.display.specshow(Z.T,hop_length = HOP,sr=fs, x_axis='time')
yticks(range(0,Z[0].size,2))
ylabel('Neurons')
colorbar(label='Activity')
if SAVEFIG: plt.savefig('figs/Z')

In [None]:
from IPython.display import Audio, display

M = recon*stdX+meanX
M_ = zeros((M.shape[0],M.shape[1]//2+1),dtype=complex64)
M_.real[:,:NFFT//2] = M[:,:NFFT//2]
M_.imag[:,:NFFT//2] = M[:,NFFT//2:]

out = librosa.istft(M_.T,hop_length=HOP, win_length=NFFT, center=True)

display(Audio(data=out,rate=fs))
display(Audio(data=track,rate=fs))