# __Recognizing with VAE features!__

## _Importing the libraries_

In [1]:
import os
import keras
import pickle
import random

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from keras.layers import Dense, Input, Dropout
from keras.layers import Conv2D, Flatten, Lambda
from keras.layers import Reshape, Conv2DTranspose
from keras.models import Model, Sequential
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model, to_categorical
from keras.layers.advanced_activations import LeakyReLU
from keras import backend as K
from scipy import signal
from scipy.io import wavfile

Using TensorFlow backend.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## _Loading data_

In [3]:
data_path = '/home/ds/DataScience/Datasets/GoogleSpeechCommands/_commands_'
percent_test = 0.2

In [4]:
filenames_dict = {}
for root, directories, filenames in os.walk(data_path):
    for filename in filenames:
        label = os.path.split(root)[1]
        if label in filenames_dict:
            filenames_dict[label] += [os.path.join(root,filename)]
        else:
            filenames_dict[label] = [os.path.join(root,filename)]

In [5]:
label_counts = {label: len(filenames_dict[label]) for label in filenames_dict}

In [6]:
train_filenames = []
test_filenames = []
for label in label_counts:
    n_test = int(label_counts[label] * 0.2)
    filenames = list(zip(filenames_dict[label], [label]*label_counts[label]))
    random.shuffle(filenames)
    train_filenames += filenames[:-n_test]
    test_filenames += filenames[-n_test:]
random.shuffle(train_filenames)
random.shuffle(test_filenames)

In [7]:
texts2labels = {label: i for i, label in enumerate(label_counts.keys())}
labels2texts = {i: label for i, label in enumerate(label_counts.keys())}
num_classes = len(label_counts.keys())

In [8]:
train_filenames[:3]

[('/home/ds/DataScience/Datasets/GoogleSpeechCommands/_commands_/wow/b6091c84_nohash_0.wav',
  'wow'),
 ('/home/ds/DataScience/Datasets/GoogleSpeechCommands/_commands_/nine/7f74626f_nohash_0.wav',
  'nine'),
 ('/home/ds/DataScience/Datasets/GoogleSpeechCommands/_commands_/nine/c120e80e_nohash_5.wav',
  'nine')]

## _Defining the architecture for the VAE_

In [9]:
input_shape = (129, 48, 1)
intermediate_dim = 512
latent_dim = 40
batch_size = 16
kernel_size = 6
stride_size = 3
filters = 16
epochs = 100


def sampling(args):
    """Reparameterization trick by sampling fr an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

# VAE model = encoder + decoder
# build encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = inputs
for i in range(2):
    filters *= 2
    x = Conv2D(filters=filters,
               kernel_size=kernel_size,
               activation='tanh',
               strides=3,
               padding='valid')(x)

# shape info needed to build decoder model
shape = K.int_shape(x)

# generate latent vector Q(z|X)
x = Flatten()(x)
x = Dense(intermediate_dim, activation='tanh')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')

# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(shape[1] * shape[2] * shape[3], activation='tanh')(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)

for i in range(2):
    x = Conv2DTranspose(filters=filters,
                        kernel_size=kernel_size,
                        activation='tanh',
                        strides=3,
                        padding='valid')(x)
    filters //= 2

outputs = Conv2DTranspose(filters=1,
                          kernel_size=kernel_size,
                          activation='sigmoid',
                          padding='same',
                          name='decoder_output')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')

# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

reconstruction_loss = mse(K.flatten(inputs), K.flatten(outputs))

reconstruction_loss *= input_shape[0] * input_shape[1]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -5e-4
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='rmsprop')

In [10]:
vae.load_weights('/home/ds/DataScience/Models/audio_vae/40_6_3/4.h5')

## Calculating features

In [11]:
x_mean = pickle.load(open('../data/x_mean.pkl', 'rb'))
x_std = pickle.load(open('../data/x_std.pkl', 'rb'))

In [12]:
def compute_features(filename, frame_duration=0.01, stride=0.1, window_size=0.3):
    sample_rate, samples = wavfile.read(filename)
    frame_size = int(round(frame_duration * float(sample_rate)))
    overlap_size = frame_size - int(round((frame_duration - 0.005) * float(sample_rate)))
    n_fft_points = 2
    while n_fft_points < frame_size:
        n_fft_points *= 2
    frequencies, times, spectrogram = signal.spectrogram(
        samples, fs=sample_rate, window='hamming', nperseg=frame_size, noverlap=overlap_size, nfft=n_fft_points,
        scaling='spectrum', mode='psd'
    )
    n_frames_window = int(frame_size * window_size)
    new_features = []
    padded_spectrogram = np.hstack((np.zeros((frequencies.shape[0], int(frame_size*stride))),
                                    spectrogram,
                                    np.zeros((frequencies.shape[0], int(frame_size*stride)))))
    for suck_i, time in enumerate(times*stride):
        i = int(suck_i/stride)
        chunk = padded_spectrogram[:, i:int(frame_size * window_size) + i]
        if chunk.shape == (frequencies.shape[0], int(frame_size*window_size)):
            new_features += [chunk]
    x_file = (np.stack(new_features) - x_mean) / x_std
    return encoder.predict(x_file.reshape(x_file.shape + (1,)),
                           batch_size=batch_size)[2].flatten()

In [13]:
features_path = '/home/ds/DataScience/Datasets/GoogleSpeechCommands/commands_vae_features/'

In [14]:
%%time
x_train = []
y_train = []
for filename, label in train_filenames:
    x_file = compute_features(filename)
    if x_file.shape == (760,):
        x_train += [x_file]
        y_train += [to_categorical([texts2labels[label]], num_classes=num_classes)[0]]
np.stack(x_train).dump(os.path.join(features_path, 'x_train'))
np.stack(y_train).dump(os.path.join(features_path, 'y_train'))

x_test = []
y_test = []
for filename, label in test_filenames:
    x_file = compute_features(filename)
    if x_file.shape == (760,):
        x_test += [x_file]
        y_test += [to_categorical([texts2labels[label]], num_classes=num_classes)[0]]
np.stack(x_test).dump(os.path.join(features_path, 'x_test'))
np.stack(y_test).dump(os.path.join(features_path, 'y_test'))

CPU times: user 3h 13min 17s, sys: 20.1 s, total: 3h 13min 37s
Wall time: 38min 4s


In [15]:
input_shape = np.stack(x_train).shape[1]
input_shape

760

## _Defining architecture for MLP_

In [18]:
x_train = np.load(os.path.join(features_path, 'x_train'))
y_train = np.load(os.path.join(features_path, 'y_train'))

x_test = np.load(os.path.join(features_path, 'x_test'))
y_test = np.load(os.path.join(features_path, 'y_test'))

In [19]:
y_train.shape

(46630, 30)

In [20]:
asr = Sequential()
asr.add(Dense(100, activation='relu', input_shape=(input_shape,)))
asr.add(Dropout(0.2))
asr.add(Dense(100, activation='relu'))
asr.add(Dropout(0.2))
asr.add(Dense(num_classes, activation='softmax'))

asr.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [21]:
%%time
asr.fit(x_train, y_train, epochs=30, batch_size=64, validation_data=(x_test, y_test))

Train on 46630 samples, validate on 11630 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
CPU times: user 1min 4s, sys: 8.31 s, total: 1min 12s
Wall time: 42.1 s


<keras.callbacks.History at 0x7f4e5060a208>