In [1]:
%matplotlib inline

In [2]:
import os
import keras
import pickle

import numpy as np
import matplotlib.pyplot as plt

from keras.layers import Dense, Input
from keras.layers import Conv2D, Flatten, Lambda
from keras.layers import Reshape, Conv2DTranspose
from keras.models import Model
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras.layers.advanced_activations import LeakyReLU
from keras import backend as K

Using TensorFlow backend.


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
input_shape = (129, 48, 1)
intermediate_dim = 512
latent_dim = 40
batch_size = 16
kernel_size = 6
stride_size = 3
filters = 16
epochs = 100

In [5]:
def sampling(args):
    """Reparameterization trick by sampling fr an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean=0 and std=1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

# VAE model = encoder + decoder
# build encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = inputs
for i in range(2):
    filters *= 2
    x = Conv2D(filters=filters,
               kernel_size=kernel_size,
               activation='tanh',
               strides=stride_size,
               padding='valid')(x)

# shape info needed to build decoder model
shape = K.int_shape(x)

# generate latent vector Q(z|X)
x = Flatten()(x)
x = Dense(intermediate_dim, activation='tanh')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
#encoder.summary()
plot_model(encoder, to_file='../data/vae_cnn_encoder.png', show_shapes=True)

# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(shape[1] * shape[2] * shape[3], activation='tanh')(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)

for i in range(2):
    x = Conv2DTranspose(filters=filters,
                        kernel_size=kernel_size,
                        activation='tanh',
                        strides=stride_size,
                        padding='valid')(x)
    filters //= 2

outputs = Conv2DTranspose(filters=1,
                          kernel_size=kernel_size,
                          activation='sigmoid',
                          padding='same',
                          name='decoder_output')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
#decoder.summary()
plot_model(decoder, to_file='../data/vae_cnn_decoder.png', show_shapes=True)

# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')

reconstruction_loss = mse(K.flatten(inputs), K.flatten(outputs))

reconstruction_loss *= input_shape[0] * input_shape[1]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -5e-4
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='rmsprop')
plot_model(vae, to_file='../data/vae_cnn.png', show_shapes=True)
#vae.summary()
1

1

%%time
# Finding approximate mean and std of data
x_train = []
train_features_path = '/home/ds/DataScience/Datasets/LibriSpeech/VAELibriSpeech/train-clean-wav/'
n_files = len(os.listdir(train_features_path))
n_train = 0
for filename in sorted(os.listdir(train_features_path)):
    full_filename = os.path.join(train_features_path, filename)
    print(full_filename)
    data = np.load(full_filename)
    n_train += data.shape[0]
    x_train += [data[np.random.randint(data.shape[0], size=1)]]
x_train = np.vstack(x_train)
x_mean = np.mean(x_train)
x_std = np.std(x_train)

test_features_path = '/home/ds/DataScience/Datasets/LibriSpeech/VAELibriSpeech/test-clean-wav/'
n_files = len(os.listdir(test_features_path))
n_test = 0
for filename in sorted(os.listdir(test_features_path)):
    full_filename = os.path.join(test_features_path, filename)
    print(full_filename)
    data = np.load(full_filename)
    n_test += data.shape[0]

(n_train, n_test) (1047736, 30548)

pickle.dump(x_mean, open('../data/x_mean.pkl', 'wb'))
pickle.dump(x_std, open('../data/x_std.pkl', 'wb'))

In [6]:
x_mean = pickle.load(open('../data/x_mean.pkl', 'rb'))
x_std = pickle.load(open('../data/x_std.pkl', 'rb'))

In [None]:
%%time
x_train = []
train_features_path = '/home/ds/DataScience/Datasets/LibriSpeech/VAELibriSpeech/train-clean-wav/'
n_files = len(os.listdir(train_features_path))
for epoch in range(epochs):
    print(epoch)
    for filename in sorted(os.listdir(train_features_path)):
        print(filename)
        full_filename = os.path.join(train_features_path, filename)
        data = np.load(full_filename)
        x_train = (data - x_mean)/x_std
        x_train = x_train.reshape(x_train.shape + (1,))
        n_batches = int(data.shape[0] / batch_size)
        for batch in np.array_split(x_train, [ind*batch_size for ind in range(1, n_batches+1)]):
            if batch.shape != (batch_size, data.shape[1], data.shape[2], 1):
                continue
            batch_loss = vae.train_on_batch(batch, y=None)
        print(batch_loss)
    vae.save_weights('/home/ds/DataScience/Models/audio_vae/40_6_3/{}.h5'.format(epoch))

%%time
x_train = []
train_features_path = '/home/ds/DataScience/Datasets/LibriSpeech/VAELibriSpeech/train-clean-wav/'
n_files = len(os.listdir(train_features_path))
for epoch in range(epochs):
    print(epoch)
    for filename in sorted(os.listdir(train_features_path)):
        print(filename)
        full_filename = os.path.join(train_features_path, filename)
        data = np.load(full_filename)
        data = data[np.random.randint(data.shape[0], size=batch_size), :, :]
        x_train = (data - x_mean)/x_std
        x_train = x_train.reshape(x_train.shape + (1,))
        print(vae.train_on_batch(x_train, y=None))
    vae.save_weights('/home/ds/DataScience/Models/audio_vae/10/vae_cnn_audio_{}.h5'.format(epoch))