In [1]:
import os
import numpy as np
from tensorflow.keras.datasets import mnist
from vae import VAE
%config Completer.use_jedi = False

In [2]:
# global constants
LEARNING_RATE = 0.0005
BATCH_SIZE = 64    #32
NUM_EPOCHS = 150   #100
SPECTROGRAMS_PATH = "C:\\Users\\pbeata\\Desktop\\Data_Science\\Audio\\sound-generation\\datasets\\fsdd\\spectrograms\\"


# choose dataset
run_mnist = False
run_fsdd = True


def load_mnist():
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    # apply normalization
    x_train = x_train.astype("float32") / 255
    x_test = x_test.astype("float32") / 255
    # add channel dimension
    x_train = x_train.reshape(x_train.shape + (1,))
    x_test = x_test.reshape(x_test.shape + (1,))
    return x_train, y_train, x_test, y_test


def load_fsdd(spectrograms_path):
    x_train = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file in file_names:
            file_path = os.path.join(root, file)
            spectrogram = np.load(file_path) # (n_bins, n_frames) only 2D 
            x_train.append(spectrogram)
    x_train = np.array(x_train)
    # we need to add a channel dimension before returning x_train
    x_train = x_train[..., np.newaxis] # --> (3000, 256, 64, 1)
    return x_train


def train_mnist(x_train, learning_rate, batch_size, num_epochs):
    autoencoder = VAE(
        input_shape=(28, 28, 1),
        conv_filters=(32, 64, 64, 64),
        conv_kernels=(3, 3, 3, 3),
        conv_strides=(1, 2, 2, 1),
        latent_space_dim=2
    )
    autoencoder.summary()
    autoencoder.compile(learning_rate)
    autoencoder.train(x_train, batch_size, num_epochs)
    return autoencoder


def train_fsdd(x_train, learning_rate, batch_size, num_epochs):
    autoencoder = VAE(
        input_shape=(256, 64, 1),
        conv_filters=(512, 256, 128, 64, 32),
        conv_kernels=(3, 3, 3, 3, 3),
        conv_strides=(2, 2, 2, 2, (2, 1)),
        latent_space_dim=128
    )
    autoencoder.summary()
    autoencoder.compile(learning_rate)
    autoencoder.train(x_train, batch_size, num_epochs)
    return autoencoder

### MNIST

In [3]:
if run_mnist:

    # train VAE on subset of the MNIST dataset
    num_samples = 10_000
    x_train, _, _, _ = load_mnist()
    autoencoder = train_mnist(x_train[:num_samples], LEARNING_RATE, BATCH_SIZE, NUM_EPOCHS)

In [4]:
if run_mnist:

    # save and re-load the VAE model
    folder_path = "../trained_models/vae_model/"
    autoencoder.save(folder_path)
    autoencoder2 = VAE.load(folder_path)
    autoencoder2.summary()

### FSDD

In [6]:
if run_fsdd:

    # train VAE on the FSDD dataset (audio files)
    x_train = load_fsdd(SPECTROGRAMS_PATH)
    autoencoder = train_fsdd(x_train, LEARNING_RATE, BATCH_SIZE, NUM_EPOCHS)

    # save the trained model at the end
    save_path = "../trained_models/vae_model_fsdd/"
    autoencoder.save(save_path)

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 256, 64, 1)] 0                                            
__________________________________________________________________________________________________
encoder_conv_layer_1 (Conv2D)   (None, 128, 32, 512) 5120        encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_relu_1 (ReLU)           (None, 128, 32, 512) 0           encoder_conv_layer_1[0][0]       
__________________________________________________________________________________________________
encoder_bn_1 (BatchNormalizatio (None, 128, 32, 512) 2048        encoder_relu_1[0][0]             
____________________________________________________________________________________________

 448/3000 [===>..........................] - ETA: 14:18 - loss: 39149.2003 - _calculate_reconstruction_loss: 0.0385 - _calculate_KL_loss: 645.8477

KeyboardInterrupt: 