In [1]:
import os
from os import path
from urllib.request import urlretrieve
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist

## I - Load and preprocess data

In [2]:
(X_tr, y_tr), (X_te, y_te) = fashion_mnist.load_data()
labels = [
    "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"
]

In [3]:
plt.figure(figsize=(10, 2))
for i in range(0, 5):
    index = np.random.randint(0, X_tr.shape[0])
    plt.subplot(1, 5, i + 1)
    plt.imshow(X_tr[index], cmap="gray")
    plt.title(labels[y_tr[index]])
    plt.axis("off")
plt.show()

In [4]:
def min_max_scaling(X):
    minx, maxx = np.min(X, axis=0), np.max(X, axis=0)
    return (X - minx) / (maxx - minx)

In [5]:
X_tr, X_te = min_max_scaling(X_tr), min_max_scaling(X_te)

## II - Simple Autoencoder

We are going to start in this part by defining a simple autoencoder:
- the encoder has two Dense layers projecting into a latent dimension of fixed size (here we are going to choose latent_dim=2 for simplicity)
- the decoder has also two Denser layers and tries to recontruct the original images

<img src="../images/autoencoder.png" width="500px" />

In [6]:
from keras.layers import Input, Dense
from keras import Model

class Autoencoder():
    def __init__(self, X_tr, X_te, y_tr, y_te, labels, latent_dim=2, intermediate_dim=256, flatten=True):
        # Data
        if flatten:
            self.X_tr = self.flatten_image(X_tr)
            self.X_te = self.flatten_image(X_te)
        else:
            self.X_tr = X_tr
            self.X_te = X_te
        self.y_tr = y_tr
        self.y_te = y_te
        self.labels = labels
        # Hyperparameters
        self.original_dim = self.X_tr.shape[1]
        self.intermediate_dim = intermediate_dim
        self.latent_dim = latent_dim
        # Models
        self.encoder = None
        self.decoder = None
        self.model = None
        
    def flatten_image(self, X):
        return X.reshape((X.shape[0], np.prod(X.shape[1:])))
    
    """
    Two Dense layers with relu activation
    They respectively project into intermediate_dim and latent_dim dimensions
    """
    def design_and_compile_encoder(self):
        # TODO:
        x = None
        hidden = None
        latent = None
        if all([x, latent]):
            return Model(
                inputs=x, 
                outputs=latent, 
                name="mlp_encoder"
            )
    
    """
    Two Dense layers, the first has a relu activation and the last a sigmoid activation
    Sigmoid activation can indeed be used in this case because our images have been min-max scaled previously
    They respectively project into intermediate_dim and latent_dim dimensions
    """
    def design_and_compile_decoder(self):   
        # TODO:
        latent = None
        hidden = None
        x = None
        if all([latent, hidden]):
            return Model(
                inputs=latent, 
                outputs=x,
                name="mlp_decoder"
            )
    
    """
    Define and compile the encoder/decoder models by calling the previous methods
    Store them into self.encoder and self.decoder
    You can now define your final model in self.model
    """
    def design_and_compile_full_model(self):
        # TODO:
        self.encoder = None
        self.decoder = None
        
        x = None
        z = None
        x_decoded = None
        if all([x, x_decoded]):
            self.model = Model(x, x_decoded)
            self.model.compile(optimizer='adam', loss='mse')
        
    def model_summary(self):
        if self.model is not None:
            self.model.summary()
        
    def train(self, epochs=5, batch_size=100):
        if self.model is None:
            print("The model has not been designed yet!")
            return None
            self.model.fit(
                self.X_tr, self.X_tr,
                epochs=epochs, batch_size=batch_size,
                validation_data=(self.X_te, self.X_te)
            )
        
    def plot_x_test_decoded_i(self, index):
        if self.model is None:
            print("The model has not been trained yet!")
            return None
        plt.subplot(1, 2, 1)
        plt.imshow(self.X_te[index].reshape(28, 28), cmap=plt.cm.gray)
        plt.title("Real image")
        z = self.encoder.predict(np.expand_dims(self.X_te[index], axis=0))
        decoded_image = self.decoder.predict(z)
        plt.subplot(1, 2, 2)
        plt.imshow(decoded_image.reshape(28, 28), cmap=plt.cm.gray)
        plt.title("Reconstructed image")
        plt.axis('off');
        
    def plot_latent_space(self):
        if self.model is None:
            print("The model has not been trained yet!")
            return None
        Z_te = self.encoder.predict(self.X_te, batch_size=100)
        if isinstance(Z_te, list):
            Z_te = Z_te[0]
        plt.figure(figsize=(7, 6))
        plt.scatter(Z_te[:, 0], Z_te[:, 1], c=self.y_te,
                    cmap=plt.cm.tab10)
        cb = plt.colorbar()
        cb.set_ticks(list(range(len(self.labels))))
        cb.set_ticklabels(self.labels)
        cb.update_ticks()
        plt.show()

In [7]:
ae = Autoencoder(X_tr, X_te, y_tr, y_te, labels)
ae.design_and_compile_full_model()
ae.model_summary()
ae.train(epochs=2)

In [8]:
index = np.random.randint(0, ae.X_te.shape[0])

ae.plot_x_test_decoded_i(index)

In [9]:
ae.plot_latent_space()

## III - Fully Connected VAE

In this part we are going to improve our simple autoencoder model by adding a gaussian noise in the latent representation

The objective is to estimate both $q_{\phi}(z | x)$ and $p_{\theta}(x | z)$

<img src="../images/vae_horiz.png" width="700px" />

We can use the reparametrization trick to ensure that our samples are deterministically dependent on the parameters of the distribution

Instead of sampling $z$ as below:
$$ z \sim \mathcal{N}(\mu(x), \sigma(x)) $$

We can do the following:
$$ z = \mu(x) + \sigma(x) \cdot \epsilon$$
with:
$$ \epsilon \sim \mathcal{N}(0, 1) $$

In practice the encoder actually parametrizes $log(\sigma^2(x)$ and not solely $\sigma(x)$<br/>
We take the exponential of $log(\sigma^2_z(x)$ afterwards in the sampler layer, which ensures the positivity of the final $\sigma$ that we are trying to learn

In [10]:
from keras.layers import Lambda
from keras import backend as K
from keras import metrics

class VariationalAutoencoder(Autoencoder):
    def __init__(self, X_tr, X_te, y_tr, y_te, labels, latent_dim=2, intermediate_dim=256, flatten=True):
        # Autoencoder class initialization
        super(VariationalAutoencoder, self).__init__(
            X_tr, X_te, y_tr, y_te, labels, latent_dim, intermediate_dim, flatten
        )
        # Models
        self.sampler = None
    
    """
    The first Dense layer is identical to the standard autoencoder
    However, it is now mandatory to have two parallel Dense layers projecting into self.latent_dim
    They are respectively estimating z_mean and z_log_var
    """
    def design_and_compile_encoder(self):
        # TODO:
        x = None
        hidden = None
        z_mean = None
        z_log_var = None
        if all([x, z_mean, z_log_var]):
            return Model(
                inputs=x, 
                outputs=[z_mean, z_log_var], 
                name="mlp_encoder"
            )
    
    """
    Now we can use the estimated z_mean and z_log_var to sample z using a gaussian distribution
    Define your inputs, wrap the sampling function in a Lambda layer, 
    and return a Model Keras Object that outputs the stochastic latent variable z
    """
    def design_and_compile_sampler(self):
        def _sampling(inputs):
            z_mean, z_log_var = inputs
            batch_size = K.shape(z_mean)[0]
            epsilon = K.random_normal(shape=(batch_size, self.latent_dim),
                                      mean=0., stddev=1.)
            return z_mean + K.exp(z_log_var / 2) * epsilon
    
        # TODO:
        z_mean = None
        z_log_var = None
        z = None
        if all([z_mean, z_log_var, z]):
            return Model(
                inputs=[z_mean, z_log_var], 
                outputs=z,
                name="mlp_sampler"
            )
    
    """
    The decoder is actually the same as the previous part in the standard autoencoder model
    """
    def design_and_compile_decoder(self):
        # TODO:
        latent = None
        hidden = None
        x = None
        if all([latent, x]):
            return Model(
                inputs=latent, 
                outputs=x, 
                name="mlp_decoder"
            )
    
    """
    Now you can use the three models that you have defined and compiled, 
    store them into self.encoder, self.sampler and self.decoder
    Your final model will be defined in self.model afterward
    The used loss function is the negative ELBO
    """
    def design_and_compile_full_model(self):
        # TODO:
        self.encoder = None
        self.sampler = None
        self.decoder = None
        
        x = None
        z_mean, z_log_var = None, None
        z = None
        x_decoded_mean = None
        
        if all([x, x_decoded_mean]):
            self.model = Model(x, x_decoded_mean)

            xent_loss = self.original_dim * metrics.binary_crossentropy(
                K.flatten(x), K.flatten(x_decoded_mean)
            )
            kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
            vae_loss = K.mean(xent_loss + kl_loss)

            self.model.add_loss(vae_loss)
            self.model.compile(optimizer='adam')
        
    def train(self, epochs=5, batch_size=100):
        if self.model is None:
            print("The model has not been designed yet!")
            return None
        self.model.fit(
            self.X_tr, 
            epochs=epochs, batch_size=batch_size,
            validation_data=(self.X_te, None)
        )
        
    def plot_x_test_decoded_i(self, index):
        if self.model is None:
            print("The model has not been trained yet!")
            return None
        plt.subplot(1, 2, 1)
        plt.imshow(self.X_te[index].reshape(28, 28), cmap=plt.cm.gray)
        plt.title("Real image")
        z_mean, z_log_var = self.encoder.predict(np.expand_dims(self.X_te[index], axis=0))
        z = self.sampler.predict([z_mean, z_log_var])
        decoded_image = self.decoder.predict(z)
        plt.subplot(1, 2, 2)
        plt.imshow(decoded_image.reshape(28, 28), cmap=plt.cm.gray)
        plt.title("Reconstructed image")
        plt.axis('off');
        
    def plot_sampled_prediction(self):
        if self.model is None:
            print("The model has not been trained yet!")
            return None
        random_z_from_prior = np.random.normal(size=(1, self.latent_dim))
        generated = self.decoder.predict(random_z_from_prior)
        plt.imshow(generated.reshape(28, 28), cmap=plt.cm.gray)
        plt.axis('off');

In [11]:
vae = VariationalAutoencoder(X_tr, X_te, y_tr, y_te, labels)
vae.design_and_compile_full_model()
vae.model_summary()
vae.train(epochs=2)

In [12]:
vae.plot_sampled_prediction()

In [13]:
index = np.random.randint(0, vae.X_te.shape[0])

vae.plot_x_test_decoded_i(index)

In [14]:
vae.plot_latent_space()

## IV - Convolutional VAE

In this final part, we are going to enrich the previous encoder and decoder with convolutional layers

This allows us not to destroy the precious spatial representation in our images and extract local patterns that can be encoded

In [15]:
from keras.layers import Flatten, Conv2D, BatchNormalization, Reshape, Conv2DTranspose

class ConvolutionalVariationalAutoencoder(VariationalAutoencoder):
    def __init__(self, X_tr, X_te, y_tr, y_te, labels, 
                 latent_dim=2, intermediate_dim=128, 
                 filters=32, kernel_size=3, spatial_size=7,
                 flatten=False):
        X_tr = np.expand_dims(X_tr, axis=-1)
        X_te = np.expand_dims(X_te, axis=-1)
        # VAE class initialization
        super(ConvolutionalVariationalAutoencoder, self).__init__(
            X_tr, X_te, y_tr, y_te, labels, latent_dim, intermediate_dim, flatten
        )
        # Hyperparameters
        self.filters = filters
        self.kernel_size = kernel_size
        self.spatial_size = spatial_size
        # Models
        self.sampler = None
    
    """
    Use series of Conv2D layers with strides and batch normalization, and then continue with a Flatten layer
    Add Dense layers afterwards to estimate z_mean and z_log_var
    """
    def design_and_compile_encoder(self):
        # TODO:
        x = None
        x_conv = None
        x_flat = None
        hidden = None
        z_mean = None
        z_log_var = None
        if all([x, z_mean, z_log_var]):
            return Model(
                inputs=x, 
                outputs=[z_mean, z_log_var], 
                name="convolutional_encoder"
            )
    
    """
    Here we need a transformation going in the opposite direction of a normal Conv2D layer: Conv2DTranspose
    It starts from the latent space to upsample to the image original dimension
    Start with two Dense layers into a hidden dimension of (self.filters * self.spatial_size * self.spatial_size),
    then use a Reshape layer to convert it to a 3D tensor: it will be your starting point for deconvolution layers
    You can now use series of Conv2DTranspose layers with strides and batch normalization
    Conclude your model with a standard Conv2D layer with sigmoid activation, and make sure you ends up with 
    the same dimensions as your original image
    """
    def design_and_compile_decoder(self):
        decoder_input = None
        x = None
        if all([decoder_input, x]):
            return Model(decoder_input, x, name='convolutional_decoder')

In [16]:
cvae = ConvolutionalVariationalAutoencoder(X_tr, X_te, y_tr, y_te, labels)
cvae.design_and_compile_full_model()
cvae.model_summary()
cvae.train(epochs=2)

In [17]:
cvae.plot_sampled_prediction()

In [18]:
index = np.random.randint(0, cvae.X_te.shape[0])

cvae.plot_x_test_decoded_i(index)

In [19]:
cvae.plot_latent_space()