# Dataset Preparation

In [None]:
import numpy as np
import pandas as pd
from keras.datasets import cifar10
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Reshape, Dense, Flatten
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.utils import model_to_dot
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
import umap

%load_ext tensorboard

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

x_train = np.reshape(x_train, (len(x_train), 32, 32, 3))
x_test = np.reshape(x_test, (len(x_test), 32, 32, 3))

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

x_train = x_train / 255.0
x_test = x_test / 255.0

# Building the Autoencoder

In [None]:
input_img = Input(shape=(32, 32, 3), name='input')

encoded = Conv2D(32, (3, 3), activation='relu', padding='same', name='conv2d_1')(input_img)
encoded = MaxPooling2D((2, 2), name='max_pooling2d_1')(encoded)
encoded = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2d_2')(encoded)
encoded = MaxPooling2D((2, 2), name='max_pooling2d_2')(encoded)
encoded = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2d_3')(encoded)
encoded = Flatten(name='flatten')(encoded)
encoded = Dense(10, name='dense_1')(encoded)

decoded = Dense(4096, activation='relu', name='dense_2')(encoded)
decoded = Reshape((8, 8, 64), name='reshape')(decoded)
decoded = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv2d_4')(decoded)
decoded = UpSampling2D((2, 2), name='up_sampling2d_1')(decoded)
decoded = Conv2D(32, (3, 3), activation='relu', padding='same', name='conv2d_5')(decoded)
decoded = UpSampling2D((2, 2), name='up_sampling2d_2')(decoded)
decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same', name='conv2d_6')(decoded)

model = Model(input_img, decoded)
model.compile(optimizer='adadelta', loss='mse', metrics=['accuracy'])

encoder = Model(input_img, encoded)

plot_model(model, to_file='model.png', show_shapes=True)

# Training the Auto-Encoder

In [None]:
!rm -rf ./logs/

# model.load_weights(checkpoint_filepath)

tensorboard_callback = TensorBoard(log_dir='logs/fit/', histogram_freq=1)
model_checkpoint_callback = ModelCheckpoint("model.hdf5",
                                            monitor='val_loss',
                                            save_best_only=True,
                                            mode='auto')

model.fit(x_train, x_train,
          batch_size=200,
          epochs=100,
          verbose=1,
          callbacks=[tensorboard_callback, model_checkpoint_callback],
          validation_data=(x_test, x_test),
          shuffle=True)

encoded_imgs = encoder.predict(x_test)
decoded_imgs = model.predict(x_test)

%tensorboard --logdir=logs/fit

# Sanity Check

Visualizing Reconstruction Results

In [None]:
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
  ax = plt.subplot(2, n, i + 1)
  plt.imshow(x_test[i].reshape(32, 32, 3))
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  ax = plt.subplot(2, n, i + 1 + n)
  plt.imshow(decoded_imgs[i].reshape(32, 32, 3))
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
plt.show()

Distribution Analysis

In [None]:
df = pd.DataFrame(data=encoded_imgs)
df = df.sample(n=50)
sns.pairplot(df)

UMAP

In [None]:
y_test = np.reshape(y_test, (len(y_test)))

mapper = umap.UMAP().fit(encoded_imgs)
umap.plot.points(mapper, labels=y_test)

# Data Querying

Euclidean Distance

In [None]:
query = x_test[10]
query_encoded_img = encoder.predict(query.reshape(1, 32, 32, 3))

nbrs = NearestNeighbors(n_neighbors=10, metric='euclidean')
nbrs.fit(encoded_imgs)
distances, indices = nbrs.kneighbors(np.array(query_encoded_img))
closest_image = x_test[indices]
closest_image = closest_image.reshape(-1, 32, 32, 3)

n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
  ax = plt.subplot(2, n, i + 1)
  plt.imshow(query)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  ax = plt.subplot(2, n, i + 1 + n)
  plt.imshow(closest_image[i])
  plt.title("Distance: %.3f" % distances[0][i])
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
plt.show()

Manhattan Distance

In [None]:
query = x_test[10]
query_encoded_img = encoder.predict(query.reshape(1, 32, 32, 3))

nbrs = NearestNeighbors(n_neighbors=10, metric='manhattan')
nbrs.fit(encoded_imgs)
distances, indices = nbrs.kneighbors(np.array(query_encoded_img))
closest_image = x_test[indices]
closest_image = closest_image.reshape(-1, 32, 32, 3)

n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
  ax = plt.subplot(2, n, i + 1)
  plt.imshow(query)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  ax = plt.subplot(2, n, i + 1 + n)
  plt.imshow(closest_image[i])
  plt.title("Distance: %.3f" % distances[0][i])
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
plt.show()

Chebyshev Distance

In [None]:
query = x_test[10]
query_encoded_img = encoder.predict(query.reshape(1, 32, 32, 3))

nbrs = NearestNeighbors(n_neighbors=10, metric='chebyshev')
nbrs.fit(encoded_imgs)
distances, indices = nbrs.kneighbors(np.array(query_encoded_img))
closest_image = x_test[indices]
closest_image = closest_image.reshape(-1, 32, 32, 3)

n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
  ax = plt.subplot(2, n, i + 1)
  plt.imshow(query)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  ax = plt.subplot(2, n, i + 1 + n)
  plt.imshow(closest_image[i])
  plt.title("Distance: %.3f" % distances[0][i])
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
plt.show()