In [1]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt

%matplotlib inline

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
training_data = pd.read_csv("../datasets/mnist/training_data.csv",header=None)
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test_data = pd.read_csv("../datasets/mnist/test_data.csv",header=None)
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Defining the auto-encoder

In [4]:
autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512, activation='elu', input_shape=(784,)),
    tf.keras.layers.Dense(128,  activation='elu'),
    tf.keras.layers.Dense(10,    activation='linear', name="bottleneck"),
    tf.keras.layers.Dense(128,  activation='elu'),
    tf.keras.layers.Dense(512,  activation='elu'),
    tf.keras.layers.Dense(784,  activation='sigmoid')
])
autoencoder.compile(loss='mean_squared_error', optimizer = Adam())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


## Preparing the dataset.

In [5]:
x_train = training_data.loc[:,1:].values
x_train = x_train / 255.0 # scaling

x_test = test_data.loc[:,1:].values
x_test = x_test / 255.0

In [6]:
trained_model = autoencoder.fit(x_train, x_train, batch_size=32, epochs=32, verbose=1, validation_data=(x_test, x_test))

Train on 60000 samples, validate on 10000 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [7]:
encoder = tf.keras.Model(autoencoder.input, autoencoder.get_layer('bottleneck').output)

## Generating the gensim index

### Preparing data for gensim indices

In [8]:
training_encodings = encoder.predict(x_train)
test_encodings = encoder.predict(x_test)

In [9]:
indices = []
for item in training_encodings:
    indices.append([(x,y) for (x,y) in enumerate(list(item))])
    
for item in test_encodings:
    indices.append([(x,y) for (x,y) in enumerate(list(item))])

### Generating index

In [10]:
index = gensim.similarities.Similarity("index",indices,10)

## Testing

In [11]:
index.num_best = 25
predict_item = 7
index[encoder.predict(x_train[predict_item].reshape(1,-1)).ravel()]

[(7, 1.0),
 (5097, 0.9856460690498352),
 (10483, 0.9771161079406738),
 (43563, 0.97706139087677),
 (5001, 0.9730933904647827),
 (12483, 0.9673655033111572),
 (42073, 0.9663132429122925),
 (66704, 0.9656130075454712),
 (361, 0.964120090007782),
 (53943, 0.9617356657981873),
 (65212, 0.9614316821098328),
 (66184, 0.9603543877601624),
 (69409, 0.9592058062553406),
 (44280, 0.957703173160553),
 (68970, 0.9569904804229736),
 (17253, 0.9562186002731323),
 (12545, 0.9562076926231384),
 (55579, 0.9554868936538696),
 (44349, 0.9546490907669067),
 (10493, 0.95005863904953),
 (49193, 0.950040876865387),
 (38245, 0.9493242502212524),
 (17341, 0.9491550922393799),
 (55509, 0.9490797519683838),
 (65168, 0.9488236308097839)]

In [12]:
df = pd.concat([training_data,test_data])
df.index = [x for x in range(len(df))]

In [13]:
for i in [x[0] for x in index[encoder.predict(x_train[predict_item].reshape(1,-1)).ravel()]]:
    print (df.loc[i,0])

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
