In [33]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
training_data = pd.read_csv("training_data.csv",header=None)
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
test_data = pd.read_csv("test_data.csv",header=None)
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Defining the auto-encoder

In [12]:
autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(512, activation='elu', input_shape=(784,)),
    tf.keras.layers.Dense(128,  activation='elu'),
    tf.keras.layers.Dense(10,    activation='linear', name="bottleneck"),
    tf.keras.layers.Dense(128,  activation='elu'),
    tf.keras.layers.Dense(512,  activation='elu'),
    tf.keras.layers.Dense(784,  activation='sigmoid')
])
autoencoder.compile(loss='mean_squared_error', optimizer = Adam())

## Preparing the dataset.

In [21]:
x_train = training_data.loc[:,1:].values
x_train = x_train / 255.0 # scaling

x_test = test_data.loc[:,1:].values
x_test = x_test / 255.0

In [23]:
trained_model = autoencoder.fit(x_train, x_train, batch_size=32, epochs=32, verbose=1, validation_data=(x_test, x_test))

Train on 60000 samples, validate on 10000 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [25]:
encoder = tf.keras.Model(autoencoder.input, autoencoder.get_layer('bottleneck').output)

## Generating the gensim index

### Preparing data for gensim indices

In [34]:
training_encodings = encoder.predict(x_train)
test_encodings = encoder.predict(x_test)

In [36]:
indices = []
for item in training_encodings:
    indices.append([(x,y) for (x,y) in enumerate(list(item))])
    
for item in test_encodings:
    indices.append([(x,y) for (x,y) in enumerate(list(item))])

### Generating index

In [42]:
index = gensim.similarities.Similarity("./",indices,10)

## Testing

In [78]:
index.num_best = 25
predict_item = 3141
index[encoder.predict(x_train[predict_item].reshape(1,-1)).ravel()]

[(3141, 0.9999999403953552),
 (67988, 0.9761031866073608),
 (2651, 0.9722726941108704),
 (10509, 0.966676652431488),
 (11237, 0.9661428928375244),
 (5403, 0.9660782814025879),
 (44055, 0.965561032295227),
 (14979, 0.9634627103805542),
 (40733, 0.9623900651931763),
 (63619, 0.9616278409957886),
 (42431, 0.9606949090957642),
 (45263, 0.9595298171043396),
 (21155, 0.9591671824455261),
 (49689, 0.9584767818450928),
 (50939, 0.9584395289421082),
 (1485, 0.9576925039291382),
 (1413, 0.9574302434921265),
 (335, 0.9545435309410095),
 (68180, 0.9544398188591003),
 (2655, 0.9533775448799133),
 (28845, 0.9523711204528809),
 (5603, 0.9517109394073486),
 (19083, 0.9507284164428711),
 (259, 0.9505099058151245),
 (3669, 0.9503395557403564)]

In [79]:
df = pd.concat([training_data,test_data])
df.index = [x for x in range(len(df))]

In [80]:
for i in [x[0] for x in index[encoder.predict(x_train[predict_item].reshape(1,-1)).ravel()]]:
    print (df.loc[i,0])


5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
