In [1]:
import tensorflow as tf
import numpy as np

In [2]:
train_file = open("../../data/iyer/train.txt")

In [3]:
train_file_contents = train_file.readlines()

In [4]:
len(train_file_contents)

52997

In [5]:
train = []
for line in train_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        train.append(line.split('\t')[2])

In [6]:
len(train)

52795

In [7]:
len(train[0])

31

In [8]:
len(train[1])

23

In [9]:
import sys
sys.path.append("../../src")
from text_data_utils import *
from os import path
import gensim

In [10]:
train_tok = tokenize_texts(train)

In [11]:
wv = gensim.models.Word2Vec(train_tok).wv

In [12]:
max_len = max(len(text) for text in train_tok)

In [13]:
train_tensor = tokenized_texts_to_tensor(train_tok, wv, max_len)

In [14]:
train_tensor[0, 0]

array([-1.73879492,  0.55274993, -0.35729647, -0.52002335,  0.12782197,
        0.11938381,  0.82348132, -0.48800877,  0.32269821,  1.74547958,
        0.73873854,  0.140288  , -0.392452  ,  0.30661142, -0.55629641,
       -0.16242556,  0.47705811, -1.7649827 , -1.06810749,  0.08043519,
       -0.70535427, -0.01237084,  0.68049383,  0.18081552,  0.86641955,
        0.84606814,  0.01800916, -0.29523399, -0.00870313, -1.0394522 ,
        0.2131716 ,  0.96355474,  0.01078537,  0.90590084,  1.18876171,
        0.46406162,  0.42419285, -0.53306198,  0.73095047,  0.67578936,
       -0.67718387, -0.19723158,  0.59144628, -0.40570411,  1.3554765 ,
        0.90622997, -0.66121393,  0.62829822,  0.83926123,  0.17268981,
        0.74130988,  0.64965093,  0.17344132, -0.24491772, -0.76218379,
        0.29207313, -0.70666963,  0.19077501,  0.70406359,  0.28889212,
       -0.45597586, -0.40071785,  0.1031318 ,  0.20170197, -0.08252058,
       -0.57074344, -0.10505958,  0.96575826,  0.81126976,  0.53

In [15]:
train_tensor_fl = np.reshape(train_tensor, (train_tensor.shape[0], train_tensor.shape[1] * train_tensor.shape[2]))

In [16]:
train_tensor.shape

(52795, 39, 100)

In [17]:
train_tensor_fl.shape

(52795, 3900)

In [18]:
val_file_contents = open('../../data/iyer/valid.txt').readlines()
val = []
for line in val_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        val.append(line.split('\t')[2])
val_tok = tokenize_texts(val)
val_tensor = tokenized_texts_to_tensor(val_tok, wv, max_len)
val_tensor_fl = np.reshape(val_tensor, (val_tensor.shape[0], val_tensor.shape[1] * val_tensor.shape[2]))

In [19]:
from mlp_vae import MLPVariationalAutoEncoder

In [20]:
latent_dim = 128

In [21]:
model = MLPVariationalAutoEncoder(train_tensor_fl.shape[1], latent_dim, [1024, 512], final_activation='linear')
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(train_tensor_fl, train_tensor_fl, batch_size=256, epochs=20, verbose=1, shuffle=True,
                    validation_data=(val_tensor_fl, val_tensor_fl))

Train on 52795 samples, validate on 6599 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
import random

In [24]:
random.seed()
random_idx = random.randrange(train_tensor.shape[0])

In [25]:
print("(Training Set) Input: ", tensor_to_tokenized_texts(np.array([train_tensor[random_idx]]), wv)[0])

(Training Set) Input:  ['<s>', 'How', 'can', 'I', 'display', 'multiple', 'images', 'in', 'a', 'loop', 'in', 'a', 'WP7', 'app', '?', '</s>']


In [26]:
rec = np.reshape(model.predict(np.array([train_tensor_fl[random_idx]])), (1, train_tensor.shape[1], train_tensor.shape[2]))

In [27]:
print("(Training Set) Reconstructed: ", tensor_to_tokenized_texts(rec, wv)[0])

(Training Set) Reconstructed:  ['<s>', 'How', 'can', 'I', 'display', 'multiple', 'pdf', 'in', 'a', 'loop', 'in', 'a', 'timer', 'app', '?', '</s>']


In [28]:
wv.similar_by_vector(rec[0, 4])

[('display', 0.9453029632568359),
 ('add', 0.8929032683372498),
 ('show', 0.8895425796508789),
 ('change', 0.8889898657798767),
 ('retrieve', 0.8829615116119385),
 ('delete', 0.8801358342170715),
 ('print', 0.8584847450256348),
 ('move', 0.851596474647522),
 ('bind', 0.8342567682266235),
 ('hide', 0.8299027681350708)]

In [29]:
test_file_contents = open('../../data/iyer/test.txt').readlines()
test = []
for line in test_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        test.append(line.split('\t')[2])
test_tok = tokenize_texts(test)
test_tensor = tokenized_texts_to_tensor(test_tok, wv, max_len)
test_tensor_fl = np.reshape(test_tensor, (test_tensor.shape[0], test_tensor.shape[1] * test_tensor.shape[2]))

In [30]:
model.evaluate(test_tensor_fl, test_tensor_fl, verbose=0)

0.047301981562573835

In [31]:
random.seed()
random_idx = random.randrange(test_tensor.shape[0])

In [32]:
print("(Test Set) Input: ", tensor_to_tokenized_texts(np.array([test_tensor[random_idx]]), wv)[0])

(Test Set) Input:  ['<s>', 'How', 'to', 'call', 'a', 'JavaScript', 'function', 'multiple', 'times', 'in', 'a', 'loop', 'on', 'page', 'reload', 'with', 'ASP', '.', 'NET', '</s>']


In [33]:
print("(Test Set) Reconstructed: ", tensor_to_tokenized_texts(np.reshape(model.predict(np.array([test_tensor_fl[random_idx]])), (1, test_tensor.shape[1], test_tensor.shape[2])), wv)[0])

(Test Set) Reconstructed:  ['<s>', 'How', 'to', 'call', 'a', 'Direct', 'function', 'multiple', 'Criteria', 'in', 'a', 'dictionary', 'on', 'repeater', 'SqlDataSource', 'in', '.', 'Asp', 'net', '</s>']
