In [1]:
import tensorflow as tf
import numpy as np

In [2]:
train_file = open("../../data/iyer/train.txt")

In [3]:
train_file_contents = train_file.readlines()

In [4]:
len(train_file_contents)

52997

In [5]:
train = []
for line in train_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        train.append(line.split('\t')[2] + " ENDOFEXAMPLE")

In [6]:
len(train)

52795

In [7]:
len(train[0])

44

In [8]:
len(train[1])

36

In [9]:
from os import path
from nltk.tokenize import word_tokenize
import gensim

In [10]:
train_tok = [word_tokenize(sample) for sample in train]

In [11]:
wv = gensim.models.Word2Vec(train_tok).wv

In [12]:
def texts_to_tensors(wv, texts, max_len):
    text_tensors = np.zeros((len(texts), max_len, wv.vector_size))
    for i in range(len(texts)):
        for j in range(len(texts[i])):
            if texts[i][j] in wv:
                text_tensors[i][j] = wv[texts[i][j]]
            else:
                text_tensors[i][j] = np.ones((wv.vector_size,))
    return text_tensors


def tensors_to_texts(wv, tensors):
    texts = []
    for tensor in tensors:
        text = ""
        for i in range(0, len(tensor)):
            if np.sum(tensor[i]) == wv.vector_size:
                text += "<UNK> "
            elif np.sum(tensor[i]) == 0:
                break
            else:
                similar = wv.similar_by_vector(tensor[i])
                if similar[0][0] == "ENDOFEXAMPLE":
                    break
                text += similar[0][0] + " "
        texts.append(text)
    return texts

In [13]:
max_len = max(len(text) for text in train_tok)

In [14]:
train_tensors = texts_to_tensors(wv, train_tok, max_len)

In [15]:
train[0]

'C# getters, setters declaration ENDOFEXAMPLE'

In [16]:
train_tensor = np.reshape(train_tensors, (train_tensors.shape[0], train_tensors.shape[1] * train_tensors.shape[2]))

In [17]:
train_tensor.shape

(52795, 3600)

In [18]:
val_file_contents = open('../../data/iyer/valid.txt').readlines()
val = []
for line in val_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        val.append(line.split('\t')[2])
val_tok = [word_tokenize(sample) for sample in val]
val_tensors = texts_to_tensors(wv, val_tok, max_len)
val_tensor = np.reshape(val_tensors, (val_tensors.shape[0], val_tensors.shape[1] * val_tensors.shape[2]))

In [19]:
from mlp_vae import MLPVariationalAutoEncoder

In [20]:
latent_dim = 128

In [21]:
model = MLPVariationalAutoEncoder(train_tensor.shape[1], latent_dim, [1024, 512])
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(train_tensor, train_tensor, batch_size=256, epochs=12, verbose=1, shuffle=True,
                    validation_data=(val_tensor, val_tensor))

Train on 52795 samples, validate on 6599 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [22]:
print("(Training) Input: ", tensors_to_texts(wv, [train_tensors[0]]))

(Training) Input:  ['C # <UNK> , setters declaration ']


In [23]:
rec = np.reshape(model.predict(np.array([train_tensor[0]])), (1, train_tensors.shape[1], train_tensors.shape[2]))

In [24]:
print("(Training) Reconstructed: ", tensors_to_texts(wv, rec))

(Training) Reconstructed:  ['C # Setter just Literal domain ']


In [25]:
wv.similar_by_vector(rec[0,6])

[('ENDOFEXAMPLE', 0.7773289680480957),
 ('passing', 0.4660573899745941),
 ('Passing', 0.44540154933929443),
 ('Pass', 0.435727059841156),
 ('assigning', 0.4347077012062073),
 ('retaining', 0.43356168270111084),
 ('calling', 0.4285605549812317),
 ('built', 0.4266049265861511),
 ('specifying', 0.42437824606895447),
 ('javascript', 0.4207734167575836)]

In [26]:
test_file_contents = open('../../data/iyer/test.txt').readlines()
test = []
for line in test_file_contents:
    items = line.split('\t')
    if len(items) == 5:
        test.append(line.split('\t')[2])
test_tok = [word_tokenize(sample) for sample in test]
test_tensors = texts_to_tensors(wv, test_tok, max_len)
test_tensor = np.reshape(test_tensors, (test_tensors.shape[0], test_tensors.shape[1] * test_tensors.shape[2]))

In [27]:
model.evaluate(test_tensor, test_tensor, verbose=0)

0.10897430324124502

In [28]:
import random

In [29]:
random_test_ex = test_tensor[random.randrange(test_tensor.shape[0])]

In [30]:
print("(Test Set) Input: ", tensors_to_texts(wv, np.reshape(random_test_ex, (1, test_tensors.shape[1], test_tensors.shape[2]))))

(Test Set) Input:  ['Use App Pool Credentials for WebClient Request ']


In [31]:
print("(Test Set) Reconstructed: ", tensors_to_texts(wv, np.reshape(model.predict(np.array([random_test_ex])), (1, test_tensors.shape[1], test_tensors.shape[2]))))

(Test Set) Reconstructed:  ['Delphi Code toolkit Nhibernate for VS2010 Setter controlling Assigning execution customized ui onclick crystal setter calling main keydown cmdlet i.e ']


In [32]:
history2 = model.fit(train_tensor, train_tensor, batch_size=256, epochs=12, verbose=1, shuffle=True,
                     validation_data=(val_tensor, val_tensor))

Train on 52795 samples, validate on 6599 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [33]:
history3 = model.fit(train_tensor, train_tensor, batch_size=256, epochs=12, verbose=1, shuffle=True,
                     validation_data=(val_tensor, val_tensor))

Train on 52795 samples, validate on 6599 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [34]:
print("(Test Set) Input: ", tensors_to_texts(wv, np.reshape(random_test_ex, (1, test_tensors.shape[1], test_tensors.shape[2]))))

(Test Set) Input:  ['Use App Pool Credentials for WebClient Request ']


In [35]:
print("(Test Set) Reconstructed: ", tensors_to_texts(wv, np.reshape(model.predict(np.array([random_test_ex])), (1, test_tensors.shape[1], test_tensors.shape[2]))))

(Test Set) Reconstructed:  ['Details Silverlight Place Datasource for beep VS2010 ']
