In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import gensim
import random
import matplotlib.pyplot as plt
from typing import Union
from mlp import *
from text_data_utils import *
from bvae import *

In [2]:
train_summaries, train_codes = load_iyer_file("../data/iyer/train.txt")
train_summaries_tokens = tokenize_texts(train_summaries)
max_summary_len = max(len(text) for text in train_summaries_tokens)
train_codes_tokens = tokenize_texts(train_codes)
max_code_len = max(len(text) for text in train_codes_tokens)
summaries_wv = gensim.models.Word2Vec(train_summaries_tokens, size=110, min_count=2).wv
codes_wv = gensim.models.Word2Vec(train_codes_tokens, size=125, min_count=2).wv
train_summaries_tensor = tokenized_texts_to_tensor(train_summaries_tokens, summaries_wv, max_summary_len)
train_codes_tensor = tokenized_texts_to_tensor(train_codes_tokens, codes_wv, max_code_len)
train_summaries_tensor_fl = np.reshape(train_summaries_tensor,
                                       (train_summaries_tensor.shape[0],
                                        train_summaries_tensor.shape[1] * train_summaries_tensor.shape[2]))
train_codes_tensor_fl = np.reshape(train_codes_tensor,
                                   (train_codes_tensor.shape[0],
                                    train_codes_tensor.shape[1] * train_codes_tensor.shape[2]))

val_summaries, val_codes = load_iyer_file("../data/iyer/valid.txt")
val_summaries_tokens = tokenize_texts(val_summaries)
val_codes_tokens = tokenize_texts(val_codes)
val_summaries_tensor = tokenized_texts_to_tensor(val_summaries_tokens, summaries_wv, max_summary_len)
val_codes_tensor = tokenized_texts_to_tensor(val_codes_tokens, codes_wv, max_code_len)
val_summaries_tensor_fl = np.reshape(val_summaries_tensor,
                                     (val_summaries_tensor.shape[0],
                                      val_summaries_tensor.shape[1] * val_summaries_tensor.shape[2]))
val_codes_tensor_fl = np.reshape(val_codes_tensor,
                                 (val_codes_tensor.shape[0],
                                  val_codes_tensor.shape[1] * val_codes_tensor.shape[2]))

test_summaries, test_codes = load_iyer_file("../data/iyer/test.txt")
test_summaries_tokens = tokenize_texts(test_summaries)
test_codes_tokens = tokenize_texts(test_codes)
test_summaries_tensor = tokenized_texts_to_tensor(test_summaries_tokens, summaries_wv, max_summary_len)
test_codes_tensor = tokenized_texts_to_tensor(test_codes_tokens, codes_wv, max_code_len)
test_summaries_tensor_fl = np.reshape(test_summaries_tensor,
                                      (test_summaries_tensor.shape[0],
                                       test_summaries_tensor.shape[1] * test_summaries_tensor.shape[2]))
test_codes_tensor_fl = np.reshape(test_codes_tensor,
                                  (test_codes_tensor.shape[0],
                                   test_codes_tensor.shape[1] * test_codes_tensor.shape[2]))

latent_dim = 512

In [3]:
model = BimodalVariationalAutoEncoder(train_summaries_tensor_fl.shape[1],
                                      train_codes_tensor_fl.shape[1],
                                      latent_dim)

model.train(train_summaries_tensor_fl, train_codes_tensor_fl, val_summaries_tensor_fl, val_codes_tensor_fl, 35, 128,
            tf.keras.optimizers.Adam(learning_rate=0.0001))

Epoch 1 of 35 completed, training loss = 77.84404754638672, validation loss = 52.45042419433594
Epoch 2 of 35 completed, training loss = 43.399166107177734, validation loss = 37.74650573730469
Epoch 3 of 35 completed, training loss = 37.22574234008789, validation loss = 35.740394592285156
Epoch 4 of 35 completed, training loss = 31.8889217376709, validation loss = 30.034029006958008
Epoch 5 of 35 completed, training loss = 26.851348876953125, validation loss = 25.520795822143555
Epoch 6 of 35 completed, training loss = 26.0675048828125, validation loss = 26.096670150756836
Epoch 7 of 35 completed, training loss = 23.098560333251953, validation loss = 22.63214111328125
Epoch 8 of 35 completed, training loss = 19.619937896728516, validation loss = 18.929258346557617
Epoch 9 of 35 completed, training loss = 16.88948631286621, validation loss = 16.407745361328125
Epoch 10 of 35 completed, training loss = 14.844555854797363, validation loss = 15.380115509033203
Epoch 11 of 35 completed, tra

### Example from train set

In [4]:
random.seed()
random_idx = random.randrange(train_summaries_tensor.shape[0])
random_train_summary = np.array([train_summaries_tensor[random_idx]])
random_train_code = np.array([train_codes_tensor[random_idx]])

In [5]:
tensor_to_tokenized_texts(random_train_summary, summaries_wv)[0]

['<s>', 'How', 'to', 'put', 'Unicode', 'in', 'browser', 'title', '</s>']

In [6]:
tensor_to_tokenized_texts(random_train_code, codes_wv)[0]

['<s>',
 'Page',
 '.',
 'Title',
 '=',
 '"',
 '/',
 '/',
 '<UNK>',
 '/',
 '/',
 '"',
 '</s>']

In [13]:
tensor_to_tokenized_texts(np.reshape(model.language_decoder(model.language_encoder(np.reshape(random_train_summary, (1, random_train_summary.shape[1] * random_train_summary.shape[2]))).sample()), (1, train_summaries_tensor.shape[1], train_summaries_tensor.shape[2])), summaries_wv)[0]

['a', 'value', 'using', '<UNK>', 'Removing', 'in', 'inside', 'Check', '</s>']

In [16]:
tensor_to_tokenized_texts(np.reshape(model.language_decoder(model.source_code_encoder(np.reshape(random_train_code, (1, random_train_code.shape[1] * random_train_code.shape[2]))).sample()), (1, train_summaries_tensor.shape[1], train_summaries_tensor.shape[2])), summaries_wv)[0]

['only',
 'debug',
 'integer',
 'stream',
 'Mouse',
 'Multiple',
 'BackgroundWorker',
 'struct',
 'Request',
 'Unable',
 'enum',
 '<UNK>',
 'ZIndex',
 '<UNK>',
 '<UNK>',
 '<UNK>',
 '<UNK>']