In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
import gensim
import random
import matplotlib.pyplot as plt
from typing import Union
from mlp import *
from text_data_utils import *
from bvae import *

In [2]:
train_summaries, train_codes = load_iyer_file("../data/iyer/train.txt")
train_summaries_tokens = tokenize_texts(train_summaries)
max_summary_len = max(len(text) for text in train_summaries_tokens)
train_codes_tokens = tokenize_texts(train_codes)
max_code_len = max(len(text) for text in train_codes_tokens)
summaries_wv = gensim.models.Word2Vec(train_summaries_tokens, size=110, min_count=2).wv
codes_wv = gensim.models.Word2Vec(train_codes_tokens, size=125, min_count=2).wv
train_summaries_tensor = tokenized_texts_to_tensor(train_summaries_tokens, summaries_wv, max_summary_len)
train_codes_tensor = tokenized_texts_to_tensor(train_codes_tokens, codes_wv, max_code_len)
train_summaries_tensor_fl = np.reshape(train_summaries_tensor,
                                       (train_summaries_tensor.shape[0],
                                        train_summaries_tensor.shape[1] * train_summaries_tensor.shape[2]))
train_codes_tensor_fl = np.reshape(train_codes_tensor,
                                   (train_codes_tensor.shape[0],
                                    train_codes_tensor.shape[1] * train_codes_tensor.shape[2]))

val_summaries, val_codes = load_iyer_file("../data/iyer/valid.txt")
val_summaries_tokens = tokenize_texts(val_summaries)
val_codes_tokens = tokenize_texts(val_codes)
val_summaries_tensor = tokenized_texts_to_tensor(val_summaries_tokens, summaries_wv, max_summary_len)
val_codes_tensor = tokenized_texts_to_tensor(val_codes_tokens, codes_wv, max_code_len)
val_summaries_tensor_fl = np.reshape(val_summaries_tensor,
                                     (val_summaries_tensor.shape[0],
                                      val_summaries_tensor.shape[1] * val_summaries_tensor.shape[2]))
val_codes_tensor_fl = np.reshape(val_codes_tensor,
                                 (val_codes_tensor.shape[0],
                                  val_codes_tensor.shape[1] * val_codes_tensor.shape[2]))

test_summaries, test_codes = load_iyer_file("../data/iyer/test.txt")
test_summaries_tokens = tokenize_texts(test_summaries)
test_codes_tokens = tokenize_texts(test_codes)
test_summaries_tensor = tokenized_texts_to_tensor(test_summaries_tokens, summaries_wv, max_summary_len)
test_codes_tensor = tokenized_texts_to_tensor(test_codes_tokens, codes_wv, max_code_len)
test_summaries_tensor_fl = np.reshape(test_summaries_tensor,
                                      (test_summaries_tensor.shape[0],
                                       test_summaries_tensor.shape[1] * test_summaries_tensor.shape[2]))
test_codes_tensor_fl = np.reshape(test_codes_tensor,
                                  (test_codes_tensor.shape[0],
                                   test_codes_tensor.shape[1] * test_codes_tensor.shape[2]))

latent_dim = 512

model = BimodalVariationalAutoEncoder(train_summaries_tensor_fl.shape[1],
                                      train_codes_tensor_fl.shape[1],
                                      latent_dim)

model.train(train_summaries_tensor_fl, train_codes_tensor_fl, val_summaries_tensor_fl, val_codes_tensor_fl, 35, 128,
            tf.keras.optimizers.Adam(learning_rate=0.0001))


Epoch 1 of 35 completed, training loss = 65.73186492919922, validation loss = 39.141151428222656
Epoch 2 of 35 completed, training loss = 30.48387336730957, validation loss = 25.600452423095703
Epoch 3 of 35 completed, training loss = 20.50125503540039, validation loss = 16.97852897644043
Epoch 4 of 35 completed, training loss = 13.028983116149902, validation loss = 10.940509796142578
Epoch 5 of 35 completed, training loss = 8.43012809753418, validation loss = 7.396726608276367
Epoch 6 of 35 completed, training loss = 5.720603942871094, validation loss = 5.21226167678833
Epoch 7 of 35 completed, training loss = 4.085941791534424, validation loss = 3.874422550201416
Epoch 8 of 35 completed, training loss = 3.063230276107788, validation loss = 2.978896141052246
Epoch 9 of 35 completed, training loss = 2.3727877140045166, validation loss = 2.284188747406006
Epoch 10 of 35 completed, training loss = 1.798722505569458, validation loss = 1.7957215309143066
Epoch 11 of 35 completed, training 

### Example from train set

In [11]:
random.seed()
random_idx = random.randrange(train_summaries_tensor.shape[0])
random_train_summary = np.array([train_summaries_tensor[random_idx]])
random_train_code = np.array([train_codes_tensor[random_idx]])

In [12]:
tensor_to_tokenized_texts(random_train_summary, summaries_wv)[0]

['<s>', 'Remove', 'front', 'characters', 'from', 'a', 'string', '?', '</s>']

In [13]:
tensor_to_tokenized_texts(random_train_code, codes_wv)[0]

['<s>', 'sName', '=', 'sName', '.', 'Substring', '(', '5', ')', '</s>']

In [22]:
random_train_summary_fl = np.reshape(random_train_summary, (1, train_summaries_tensor_fl.shape[1]))
encoded_summary_dists = model.language_encoder(random_train_summary_fl)
encoded_summary_dists.mean(), encoded_summary_dists.stddev()

(<tf.Tensor: id=396267, shape=(1, 512), dtype=float32, numpy=
 array([[ 0.02736173, -0.23848228,  0.00174127,  0.13009836,  0.12017126,
          0.04194421,  0.41902667, -0.22565551, -0.00998425, -0.06575833,
         -0.14337958, -0.0948907 ,  0.05036798,  0.18774797,  0.1708781 ,
         -0.13355917,  0.06186533, -0.07228084, -0.32512334, -0.02781991,
         -0.1610523 , -0.35389405,  0.2844652 , -0.1398453 , -0.20393343,
          0.21563421,  0.2808211 , -0.23831762,  0.07356949, -0.10338534,
         -0.36049134, -0.07092552,  0.10859936, -0.03177325, -0.04328687,
         -0.04337417,  0.18152525,  0.14371029, -0.13935263,  0.07285238,
         -0.3039223 ,  0.02283813,  0.06288165,  0.27431384,  0.24891287,
         -0.2568365 ,  0.18736905, -0.31642538,  0.13009341, -0.19948609,
          0.3450756 , -0.3931423 , -0.20468187, -0.07761283,  0.15310813,
          0.19759761,  0.0343466 , -0.3808818 ,  0.1250449 , -0.0752419 ,
         -0.01844649,  0.23818175, -0.06121144,  0

In [23]:
random_train_code_fl = np.reshape(random_train_code, (1, train_codes_tensor_fl.shape[1]))
encoded_code_dists = model.source_code_encoder(random_train_code_fl)
encoded_code_dists.mean(), encoded_code_dists.stddev()

(<tf.Tensor: id=396298, shape=(1, 512), dtype=float32, numpy=
 array([[ 0.142884  , -0.1749379 , -0.00716674,  0.11260547,  0.04572623,
         -0.04657702,  0.2900985 , -0.12297083,  0.02039827,  0.10840655,
         -0.238659  , -0.20726947,  0.1192681 ,  0.30249968, -0.01022793,
          0.01373317,  0.14899513, -0.1583602 , -0.2894834 , -0.00375577,
         -0.15400855, -0.40125722,  0.28726932, -0.00953858, -0.12610207,
          0.08212825,  0.17770405, -0.30165017,  0.10041889, -0.09341551,
         -0.21463856, -0.00410952,  0.2135833 , -0.00984163, -0.0318987 ,
         -0.12042491,  0.18691246,  0.02224913, -0.10153142, -0.04504642,
         -0.34193236,  0.119611  ,  0.16765684,  0.33118334,  0.19648036,
         -0.19117583, -0.02723853, -0.15904337,  0.17486084, -0.24699223,
          0.37254718, -0.30304587, -0.19494113,  0.032115  ,  0.14552267,
          0.13857053,  0.07663962, -0.40373597,  0.1725961 , -0.19399327,
         -0.0236371 ,  0.08489828,  0.09932552,  0

In [26]:
mean_mean = (encoded_summary_dists.mean() + encoded_code_dists.mean()) / 2
mean_stddev = (encoded_summary_dists.stddev() + encoded_code_dists.stddev()) / 2
language_kl_divergence = tfp.distributions.kl_divergence(
    tfp.distributions.Normal(encoded_summary_dists.mean(), encoded_summary_dists.stddev()),
    tfp.distributions.Normal(mean_mean, mean_stddev)
)
source_code_kl_divergence = tfp.distributions.kl_divergence(
    tfp.distributions.Normal(encoded_code_dists.mean(), encoded_code_dists.stddev()),
    tfp.distributions.Normal(mean_mean, mean_stddev)
)

In [27]:
language_kl_divergence

<tf.Tensor: id=396419, shape=(1, 512), dtype=float32, numpy=
array([[           nan,            nan,            nan,            nan,
                   nan, 3.37352020e+02,            nan,            nan,
                   nan,            nan, 3.12429321e+02,            nan,
                   nan, 1.63147241e+03,            nan,            nan,
                   nan, 3.64576683e+01, 6.71665382e+00,            nan,
                   nan,            nan,            nan,            nan,
                   nan, 7.71433868e+01,            nan, 2.09473633e+02,
                   nan,            nan, 4.86042786e+02,            nan,
                   nan, 8.41574860e+00, 1.31730408e+02,            nan,
        1.97203457e-01,            nan,            nan,            nan,
                   nan,            nan, 1.09577961e+01,            nan,
                   nan,            nan,            nan,            nan,
                   nan, 1.32902889e+01, 1.46204891e+01,            nan,
   

In [34]:
tensor_to_tokenized_texts(np.reshape(model.language_decoder(encoded_summary_dists.sample()), (1, train_summaries_tensor.shape[1], train_summaries_tensor.shape[2])), summaries_wv)

[['<s>', 'Get', 'NET', '<s>', 'to', '</s>']]