In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
from six.moves import zip_longest

import numpy as np
import tensorflow as tf
from tensorflow import distributions as tfd
from tensorflow.python import debug as tf_debug
import _pickle as cPickle
import random

from data_structure import load_data
from components import tf_log, encode_latents, sample_latents, compute_kl_loss, dynamic_rnn, dynamic_bi_rnn, DiagonalGaussian

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '2', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('data_path', 'data/apnews/instances.pkl', 'path of data')
flags.DEFINE_string('modeldir', 'model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 10, 'epochs')
flags.DEFINE_integer('batch_size', 8, 'batch size')
flags.DEFINE_integer('log_period', 100, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.01, 'lr')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 1.0, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 10, 'warmup period for KL')

flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('n_topic', 10, 'number of topic')
flags.DEFINE_integer('dim_topic', 256, 'dim of latent topic')
flags.DEFINE_integer('dim_emb', 256, 'dim_latent')
flags.DEFINE_integer('dim_hidden', 512, 'dim_output')
flags.DEFINE_integer('dim_latent', 32, 'dim_latent')

# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [5]:
num_train_batches, train_batches, dev_batches, test_batches, word_to_idx, idx_to_word, bow_idxs = load_data(config)

Number of train examples: 39553


In [6]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')
flags.DEFINE_integer('dim_bow', len(bow_idxs), 'dim_bow')

In [7]:
maximum_iterations = max([max([instance.max_sent_l for instance in batch]) for ct, batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build language model 

## feed dict

In [8]:
def get_feed_dict(batch, mode='train'):
    bow = np.array([instance.bow for instance in batch]).astype(np.float32)
    
    batch_size = len(batch)
    
    doc_l_matrix = np.array([instance.doc_l for instance in batch]).astype(np.int32)

    max_doc_l = np.max(doc_l_matrix)
    max_sent_l = max([instance.max_sent_l for instance in batch])

    token_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l], np.int32)
    dec_input_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    dec_target_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)
    dec_sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)

    for i, instance in enumerate(batch):
        for j, sent_idxs in enumerate(instance.token_idxs):
            token_idxs_matrix[i, j, :len(sent_idxs)] = np.asarray(sent_idxs)
            
            sent_idxs_dropout = np.asarray(sent_idxs)
            sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
            dec_input_idxs_matrix[i, j, :len(sent_idxs)+1] = np.concatenate([[config.BOS_IDX], sent_idxs_dropout])
            
            dec_target_idxs_matrix[i, j, :len(sent_idxs)+1] = np.asarray(sent_idxs + [config.EOS_IDX])
            sent_l_matrix[i, j] = len(sent_idxs)
            dec_sent_l_matrix[i, j] = len(sent_idxs)+1

    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['bow']: bow, t_variables['token_idxs']: token_idxs_matrix,
                t_variables['dec_input_idxs']: dec_input_idxs_matrix, t_variables['dec_target_idxs']: dec_target_idxs_matrix, 
                t_variables['batch_l']: batch_size, t_variables['doc_l']: doc_l_matrix, t_variables['sent_l']: sent_l_matrix, t_variables['dec_sent_l']: dec_sent_l_matrix,
                t_variables['max_doc_l']: max_doc_l, t_variables['max_sent_l']: max_sent_l, 
                t_variables['keep_prob']: keep_prob}
    return  feed_dict

In [9]:
def debug_shape(variables):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        sample_batch = test_batches[0][1]
        feed_dict = get_feed_dict(sample_batch)
        _variables = sess.run(variables, feed_dict=feed_dict)
        for _variable, variable in zip(_variables, variables):
            print(variable.name, ':', _variable.shape)
            
        sess.close()
        
def debug_value(variables):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        sample_batch = test_batches[0][1]
        feed_dict = get_feed_dict(sample_batch)
        _variables = sess.run(variables, feed_dict=feed_dict)
        for _variable, variable in zip(_variables, variables):
            print(variable.name, ':', _variable)
            
        sess.close()

## fed variables

In [10]:
tf.reset_default_graph()

t_variables = {}
t_variables['bow'] = tf.placeholder(tf.float32, [None, config.dim_bow])
t_variables['token_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['batch_l'] = tf.placeholder(tf.int32, [])
t_variables['doc_l'] = tf.placeholder(tf.int32, [None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['max_doc_l'] = tf.placeholder(tf.int32, [])
t_variables['max_sent_l'] = tf.placeholder(tf.int32, [])
t_variables['keep_prob'] = tf.placeholder(tf.float32)

## trained variables

## topic model

In [11]:
bow = t_variables['bow']
with tf.variable_scope('topic/enc', reuse=False):
    means_bow, logvars_bow = encode_latents(bow, dim=config.dim_topic, name='latent') # encode to parameters of gaussian distribution
    latents_bow = sample_latents(means_bow, logvars_bow) # sample latent vectors
    hidden_bow = tf.layers.Dense(units=config.dim_topic, activation=tf.nn.relu, name='hidden')(latents_bow)
    prob_topic = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax, name='prob')(hidden_bow) # inference of topic probabilities

with tf.variable_scope('shared', reuse=False):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of vocab

bow_embeddings = tf.nn.embedding_lookup(embeddings, bow_idxs) # embeddings of each bow features
    
with tf.variable_scope('topic/dec', reuse=False):
    topic_embeddings = tf.get_variable('topic_emb', [config.n_topic, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of topics
    
    topic_bow = tf.nn.softmax(tf.matmul(topic_embeddings, bow_embeddings, transpose_b=True), 1) # bow vectors for each topic
    prob_bow = tf_log(tf.matmul(prob_topic, topic_bow)) # predicted bow distribution
    
topic_loss_recon = -tf.reduce_mean(tf.reduce_sum(tf.multiply(bow, prob_bow), 1)) # negative log likelihood of each words
topic_loss_kl = compute_kl_loss(means_bow, logvars_bow) # KL divergence b/w latent dist & gaussian std

### test

In [12]:
debug_shape([bow, latents_bow, hidden_bow, prob_topic, embeddings, bow_embeddings, topic_embeddings, topic_bow, prob_bow, topic_loss_recon])

Placeholder:0 : (8, 4022)
topic/enc/add:0 : (8, 256)
topic/enc/hidden/Relu:0 : (8, 256)
topic/enc/prob/Softmax:0 : (8, 10)
shared/emb:0 : (29743, 256)
embedding_lookup:0 : (4022, 256)
topic/dec/topic_emb:0 : (10, 256)
topic/dec/Softmax:0 : (10, 4022)
topic/dec/Log:0 : (8, 4022)
Neg:0 : ()


In [13]:
debug_value([tf.reduce_sum(prob_topic, -1), tf.reduce_sum(topic_bow, -1)])

Sum_2:0 : [1.0000001  1.         1.         0.9999999  0.99999994 1.
 1.         1.        ]
Sum_3:0 : [0.9999999  0.99999994 1.         0.9999999  1.         1.
 0.99999994 1.         1.         1.        ]


In [14]:
sigma_bow = tf.exp(0.5 * logvars_bow)
dist_bow = tfd.Normal(means_bow, sigma_bow)
dist_std = tfd.Normal(0., 1.)
topic_loss_kl_tmp = tf.reduce_mean(tf.reduce_sum(tfd.kl_divergence(dist_bow, dist_std), 1))

In [15]:
debug_value([topic_loss_recon, topic_loss_kl, topic_loss_kl_tmp])

Neg:0 : 433.66687
Mean_1:0 : 3.8835375
Mean_2:0 : 3.883537


## encoder

In [16]:
# input
batch_l = t_variables['batch_l']
max_doc_l = t_variables['max_doc_l']
max_sent_l = t_variables['max_sent_l']
token_idxs = t_variables['token_idxs'][:, :max_doc_l, :max_sent_l]

# get word embedding
with tf.variable_scope('sent/enc', reuse=False):
    enc_input = tf.nn.embedding_lookup(embeddings, token_idxs)
    enc_input_do = tf.reshape(enc_input, [batch_l * max_doc_l, max_sent_l, config.dim_emb])

    # get sentence embedding
    sent_l = t_variables['sent_l']
    sent_l_do = tf.reshape(sent_l, [batch_l * max_doc_l])

    _, enc_state_do = dynamic_rnn(enc_input_do, sent_l_do, config.dim_hidden, t_variables['keep_prob'])
    enc_state = tf.reshape(enc_state_do, [batch_l, max_doc_l, config.dim_hidden])

    # TODO House Holder flow
    prob_topic_sent = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax)(enc_state)
    
    # inference of each gaussian dist. parameter
    enc_state_topic = tf.tile(tf.expand_dims(enc_state, 2), [1, 1, config.n_topic, 1]) # tile over topics
    means_sent_topic, logvars_sent_topic = encode_latents(enc_state_topic, dim=config.dim_latent, name='latent') 
    
    # latent vectors from each gaussian dist.
    latents_sent_topic = sample_latents(means_sent_topic, logvars_sent_topic) 
    # latent vector from gaussian mixture    
    latents_sent = tf.squeeze(tf.matmul(tf.expand_dims(prob_topic_sent, -1), latents_sent_topic, transpose_a=True), 2) 
    latents_sent_do = tf.reshape(latents_sent, [batch_l * max_doc_l, config.dim_latent])

### test

In [17]:
debug_shape([token_idxs, enc_input, enc_state])

strided_slice:0 : (8, 6, 48)
sent/enc/embedding_lookup:0 : (8, 6, 48, 256)
sent/enc/Reshape_2:0 : (8, 6, 512)


In [18]:
debug_shape([prob_topic_sent, enc_state_topic, latents_sent_topic, latents_sent, latents_sent_do])

sent/enc/dense/Reshape_1:0 : (8, 6, 10)
sent/enc/Tile:0 : (8, 6, 10, 512)
sent/enc/add:0 : (8, 6, 10, 32)
sent/enc/Squeeze:0 : (8, 6, 32)
sent/enc/Reshape_3:0 : (48, 32)


In [19]:
debug_value([tf.reduce_sum(prob_topic_sent, -1)])

Sum_5:0 : [[0.99999994 0.99999994 1.         0.99999994 1.         1.        ]
 [1.0000001  1.         0.99999994 0.9999999  0.99999994 1.        ]
 [0.99999994 0.99999994 1.         1.         1.         1.        ]
 [0.99999994 1.         1.         1.         1.         0.9999999 ]
 [1.0000001  0.99999994 0.99999994 1.         1.         1.        ]
 [1.         0.99999994 0.99999994 1.         0.99999994 1.        ]
 [1.0000001  1.         0.99999994 1.         1.         0.99999994]
 [1.         1.         1.         1.         1.0000001  1.0000001 ]]


## decoder

In [20]:
# prepare for decoding
dec_input_idxs = t_variables['dec_input_idxs']
dec_input_idxs_do = tf.reshape(dec_input_idxs, [batch_l * max_doc_l, max_sent_l+1])
dec_input_do = tf.nn.embedding_lookup(embeddings, dec_input_idxs_do)

dec_latent_input_do = tf.tile(tf.expand_dims(latents_sent_do, 1), [1, tf.shape(dec_input_do)[1], 1])
dec_concat_input_do = tf.concat([dec_input_do, dec_latent_input_do], 2)

# decode for training
dec_sent_l = t_variables['dec_sent_l']
dec_sent_l_do = tf.reshape(dec_sent_l, [batch_l * max_doc_l])

with tf.variable_scope('sent/dec/rnn', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(config.dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.layers.Dense(units=config.dim_hidden, activation=tf.nn.relu)(latents_sent_do)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input_do, sequence_length=dec_sent_l_do)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name='out')
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs_do = tf.argmax(output_logits, 2)
    output_token_idxs = tf.reshape(output_token_idxs_do, [batch_l, max_doc_l, tf.shape(output_token_idxs_do)[1]])

In [21]:
debug_shape([dec_input_idxs, dec_concat_input_do, dec_initial_state, output_logits, output_token_idxs])

Placeholder_2:0 : (8, 6, 49)
concat:0 : (48, 49, 288)
sent/dec/rnn/dense/Relu:0 : (48, 512)
sent/dec/rnn/out/Tensordot:0 : (48, 49, 29743)
sent/dec/rnn/Reshape:0 : (8, 6, 49)


## language modeling cost

In [22]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
dec_target_idxs_do = tf.reshape(dec_target_idxs, [batch_l * max_doc_l, max_sent_l+1])                
dec_mask_tokens_do = tf.sequence_mask(dec_sent_l_do, maxlen=max_sent_l+1, dtype=tf.float32)

# nll for each token (averaged over batch & sentence)
sent_loss_recon = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs_do, dec_mask_tokens_do) 

In [23]:
# inferred mixture probabilities (computed for each sentence)
categ_topic_sent = tfd.Categorical(probs=prob_topic_sent)

# prior of mixture probabilities (computed for each document)
categ_topic = tfd.Categorical(probs=tf.expand_dims(prob_topic, 1))

sent_loss_kl_categ = tf.reduce_mean(tfd.kl_divergence(categ_topic_sent, categ_topic))

# prior of each gaussian gaussribution (computed for each topic)
with tf.variable_scope('topic/enc', reuse=False):
    means_topic_bow, logvars_topic_bow = encode_latents(topic_bow, dim=config.dim_latent, name='prior')
sigma_topic_bow = tf.exp(0.5 * logvars_topic_bow)
gauss_topic_bow = tfd.Normal(loc=means_topic_bow, scale=sigma_topic_bow)

# inference of each gaussian gaussribution (computed for each sentence)
sigma_sent_topic = tf.exp(0.5 * logvars_sent_topic)
gauss_sent_topic = tfd.Normal(loc=means_sent_topic, scale=sigma_sent_topic)

sent_loss_kl_gauss = tf.reduce_sum(tfd.kl_divergence(gauss_sent_topic, gauss_topic_bow), -1)
sent_loss_kl_gmm = tf.reduce_mean(tf.reduce_sum(tf.multiply(prob_topic_sent, sent_loss_kl_gauss), -1))

sent_loss_kl = sent_loss_kl_categ + sent_loss_kl_gmm

### test

In [24]:
# kl loss for categorical distribution
sent_loss_kl_categ_tmp = tf.reduce_mean(tf.reduce_sum(tf.multiply(prob_topic_sent, tf_log(prob_topic_sent/tf.expand_dims(prob_topic, 1))), -1)) 
debug_value([sent_loss_kl_categ, sent_loss_kl_categ_tmp])

Mean_3:0 : 0.6479989
Mean_5:0 : 0.647999


In [25]:
debug_shape([prob_topic_sent, sent_loss_kl_gauss, sent_loss_kl_gmm])

sent/enc/dense/Reshape_1:0 : (8, 6, 10)
Sum_6:0 : (8, 6, 10)
Mean_4:0 : ()


In [26]:
debug_value([sent_loss_recon, sent_loss_kl])

sequence_loss/truediv:0 : 10.3004675
add_4:0 : 0.26921743


## optimizer

In [27]:
if config.warmup > 0:
    beta = tf.Variable(0.1, name='beta', trainable=False)

topic_loss = topic_loss_recon + beta * topic_loss_kl
sent_loss = sent_loss_recon + beta * sent_loss_kl
loss = topic_loss + sent_loss

# define optimizer
if (config.opt == 'Adam'):
    optimizer = tf.train.AdamOptimizer(config.lr)
elif (config.opt == 'Adagrad'):
    optimizer = tf.train.AdagradOptimizer(config.lr)
    
grad_vars = optimizer.compute_gradients(loss)
clipped_grad_vars = [(tf.clip_by_value(grad, -config.grad_clip, config.grad_clip), var) for grad, var in grad_vars]

opt = optimizer.apply_gradients(clipped_grad_vars)

# run model 

In [28]:
def idxs_to_sents(token_idxs, config, idx_to_word):
    sents = []
    for line_idxs in token_idxs:
        tokens = []
        for idx in line_idxs:
            if idx == config.EOS_IDX: break
            tokens.append(idx_to_word[idx])
        sent = ' '.join(tokens)
        sents.append(sent)
    return sents

In [29]:
def get_loss(sess, batches):
    losses = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch = sess.run(loss, feed_dict = feed_dict)
        losses += [loss_batch]        
    loss_mean = np.mean(losses)
    return loss_mean

def get_all_losses(sess, batches):
    losses = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch = \
        sess.run([loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]
    print('LOSS %.2f | TM NLL: %.2f, KL: %.4f | LM NLL: %.2f, KL: %.4f' %  np.mean(losses, 0))

In [30]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs_batch = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs_batch = [instance.token_idxs for instance in sample_batch]
    
    assert len(pred_token_idxs_batch) == len(true_token_idxs_batch)
    
    for true_token_idxs, pred_token_idxs in zip(true_token_idxs_batch, pred_token_idxs_batch):
        true_sents = idxs_to_sents(true_token_idxs, config, idx_to_word)
        pred_sents = idxs_to_sents(pred_token_idxs, config, idx_to_word)
        assert len(true_sents) == len(pred_sents)
        
        for true_sent, pred_sent in zip(true_sents, pred_sents):
            print('True: %s' % true_sent)
            print('Pred: %s' % pred_sent)

In [31]:
if 'sess' in globals(): sess.close()
sess = tf.Session()

sess.run(tf.global_variables_initializer())

logs = []
losses_train = []
loss_min = np.inf
beta_eval = 0.
epoch = 0

In [34]:
tf.train.SummaryWriter('../log', sess.graph)

AttributeError: module 'tensorflow.tools.api.generator.api.train' has no attribute 'SummaryWriter'

In [None]:
for ct, batch in train_batches:
    feed_dict = get_feed_dict(batch)
    if config.warmup > 0: sess.run(beta.assign(np.minimum(1., ct/(config.warmup*num_train_batches))))

    _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch = \
    sess.run([opt, loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl], feed_dict = feed_dict)
    
    losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]

    if ct%config.log_period==0:
        loss_train, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train = np.mean(losses_train, 0)
        if config.warmup > 0: beta_eval = beta.eval(session=sess)
        loss_dev = get_loss(sess, dev_batches)

#             if loss_dev <= loss_min:
#                 loss_min = loss_dev
#                 loss_test = get_loss(sess, test_batches)

        clear_output()

        logs += [(epoch, ct, loss_train, loss_dev, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train, beta_eval)]
        for log in logs:
            print('Epoch: %i, Step: %i | LOSS TRAIN: %.2f, DEV: %.2f | TM NLL: %.2f, KL: %.4f | LM NLL: %.2f, KL: %.4f | BETA: %.6f' %  log)

        print_sample(batch)

Epoch: 0, Step: 0 | LOSS TRAIN: 449.13, DEV: 699.12 | TM NLL: 438.83, KL: 3.3189 | LM NLL: 10.30, KL: 0.3819 | BETA: 0.000000
Epoch: 0, Step: 100 | LOSS TRAIN: 464.78, DEV: 676.30 | TM NLL: 454.48, KL: 4.1655 | LM NLL: 10.30, KL: 2.7532 | BETA: 0.000253
Epoch: 0, Step: 200 | LOSS TRAIN: 451.42, DEV: 662.42 | TM NLL: 441.12, KL: 4.2104 | LM NLL: 10.30, KL: 4.5798 | BETA: 0.000506
Epoch: 0, Step: 300 | LOSS TRAIN: 453.84, DEV: 660.69 | TM NLL: 443.55, KL: 4.3519 | LM NLL: 10.29, KL: 5.3766 | BETA: 0.000758
Epoch: 0, Step: 400 | LOSS TRAIN: 456.45, DEV: 657.62 | TM NLL: 446.30, KL: 4.4431 | LM NLL: 10.14, KL: 6.3616 | BETA: 0.001011
Epoch: 0, Step: 500 | LOSS TRAIN: 456.73, DEV: 656.70 | TM NLL: 447.00, KL: 4.5060 | LM NLL: 9.72, KL: 14.9999 | BETA: 0.001264
Epoch: 0, Step: 600 | LOSS TRAIN: 457.90, DEV: 656.25 | TM NLL: 448.51, KL: 4.5352 | LM NLL: 9.36, KL: 22.2632 | BETA: 0.001517
Epoch: 0, Step: 700 | LOSS TRAIN: 455.48, DEV: 656.38 | TM NLL: 446.37, KL: 4.5211 | LM NLL: 9.07, KL: 26.

In [45]:
get_all_losses(sess, dev_batches)

TypeError: only size-1 arrays can be converted to Python scalars

# confirm variables

In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)