In [11]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import time
import datetime
import math
import random
import _pickle as cPickle
from collections import defaultdict

from six.moves import zip_longest
import numpy as np

import tensorflow as tf
from tensorflow import distributions as tfd
from tensorflow.python import debug as tf_debug

from data_structure import get_batches
from components import tf_log, sample_latents, compute_kl_loss, dynamic_rnn, dynamic_bi_rnn
from topic_model import TopicModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [4]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '2', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('data_path', 'data/apnews/instances.pkl', 'path of data')
flags.DEFINE_string('modeldir', 'model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 1000, 'epochs')
flags.DEFINE_integer('batch_size', 128, 'number of sentences in each batch')
flags.DEFINE_integer('log_period', 500, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('reg', 0.1, 'regularization term')
flags.DEFINE_float('beta', 0.0, 'initial value of beta')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 0.6, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 10, 'warmup period for KL')
flags.DEFINE_integer('warmup_topic', 0, 'warmup period for KL of topic')

flags.DEFINE_integer('beam_width', 2, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('n_topic', 10, 'number of topic')
flags.DEFINE_integer('dim_hidden_bow', 256, 'dim of hidden bow')
flags.DEFINE_integer('dim_latent_topic', 32, 'dim of latent topic')
flags.DEFINE_integer('dim_emb', 256, 'dim_emb')
flags.DEFINE_integer('dim_hidden', 512, 'dim_hidden')
flags.DEFINE_integer('dim_latent', 32, 'dim_latent')


# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [13]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.data_path,'rb'))

In [12]:
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)

In [16]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')
flags.DEFINE_integer('dim_bow', len(bow_idxs), 'dim_bow')

maximum_iterations = max([max([instance.max_sent_l for instance in batch]) for ct, batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build language model 

## feed dict

In [17]:
def get_feed_dict(batch, mode='train'):
    bow = np.array([instance.bow for instance in batch]).astype(np.float32)
    
    batch_size = len(batch)
    
    doc_l_matrix = np.array([instance.doc_l for instance in batch]).astype(np.int32)

    max_doc_l = np.max(doc_l_matrix)
    max_sent_l = max([instance.max_sent_l for instance in batch])

    token_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l], np.int32)
    dec_input_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    dec_target_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)
    dec_sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)

    for i, instance in enumerate(batch):
        for j, sent_idxs in enumerate(instance.token_idxs):
            token_idxs_matrix[i, j, :len(sent_idxs)] = np.asarray(sent_idxs)
            
            sent_idxs_dropout = np.asarray(sent_idxs)
            sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
            dec_input_idxs_matrix[i, j, :len(sent_idxs)+1] = np.concatenate([[config.BOS_IDX], sent_idxs_dropout])
            
            dec_target_idxs_matrix[i, j, :len(sent_idxs)+1] = np.asarray(sent_idxs + [config.EOS_IDX])
            sent_l_matrix[i, j] = len(sent_idxs)
            dec_sent_l_matrix[i, j] = len(sent_idxs)+1

    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['bow']: bow, t_variables['token_idxs']: token_idxs_matrix,
                t_variables['dec_input_idxs']: dec_input_idxs_matrix, t_variables['dec_target_idxs']: dec_target_idxs_matrix, 
                t_variables['batch_l']: batch_size, t_variables['doc_l']: doc_l_matrix, t_variables['sent_l']: sent_l_matrix, t_variables['dec_sent_l']: dec_sent_l_matrix,
                t_variables['max_doc_l']: max_doc_l, t_variables['max_sent_l']: max_sent_l, 
                t_variables['keep_prob']: keep_prob}
    return  feed_dict

In [18]:
def debug_shape(variables, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init
        
    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)
    for _variable, variable in zip(_variables, variables):
        if hasattr(variable, 'name'):
            print(variable.name, ':', _variable.shape)
        else:
            print(_variable.shape)
            
    if sess_init is None: sess.close()

def debug_value(variables, return_value=False, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init

    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)

    if return_value: 
        return _variables
    else:
        for _variable, variable in zip(_variables, variables):
            if hasattr(variable, 'name'):
                print(variable.name, ':', _variable)
            else:
                print(_variable)
                
    if sess_init is None: sess.close()

## fed variables

In [19]:
tf.reset_default_graph()

t_variables = {}
t_variables['bow'] = tf.placeholder(tf.float32, [None, config.dim_bow])
t_variables['token_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['batch_l'] = tf.placeholder(tf.int32, [])
t_variables['doc_l'] = tf.placeholder(tf.int32, [None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['max_doc_l'] = tf.placeholder(tf.int32, [])
t_variables['max_sent_l'] = tf.placeholder(tf.int32, [])
t_variables['keep_prob'] = tf.placeholder(tf.float32)

## topic model

In [20]:
# encode bow
with tf.variable_scope('topic/enc', reuse=False):
    hidden_bow_ = tf.keras.layers.Dense(units=config.dim_hidden_bow, activation=tf.nn.relu, name='hidden')(t_variables['bow'])
    hidden_bow = tf.keras.layers.Dropout(t_variables['keep_prob'])(hidden_bow_)
    means_bow = tf.keras.layers.Dense(units=config.dim_latent_topic)(hidden_bow)
    logvars_bow = tf.keras.layers.Dense(units=config.dim_latent_topic, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(hidden_bow)
    latents_bow = sample_latents(means_bow, logvars_bow) # sample latent vectors

    prob_topic = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax, name='prob')(latents_bow) # inference of topic probabilities

# decode bow
with tf.variable_scope('shared', reuse=False):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of vocab

bow_embeddings = tf.nn.embedding_lookup(embeddings, bow_idxs) # embeddings of each bow features

with tf.variable_scope('topic/dec', reuse=False):
    topic_embeddings = tf.get_variable('topic_emb', [config.n_topic, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of topics

    topic_bow = tf.nn.softmax(tf.matmul(topic_embeddings, bow_embeddings, transpose_b=True), 1) # bow vectors for each topic
    prob_bow = tf_log(tf.matmul(prob_topic, topic_bow)) # predicted bow distribution

# define losses
topic_losses_recon = -tf.reduce_sum(tf.multiply(t_variables['bow'], prob_bow), 1)
topic_loss_recon = tf.reduce_mean(topic_losses_recon) # negative log likelihood of each words

topic_loss_kl = compute_kl_loss(means_bow, logvars_bow) # KL divergence b/w latent dist & gaussian std

topic_embeddings_norm = topic_embeddings / tf.norm(topic_embeddings, axis=1, keepdims=True)
topic_angles = tf.matmul(topic_embeddings_norm, tf.transpose(topic_embeddings_norm))
topic_angles_mean = tf.reduce_mean(topic_angles, keepdims=True)
topic_angles_vars = tf.reduce_mean(tf.square(topic_angles - topic_angles_mean))
topic_loss_reg = topic_angles_vars - tf.squeeze(topic_angles_mean)

# monitor
n_bow = tf.reduce_sum(t_variables['bow'], 1)
topic_ppls = tf.divide(topic_losses_recon, n_bow)
topics_freq_bow_indices = tf.nn.top_k(topic_bow, 10, name='topic_freq_bow').indices

## encoder

In [21]:
# input
batch_l = t_variables['batch_l']
max_doc_l = t_variables['max_doc_l']
max_sent_l = t_variables['max_sent_l']
token_idxs = t_variables['token_idxs'][:, :max_doc_l, :max_sent_l]

# get word embedding
with tf.variable_scope('sent/enc', reuse=False):
    enc_input = tf.nn.embedding_lookup(embeddings, token_idxs)
    enc_input_do = tf.reshape(enc_input, [batch_l * max_doc_l, max_sent_l, config.dim_emb])

    # get sentence embedding
    sent_l = t_variables['sent_l']
    sent_l_do = tf.reshape(sent_l, [batch_l * max_doc_l])

    _, enc_state_do = dynamic_rnn(enc_input_do, sent_l_do, config.dim_hidden, t_variables['keep_prob'])
    enc_state = tf.reshape(enc_state_do, [batch_l, max_doc_l, config.dim_hidden])

    # TODO House Holder flow
    prob_topic_sent = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax)(enc_state)
    
    # inference of each gaussian dist. parameter
    enc_state_topic = tf.tile(tf.expand_dims(enc_state, 2), [1, 1, config.n_topic, 1]) # tile over topics
    means_sent_topic = tf.keras.layers.Dense(units=config.dim_latent)(enc_state_topic)
    logvars_sent_topic = tf.keras.layers.Dense(units=config.dim_latent, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(enc_state_topic)
    
    # latent vectors from each gaussian dist.
    latents_sent_topic = sample_latents(means_sent_topic, logvars_sent_topic) 
    # latent vector from gaussian mixture    
    latents_sent = tf.squeeze(tf.matmul(tf.expand_dims(prob_topic_sent, -1), latents_sent_topic, transpose_a=True), 2) 
    latents_sent_do = tf.reshape(latents_sent, [batch_l * max_doc_l, config.dim_latent])

In [38]:
debug_shape([token_idxs, enc_state, prob_topic_sent, latents_sent_topic, prob_topic_sent])

strided_slice:0 : (20, 9, 38)
sent/enc/Reshape_2:0 : (20, 9, 512)
sent/enc/dense/Reshape_1:0 : (20, 9, 10)
sent/enc/add:0 : (20, 9, 10, 32)
sent/enc/dense/Reshape_1:0 : (20, 9, 10)


## decoder

In [22]:
# prepare for decoding
dec_input_idxs = t_variables['dec_input_idxs']
dec_input_idxs_do = tf.reshape(dec_input_idxs, [batch_l * max_doc_l, max_sent_l+1])
dec_input_do = tf.nn.embedding_lookup(embeddings, dec_input_idxs_do)

dec_latent_input_do = tf.tile(tf.expand_dims(latents_sent_do, 1), [1, tf.shape(dec_input_do)[1], 1])
dec_concat_input_do = tf.concat([dec_input_do, dec_latent_input_do], 2)

# decode for training
dec_sent_l = t_variables['dec_sent_l']
dec_sent_l_do = tf.reshape(dec_sent_l, [batch_l * max_doc_l])

with tf.variable_scope('sent/dec/rnn', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(config.dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.layers.Dense(units=config.dim_hidden, activation=tf.nn.relu)(latents_sent_do)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input_do, sequence_length=dec_sent_l_do)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name='out')
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs_do = tf.argmax(output_logits, 2)
    output_token_idxs = tf.reshape(output_token_idxs_do, [batch_l, max_doc_l, tf.shape(output_token_idxs_do)[1]])

## language modeling cost

In [23]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
dec_target_idxs_do = tf.reshape(dec_target_idxs, [batch_l * max_doc_l, max_sent_l+1])                
dec_mask_tokens_do = tf.sequence_mask(dec_sent_l_do, maxlen=max_sent_l+1, dtype=tf.float32)

# nll for each token (averaged over batch & sentence)
sent_loss_recon = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs_do, dec_mask_tokens_do) 

In [24]:
# inferred mixture probabilities (computed for each sentence)
categ_topic_sent = tfd.Categorical(probs=prob_topic_sent)

# prior of mixture probabilities (computed for each document)
categ_topic = tfd.Categorical(probs=tf.expand_dims(prob_topic, 1))

sent_loss_kl_categ = tf.reduce_mean(tfd.kl_divergence(categ_topic_sent, categ_topic))

# prior of each gaussian gaussribution (computed for each topic)
with tf.variable_scope('topic/enc/infer', reuse=False):
    means_topic_bow = tf.keras.layers.Dense(units=config.dim_latent)(topic_bow)
    logvars_topic_bow = tf.keras.layers.Dense(units=config.dim_latent, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(topic_bow)
sigma_topic_bow = tf.exp(0.5 * logvars_topic_bow)
gauss_topic_bow = tfd.Normal(loc=means_topic_bow, scale=sigma_topic_bow)

# inference of each gaussian gaussribution (computed for each sentence)
sigma_sent_topic = tf.exp(0.5 * logvars_sent_topic)
gauss_sent_topic = tfd.Normal(loc=means_sent_topic, scale=sigma_sent_topic)

sent_loss_kl_gauss = tf.reduce_sum(tfd.kl_divergence(gauss_sent_topic, gauss_topic_bow), -1)
sent_loss_kl_gmm = tf.reduce_mean(tf.reduce_sum(tf.multiply(prob_topic_sent, sent_loss_kl_gauss), -1))

sent_loss_kl = sent_loss_kl_categ + sent_loss_kl_gmm

In [40]:
debug_shape([prob_topic_sent, prob_topic])

sent/enc/dense/Reshape_1:0 : (20, 9, 10)
topic/enc/prob/Softmax:0 : (20, 10)


In [41]:
debug_shape([means_topic_bow, means_sent_topic])

topic/enc/infer/dense/BiasAdd:0 : (10, 32)
sent/enc/dense_1/BiasAdd:0 : (20, 9, 10, 32)


## optimizer

In [26]:
beta = tf.Variable(config.beta, name='beta', trainable=False) if config.warmup > 0 else tf.constant(1., name='beta')
update_beta = tf.assign_add(beta, 1./(config.warmup*len(train_batches)))
sent_loss = sent_loss_recon + beta * sent_loss_kl

topic_loss = topic_loss_recon + topic_loss_kl + config.reg * topic_loss_reg
loss = topic_loss + sent_loss

# define optimizer
if config.opt == 'Adam':
    optimizer = tf.train.AdamOptimizer(config.lr)
elif config.opt == 'Adagrad':
    optimizer = tf.train.AdagradOptimizer(config.lr)
    
grad_vars = optimizer.compute_gradients(loss)
clipped_grad_vars = [(tf.clip_by_value(grad, -config.grad_clip, config.grad_clip), var) for grad, var in grad_vars]

opt = optimizer.apply_gradients(clipped_grad_vars)

# run model 

In [27]:
def idxs_to_sents(token_idxs, config, idx_to_word):
    sents = []
    for line_idxs in token_idxs:
        tokens = []
        for idx in line_idxs:
            if idx == config.EOS_IDX: break
            tokens.append(idx_to_word[idx])
        sent = ' '.join(tokens)
        sents.append(sent)
    return sents

In [28]:
def get_loss(sess, batches):
    losses = []
    ppl_list = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_batch, sent_loss_batch, ppls_batch = sess.run([loss, topic_loss, sent_loss, topic_ppls], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_batch, sent_loss_batch]]
        ppl_list += list(ppls_batch)
    loss_mean, topic_loss_mean, sent_loss_mean = np.mean(losses, 0)
    ppl_mean = np.exp(np.mean(ppl_list))
    return loss_mean, topic_loss_mean, sent_loss_mean, ppl_mean

def get_all_losses(sess, batches):
    losses = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch = \
        sess.run([loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]
    print('LOSS %.2f | TM NLL: %.2f, KL: %.4f | LM NLL: %.2f, KL: %.4f' %  np.mean(losses, 0))

In [29]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs_batch = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs_batch = [instance.token_idxs for instance in sample_batch]
    
    assert len(pred_token_idxs_batch) == len(true_token_idxs_batch)
    
    for true_token_idxs, pred_token_idxs in zip(true_token_idxs_batch, pred_token_idxs_batch):
        true_sents = idxs_to_sents(true_token_idxs, config, idx_to_word)
        pred_sents = idxs_to_sents(pred_token_idxs, config, idx_to_word)
        assert len(true_sents) == len(pred_sents)
        
        for true_sent, pred_sent in zip(true_sents, pred_sents):
            print('True: %s' % true_sent)
            print('Pred: %s' % pred_sent)

In [30]:
if 'sess' in globals(): sess.close()
sess = tf.Session()

sess.run(tf.global_variables_initializer())

logs = []
losses_train = []
ppls_train = []
loss_min = np.inf
beta_eval = 1.
epoch = 0

In [31]:
time_start = time.time()
for ct, batch in train_batches:
    feed_dict = get_feed_dict(batch)
    if config.warmup > 0 and beta_eval < 1.0: sess.run(update_beta)

    _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch, ppls_batch = \
    sess.run([opt, loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl, topic_ppls], feed_dict = feed_dict)
    
    losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]
    ppls_train += list(ppls_batch)

    if ct%config.log_period==0:
        time_dev = time.time()
        loss_train, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train = np.mean(losses_train, 0)
        ppl_train = np.exp(np.mean(ppls_train))
        loss_dev, topic_loss_dev, sent_loss_dev, ppl_dev = get_loss(sess, dev_batches)
        
        if config.warmup > 0: beta_eval = beta.eval(session=sess)
        
        clear_output()
        time_finish = time.time()
        time_log = int(time_finish - time_start)
        time_log_dev = int(time_finish - time_dev)
        logs += [(time_log, time_log_dev, epoch, ct, loss_train, ppl_train, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train, loss_dev, ppl_dev, topic_loss_dev, sent_loss_dev, beta_eval)]
        for log in logs:
            print('%03d[s], %02d[s], Ep: %02d, Ct: %05d|TR LOSS: %.0f, PPL: %.0f|TM NLL: %.0f, KL: %.2f | LM NLL: %.2f, KL: %.2f|DE LOSS: %.0f, PPL: %.0f, TM: %.0f, LM: %.2f|BETA: %.6f' %  log)

        print_sample(batch)
        
        time_start = time.time()

008[s], 06[s], Ep: 00, Ct: 00000|TR LOSS: 363, PPL: 2498|TM NLL: 352, KL: 0.60 | LM NLL: 9.95, KL: 0.81|DE LOSS: 340, PPL: 2494, TM: 330, LM: 9.95|BETA: 0.000000
True: the alaska board of fisheries and game
Pred: backbone main crossed decide caylee cheney manley canoe pekin manley supermarket textbook cheney denise barrels maher tools cheney major larger since pavement swords lap fulghum canoe undermine shattered pointe cheney address veterinary foundry furnace lockwood cheney cheney marek pointe canoe
True: a joint board
Pred: mia sponsorships mnsure listeriosis drought nordic bus ruby shelters submitting rowan cells garbage taylor cage productions corroon telling productions farina treats productions ihs z epilepsy steen productions dependent firefighters fact embrace pumpkins productions z jurisdiction sucked potential jacket ongoing dependent
True: has decided to forward two names to the governor for the position of department of fish and game commissioner
Pred: abbott tighten play

AssertionError: 

In [32]:
%debug

> [0;32m<ipython-input-29-6fa3dcb112b6>[0m(11)[0;36mprint_sample[0;34m()[0m
[0;32m      9 [0;31m        [0mtrue_sents[0m [0;34m=[0m [0midxs_to_sents[0m[0;34m([0m[0mtrue_token_idxs[0m[0;34m,[0m [0mconfig[0m[0;34m,[0m [0midx_to_word[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m        [0mpred_sents[0m [0;34m=[0m [0midxs_to_sents[0m[0;34m([0m[0mpred_token_idxs[0m[0;34m,[0m [0mconfig[0m[0;34m,[0m [0midx_to_word[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 11 [0;31m        [0;32massert[0m [0mlen[0m[0;34m([0m[0mtrue_sents[0m[0;34m)[0m [0;34m==[0m [0mlen[0m[0;34m([0m[0mpred_sents[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     12 [0;31m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0;32mfor[0m [0mtrue_sent[0m[0;34m,[0m [0mpred_sent[0m [0;32min[0m [0mzip[0m[0;34m([0m[0mtrue_sents[0m[0;34m,[0m [0mpred_sents[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ip

--KeyboardInterrupt--
ipdb> exit


# confirm variables

In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)