In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import sys
import pdb
import time
import datetime
import math
import random
import _pickle as cPickle
from collections import defaultdict

from six.moves import zip_longest
import numpy as np

import tensorflow as tf
from tensorflow import distributions as tfd
from tensorflow.keras.preprocessing.sequence import pad_sequences

from data_structure import get_batches
from components import tf_log, sample_latents, compute_kl_loss, dynamic_rnn, dynamic_bi_rnn
from topic_model import TopicModel

# from absl import logging
# logging.warning('Worrying Stuff')

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '2', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('data_path', 'data/apnews/instances.pkl', 'path of data')
flags.DEFINE_string('modeldir', 'model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 1000, 'epochs')
flags.DEFINE_integer('batch_size', 128, 'number of sentences in each batch')
flags.DEFINE_integer('log_period', 500, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('reg', 0.1, 'regularization term')
flags.DEFINE_float('beta', 0.0, 'initial value of beta')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 0.8, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 10, 'warmup period for KL')
flags.DEFINE_integer('warmup_topic', 0, 'warmup period for KL of topic')

flags.DEFINE_integer('beam_width', 2, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('n_topic', 10, 'number of topic')
flags.DEFINE_integer('dim_hidden_bow', 256, 'dim of hidden bow')
flags.DEFINE_integer('dim_latent_topic', 32, 'dim of latent topic')
flags.DEFINE_integer('dim_emb', 256, 'dim_emb')
flags.DEFINE_integer('dim_hidden', 512, 'dim_hidden')
flags.DEFINE_integer('dim_latent', 32, 'dim_latent')


# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')
flags.DEFINE_bool('logtostderr', True, 'kernel')
flags.DEFINE_bool('showprefixforinfo', False, '')
flags.DEFINE_bool('verbosity', False, '')
# flags.DEFINE_integer('stderrthreshold', 20, 'kernel')

config = flags.FLAGS

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.data_path,'rb'))

In [6]:
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)

In [7]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')
flags.DEFINE_integer('dim_bow', len(bow_idxs), 'dim_bow')

maximum_iterations = max([max([instance.max_sent_l for instance in batch]) for ct, batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build language model 

## feed dict

In [42]:
tf.reset_default_graph()

t_variables = {}
t_variables['bow'] = tf.placeholder(tf.float32, [None, config.dim_bow])
t_variables['input_token_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['doc_l'] = tf.placeholder(tf.int32, [None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None])
t_variables['keep_prob'] = tf.placeholder(tf.float32)

In [43]:
def get_feed_dict(batch, mode='train', assertion=False):
    def token_dropout(sent_idxs):
        sent_idxs_dropout = np.asarray(sent_idxs)
        sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
        return list(sent_idxs_dropout)

    bow = np.array([instance.bow for instance in batch]).astype(np.float32)
    
    doc_l = np.array([len(instance.token_idxs) for instance in batch])
    
    feed_input_token_idxs_list = [sent_idxs for instance in batch for sent_idxs in instance.token_idxs]
    feed_dec_input_idxs_list = [[config.BOS_IDX] + token_dropout(sent_idxs) for sent_idxs in feed_input_token_idxs_list]
    feed_dec_target_idxs_list = [sent_idxs + [config.EOS_IDX]  for sent_idxs in feed_input_token_idxs_list]
        
    sent_l = np.array([len(sent_idxs) for sent_idxs in feed_input_token_idxs_list], np.int32)
    
    feed_input_token_idxs = pad_sequences(feed_input_token_idxs_list, padding='post', value=config.PAD_IDX, dtype=np.int32)
    feed_dec_input_idxs = pad_sequences(feed_dec_input_idxs_list, padding='post', value=config.PAD_IDX, dtype=np.int32)
    feed_dec_target_idxs = pad_sequences(feed_dec_target_idxs_list, padding='post', value=config.PAD_IDX, dtype=np.int32)
    
    if assertion:
        index = 0
        for instance in batch:
            for line_idxs in instance.token_idxs:
                assert feed_input_token_idxs_list[index] == line_idxs
                index += 1
        assert feed_input_token_idxs.shape[1] == np.max(sent_l)
        assert feed_dec_input_idxs.shape[1] == np.max(sent_l) + 1
        assert feed_dec_target_idxs.shape[1] == np.max(sent_l) + 1
    
    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['bow']: bow, 
                t_variables['doc_l']: doc_l, t_variables['sent_l']: sent_l, 
                t_variables['input_token_idxs']: feed_input_token_idxs, t_variables['dec_input_idxs']: feed_dec_input_idxs, t_variables['dec_target_idxs']: feed_dec_target_idxs, 
                t_variables['keep_prob']: keep_prob
    }
    return  feed_dict

In [44]:
batch = test_batches[0][1]
get_feed_dict(batch);

In [45]:
def debug_shape(variables, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init
        
    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)
    for _variable, variable in zip(_variables, variables):
        if hasattr(variable, 'name'):
            print(variable.name, ':', _variable.shape)
        else:
            print(_variable.shape)
            
    if sess_init is None: sess.close()

def debug_value(variables, return_value=False, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init

    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)

    if return_value: 
        return _variables
    else:
        for _variable, variable in zip(_variables, variables):
            if hasattr(variable, 'name'):
                print(variable.name, ':', _variable)
            else:
                print(_variable)
                
    if sess_init is None: sess.close()

## topic model

In [46]:
# encode bow
with tf.variable_scope('topic/enc', reuse=False):
    hidden_bow_ = tf.keras.layers.Dense(units=config.dim_hidden_bow, activation=tf.nn.relu, name='hidden')(t_variables['bow'])
    hidden_bow = tf.keras.layers.Dropout(t_variables['keep_prob'])(hidden_bow_)
    means_bow = tf.keras.layers.Dense(units=config.dim_latent_topic)(hidden_bow)
    logvars_bow = tf.keras.layers.Dense(units=config.dim_latent_topic, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(hidden_bow)
    latents_bow = sample_latents(means_bow, logvars_bow) # sample latent vectors

    prob_topic = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax, name='prob')(latents_bow) # inference of topic probabilities

# decode bow
with tf.variable_scope('shared', reuse=False):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of vocab

bow_embeddings = tf.nn.embedding_lookup(embeddings, bow_idxs) # embeddings of each bow features

with tf.variable_scope('topic/dec', reuse=False):
    topic_embeddings = tf.get_variable('topic_emb', [config.n_topic, config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of topics

    topic_bow = tf.nn.softmax(tf.matmul(topic_embeddings, bow_embeddings, transpose_b=True), 1) # bow vectors for each topic
    logits_bow = tf_log(tf.matmul(prob_topic, topic_bow)) # predicted bow distribution

# prior of each gaussian distribution (computed for each topic)
with tf.variable_scope('topic/enc/infer', reuse=False):
    means_topic = tf.keras.layers.Dense(units=config.dim_latent)(topic_bow)
    logvars_topic = tf.keras.layers.Dense(units=config.dim_latent, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(topic_bow)
sigma_topic = tf.exp(0.5 * logvars_topic)
gauss_topic = tfd.Normal(loc=means_topic, scale=sigma_topic)    
    
# define losses
topic_losses_recon = -tf.reduce_sum(tf.multiply(t_variables['bow'], logits_bow), 1)
topic_loss_recon = tf.reduce_mean(topic_losses_recon) # negative log likelihood of each words

topic_loss_kl = compute_kl_loss(means_bow, logvars_bow) # KL divergence b/w latent dist & gaussian std

topic_embeddings_norm = topic_embeddings / tf.norm(topic_embeddings, axis=1, keepdims=True)
topic_angles = tf.matmul(topic_embeddings_norm, tf.transpose(topic_embeddings_norm))
topic_angles_mean = tf.reduce_mean(topic_angles, keepdims=True)
topic_angles_vars = tf.reduce_mean(tf.square(topic_angles - topic_angles_mean))
topic_loss_reg = topic_angles_vars - tf.squeeze(topic_angles_mean)

# monitor
n_bow = tf.reduce_sum(t_variables['bow'], 1)
topic_ppls = tf.divide(topic_losses_recon, n_bow)
topics_freq_bow_indices = tf.nn.top_k(topic_bow, 10, name='topic_freq_bow').indices

In [50]:
debug_shape([hidden_bow, latents_bow, prob_topic])

topic/enc/dropout/cond/Merge:0 : (21, 256)
topic/enc/add:0 : (21, 32)
topic/enc/prob/Softmax:0 : (21, 13)


In [55]:
debug_shape([topic_embeddings, topic_bow, logits_bow, means_topic, topic_losses_recon])

topic/dec/topic_emb:0 : (13, 256)
topic/dec/Softmax:0 : (13, 2661)
topic/dec/Log:0 : (21, 2661)
topic/enc/infer/dense_2/BiasAdd:0 : (13, 32)
Neg:0 : (21,)


In [63]:
debug_value([tf.reduce_sum(prob_topic, 1)])

Sum_10:0 : [1.0000001  0.9999999  0.9999999  1.         1.         1.0000001
 1.         1.         1.         1.         0.99999994 1.
 1.         1.         1.         1.         1.         1.
 0.99999994 1.         0.99999994]


## encoder

In [57]:
# input
input_token_idxs = t_variables['input_token_idxs']
batch_l, max_sent_l = tf.shape(input_token_idxs)[0], tf.shape(input_token_idxs)[1]
sent_l = t_variables['sent_l']

with tf.variable_scope('sent/enc', reuse=False):
    # get word embedding
    enc_input = tf.nn.embedding_lookup(embeddings, input_token_idxs)

    # get sentence embedding
    _, enc_state = dynamic_rnn(enc_input, sent_l, config.dim_hidden, t_variables['keep_prob'])

    # TODO House Holder flow
    prob_topic_infer = tf.layers.Dense(units=config.n_topic, activation=tf.nn.softmax)(enc_state)
    
    # inference of each gaussian dist. parameter
    enc_state_infer = tf.tile(tf.expand_dims(enc_state, 1), [1, config.n_topic, 1]) # tile over topics
    means_topic_infer = tf.keras.layers.Dense(units=config.dim_latent)(enc_state_infer)
    logvars_topic_infer = tf.keras.layers.Dense(units=config.dim_latent, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(enc_state_infer)
    sigma_topic_infer = tf.exp(0.5 * logvars_topic_infer)
    gauss_topic_infer = tfd.Normal(loc=means_topic_infer, scale=sigma_topic_infer)

    # latent vectors from each gaussian dist.
    latents_topic_infer = sample_latents(means_topic_infer, logvars_topic_infer) 
    # latent vector from gaussian mixture    
    latents_input = tf.matmul(tf.expand_dims(prob_topic_infer, -1), latents_topic_infer, transpose_a=True)

In [58]:
debug_shape([input_token_idxs, enc_state, enc_state_infer, latents_topic_infer, prob_topic_infer, latents_input])

Placeholder_1:0 : (132, 40)
sent/enc/rnn/while/Exit_3:0 : (132, 512)
sent/enc/Tile:0 : (132, 13, 512)
sent/enc/add:0 : (132, 13, 32)
sent/enc/dense/Softmax:0 : (132, 13)
sent/enc/MatMul:0 : (132, 1, 32)


In [62]:
debug_value([tf.reduce_sum(prob_topic_infer, 1)])

Sum_9:0 : [1.0000001  1.         0.99999994 1.         1.         0.9999999
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.99999994 1.         0.99999994
 1.         1.         1.         1.         1.         0.99999994
 1.         1.         0.99999994 1.         1.         0.9999999
 1.         1.         0.99999994 1.         0.99999994 1.
 1.         1.         1.0000001  1.         1.         1.
 1.         1.         1.         1.         1.         0.9999999
 1.         0.99999994 0.99999994 1.         1.         1.
 1.         1.         1.         0.9999999  1.         1.
 1.         1.         1.         1.         1.         0.99999994
 0.99999994 1.         1.         1.         1.         0.9999999
 1.         0.99999994 1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.99999994
 1.         1.         1.         1.         1.         1.
 1.         0.99999994 1.         1.         

## decoder

In [65]:
# prepare for decoding
dec_sent_l = tf.add(sent_l, 1)
dec_input_idxs = t_variables['dec_input_idxs']
dec_input = tf.nn.embedding_lookup(embeddings, dec_input_idxs)

dec_latents_input = tf.tile(latents_input, [1, tf.shape(dec_input)[1], 1])
dec_concat_input = tf.concat([dec_input, dec_latents_input], -1)

# decode for training
with tf.variable_scope('sent/dec/rnn', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(config.dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.layers.Dense(units=config.dim_hidden, activation=tf.nn.relu)(tf.squeeze(latents_input, 1))
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input, sequence_length=dec_sent_l)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name='out')
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs = tf.argmax(output_logits, 2)

In [69]:
debug_shape([dec_concat_input, dec_initial_state, output_logits, output_token_idxs])

concat:0 : (132, 41, 288)
sent/dec/rnn/dense/Relu:0 : (132, 512)
sent/dec/rnn/out/Tensordot:0 : (132, 41, 21867)
sent/dec/rnn/ArgMax:0 : (132, 41)


## language modeling cost

In [71]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
dec_mask_tokens = tf.sequence_mask(dec_sent_l, maxlen=max_sent_l+1, dtype=tf.float32)

# nll for each token (averaged over batch & sentence)
sent_loss_recon = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs, dec_mask_tokens)

In [72]:
doc_l = t_variables['doc_l']
mask_sents = tf.sequence_mask(doc_l)
mask_sents_flatten = tf.reshape(mask_sents, [tf.shape(mask_sents)[0]*tf.shape(mask_sents)[1]])

prob_topic_tiled = tf.tile(tf.expand_dims(prob_topic, 1), [1, tf.shape(mask_sents)[1], 1])
prob_topic_flatten = tf.reshape(prob_topic_tiled, [tf.shape(mask_sents)[0]*tf.shape(mask_sents)[1], config.n_topic])
prob_topic_sents = tf.boolean_mask(prob_topic_flatten, mask_sents_flatten)

In [73]:
# inferred mixture probabilities (computed for each sentence)
categ_topic_infer = tfd.Categorical(probs=prob_topic_infer)

# prior of mixture probabilities (computed for each document, tiled for each sentence)
categ_topic = tfd.Categorical(probs=prob_topic_sents)

sent_loss_kl_categ = tf.reduce_mean(tfd.kl_divergence(categ_topic_infer, categ_topic))

# inference of each gaussian gaussribution (computed for each sentence)

sent_loss_kl_gauss = tf.reduce_sum(tfd.kl_divergence(gauss_topic_infer, gauss_topic), -1)
sent_loss_kl_gmm = tf.reduce_mean(tf.reduce_sum(tf.multiply(prob_topic_infer, sent_loss_kl_gauss), -1))

sent_loss_kl = sent_loss_kl_categ + sent_loss_kl_gmm

In [77]:
debug_shape([doc_l, mask_sents, mask_sents_flatten])

Placeholder_4:0 : (21,)
SequenceMask_1/Less:0 : (21, 9)
Reshape:0 : (189,)


In [81]:
debug_shape([prob_topic_tiled, prob_topic_flatten, prob_topic_sents, prob_topic_infer])

Tile_1:0 : (21, 9, 13)
Reshape_1:0 : (189, 13)
boolean_mask/GatherV2:0 : (132, 13)
sent/enc/dense/Softmax:0 : (132, 13)


In [76]:
debug_value([mask_sents, mask_sents_flatten])

SequenceMask_1/Less:0 : [[ True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True False]
 [ True  True  True  True  True  True  True  True False]
 [ True  True  True  True  True  True  True  True False]
 [ True  True  True  True  True  True  True  True False]
 [ True  True  True  True  True  True  True False False]
 [ True  True  True  True  True  True  True False False]
 [ True  True  True  True  True  True  True False False]
 [ True  True  True  True  True  True  True False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True  True False False False]
 [ True  True  True  True  True False False False False]
 [ True  True  True  True  True False False False False]
 [ True

In [85]:
debug_shape([means_topic_infer, means_topic])

sent/enc/dense_5/BiasAdd:0 : (132, 13, 32)
topic/enc/infer/dense_2/BiasAdd:0 : (13, 32)


In [92]:
means_topic_tmp = tf.tile(tf.expand_dims(means_topic, 0), [batch_l, 1, 1])
sigma_topic_tmp = tf.tile(tf.expand_dims(sigma_topic, 0), [batch_l, 1, 1])

gauss_topic_tmp = tfd.Normal(loc=means_topic_tmp, scale=sigma_topic_tmp)
sent_loss_kl_gauss_tmp = tf.reduce_sum(tfd.kl_divergence(gauss_topic_infer, gauss_topic_tmp), -1)

debug_shape([means_topic_tmp])
debug_value([sent_loss_kl_gauss, sent_loss_kl_gauss_tmp])

Tile_2:0 : (132, 13, 32)
Sum_11:0 : [[0.00031778 0.00031761 0.00031753 ... 0.00031782 0.00031723 0.00031761]
 [0.00026743 0.00026744 0.00026729 ... 0.0002673  0.00026752 0.00026738]
 [0.00021236 0.00021253 0.00021226 ... 0.00021252 0.00021227 0.00021219]
 ...
 [0.00011392 0.00011369 0.00011354 ... 0.00011363 0.00011366 0.00011365]
 [0.00024045 0.00024016 0.00024023 ... 0.00024029 0.00024018 0.00024052]
 [0.00025667 0.00025701 0.00025676 ... 0.00025693 0.0002569  0.00025666]]
Sum_18:0 : [[0.00031778 0.00031761 0.00031753 ... 0.00031782 0.00031723 0.00031761]
 [0.00026743 0.00026744 0.00026729 ... 0.0002673  0.00026752 0.00026738]
 [0.00021236 0.00021253 0.00021226 ... 0.00021252 0.00021227 0.00021219]
 ...
 [0.00011392 0.00011369 0.00011354 ... 0.00011363 0.00011366 0.00011365]
 [0.00024045 0.00024016 0.00024023 ... 0.00024029 0.00024018 0.00024052]
 [0.00025667 0.00025701 0.00025676 ... 0.00025693 0.0002569  0.00025666]]


## optimizer

In [20]:
beta = tf.Variable(config.beta, name='beta', trainable=False) if config.warmup > 0 else tf.constant(1., name='beta')
update_beta = tf.assign_add(beta, 1./(config.warmup*len(train_batches)))
sent_loss = sent_loss_recon + beta * sent_loss_kl

topic_loss = topic_loss_recon + topic_loss_kl + config.reg * topic_loss_reg
loss = topic_loss + sent_loss

# define optimizer
if config.opt == 'Adam':
    optimizer = tf.train.AdamOptimizer(config.lr)
elif config.opt == 'Adagrad':
    optimizer = tf.train.AdagradOptimizer(config.lr)
    
grad_vars = optimizer.compute_gradients(loss)
clipped_grad_vars = [(tf.clip_by_value(grad, -config.grad_clip, config.grad_clip), var) for grad, var in grad_vars]

opt = optimizer.apply_gradients(clipped_grad_vars)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# run model 

In [21]:
def idxs_to_sents(token_idxs, config, idx_to_word):
    sents = []
    for sent_idxs in token_idxs:
        tokens = []
        for idx in sent_idxs:
            if idx == config.EOS_IDX: break
            tokens.append(idx_to_word[idx])
        sent = ' '.join(tokens)
        sents.append(sent)
    return sents

In [22]:
def get_loss(sess, batches):
    losses = []
    ppl_list = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_batch, sent_loss_batch, ppls_batch = sess.run([loss, topic_loss, sent_loss, topic_ppls], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_batch, sent_loss_batch]]
        ppl_list += list(ppls_batch)
    loss_mean, topic_loss_mean, sent_loss_mean = np.mean(losses, 0)
    ppl_mean = np.exp(np.mean(ppl_list))
    return loss_mean, topic_loss_mean, sent_loss_mean, ppl_mean

def get_all_losses(sess, batches):
    losses = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch = \
        sess.run([loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]
    print('LOSS %.2f | TM NLL: %.2f, KL: %.4f | LM NLL: %.2f, KL: %.4f' %  np.mean(losses, 0))

In [23]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs = [sent_idxs for instance in sample_batch for sent_idxs in instance.token_idxs]
    
    assert len(pred_token_idxs) == len(true_token_idxs)
    
    pred_sents = idxs_to_sents(pred_token_idxs, config, idx_to_word)
    true_sents = idxs_to_sents(true_token_idxs, config, idx_to_word)
    
    for true_sent, pred_sent in zip(true_sents, pred_sents):        
        print('True: %s' % true_sent)
        print('Pred: %s' % pred_sent)

In [24]:
if 'sess' in globals(): sess.close()
sess = tf.Session()

sess.run(tf.global_variables_initializer())

logs = []
losses_train = []
ppls_train = []
loss_min = np.inf
beta_eval = 1.
epoch = 0

In [25]:
time_start = time.time()
for epoch in range(config.epochs):
    for ct, batch in train_batches:
        feed_dict = get_feed_dict(batch)
        if config.warmup > 0 and beta_eval < 1.0: sess.run(update_beta)

        _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch, ppls_batch = \
        sess.run([opt, loss, topic_loss_recon, topic_loss_kl, sent_loss_recon, sent_loss_kl, topic_ppls], feed_dict = feed_dict)

        losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, sent_loss_recon_batch, sent_loss_kl_batch]]
        ppls_train += list(ppls_batch)

        if ct%config.log_period==0:
            time_dev = time.time()
            loss_train, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train = np.mean(losses_train, 0)
            ppl_train = np.exp(np.mean(ppls_train))
            loss_dev, topic_loss_dev, sent_loss_dev, ppl_dev = get_loss(sess, dev_batches)

            if config.warmup > 0: beta_eval = beta.eval(session=sess)

            clear_output()
            time_finish = time.time()
            time_log = int(time_finish - time_start)
            time_log_dev = int(time_finish - time_dev)
            logs += [(time_log, time_log_dev, epoch, ct, loss_train, ppl_train, topic_loss_recon_train, topic_loss_kl_train, sent_loss_recon_train, sent_loss_kl_train, loss_dev, ppl_dev, topic_loss_dev, sent_loss_dev, beta_eval)]
            for log in logs:
                print('%03d[s], %02d[s], Ep: %02d, Ct: %05d|TR LOSS: %.0f, PPL: %.0f|TM NLL: %.0f, KL: %.2f | LM NLL: %.2f, KL: %.2f|DE LOSS: %.0f, PPL: %.0f, TM: %.0f, LM: %.2f|BETA: %.6f' %  log)

            print_sample(batch)

            time_start = time.time()

007[s], 05[s], Ep: 00, Ct: 00000|TR LOSS: 351, PPL: 2662|TM NLL: 341, KL: 0.65 | LM NLL: 9.99, KL: 0.68|DE LOSS: 346, PPL: 2658, TM: 336, LM: 9.99|BETA: 0.000000
100[s], 05[s], Ep: 00, Ct: 00500|TR LOSS: 326, PPL: 1560|TM NLL: 317, KL: 1.30 | LM NLL: 7.23, KL: 4.81|DE LOSS: 317, PPL: 1374, TM: 310, LM: 6.82|BETA: 0.034674
100[s], 05[s], Ep: 00, Ct: 01000|TR LOSS: 323, PPL: 1450|TM NLL: 314, KL: 1.66 | LM NLL: 7.00, KL: 3.19|DE LOSS: 314, PPL: 1294, TM: 307, LM: 6.75|BETA: 0.069347
090[s], 05[s], Ep: 01, Ct: 00000|TR LOSS: 321, PPL: 1387|TM NLL: 312, KL: 1.90 | LM NLL: 6.91, KL: 2.53|DE LOSS: 312, PPL: 1216, TM: 305, LM: 6.75|BETA: 0.100000
101[s], 05[s], Ep: 01, Ct: 00500|TR LOSS: 319, PPL: 1328|TM NLL: 310, KL: 2.15 | LM NLL: 6.85, KL: 2.13|DE LOSS: 310, PPL: 1147, TM: 303, LM: 6.71|BETA: 0.134675
101[s], 05[s], Ep: 01, Ct: 01000|TR LOSS: 318, PPL: 1289|TM NLL: 309, KL: 2.31 | LM NLL: 6.81, KL: 1.86|DE LOSS: 309, PPL: 1124, TM: 302, LM: 6.69|BETA: 0.169350
090[s], 05[s], Ep: 02, Ct: 0

KeyboardInterrupt: 

In [27]:
# visualize topic
topics_freq_bow_idxs = bow_idxs[sess.run(topics_freq_bow_indices)]
for topic, topic_freq_bow_idxs in enumerate(topics_freq_bow_idxs):
    print(topic, ':', ' '.join([idx_to_word[idx] for idx in topic_freq_bow_idxs]))

0 : republican senate democratic house election party campaign governor republicans gov
1 : court judge federal u.s. attorney lawsuit filed case district law
2 : million company http water project reports federal department u.s. plant
3 : vehicle car crash driver truck died driving killed hit struck
4 : department http people reports health office law officers week security
5 : percent million company cents share revenue billion rate shares average
6 : national saturday event museum died day years u.s. center president
7 : bill tax budget million health lawmakers house measure gov approved
8 : school university students board education program president college schools district
9 : fire firefighters area reported authorities home people sunday miles blaze
10 : found home officers authorities shot woman sheriff shooting arrested body
11 : charged guilty prosecutors charges court years prison pleaded attorney murder
12 : service weather power national storm snow tuesday expected rain cus

# confirm variables

In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)