In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
from six.moves import zip_longest

import numpy as np
import tensorflow as tf
from tensorflow import distributions as tfd
from tensorflow.python import debug as tf_debug
import _pickle as cPickle
import random

from data_structure import load_data
from components import tf_log, encode_latents, sample_latents, compute_kl_loss, dynamic_rnn, dynamic_bi_rnn, DiagonalGaussian

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '3', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('data_path', 'data/apnews/instances.pkl', 'path of data')
flags.DEFINE_string('modeldir', 'model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 1000, 'epochs')
flags.DEFINE_integer('batch_size', 32, 'batch size')
flags.DEFINE_integer('log_period', 5000, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('reg', 0.1, 'regularization term')
flags.DEFINE_float('beta', 0.1, 'initial value of beta')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 0.6, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 0, 'warmup period for KL')
flags.DEFINE_integer('warmup_topic', 0, 'warmup period for KL of topic')

flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('n_topic', 50, 'number of topic')
flags.DEFINE_integer('dim_hidden_bow', 256, 'dim of hidden bow')
flags.DEFINE_integer('dim_latent_topic', 32, 'dim of latent topic')
flags.DEFINE_integer('dim_emb', 256, 'dim_emb')
flags.DEFINE_integer('dim_hidden', 512, 'dim_hidden')
flags.DEFINE_integer('dim_latent', 32, 'dim_latent')


# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [5]:
num_train_batches, train_batches, dev_batches, test_batches, word_to_idx, idx_to_word, bow_idxs = load_data(config)

Number of train examples: 39553


In [6]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')
flags.DEFINE_integer('dim_bow', len(bow_idxs), 'dim_bow')

maximum_iterations = max([max([instance.max_sent_l for instance in batch]) for ct, batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build language model 

In [7]:
def debug_shape(variables, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init
        
    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)
    for _variable, variable in zip(_variables, variables):
        if hasattr(variable, 'name'):
            print(variable.name, ':', _variable.shape)
        else:
            print(_variable.shape)
            
    if sess_init is None: sess.close()

def debug_value(variables, return_value=False, sess_init=None):
    if sess_init is None:
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
    else:
        sess = sess_init

    sample_batch = test_batches[0][1]
    feed_dict = get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)

    if return_value: 
        return _variables
    else:
        for _variable, variable in zip(_variables, variables):
            if hasattr(variable, 'name'):
                print(variable.name, ':', _variable)
            else:
                print(_variable)
                
    if sess_init is None: sess.close()

## define model

In [11]:
class TopicModel():
    def __init__(self, config):
        self.config = config
        
        t_variables = {}
        t_variables['bow'] = tf.placeholder(tf.float32, [None, self.config.dim_bow])
        t_variables['keep_prob'] = tf.placeholder(tf.float32)
        self.t_variables = t_variables
        
    def build(self):
        # encode bow
        with tf.variable_scope('topic/enc', reuse=False):
            hidden_bow_ = tf.keras.layers.Dense(units=self.config.dim_hidden_bow, activation=tf.nn.relu, name='hidden')(self.t_variables['bow'])
            hidden_bow = tf.keras.layers.Dropout(self.t_variables['keep_prob'])(hidden_bow_)
            means_bow = tf.keras.layers.Dense(units=self.config.dim_latent_topic)(hidden_bow)
            logvars_bow = tf.keras.layers.Dense(units=self.config.dim_latent_topic, kernel_initializer=tf.constant_initializer(0), bias_initializer=tf.constant_initializer(0))(hidden_bow)
            latents_bow = sample_latents(means_bow, logvars_bow) # sample latent vectors

            prob_topic = tf.layers.Dense(units=self.config.n_topic, activation=tf.nn.softmax, name='prob')(latents_bow) # inference of topic probabilities

        # decode bow
        with tf.variable_scope('shared', reuse=False):
            embeddings = tf.get_variable('emb', [self.config.n_vocab, self.config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of vocab

        bow_embeddings = tf.nn.embedding_lookup(embeddings, bow_idxs) # embeddings of each bow features

        with tf.variable_scope('topic/dec', reuse=False):
            topic_embeddings = tf.get_variable('topic_emb', [self.config.n_topic, self.config.dim_emb], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embeddings of topics

            topic_bow = tf.nn.softmax(tf.matmul(topic_embeddings, bow_embeddings, transpose_b=True), 1) # bow vectors for each topic
            self.topic_bow = topic_bow
            prob_bow = tf_log(tf.matmul(prob_topic, topic_bow)) # predicted bow distribution

        # define lisses
        topic_losses_recon = -tf.reduce_sum(tf.multiply(self.t_variables['bow'], prob_bow), 1)
        topic_loss_recon = tf.reduce_mean(topic_losses_recon) # negative log likelihood of each words
        self.topic_losses_recon, self.topic_loss_recon = topic_losses_recon, topic_loss_recon

        topic_loss_kl = compute_kl_loss(means_bow, logvars_bow) # KL divergence b/w latent dist & gaussian std
        self.topic_loss_kl = topic_loss_kl

        topic_embeddings_norm = topic_embeddings / tf.norm(topic_embeddings, axis=1, keepdims=True)
        topic_angles = tf.matmul(topic_embeddings_norm, tf.transpose(topic_embeddings_norm))
        topic_angles_mean = tf.reduce_mean(topic_angles, keepdims=True)
        topic_angles_vars = tf.reduce_mean(tf.square(topic_angles - topic_angles_mean))
        topic_loss_reg = topic_angles_vars - tf.squeeze(topic_angles_mean)
        self.topic_loss_reg = topic_loss_reg

    def build_opt(self):        
        if self.config.warmup_topic > 0:
            beta = tf.Variable(self.config.beta, name='beta', trainable=False)
            update_beta = tf.assign_add(beta, 1./(self.config.warmup_topic*num_train_batches))
            self.beta, self.update_beta = beta, update_beta
            loss = self.topic_loss_recon + self.beta * self.topic_loss_kl + self.config.reg * self.topic_loss_reg
        else:
            loss = self.topic_loss_recon + self.topic_loss_kl + self.config.reg * self.topic_loss_reg
        self.loss = loss
        
        # define optimizer
        if self.config.opt == 'Adam':
            optimizer = tf.train.AdamOptimizer(self.config.lr)
        elif self.config.opt == 'Adagrad':
            optimizer = tf.train.AdagradOptimizer(self.config.lr)

        grad_vars = optimizer.compute_gradients(loss)
        clipped_grad_vars = [(tf.clip_by_value(grad, -self.config.grad_clip, self.config.grad_clip), var) for grad, var in grad_vars]
        opt = optimizer.apply_gradients(clipped_grad_vars)
        self.opt = opt

        n_bow = tf.reduce_sum(self.t_variables['bow'], 1)
        ppls = tf.divide(self.topic_losses_recon, n_bow)
        self.ppls = ppls
        
        topics_freq_bow_indices = tf.nn.top_k(self.topic_bow, 10, name='topic_freq_bow').indices
        self.topics_freq_bow_indices = topics_freq_bow_indices
        
    def get_feed_dict(self, batch, mode='train'):
        bow = np.array([instance.bow for instance in batch]).astype(np.float32)
        keep_prob = self.config.keep_prob if mode == 'train' else 1.0
        feed_dict = {
                    self.t_variables['bow']: bow, 
                    self.t_variables['keep_prob']: keep_prob
        }
        return  feed_dict

# run model 

In [9]:
def get_loss(sess, batches, model):
    losses = []
    ppl_list = []
    for ct, batch in batches:
        feed_dict = model.get_feed_dict(batch, mode='test')
        loss_batch, ppls_batch = sess.run([model.loss, model.ppls], feed_dict = feed_dict)
        losses += [loss_batch]
        ppl_list += list(ppls_batch)
    loss_mean = np.mean(losses)
    ppl_mean = np.exp(np.mean(ppl_list))
    return loss_mean, ppl_mean

In [12]:
tf.reset_default_graph()
model = TopicModel(config)
model.build()
model.build_opt()

if 'sess' in globals(): sess.close()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

logs = []
losses_train = []
ppls_train = []
loss_min = np.inf
beta_eval = 0.
epoch = 0

In [None]:
for ct, batch in train_batches:
    feed_dict = model.get_feed_dict(batch)
    if config.warmup_topic > 0 and beta_eval < 1.: 
        _, _, beta_eval, ppls_batch, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch = \
        sess.run([model.opt, model.update_beta, model.beta, model.ppls, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_reg], feed_dict = feed_dict)
    else:
        _, ppls_batch, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch = \
        sess.run([model.opt, model.ppls, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_reg], feed_dict = feed_dict)
    
    losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch]]
    ppls_train += list(ppls_batch)

    if ct%config.log_period==0:
        loss_train, topic_loss_recon_train, topic_loss_kl_train, topic_loss_reg_train = np.mean(losses_train, 0)
        ppl_train = np.exp(np.mean(ppls_train))
        loss_dev, ppl_dev = get_loss(sess, dev_batches, model)

        clear_output()

        logs += [(epoch, ct, loss_train, loss_dev, ppl_train, ppl_dev, topic_loss_recon_train, topic_loss_kl_train, topic_loss_reg_train, beta_eval)]
        for log in logs:
            print('Epoch: %i, Step: %i | LOSS TRAIN: %.2f VALID: %.2f | PPL TRAIN: %.2f VALID: %.2f | TM NLL: %.2f, KL: %.2f, REG: %.2f| BETA: %.6f' %  log)
        
        # visualize topic
        topics_freq_bow_idxs = bow_idxs[sess.run(model.topics_freq_bow_indices)]
        for topic, topic_freq_bow_idxs in enumerate(topics_freq_bow_idxs):
            print(topic, ':', ' '.join([idx_to_word[idx] for idx in topic_freq_bow_idxs]))

Epoch: 0, Step: 0 | LOSS TRAIN: 390.80 VALID: 832.72 | PPL TRAIN: 4022.17 VALID: 4020.95 | TM NLL: 390.34, KL: 0.46, REG: 0.00| BETA: 0.000000
Epoch: 0, Step: 5000 | LOSS TRAIN: 438.68 VALID: 751.33 | PPL TRAIN: 1662.27 VALID: 1488.77 | TM NLL: 435.05, KL: 3.68, REG: -0.54| BETA: 0.000000
Epoch: 0, Step: 10000 | LOSS TRAIN: 432.97 VALID: 745.31 | PPL TRAIN: 1483.38 VALID: 1369.56 | TM NLL: 428.57, KL: 4.44, REG: -0.45| BETA: 0.000000
Epoch: 0, Step: 15000 | LOSS TRAIN: 430.04 VALID: 743.03 | PPL TRAIN: 1399.07 VALID: 1317.57 | TM NLL: 425.23, KL: 4.85, REG: -0.39| BETA: 0.000000
Epoch: 0, Step: 20000 | LOSS TRAIN: 428.11 VALID: 741.80 | PPL TRAIN: 1345.32 VALID: 1289.90 | TM NLL: 423.01, KL: 5.14, REG: -0.35| BETA: 0.000000
Epoch: 0, Step: 25000 | LOSS TRAIN: 426.68 VALID: 741.22 | PPL TRAIN: 1306.97 VALID: 1269.23 | TM NLL: 421.35, KL: 5.35, REG: -0.31| BETA: 0.000000
Epoch: 0, Step: 30000 | LOSS TRAIN: 425.61 VALID: 740.12 | PPL TRAIN: 1278.00 VALID: 1255.05 | TM NLL: 420.11, KL: 5.5

# confirm variables

In [None]:
debug_value([topic_losses_recon, n_bow])

In [None]:
debug_value([tf.exp(-tf.divide(topic_losses_recon, n_bow))])

### test

In [12]:
debug_shape([bow, hidden_bow, latents_bow, prob_topic, bow_embeddings, topic_embeddings, topic_bow, prob_bow])

Placeholder:0 : (32, 4022)
topic/enc/dropout/cond/Merge:0 : (32, 256)
topic/enc/add:0 : (32, 32)
topic/enc/prob/Softmax:0 : (32, 50)
embedding_lookup:0 : (4022, 256)
topic/dec/topic_emb:0 : (50, 256)
topic/dec/Softmax:0 : (50, 4022)
topic/dec/Log:0 : (32, 4022)


In [27]:
debug_shape([topic_losses_recon, topic_loss_recon, n_bow, ppls, topic_embeddings_norm, tf.expand_dims(topic_angles_mean, -1), topic_angles_vars])

Sum:0 : (32,)
Neg:0 : ()
Sum_2:0 : (32,)
Neg_1:0 : (32,)
truediv_1:0 : (50, 256)
ExpandDims_1:0 : (1,)
Mean_3:0 : ()


In [14]:
debug_value([tf.reduce_sum(tf.square(topic_embeddings_norm), 1)], return_value=True)[0]

array([1.        , 0.99999994, 0.9999999 , 1.        , 1.        ,
       1.0000001 , 0.9999999 , 1.0000001 , 1.0000001 , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.9999999 , 0.9999999 , 0.99999994, 1.        ,
       1.        , 0.9999999 , 1.0000001 , 1.        , 1.        ,
       1.        , 0.99999994, 1.        , 0.99999994, 0.99999994,
       1.0000001 , 0.9999999 , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.9999999 , 1.        ,
       0.9999999 , 1.        , 1.        , 1.        , 0.99999994,
       1.0000001 , 1.        , 1.        , 0.99999994, 1.        ],
      dtype=float32)

In [15]:
debug_value([tf.reduce_sum(prob_topic, -1), tf.reduce_sum(topic_bow, -1), tf.reduce_sum(tf.exp(prob_bow), 1)])

Sum_4:0 : [1.0000001  1.         1.0000001  0.99999994 1.         1.
 1.         0.99999994 0.99999994 1.         1.         1.
 0.99999994 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.9999999
 1.         0.99999994 1.         1.0000001  0.99999994 1.
 1.         1.        ]
Sum_5:0 : [1.         0.99999994 1.         0.99999994 0.99999994 1.
 1.         0.9999998  1.         1.         1.         1.
 1.         1.         1.         1.0000001  0.99999994 1.
 1.         0.99999994 0.9999999  0.99999994 1.0000001  1.
 1.         1.         1.         0.99999994 0.9999999  1.
 1.         1.         0.99999994 1.         0.99999994 1.
 1.         0.9999999  1.         1.         1.         1.
 0.99999994 1.         0.99999994 0.99999994 0.99999994 0.99999994
 1.         1.        ]
Sum_6:0 : [1.         1.         1.0000001  0.99999994 1.         1.
 0.9999999  0.99999994 0.9999999  1.         1.0000001  1.
 0.9999999  1.        

In [16]:
sigma_bow = tf.exp(0.5 * logvars_bow)
dist_bow = tfd.Normal(means_bow, sigma_bow)
dist_std = tfd.Normal(0., 1.)
topic_loss_kl_tmp = tf.reduce_mean(tf.reduce_sum(tfd.kl_divergence(dist_bow, dist_std), 1))

In [17]:
debug_value([topic_loss_recon, topic_loss_kl, topic_loss_kl_tmp])

Neg:0 : 405.38312
Mean_1:0 : 0.32056683
Mean_4:0 : 0.32056683


In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)