In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
from six.moves import zip_longest

import numpy as np
import tensorflow as tf
from tensorflow.python import debug as tf_debug
import _pickle as cPickle
import random

from data_structure import load_data
from components import dynamic_rnn, dynamic_bi_rnn, DiagonalGaussian

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [37]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '1', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('datadir', 'data', 'directory of input data')
flags.DEFINE_string('dataname', 'sports_sents.pkl', 'name of data')
flags.DEFINE_string('modeldir', 'NAS/model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 10, 'epochs')
flags.DEFINE_integer('batch_size', 32, 'batch size')
flags.DEFINE_integer('log_period', 1000, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 1.0, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 10, 'warmup period for KL')

flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('dim_emb', 256, 'dim_latent')
flags.DEFINE_integer('dim_hidden', 256, 'dim_output')
flags.DEFINE_integer('dim_latent', 16, 'dim_latent')

# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [38]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [39]:
data_path = os.path.join(config.datadir, config.dataname)
data_train, data_dev, data_test, word_to_idx, idx_to_word = cPickle.load(open(data_path,'rb'))

In [40]:
def grouper(iterable, n, fillvalue=None, shorten=True, num_groups=None, shuffle=False):
    args = [iter(iterable)] * n
    out = zip_longest(*args, fillvalue=fillvalue)
    out = list(out)
    if shuffle: random.shuffle(out)
    if num_groups is not None:
        default = (fillvalue,) * n
        assert isinstance(num_groups, int)
        out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
    if shorten:
        assert fillvalue is None
        out = (tuple(e for e in each if e is not None) for each in out)
    return out

In [41]:
train_batches = grouper(data_train, config.batch_size, shuffle=True)
dev_batches = list(grouper(data_dev, config.batch_size))
test_batches = list(grouper(data_test, config.batch_size))
num_train_batches = len(list(grouper(data_train, config.batch_size, shuffle=True)))

In [42]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')

maximum_iterations = max([max([len(sent_idx) for sent_idx in batch]) for batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build model 

## feed dict

In [43]:
def get_feed_dict(batch, mode='train'):
    batch_size = len(batch)

    sent_l = [len(sent_idxs) for sent_idxs in batch]
    dec_sent_l = [len(sent_idxs)+1 for sent_idxs in batch]
    max_sent_l = max(sent_l)

    token_idxs_matrix = np.zeros([batch_size, max_sent_l], np.int32)
    dec_input_idxs_matrix = np.zeros([batch_size, max_sent_l+1], np.int32)
    dec_target_idxs_matrix = np.zeros([batch_size, max_sent_l+1], np.int32)
    
    for i, sent_idxs in enumerate(batch):
        token_idxs_matrix[i, :len(sent_idxs)] = np.asarray(sent_idxs)

        sent_idxs_dropout = np.asarray(sent_idxs)
        sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
        dec_input_idxs_matrix[i, :len(sent_idxs)+1] = np.concatenate([[config.BOS_IDX], sent_idxs_dropout])

        dec_target_idxs_matrix[i, :len(sent_idxs)+1] = np.asarray(sent_idxs + [config.EOS_IDX])

    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['token_idxs']: token_idxs_matrix,
                t_variables['dec_input_idxs']: dec_input_idxs_matrix, t_variables['dec_target_idxs']: dec_target_idxs_matrix, 
                t_variables['batch_l']: batch_size, t_variables['sent_l']: sent_l, t_variables['dec_sent_l']: dec_sent_l,
                t_variables['keep_prob']: keep_prob}
    return  feed_dict

In [44]:
def debug_shape(variables):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        sample_batch = test_batches[0]
        feed_dict = get_feed_dict(sample_batch)
        _variables = sess.run(variables, feed_dict=feed_dict)
        for _variable, variable in zip(_variables, variables):
            print(variable.name, ':', _variable.shape)
            
        sess.close()

## fed variables

In [45]:
tf.reset_default_graph()

t_variables = {}
t_variables['keep_prob'] = tf.placeholder(tf.float32)
t_variables['batch_l'] = tf.placeholder(tf.int32, [])
t_variables['token_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None])
t_variables['dec_sent_l'] = tf.placeholder(tf.int32, [None])

## trained variables

In [46]:
dtype = tf.float32

dim_hidden = config.dim_hidden
dim_latent = config.dim_latent

# with tf.variable_scope('emb', reuse=tf.AUTO_REUSE):
#     embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
    
# with tf.variable_scope('enc', reuse=tf.AUTO_REUSE):
#     w_enc = tf.get_variable('w', [dim_hidden, 2 * dim_latent], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
#     b_enc = tf.get_variable('b', [2 * dim_latent], dtype=dtype, initializer=tf.constant_initializer())
    
# with tf.variable_scope('dec', reuse=tf.AUTO_REUSE):
#     w_dec = tf.get_variable('w', [dim_latent, dim_hidden], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
#     b_dec = tf.get_variable('b', [dim_hidden], dtype=dtype, initializer=tf.constant_initializer())

if config.warmup > 0:
    beta = tf.Variable(0.1, name='beta', trainable=False)    
    
with tf.variable_scope('emb'):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer())
    
with tf.variable_scope('enc'):
    w_enc = tf.get_variable('w_enc', [dim_hidden, 2 * dim_latent], dtype=dtype)
    b_enc = tf.get_variable('b_enc', [2 * dim_latent], dtype=dtype)
    
with tf.variable_scope('dec'):
    w_dec = tf.get_variable('w_dec', [dim_latent, dim_hidden], dtype=dtype)
    b_dec = tf.get_variable('b_dec', [dim_hidden], dtype=dtype)



## encode sentences

In [47]:
# input
batch_l = t_variables['batch_l']
sent_l = t_variables['sent_l']
max_sent_l = tf.reduce_max(sent_l)
token_idxs = t_variables['token_idxs']

# get sentence embedding
enc_input = tf.nn.embedding_lookup(embeddings, token_idxs)
_, enc_state = dynamic_rnn(enc_input, sent_l, dim_hidden, t_variables['keep_prob'], cell_name='Model/sent')

# encode to parameter 
means_logvars = tf.nn.relu(tf.matmul(enc_state, w_enc) + b_enc)
means, logvars = tf.split(means_logvars, 2, 1)

# reparameterize
noises = tf.random_normal(tf.shape(means))
latents = means + tf.exp(0.5 * logvars) * noises

In [48]:
debug_shape([token_idxs, enc_input, enc_state, means, logvars, latents])

Placeholder_2:0 : (32, 3)
embedding_lookup:0 : (32, 3, 256)
Model/sent/rnn/while/Exit_3:0 : (32, 256)
split:0 : (32, 16)
split:1 : (32, 16)
add_1:0 : (32, 16)


## decode sentences

In [49]:
# prepare for decoding
dec_input_idxs = t_variables['dec_input_idxs']
dec_input = tf.nn.embedding_lookup(embeddings, dec_input_idxs)

dec_latent_input = tf.tile(tf.expand_dims(latents, 1), [1, tf.shape(dec_input_idxs)[1], 1])
dec_concat_input = tf.concat([dec_input, dec_latent_input], 2)

# decode for training
dec_sent_l = t_variables['dec_sent_l']

with tf.variable_scope('Model/sent/dec', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.nn.relu(tf.matmul(latents, w_dec) + b_dec)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input, sequence_length=dec_sent_l)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name="output_projection")
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs = tf.argmax(output_logits, 2)

In [50]:
debug_shape([dec_concat_input, output_logits, output_token_idxs])

concat:0 : (32, 4, 272)
Model/sent/dec/output_projection/Tensordot:0 : (32, 4, 20000)
Model/sent/dec/ArgMax:0 : (32, 4)


## define cost & optimizer

In [51]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
max_dec_sent_l = tf.reduce_max(dec_sent_l)
dec_mask_tokens = tf.sequence_mask(dec_sent_l, maxlen=max_dec_sent_l, dtype=tf.float32)

recon_loss = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs, dec_mask_tokens) # nll for each token (averaged over batch & sentence)

# define loss
kl_losses = tf.reduce_sum(-0.5 * (logvars - tf.square(means) - tf.exp(logvars) + 1.0), 1) # sum over latent dimentsion    
kl_loss = tf.reduce_mean(kl_losses, [0]) #mean of kl_losses over batches

loss = recon_loss + beta * kl_loss

# define optimizer
if (config.opt == 'Adam'):
    optimizer = tf.train.AdamOptimizer(config.lr)
elif (config.opt == 'Adagrad'):
    optimizer = tf.train.AdagradOptimizer(config.lr)
    
grad_vars = optimizer.compute_gradients(loss)
clipped_grad_vars = [(tf.clip_by_value(grad, -config.grad_clip, config.grad_clip), var) for grad, var in grad_vars]

opt = optimizer.apply_gradients(clipped_grad_vars)

In [52]:
debug_shape([kl_losses])

Sum:0 : (32,)


# run model 

In [53]:
def idxs_to_sent(line_idxs, config, idx_to_word):
    tokens = []
    for idx in line_idxs:
        if idx == config.EOS_IDX: break
        tokens.append(idx_to_word[idx])
    sent = ' '.join(tokens)
    return sent

In [54]:
def get_loss(sess, batches):
    losses = []
    for batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch = sess.run(loss, feed_dict = feed_dict)
        losses += [loss_batch]        
    loss_mean = np.mean(losses)
    return loss_mean

In [55]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs = sample_batch
    
    assert len(pred_token_idxs) == len(true_token_idxs)
    
    for true_sent_idxs, pred_sent_idxs in zip(true_token_idxs, pred_token_idxs):
        true_sent = idxs_to_sent(true_sent_idxs, config, idx_to_word)
        pred_sent = idxs_to_sent(pred_sent_idxs, config, idx_to_word)

        print('True: %s' % true_sent)
        print('Pred: %s' % pred_sent)

In [56]:
if 'sess' in globals(): sess.close()
sess = tf.Session()

sess.run(tf.global_variables_initializer())

logs = []
losses_train = []
loss_min = np.inf
beta_eval = 0.01

In [61]:
for epoch in range(config.epochs):
    train_batches = grouper(data_train, config.batch_size, shuffle=True)
    for ct, batch in enumerate(train_batches):
        feed_dict = get_feed_dict(batch)
        if config.warmup > 0: sess.run(beta.assign(np.minimum(1., beta_eval+ 1./(config.warmup*num_train_batches))))

        _, loss_batch, kl_loss_batch, recon_loss_batch = sess.run([opt, loss, kl_loss, recon_loss], feed_dict = feed_dict)
        losses_train += [[loss_batch, kl_loss_batch, recon_loss_batch]]

        if ct%config.log_period==0:
            loss_train, kl_loss_train, recon_loss_train = np.mean(losses_train, 0)
            ppl_train = np.exp(recon_loss_train)
            if config.warmup > 0: beta_eval = beta.eval(session=sess)
            loss_dev = get_loss(sess, dev_batches)

            if loss_dev <= loss_min:
                loss_min = loss_dev
                loss_test = get_loss(sess, test_batches)

            clear_output()

            logs += [(ct, loss_train, loss_dev, loss_test, ppl_train, kl_loss_train, beta_eval)]
            for log in logs:
                print('Step: %i | LOSS TRAIN: %.2f, DEV: %.2f, TEST: %.2f | NLL: %.2f, KL: %.4f| BETA: %.6f' %  log)

            print_sample(batch)

Step: 0 | LOSS TRAIN: 9.91, DEV: 9.91, TEST: 9.91 | NLL: 19989.35, KL: 0.2202| BETA: 0.010008
Step: 1000 | LOSS TRAIN: 6.59, DEV: 6.28, TEST: 6.26 | NLL: 705.00, KL: 3.5135| BETA: 0.010016
Step: 2000 | LOSS TRAIN: 6.43, DEV: 6.18, TEST: 6.16 | NLL: 602.32, KL: 2.7705| BETA: 0.010024
Step: 3000 | LOSS TRAIN: 6.34, DEV: 6.11, TEST: 6.09 | NLL: 551.30, KL: 2.6194| BETA: 0.010032
Step: 4000 | LOSS TRAIN: 6.26, DEV: 5.94, TEST: 5.92 | NLL: 512.02, KL: 2.5567| BETA: 0.010040
Step: 5000 | LOSS TRAIN: 6.19, DEV: 5.80, TEST: 5.78 | NLL: 475.38, KL: 2.5735| BETA: 0.010047
Step: 6000 | LOSS TRAIN: 6.12, DEV: 5.73, TEST: 5.71 | NLL: 442.81, KL: 2.6369| BETA: 0.010055
Step: 7000 | LOSS TRAIN: 6.06, DEV: 5.66, TEST: 5.66 | NLL: 415.69, KL: 2.7330| BETA: 0.010063
Step: 8000 | LOSS TRAIN: 6.00, DEV: 5.61, TEST: 5.59 | NLL: 393.43, KL: 2.8561| BETA: 0.010071
Step: 9000 | LOSS TRAIN: 5.96, DEV: 5.57, TEST: 5.55 | NLL: 374.77, KL: 3.0316| BETA: 0.010079
Step: 10000 | LOSS TRAIN: 5.91, DEV: 5.53, TEST: 5.

KeyboardInterrupt: 

In [59]:
for log in logs:
            print('Step: %i | LOSS TRAIN: %.2f, DEV: %.2f, TEST: %.2f | NLL: %.2f, KL: %.4f| BETA: %.6f' %  log)

Step: 0 | LOSS TRAIN: 9.91, DEV: 9.91, TEST: 9.91 | NLL: 19989.35, KL: 0.2202| BETA: 0.010008
Step: 1000 | LOSS TRAIN: 6.59, DEV: 6.28, TEST: 6.26 | NLL: 705.00, KL: 3.5135| BETA: 0.010016
Step: 2000 | LOSS TRAIN: 6.43, DEV: 6.18, TEST: 6.16 | NLL: 602.32, KL: 2.7705| BETA: 0.010024
Step: 3000 | LOSS TRAIN: 6.34, DEV: 6.11, TEST: 6.09 | NLL: 551.30, KL: 2.6194| BETA: 0.010032
Step: 4000 | LOSS TRAIN: 6.26, DEV: 5.94, TEST: 5.92 | NLL: 512.02, KL: 2.5567| BETA: 0.010040
Step: 5000 | LOSS TRAIN: 6.19, DEV: 5.80, TEST: 5.78 | NLL: 475.38, KL: 2.5735| BETA: 0.010047
Step: 6000 | LOSS TRAIN: 6.12, DEV: 5.73, TEST: 5.71 | NLL: 442.81, KL: 2.6369| BETA: 0.010055
Step: 7000 | LOSS TRAIN: 6.06, DEV: 5.66, TEST: 5.66 | NLL: 415.69, KL: 2.7330| BETA: 0.010063
Step: 8000 | LOSS TRAIN: 6.00, DEV: 5.61, TEST: 5.59 | NLL: 393.43, KL: 2.8561| BETA: 0.010071
Step: 9000 | LOSS TRAIN: 5.96, DEV: 5.57, TEST: 5.55 | NLL: 374.77, KL: 3.0316| BETA: 0.010079
Step: 10000 | LOSS TRAIN: 5.91, DEV: 5.53, TEST: 5.

# confirm variables

In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)