In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import h5py
from six.moves import zip_longest

import numpy as np
import tensorflow as tf
from tensorflow.python import debug as tf_debug
import _pickle as cPickle
import random

from data_structure import load_data
from preprocess_text import Indexer
from components import dynamic_rnn, dynamic_bi_rnn, DiagonalGaussian
from data import Dataset

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '1', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('datadir', 'data', 'directory of input data')
flags.DEFINE_string('dataname', 'sports_sents.pkl', 'name of data')
flags.DEFINE_string('modeldir', 'NAS/model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_string('train_file', 'data/yahoo/yahoo32-train.hdf5', 'name of model')
flags.DEFINE_string('val_file', 'data/yahoo/yahoo32-val.hdf5', 'name of model')
flags.DEFINE_string('test_file', 'data/yahoo/yahoo32-test.hdf5', 'name of model')
flags.DEFINE_string('vocab_file', 'data/yahoo/yahoo32.dict', 'name of model')

flags.DEFINE_integer('epochs', 10, 'epochs')
flags.DEFINE_integer('batch_size', 32, 'batch size')
flags.DEFINE_integer('log_period', 100, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('grad_clip', 5., 'grad_clip')

flags.DEFINE_float('keep_prob', 1.0, 'dropout rate')
flags.DEFINE_float('word_keep_prob', 0.75, 'word dropout rate')

flags.DEFINE_integer('warmup', 10, 'warmup period for KL')

flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('dim_emb', 256, 'dim_latent')
flags.DEFINE_integer('dim_hidden', 256, 'dim_output')
flags.DEFINE_integer('dim_latent', 16, 'dim_latent')

# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [5]:
train_data = Dataset(config.train_file)
val_data = Dataset(config.val_file)
test_data = Dataset(config.test_file)

print('Train data: %d batches' % len(train_data))
print('Val data: %d batches' % len(val_data))
print('Test data: %d batches' % len(test_data))

batch_size = train_data.batch_size[0]
n_vocab = int(train_data.vocab_size)

Train data: 3211 batches
Val data: 398 batches
Test data: 392 batches


In [6]:
indexer = Indexer()
indexer.load_vocab(config.vocab_file)
word_to_idx = indexer.d
idx_to_word = indexer.idx2word

assert len(word_to_idx) == n_vocab

In [7]:
train_batches = [train_data[i][0].data.numpy()[:, 1:-1] for i in range(len(train_data))]
val_batches = [val_data[i][0].data.numpy()[:, 1:-1] for i in range(len(val_data))]
test_batches = [test_data[i][0].data.numpy()[:, 1:-1] for i in range(len(test_data))]
num_train_batches = len(train_batches)

In [8]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')

maximum_iterations = val_data.sents.numpy().shape[1]
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')

# build model 

## feed dict

In [9]:
def get_feed_dict(batch, mode='train'):
    batch_size = len(batch)

    sent_l = [len(sent_idxs) for sent_idxs in batch]
    dec_sent_l = [len(sent_idxs)+1 for sent_idxs in batch]
    max_sent_l = max(sent_l)

    token_idxs_matrix = np.zeros([batch_size, max_sent_l], np.int32)
    dec_input_idxs_matrix = np.zeros([batch_size, max_sent_l+1], np.int32)
    dec_target_idxs_matrix = np.zeros([batch_size, max_sent_l+1], np.int32)
    
    for i, sent_idxs in enumerate(batch):
        token_idxs_matrix[i, :len(sent_idxs)] = np.asarray(sent_idxs)

        sent_idxs_dropout = np.asarray(sent_idxs)
        sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
        dec_input_idxs_matrix[i, :len(sent_idxs)+1] = np.concatenate([[config.BOS_IDX], sent_idxs_dropout])

        dec_target_idxs_matrix[i, :len(sent_idxs)+1] = np.concatenate([np.asarray(sent_idxs), [config.EOS_IDX]])

    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['token_idxs']: token_idxs_matrix,
                t_variables['dec_input_idxs']: dec_input_idxs_matrix, t_variables['dec_target_idxs']: dec_target_idxs_matrix, 
                t_variables['batch_l']: batch_size, t_variables['sent_l']: sent_l, t_variables['dec_sent_l']: dec_sent_l,
                t_variables['keep_prob']: keep_prob}
    return  feed_dict

In [10]:
def debug_shape(variables):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        sample_batch = test_batches[0]
        feed_dict = get_feed_dict(sample_batch)
        _variables = sess.run(variables, feed_dict=feed_dict)
        for _variable, variable in zip(_variables, variables):
            print(variable.name, ':', _variable.shape)
            
        sess.close()

## fed variables

In [11]:
tf.reset_default_graph()

t_variables = {}
t_variables['keep_prob'] = tf.placeholder(tf.float32)
t_variables['batch_l'] = tf.placeholder(tf.int32, [])
t_variables['token_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None])
t_variables['dec_sent_l'] = tf.placeholder(tf.int32, [None])

## trained variables

In [12]:
dtype = tf.float32

dim_hidden = config.dim_hidden
dim_latent = config.dim_latent

# with tf.variable_scope('emb', reuse=tf.AUTO_REUSE):
#     embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
    
# with tf.variable_scope('enc', reuse=tf.AUTO_REUSE):
#     w_enc = tf.get_variable('w', [dim_hidden, 2 * dim_latent], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
#     b_enc = tf.get_variable('b', [2 * dim_latent], dtype=dtype, initializer=tf.constant_initializer())
    
# with tf.variable_scope('dec', reuse=tf.AUTO_REUSE):
#     w_dec = tf.get_variable('w', [dim_latent, dim_hidden], dtype=dtype, initializer=tf.random_uniform_initializer(-1., 1.))
#     b_dec = tf.get_variable('b', [dim_hidden], dtype=dtype, initializer=tf.constant_initializer())

if config.warmup > 0:
    beta = tf.Variable(0.1, name='beta', trainable=False)    
    
with tf.variable_scope('emb'):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer())
    
with tf.variable_scope('enc'):
    w_enc = tf.get_variable('w_enc', [dim_hidden, 2 * dim_latent], dtype=dtype)
    b_enc = tf.get_variable('b_enc', [2 * dim_latent], dtype=dtype)
    
with tf.variable_scope('dec'):
    w_dec = tf.get_variable('w_dec', [dim_latent, dim_hidden], dtype=dtype)
    b_dec = tf.get_variable('b_dec', [dim_hidden], dtype=dtype)



## encode sentences

In [13]:
# input
batch_l = t_variables['batch_l']
sent_l = t_variables['sent_l']
max_sent_l = tf.reduce_max(sent_l)
token_idxs = t_variables['token_idxs']

# get sentence embedding
enc_input = tf.nn.embedding_lookup(embeddings, token_idxs)
_, enc_state = dynamic_rnn(enc_input, sent_l, dim_hidden, t_variables['keep_prob'], cell_name='Model/sent')

# encode to parameter 
means_logvars = tf.nn.relu(tf.matmul(enc_state, w_enc) + b_enc)
means, logvars = tf.split(means_logvars, 2, 1)

# reparameterize
noises = tf.random_normal(tf.shape(means))
latents = means + tf.exp(0.5 * logvars) * noises

In [14]:
debug_shape([token_idxs, enc_input, enc_state, means, logvars, latents])

Placeholder_2:0 : (32, 20)
embedding_lookup:0 : (32, 20, 256)
Model/sent/rnn/while/Exit_3:0 : (32, 256)
split:0 : (32, 16)
split:1 : (32, 16)
add_1:0 : (32, 16)


## decode sentences

In [15]:
# prepare for decoding
dec_input_idxs = t_variables['dec_input_idxs']
dec_input = tf.nn.embedding_lookup(embeddings, dec_input_idxs)

dec_latent_input = tf.tile(tf.expand_dims(latents, 1), [1, tf.shape(dec_input_idxs)[1], 1])
dec_concat_input = tf.concat([dec_input, dec_latent_input], 2)

# decode for training
dec_sent_l = t_variables['dec_sent_l']

with tf.variable_scope('Model/sent/dec', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.nn.relu(tf.matmul(latents, w_dec) + b_dec)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input, sequence_length=dec_sent_l)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name="output_projection")
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs = tf.argmax(output_logits, 2)

In [16]:
debug_shape([dec_concat_input, output_logits, output_token_idxs])

concat:0 : (32, 21, 272)
Model/sent/dec/output_projection/Tensordot:0 : (32, 21, 20001)
Model/sent/dec/ArgMax:0 : (32, 21)


## define cost & optimizer

In [17]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
max_dec_sent_l = tf.reduce_max(dec_sent_l)
dec_mask_tokens = tf.sequence_mask(dec_sent_l, maxlen=max_dec_sent_l, dtype=tf.float32)

recon_loss = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs, dec_mask_tokens) # nll for each token (averaged over batch & sentence)

# define loss
kl_losses = tf.reduce_sum(-0.5 * (logvars - tf.square(means) - tf.exp(logvars) + 1.0), 1) # sum over latent dimentsion    
kl_loss = tf.reduce_mean(kl_losses, [0]) #mean of kl_losses over batches

loss = recon_loss + beta * kl_loss

# define optimizer
if (config.opt == 'Adam'):
    optimizer = tf.train.AdamOptimizer(config.lr)
elif (config.opt == 'Adagrad'):
    optimizer = tf.train.AdagradOptimizer(config.lr)
    
grad_vars = optimizer.compute_gradients(loss)
clipped_grad_vars = [(tf.clip_by_value(grad, -config.grad_clip, config.grad_clip), var) for grad, var in grad_vars]

opt = optimizer.apply_gradients(clipped_grad_vars)

In [18]:
debug_shape([kl_losses])

Sum:0 : (32,)


# run model 

In [19]:
def idxs_to_sent(line_idxs, config, idx_to_word):
    tokens = []
    for idx in line_idxs:
        if idx == config.EOS_IDX: break
        tokens.append(idx_to_word[idx])
    sent = ' '.join(tokens)
    return sent

In [20]:
def get_loss(sess, batches):
    losses = []
    for batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch = sess.run(loss, feed_dict = feed_dict)
        losses += [loss_batch]        
    loss_mean = np.mean(losses)
    return loss_mean

In [21]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs = sample_batch
    
    assert len(pred_token_idxs) == len(true_token_idxs)
    
    for true_sent_idxs, pred_sent_idxs in zip(true_token_idxs, pred_token_idxs):
        true_sent = idxs_to_sent(true_sent_idxs, config, idx_to_word)
        pred_sent = idxs_to_sent(pred_sent_idxs, config, idx_to_word)

        print('True: %s' % true_sent)
        print('Pred: %s' % pred_sent)

In [24]:
if 'sess' in globals(): sess.close()
sess = tf.Session()

vim 
logs = []
losses_train = []
loss_min = np.inf
beta_eval = 0.1

In [25]:
for epoch in range(config.epochs):
    for ct, batch in enumerate(train_batches):
        feed_dict = get_feed_dict(batch)
        if config.warmup > 0: sess.run(beta.assign(np.minimum(1., beta_eval+ 1./(config.warmup*num_train_batches))))

        _, loss_batch, kl_loss_batch, recon_loss_batch = sess.run([opt, loss, kl_loss, recon_loss], feed_dict = feed_dict)
        losses_train += [[loss_batch, kl_loss_batch, recon_loss_batch]]

        if ct%config.log_period==0:
            loss_train, kl_loss_train, recon_loss_train = np.mean(losses_train, 0)
            ppl_train = np.exp(recon_loss_train)
            if config.warmup > 0: beta_eval = beta.eval(session=sess)
            loss_val = get_loss(sess, val_batches)

            if loss_val <= loss_min:
                loss_min = loss_val
                loss_test = get_loss(sess, test_batches)

            clear_output()

            logs += [(epoch, ct, loss_train, loss_val, loss_test, ppl_train, kl_loss_train, beta_eval)]
            for log in logs:
                print('Epoch: %i, Step: %i | LOSS TRAIN: %.2f, val: %.2f, TEST: %.2f | NLL: %.2f, KL: %.4f| BETA: %.6f' %  log)

            print_sample(batch)

Epoch: 0, Step: 0 | LOSS TRAIN: 9.91, val: 9.90, TEST: 9.90 | NLL: 20179.49, KL: 0.1638| BETA: 0.010031
Epoch: 0, Step: 100 | LOSS TRAIN: 5.94, val: 3.95, TEST: 3.95 | NLL: 355.32, KL: 6.6171| BETA: 0.010062
Epoch: 0, Step: 200 | LOSS TRAIN: 5.74, val: 3.61, TEST: 3.61 | NLL: 292.68, KL: 5.9026| BETA: 0.010093
Epoch: 0, Step: 300 | LOSS TRAIN: 5.63, val: 2.87, TEST: 2.87 | NLL: 265.02, KL: 5.2205| BETA: 0.010125
Epoch: 0, Step: 400 | LOSS TRAIN: 5.57, val: 3.01, TEST: 2.87 | NLL: 249.75, KL: 4.7162| BETA: 0.010156
Epoch: 0, Step: 500 | LOSS TRAIN: 5.53, val: 2.69, TEST: 2.93 | NLL: 241.19, KL: 4.3370| BETA: 0.010187
Epoch: 0, Step: 600 | LOSS TRAIN: 5.50, val: 2.56, TEST: 2.74 | NLL: 234.59, KL: 4.0051| BETA: 0.010218
Epoch: 0, Step: 700 | LOSS TRAIN: 5.47, val: 2.02, TEST: 2.16 | NLL: 229.73, KL: 3.7316| BETA: 0.010249
Epoch: 0, Step: 800 | LOSS TRAIN: 5.46, val: 2.10, TEST: 2.16 | NLL: 226.41, KL: 3.4892| BETA: 0.010280
Epoch: 0, Step: 900 | LOSS TRAIN: 5.44, val: 2.20, TEST: 2.16 | 

KeyboardInterrupt: 

# confirm variables

In [110]:
_logvars, _means, _kl_losses, _latents, _output_logits = sess.run([logvars, means, kl_losses, latents, output_logits], feed_dict=feed_dict)


In [111]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape

((32, 32), (32, 32), (32,), (32, 32))

In [112]:
_output_logits

array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]], dtype=float32)

In [109]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


NameError: name 'dec_target_idxs_do' is not defined

In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)