In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
from six.moves import zip_longest

import numpy as np
import tensorflow as tf
import _pickle as cPickle

from data_structure import load_data
from components import dynamic_rnn, dynamic_bi_rnn, DiagonalGaussian

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data & set config

In [3]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('gpu', '1', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('datadir', 'data', 'directory of input data')
flags.DEFINE_string('dataname', 'sports_sents.pkl', 'name of data')
flags.DEFINE_string('modeldir', 'NAS/model', 'directory of model')
flags.DEFINE_string('modelname', 'sports', 'name of model')

flags.DEFINE_integer('epochs', 10, 'epochs')
flags.DEFINE_integer('batch_size', 8, 'batch size')
flags.DEFINE_integer('log_period', 100, 'valid period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('norm', 1e-6, 'norm')
flags.DEFINE_float('grad_clip', 10.0, 'grad_clip')

flags.DEFINE_float('keep_prob', 0.8, 'keep_prob')
flags.DEFINE_float('word_keep_prob', 0.75, 'keep_prob')

flags.DEFINE_bool('iaf', True, 'valid period')
flags.DEFINE_bool('anneal', True, 'valid period')
flags.DEFINE_integer('kl_rate_rise_time', 5000, 'kl rate rise time')
flags.DEFINE_float('kl_rate_rise_factor', 1e-3, 'kl rate rise factor')

flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('dim_emb', 256, 'dim_latent')
flags.DEFINE_integer('dim_hidden', 256, 'dim_output')
flags.DEFINE_integer('dim_latent', 16, 'dim_latent')

# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

In [5]:
data_path = os.path.join(config.datadir, config.dataname)
data_train, data_dev, data_test, word_to_idx, idx_to_word = cPickle.load(open(data_path,'rb'))

In [6]:
def grouper(iterable, n, fillvalue=None, shorten=True, num_groups=None):
    args = [iter(iterable)] * n
    out = zip_longest(*args, fillvalue=fillvalue)
    out = list(out)
    if num_groups is not None:
        default = (fillvalue,) * n
        assert isinstance(num_groups, int)
        out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
    if shorten:
        assert fillvalue is None
        out = list(tuple(e for e in each if e is not None) for each in out)
    return out

In [7]:
train_batches = grouper(data_train, config.batch_size)
dev_batches = grouper(data_dev, config.batch_size)
test_batches = grouper(data_test, config.batch_size)

In [8]:
flags.DEFINE_integer('PAD_IDX', word_to_idx[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_idx[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_idx[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_idx[EOS], 'EOS_IDX')

flags.DEFINE_integer('n_vocab', len(word_to_idx), 'n_vocab')

maximum_iterations = max([max([len(sent_idx) for sent_idx in batch]) for batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')    

# build model 

## fed variables

In [9]:
tf.reset_default_graph()

t_variables = {}
t_variables['keep_prob'] = tf.placeholder(tf.float32)
t_variables['batch_l'] = tf.placeholder(tf.int32, [])
t_variables['token_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_input_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['dec_target_idxs'] = tf.placeholder(tf.int32, [None, None, None])
t_variables['sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['dec_sent_l'] = tf.placeholder(tf.int32, [None, None])
t_variables['max_sent_l'] = tf.placeholder(tf.int32, [])

## trained variables

In [10]:
dtype = tf.float32

dim_hidden = config.dim_hidden
dim_latent = config.dim_latent

with tf.variable_scope('emb', reuse=tf.AUTO_REUSE):
    embeddings = tf.get_variable('emb', [config.n_vocab, config.dim_emb], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer())
    
with tf.variable_scope('enc', reuse=tf.AUTO_REUSE):
    w_enc = tf.get_variable('w', [dim_hidden, 2 * dim_latent], dtype=dtype)
    b_enc = tf.get_variable('b', [2 * dim_latent], dtype=dtype)
    
with tf.variable_scope('dec', reuse=tf.AUTO_REUSE):
    w_dec = tf.get_variable('w', [dim_latent, dim_hidden], dtype=dtype)
    b_dec = tf.get_variable('b', [dim_hidden], dtype=dtype)    

## encode sentences

In [16]:
# input
batch_l = t_variables['batch_l']
max_doc_l = t_variables['max_doc_l']
max_sent_l = t_variables['max_sent_l']
token_idxs = t_variables['token_idxs'][:, :max_doc_l, :max_sent_l]

# get word embedding
enc_input = tf.nn.embedding_lookup(embeddings, token_idxs)
enc_input_do = tf.reshape(enc_input, [batch_l, max_sent_l, config.dim_emb])

# get sentence embedding
sent_l = t_variables['sent_l']
sent_l_do = tf.reshape(sent_l, [batch_l * max_doc_l])

_, enc_state = dynamic_rnn(enc_input_do, sent_l_do, dim_hidden, t_variables['keep_prob'], cell_name='Model/sent')

# encode to parameter 
means_logvars = tf.nn.relu(tf.matmul(enc_state, w_enc) + b_enc)
means, logvars = tf.split(means_logvars, 2, 1)

# reparameterize
noises = tf.random_normal(tf.shape(means))
latents = means + tf.exp(0.5 * logvars) * noises

## get latent parameters

In [17]:


# if config.iaf:
#     with tf.variable_scope('iaf'):
#         prior = DiagonalGaussian(tf.zeros_like(means, dtype=dtype), tf.zeros_like(logvars, dtype=dtype))
#         posterior = DiagonalGaussian(means, logvars)
#         z = posterior.sample

#         logqs = posterior.logps(z)
#         L = tf.get_variable("inverse_cholesky", [dim_latent, dim_latent], dtype=dtype, initializer=tf.zeros_initializer)
#         diag_one = tf.ones([dim_latent], dtype=dtype)
#         L = tf.matrix_set_diag(L, diag_one)
#         mask = np.tril(np.ones([dim_latent,dim_latent]))
#         L = L * mask
#         latents = tf.matmul(z, L)
#         logps = prior.logps(latents)
#         kl_losses = logqs - logps
# else:
#     noises = tf.random_normal(tf.shape(means))
#     latents = means + tf.exp(0.5 * logvars) * noises
    
#     kl_losses = tf.reduce_sum(-0.5 * (logvars - tf.square(means) - tf.exp(logvars) + 1.0), 1) # sum over latent dimentsion
    
# kl_mean = tf.reduce_mean(kl_losses, [0]) #mean of kl_losses over batches
# kl_loss = tf.reduce_sum(kl_mean)

# if config.anneal:
#     kl_rate = tf.Variable(0.0, trainable=False, dtype=dtype)
#     kl_loss = kl_loss * kl_rate

## decode sentences

In [45]:
# prepare for decoding
dec_input_idxs = t_variables['dec_input_idxs']
dec_input_idxs_do = tf.reshape(dec_input_idxs, [batch_l * max_doc_l, max_sent_l+1])
dec_input_do = tf.nn.embedding_lookup(embeddings, dec_input_idxs_do)

dec_latent_input_do = tf.tile(tf.expand_dims(latents, 1), [1, tf.shape(dec_input_do)[1], 1])
dec_concat_input_do = tf.concat([dec_input_do, dec_latent_input_do], 2)

# decode for training
dec_sent_l = t_variables['dec_sent_l']
dec_sent_l_do = tf.reshape(dec_sent_l, [batch_l * max_doc_l])

with tf.variable_scope('Model/sent/dec', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
    dec_cell = tf.contrib.rnn.GRUCell(dim_hidden)
    dec_cell = tf.contrib.rnn.DropoutWrapper(dec_cell, output_keep_prob = t_variables['keep_prob'])

    dec_initial_state = tf.nn.relu(tf.matmul(latents, w_dec) + b_dec)
    
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_concat_input_do, sequence_length=dec_sent_l_do)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=dec_cell,
        helper=helper,
        initial_state=dec_initial_state)

    dec_outputs, _, output_sent_l = tf.contrib.seq2seq.dynamic_decode(train_decoder)
    
    output_layer = tf.layers.Dense(config.n_vocab, use_bias=False, name="output_projection")
    output_logits = output_layer(dec_outputs.rnn_output)
    
    output_token_idxs_do = tf.reduce_max(output_logits, 2)
    output_token_idxs = tf.reshape(output_token_idxs_do, [batch_l, max_doc_l, tf.shape(output_token_idxs_do)[1]])

In [46]:
# # decode for inferrence
# start_tokens = tf.fill([batch_l * (max_doc_l)], config.BOS_IDX)
# end_token = config.EOS_IDX            
# with tf.variable_scope('Model/sent/dec', initializer=tf.contrib.layers.xavier_initializer(), dtype = tf.float32, reuse=tf.AUTO_REUSE):
#     tiled_dec_initial_state = tf.contrib.seq2seq.tile_batch(dec_initial_state, multiplier=config.beam_width)

#     beam_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
#         cell=dec_cell,
#         embedding=embeddings,
#         start_tokens=start_tokens,
#         end_token=end_token,
#         initial_state=tiled_dec_initial_state,
#         beam_width=config.beam_width, 
#         output_layer=output_layer,
#         length_penalty_weight=config.length_penalty_weight)

#     beam_dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(beam_decoder, maximum_iterations = config.maximum_iterations)

#     output_token_idxs_do = beam_dec_outputs.predicted_ids[:, :, 0]
#     output_token_idxs = tf.reshape(output_token_idxs_do, [batch_l, max_doc_l, tf.shape(output_token_idxs_do)[1]])

## define cost & optimizer

In [47]:
# target and mask
dec_target_idxs = t_variables['dec_target_idxs']
dec_target_idxs_do = tf.reshape(dec_target_idxs, [batch_l * max_doc_l, max_sent_l+1])                
dec_mask_tokens_do = tf.sequence_mask(dec_sent_l_do, maxlen=max_sent_l+1, dtype=tf.float32)

recon_loss = tf.contrib.seq2seq.sequence_loss(output_logits, dec_target_idxs_do, dec_mask_tokens_do) # nll for each token (averaged over batch & sentence)

# define loss
kl_losses = tf.reduce_sum(-0.5 * (logvars - tf.square(means) - tf.exp(logvars) + 1.0), 1) # sum over latent dimentsion    
kl_loss = tf.reduce_mean(kl_losses, [0]) #mean of kl_losses over batches

loss = kl_loss + recon_loss

# define optimizer
if (config.opt == 'Adam'):
    optimizer = tf.train.AdamOptimizer(config.lr)
elif (config.opt == 'Adagrad'):
    optimizer = tf.train.AdagradOptimizer(config.lr)
opt = optimizer.minimize(loss)

ValueError: Variable Model/sent/dec/decoder/gru_cell/gates/kernel/Adagrad/ already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-20-5ccb40bc1c0b>", line 25, in <module>
    opt = optimizer.minimize(loss)
  File "/home/m-isonuma/.pyenv/versions/anaconda2-5.3.0/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/home/m-isonuma/.pyenv/versions/anaconda2-5.3.0/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):


# run model 

In [48]:
def get_feed_dict(batch, mode='train'):
    batch_size = len(batch)
    doc_l_matrix = np.array([instance.doc_l for instance in batch]).astype(np.int32)

    max_doc_l = np.max(doc_l_matrix)
    max_sent_l = max([instance.max_sent_l for instance in batch])

    token_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l], np.int32)
    dec_input_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    dec_target_idxs_matrix = np.zeros([batch_size, max_doc_l, max_sent_l+1], np.int32)
    sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)
    dec_sent_l_matrix = np.zeros([batch_size, max_doc_l], np.int32)

    for i, instance in enumerate(batch):
        for j, sent_idxs in enumerate(instance.token_idxs):
            token_idxs_matrix[i, j, :len(sent_idxs)] = np.asarray(sent_idxs)
            
            sent_idxs_dropout = np.asarray(sent_idxs)
            sent_idxs_dropout[np.random.rand(len(sent_idxs)) > config.word_keep_prob] = config.UNK_IDX
            dec_input_idxs_matrix[i, j, :len(sent_idxs)+1] = np.concatenate([[config.BOS_IDX], sent_idxs_dropout])
            
            dec_target_idxs_matrix[i, j, :len(sent_idxs)+1] = np.asarray(sent_idxs + [config.EOS_IDX])
            sent_l_matrix[i, j] = len(sent_idxs)
            dec_sent_l_matrix[i, j] = len(sent_idxs)+1

    keep_prob = config.keep_prob if mode == 'train' else 1.0

    feed_dict = {
                t_variables['token_idxs']: token_idxs_matrix,
                t_variables['dec_input_idxs']: dec_input_idxs_matrix, t_variables['dec_target_idxs']: dec_target_idxs_matrix, 
                t_variables['batch_l']: batch_size, t_variables['doc_l']: doc_l_matrix, t_variables['sent_l']: sent_l_matrix, t_variables['dec_sent_l']: dec_sent_l_matrix,
                t_variables['max_doc_l']: max_doc_l, t_variables['max_sent_l']: max_sent_l, 
                t_variables['keep_prob']: keep_prob}
    return  feed_dict

In [49]:
def idxs_to_sents(token_idxs, config, idx_to_word):
    sents = []
    for line_idxs in token_idxs:
        tokens = []
        for idx in line_idxs:
            if idx == config.EOS_IDX: break
            tokens.append(idx_to_word[idx])
        sent = ' '.join(tokens)
        sents.append(sent)
    return sents

In [50]:
def get_loss(sess, batches):
    losses = []
    for ct, batch in batches:
        feed_dict = get_feed_dict(batch, mode='test')
        loss_batch = sess.run(loss, feed_dict = feed_dict)
        losses += [loss_batch]        
    loss_mean = np.mean(losses)
    return loss_mean

In [51]:
def print_sample(sample_batch):
    feed_dict = get_feed_dict(sample_batch)
    pred_token_idxs_batch = sess.run(output_token_idxs, feed_dict = feed_dict)
    true_token_idxs_batch = [instance.token_idxs for instance in sample_batch]
    
    assert len(pred_token_idxs_batch) == len(true_token_idxs_batch)
    
    for true_token_idxs, pred_token_idxs in zip(true_token_idxs_batch, pred_token_idxs_batch):
        true_sents = idxs_to_sents(true_token_idxs, config, idx_to_word)
        pred_sents = idxs_to_sents(pred_token_idxs, config, idx_to_word)
        assert len(true_sents) == len(pred_sents)
        
        for true_sent, pred_sent in zip(true_sents, pred_sents):
            print('True: %s' % true_sent)
            print('Pred: %s' % pred_sent)

In [52]:
if 'sess' in globals(): sess.close()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(embeddings.assign(embedding_matrix.astype(np.float32)));

logs = []
losses_train = []
loss_min = np.inf
sample_batch = test_batches[0][1]

In [19]:
for ct, batch in train_batches:
    feed_dict = get_feed_dict(batch)
    _, loss_batch, kl_loss_batch, recon_loss_batch, norm_batch = sess.run([opt, loss, kl_loss, recon_loss, norm], feed_dict = feed_dict)
    losses_train += [[loss_batch, kl_loss_batch, recon_loss_batch, norm_batch]]

    kl_rate_eval = kl_rate.eval(session=sess)
    if config.anneal and (ct > config.kl_rate_rise_time) and kl_rate_eval < 1:
        new_kl_rate = kl_rate_eval + config.kl_rate_rise_factor
        sess.run(kl_rate.assign(new_kl_rate))
    
    if ct%config.log_period==0:
        loss_train, kl_loss_train, recon_loss_train, norm_train = np.mean(losses_train, 0)
        loss_dev = get_loss(sess, dev_batches)

        if loss_dev <= loss_min:
            loss_min = loss_dev
            loss_test = get_loss(sess, test_batches)
        
        clear_output()
        
        logs += [(ct, loss_train, loss_dev, loss_test, kl_loss_train, recon_loss_train, norm_train, kl_rate_eval)]
        for log in logs:
            print('Step: %i | LOSS TRAIN: %.2f, DEV: %.2f, TEST: %.2f | KL: %.2f, RECON: %.2f, NORM: %.2f | KL rate: %.3f' %  log)
        
        print_sample(sample_batch)

Step: 0 | LOSS TRAIN: 11.49, DEV: 11.48, TEST: 11.48 | KL: 0.00, RECON: 10.82, NORM: 0.67 | KL rate: 0.000
Step: 100 | LOSS TRAIN: 9.16, DEV: 7.84, TEST: 7.80 | KL: 0.00, RECON: 8.50, NORM: 0.67 | KL rate: 0.000
Step: 200 | LOSS TRAIN: 8.36, DEV: 7.30, TEST: 7.30 | KL: 0.00, RECON: 7.69, NORM: 0.67 | KL rate: 0.000
Step: 300 | LOSS TRAIN: 8.00, DEV: 7.18, TEST: 7.18 | KL: 0.00, RECON: 7.34, NORM: 0.67 | KL rate: 0.000
Step: 400 | LOSS TRAIN: 7.80, DEV: 7.06, TEST: 7.07 | KL: 0.00, RECON: 7.14, NORM: 0.67 | KL rate: 0.000
Step: 500 | LOSS TRAIN: 7.67, DEV: 6.97, TEST: 6.98 | KL: 0.00, RECON: 7.00, NORM: 0.67 | KL rate: 0.000
Step: 600 | LOSS TRAIN: 7.57, DEV: 6.95, TEST: 6.97 | KL: 0.00, RECON: 6.90, NORM: 0.67 | KL rate: 0.000
Step: 700 | LOSS TRAIN: 7.49, DEV: 6.90, TEST: 6.92 | KL: 0.00, RECON: 6.82, NORM: 0.67 | KL rate: 0.000
Step: 800 | LOSS TRAIN: 7.43, DEV: 6.87, TEST: 6.87 | KL: 0.00, RECON: 6.76, NORM: 0.67 | KL rate: 0.000
Step: 900 | LOSS TRAIN: 7.37, DEV: 6.82, TEST: 6.83 |

True: this light will no doubt capture the attention of night time drivers
Pred: this is a great product
True: it has three functions for the led blinking strobe and solid
Pred: i do not recommend this product
True: the lasers project well and can be set to flash or remain solid
Pred: i do not recommend this product
True: awesome product
Pred: i like this product
True: hopefully it holds up
Pred: i would recommend this one of these
True: i love this light
Pred: this is a great product
True: i have two of these
Pred: i do not recommend this product
True: it is very bright
Pred: if you are going to get it
True: i like the different blinking modes that the light offers
Pred: i do not recommend this product
True: i recommend this to everyone who rides at night
Pred: i do not buy this knife
True: i worked out once back in <unk>
Pred: i do not have to use it
True: i picked up some 40 lb weights and was like 34 too easy 34
Pred: i do not recommend this product
True: so then i grabbed some res

KeyboardInterrupt: 

# confirm variables

In [26]:
feed_dict = get_feed_dict(sample_batch)

In [27]:
dec_latent_input_do = tf.tile(tf.expand_dims(latents, 1), [1, tf.shape(dec_input_do)[1], 1])
dec_concat_input_do = tf.concat([dec_input_do, dec_latent_input_do], 2)

In [28]:
_logvars, _means, _kl_losses, _latents, _dec_latent_input_do, _dec_input_do, _dec_concat_input_do = sess.run([logvars, means, kl_losses, latents, dec_latent_input_do, dec_input_do, dec_concat_input_do], feed_dict=feed_dict)


In [29]:
_logvars.shape, _means.shape, _kl_losses.shape, _latents.shape, _dec_input_do.shape, _dec_latent_input_do.shape, _dec_concat_input_do.shape

((120, 16),
 (120, 16),
 (120,),
 (120, 16),
 (120, 46, 300),
 (120, 46, 16),
 (120, 46, 316))

In [30]:
_output_logits, _dec_target_idxs_do, _dec_mask_tokens_do, _recon_loss, _kl_losses, _ = sess.run([output_logits, dec_target_idxs_do, dec_mask_tokens_do, recon_loss, kl_losses, opt], feed_dict=feed_dict)


In [44]:
tf.reduce_max(output_logits, 2).eval(session=sess, feed_dict=feed_dict).shape

(120, 46)

In [31]:
_output_logits.shape, _dec_target_idxs_do.shape, _dec_mask_tokens_do.shape

((120, 46, 20000), (120, 46), (120, 46))

In [32]:
_logits = np.exp(_output_logits) / np.sum(np.exp(_output_logits), 2)[:, :, None]

In [33]:
_idxs = _dec_target_idxs_do

In [35]:
_losses = np.array([[-np.log(_logits[i, j, _idxs[i, j]]) for j in range(_idxs.shape[1])] for i in range(_idxs.shape[0])]) * _dec_mask_tokens_do

In [36]:
np.sum(_losses)/np.sum(_dec_mask_tokens_do)

9.903732

In [37]:
_recon_loss

9.903732

In [38]:
_kl_losses.shape

(120,)