In [1]:
from nltk.tokenize import word_tokenize
from collections import OrderedDict
import time
from collections import Counter
import numpy as np
import os

In [2]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tf.enable_eager_execution()
from tensorflow.python.ops import lookup_ops

In [3]:
tf.set_random_seed(42)

In [4]:
nli_file = '../data/nli_data_tokenized.txt'
# nli_file = '../data/small_dataset.txt'
vocab_file = '../data/vocab.txt'
label_vocab_file = '../data/label_vocab.txt'

In [5]:
# all hyperparameters
vocab_size = 30000
embedding_size = 256
batch_size = 256
num_neurons = 512

In [6]:
logging = tf.logging
logging.set_verbosity(logging.INFO)

def log_msg(msg):
   logging.info(f'{time.ctime()}: {msg}')

In [7]:
def NLI_create_dataset(nli_file, vocab_table, label_vocab ,batch_size):
    dataset = tf.data.TextLineDataset(nli_file)
    dataset = dataset.map(lambda sentence: (tf.string_split([sentence],'\t').values[0], tf.string_split([sentence],'\t').values[1], tf.string_split([sentence],'\t').values[2]))
    dataset = dataset.map(lambda s1, s2, lab : (tf.string_split([s1]).values, tf.string_split([s2]).values, lab ))
    dataset = dataset.map(lambda s1, s2, lab: (s1, tf.size(s1), s2, tf.size(s2), lab  ))
    dataset = dataset.map(lambda s1, l1, s2, l2, lab: (vocab_table.lookup(s1), l1, vocab_table.lookup(s2), l2, label_vocab.lookup(lab)   ) )
    dataset = dataset.padded_batch(batch_size=batch_size ,padded_shapes=([None], [],[None],[], []))
    return dataset    

In [8]:
# # Embedding model
# class Embedding(tf.keras.Model):
#     def __init__(self, V, d):
#         super(Embedding, self).__init__()
#         self.W = tfe.Variable(tf.random_uniform(minval=-1.0, maxval=1.0, shape=[V, d]))
    
#     def call(self, word_indexes):
#         return tf.nn.embedding_lookup(self.W, word_indexes)

In [9]:
class Embedding(tf.keras.Model):
    def __init__(self, V, d, init):
        super(Embedding, self).__init__()
#         self.W = tfe.Variable(tf.random_uniform(minval=-1.0, maxval=1.0, shape=[V, d]))
        self.W = tfe.Variable(init)
    
    def call(self, word_indexes):
        return tf.nn.embedding_lookup(self.W, word_indexes)

In [10]:
class StaticRNN(tf.keras.Model):
    def __init__(self, h, cell):
        super(StaticRNN, self).__init__()
        if cell == 'lstm':
            self.cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=h)
        elif cell == 'gru':
            self.cell = tf.nn.rnn_cell.GRUCell(num_units=h)
        else:
            self.cell = tf.nn.rnn_cell.BasicRNNCell(num_units=h)
        
        
    def call(self, word_vectors, num_words, state, init_state):
        word_vectors_time = tf.unstack(word_vectors, axis=1)
        if state:
            outputs, final_state = tf.nn.static_rnn(cell=self.cell, initial_state = init_state,  sequence_length=num_words, inputs=word_vectors_time, dtype=tf.float32)
        else:
            outputs, final_state = tf.nn.static_rnn(cell=self.cell,  sequence_length=num_words, inputs=word_vectors_time, dtype=tf.float32)
        return outputs, final_state

In [11]:
class Encoder(tf.keras.Model):
    def __init__(self, V, d, h, cell):
        super(Encoder, self).__init__()
        init = tf.random_uniform(minval=-1.0, maxval=1.0, shape=[V, d])
        self.word_embedding = Embedding(V, d, init)
        self.rnn = StaticRNN(h, cell)

        
    def call(self, datum, lens, state, init_state):
        word_vectors = self.word_embedding(datum)        
        logits, final_state = self.rnn(word_vectors, lens, state, init_state)
        batch_outputs = []
        for i in range(int(tf.size(lens))):
            sen_len = int(lens[i])
            batch_outputs.append(logits[sen_len-1][i])

#         return logits[-1], final_state
        return tf.convert_to_tensor(batch_outputs), final_state

In [12]:
class NLI_Decoder(tf.keras.Model):
    def __init__(self, h):
        super(NLI_Decoder, self).__init__()
        self.mlp = tf.keras.layers.Dense(units=h)
        self.output_layer = tf.keras.layers.Dense(units=3) #3 output categories

    def call(self, datum, dropout): #datum is actually concatenated output from rnn
        hidden_out = self.mlp(datum)
        relu = tf.nn.relu(hidden_out)
        if dropout:
            relu = tf.nn.dropout(relu, keep_prob = 0.7)
        logits = self.output_layer(relu)
        return logits

In [13]:
# def get_batch_encoding(datum, outputs, len_index):
#     batch_outputs = []
#     cur_batch_size = datum[0].shape[0]
#     for i in range(cur_batch_size):
#         sen_len = int(datum[len_index][i])
#         batch_outputs.append(outputs[sen_len-1][i])
#     return tf.convert_to_tensor(batch_outputs)

In [14]:
def NLI_loss_fun(model1, model2, datum):
    u, s1 = model1(datum[0], datum[1], False, dummy_state)
    v, s2 = model1(datum[2],datum[3], False, dummy_state)
    mlp_input = tf.concat( [tf.concat([u,v],1), tf.abs(tf.subtract(u,v)), tf.multiply(u,v)], 1 )
    logits = model2(mlp_input, True)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=datum[4])
    return tf.reduce_sum(loss) / tf.cast(datum[0].shape[0], dtype = tf.float32)

In [15]:
def clip_gradients(grads_and_vars, clip_ratio):
  gradients, variables = zip(*grads_and_vars)
  clipped, _ = tf.clip_by_global_norm(gradients, clip_ratio)
  return zip(clipped, variables)

In [16]:
def compute_ppl(model1, model2 , dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(dataset):
        avg_loss = NLI_loss_fun(model1, model2 , datum)
        total_loss = avg_loss * tf.cast(datum[0].shape[0], dtype = tf.float32)
        total_words += tf.cast(datum[0].shape[0], dtype = tf.float32)
        
#         if batch_num % 50 == 0:
#             print(f'ppl Done batch: {batch_num}')
    loss = total_loss / total_words
    return np.exp(loss)

In [17]:
vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0)
NLI_label_vocab = lookup_ops.index_table_from_file(label_vocab_file, default_value=0)

In [18]:
dataset = NLI_create_dataset(nli_file, vocab_table, NLI_label_vocab, batch_size)
dev_file = '../data/nli_dev_tokenized.txt'
valid_dataset = NLI_create_dataset(dev_file, vocab_table, NLI_label_vocab, batch_size)

In [19]:
test_datum = next(iter(dataset))

In [20]:
opt = tf.train.AdamOptimizer(learning_rate=0.002)
loss_and_grads_fun = tfe.implicit_value_and_gradients(NLI_loss_fun)

In [21]:
dummy_state = tf.convert_to_tensor(np.zeros(num_neurons))

In [22]:
encoder = Encoder(vocab_size, embedding_size, num_neurons, 'gru')
decoder = NLI_Decoder(num_neurons)

In [23]:
# u1 = encoder(test_datum[0], test_datum[1])
# u2 = encoder(test_datum[2], test_datum[3])
# mlp_input1 = tf.concat( [tf.concat([u1,u2],1), tf.abs(tf.subtract(u1,u2)), tf.multiply(u1,u2)], 1 )
# print(u1)
# logits = decoder(mlp_input1, False)
# # print(logits)

In [24]:
checkpoint_dir = '../nli_encoder'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
root = tfe.Checkpoint(optimizer=opt, model=encoder, optimizer_step=tf.train.get_or_create_global_step())

In [25]:
NUM_EPOCHS = 1
STATS_STEPS = 50
EVAL_STEPS = 50

valid_ppl = compute_ppl(encoder, decoder, valid_dataset)
print(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    dataset = dataset.shuffle(buffer_size = 10000)
    for step_num, datum in enumerate(dataset, start=1):
        loss_value, gradients = loss_and_grads_fun(encoder, decoder, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            print(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl(encoder, decoder, valid_dataset)
            #Save model!
            if ppl < valid_ppl:
                log_msg(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl: 0.4f}')   
                save_path = root.save(checkpoint_prefix)
#                 print(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                print(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
        
        
        
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
    
    print(f'Epoch{epoch_num} Done!')

Start :Valid ppl: 1.2944151163101196
Epoch: 0 Step: 50 Avg Loss: 1.0651798248291016
INFO:tensorflow:Sun Oct 14 17:18:46 2018: Epoch: 0 Step: 50 ppl improved:  1.2729
Epoch: 0 Step: 100 Avg Loss: 0.9480184316635132
INFO:tensorflow:Sun Oct 14 17:22:29 2018: Epoch: 0 Step: 100 ppl improved:  1.2712
Epoch: 0 Step: 150 Avg Loss: 0.9491152763366699
INFO:tensorflow:Sun Oct 14 17:26:16 2018: Epoch: 0 Step: 150 ppl improved:  1.2525
Epoch: 0 Step: 200 Avg Loss: 0.8812267780303955
INFO:tensorflow:Sun Oct 14 17:30:16 2018: Epoch: 0 Step: 200 ppl improved:  1.2489
Epoch: 0 Step: 250 Avg Loss: 0.8993968367576599
INFO:tensorflow:Sun Oct 14 17:34:23 2018: Epoch: 0 Step: 250 ppl improved:  1.2482
Epoch: 0 Step: 300 Avg Loss: 0.8707708120346069
INFO:tensorflow:Sun Oct 14 17:38:29 2018: Epoch: 0 Step: 300 ppl improved:  1.2473
Epoch: 0 Step: 350 Avg Loss: 0.8958591818809509
INFO:tensorflow:Sun Oct 14 17:43:02 2018: Epoch: 0 Step: 350 ppl improved:  1.2438
Epoch: 0 Step: 400 Avg Loss: 0.8020581007003784
