In [1]:
import tensorflow as tf
import numpy as np
import math
import json

from data_utils import build_vocab, word_to_ids, count_trainable_parameters
from load_embedding import load_embedding

In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
os.chdir("./gdrive/My Drive/NLU/project1/")

In [None]:
# from tensorboardcolab import *
# tbc = TensorBoardColab()

In [34]:
FILE_TRAIN = 'sentences.train'
FILE_EVAL = 'sentences.eval'
FILE_TEST = 'sentences_test.txt'

INPUT_TRAIN = 'train.ids'
INPUT_EVAL = 'eval.ids'
INPUT_TEST = 'test.ids'
EMBEDDING_FILE = 'wordembeddings-dim100.word2vec'

In [35]:
#word_to_id_dict = build_vocab()
word_to_id_dict = json.load(open("dictionary.txt"))

#word_to_ids(FILE_TRAIN, INPUT_TRAIN, word_to_id_dict)
#word_to_ids(FILE_EVAL, INPUT_EVAL, word_to_id_dict)
#word_to_ids(FILE_TEST, INPUT_TEST, word_to_id_dict)

In [36]:
# to account for indices starting with 0
for k, v in word_to_id_dict.items():
    word_to_id_dict[k] = v-1

In [259]:
class Model():
    
    def __init__(self, 
                 batch_size, 
                 vocab_size, 
                 embed_size, 
                 hidden_units, 
                 num_epochs,
                 experiment):
        
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_units = hidden_units
        self.num_epochs = num_epochs
        
        self.initializer = tf.contrib.layers.xavier_initializer()
        self.max_gradient_norm = 5
        self.global_step = tf.Variable(0, trainable=False)
        
        self.experiment = experiment
        
        self.pad_index = 2
        self.eos_index = 1
        
    
        def parse(line, vocab_size=self.vocab_size):
            line_split = tf.string_split([line])
            input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32)
            output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32)
            return input_seq, output_seq
        
        with tf.name_scope('inputs'):
            self.file_name_train = tf.placeholder(tf.string)
            self.file_name_validation = tf.placeholder(tf.string)
            self.file_name_test = tf.placeholder(tf.string)

            training_dataset = tf.data.TextLineDataset(self.file_name_train).map(parse).repeat(self.num_epochs).batch(self.batch_size)
            validation_dataset = tf.data.TextLineDataset(self.file_name_validation).map(parse).repeat(self.num_epochs).batch(self.batch_size)
            test_dataset = tf.data.TextLineDataset(self.file_name_test).map(parse).batch(1)

            iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
            self.input_batch, self.output_batch = iterator.get_next()

            self.training_init_op = iterator.make_initializer(training_dataset) 
            self.validation_init_op = iterator.make_initializer(validation_dataset)
            self.test_init_op = iterator.make_initializer(test_dataset)

            self.nr_words = tf.reduce_prod(tf.shape(self.input_batch))
            
        with tf.name_scope('rnn'):
            self.LSTM = tf.nn.rnn_cell.LSTMCell(self.hidden_units, initializer=self.initializer, trainable=True)
        
        
        with tf.name_scope("embeddings"):
            
            if self.experiment == "A" or self.experiment == "C":
                self.input_embedding_mat = tf.get_variable('input_embedding_mat', shape=(vocab_size, embed_size), 
                                                    dtype=tf.float32, initializer=self.initializer, trainable=True)
            else:
                self.input_embedding_mat = tf.get_variable('input_embedding_mat', shape=(vocab_size, embed_size), 
                                                    dtype=tf.float32, initializer=self.initializer, trainable=False)
                
            if self.experiment == "C":
                self.projection_mat = tf.get_variable("projection_mat", shape=(int(hidden_units/2), hidden_units),
                                                     dtype=tf.float32, initializer=self.initializer, trainable=True)
                
                self.output_embedding_mat = tf.get_variable('output_embedding_mat', shape=(vocab_size, int(hidden_units/2)), 
                                                     dtype=tf.float32, initializer=self.initializer, trainable=True)
            else:
                self.output_embedding_mat = tf.get_variable('output_embedding_mat', shape=(vocab_size, hidden_units), 
                                                     dtype=tf.float32, initializer=self.initializer, trainable=True)
                
            def output_embedding(current_output):
                return tf.matmul(current_output, tf.transpose(self.output_embedding_mat))
            
            def projection(current_output):
                return tf.matmul(current_output, tf.transpose(self.projection_mat))



        def forward_pass(self, input_batch, output_batch):
            input_embedded = tf.nn.embedding_lookup(self.input_embedding_mat, input_batch) #(64,29,100)

            batch_size = tf.shape(input_batch)[0] 
            with tf.name_scope('rnn'):
                state_c, state_h = self.LSTM.zero_state(batch_size=batch_size, dtype=tf.float32)
                preds = []
                for i in range(29):
                    output, (state_c, state_h) = self.LSTM(tf.reshape(input_embedded[:,i,:], [batch_size,self.embed_size]), state=(state_c, state_h))
                    preds.append(output)

                preds = tf.stack(preds, axis=1) #concatenate preds over axis=1 (2nd dimension), to obtain tensor of size (batch_size,29,512)
                if self.experiment == "C":
                    preds = tf.map_fn(projection, preds) #(64,29,512)
                logits = tf.map_fn(output_embedding, preds) #(64,29,20000)
                logits = tf.reshape(logits, [-1, self.vocab_size])  
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(output_batch, [-1]), logits=logits)
                
                mask =  1.0-tf.cast(tf.equal(input_batch,self.pad_index),dtype=tf.float32)
                negative_loglikelihood = tf.reduce_sum(tf.multiply(loss, tf.reshape(mask,[-1])))
                nr_words = tf.reduce_sum(mask)
                perplexity = tf.math.exp(negative_loglikelihood/nr_words)

            return loss, perplexity
            
        
        #TODO: current implementation might raise error if the input sentence's length is more than 20, double check
        def generate_sentence(self, input_batch):
            input_batch = tf.reshape(input_batch, [-1])
            pad_tags_start = tf.where(tf.equal(input_batch, self.eos_index))
            if start_pad is not None:
                input_batch = input_batch[:tf.reduce_min(pad_tags_start)] 
            input_batch = tf.reshape(input_batch, [1,-1])
            input_embedded = tf.nn.embedding_lookup(self.input_embedding_mat, input_batch) 
            sentence_length = tf.shape(input_batch)[1] 
            with tf.name_scope('rnn'):
                state_c, state_h = self.LSTM.zero_state(batch_size=1, dtype=tf.float32)
                preds, last_predicted = [], tf.convert_to_tensor([[1]])  #to avoid problems with tf.cond in the for loop below
                for i in range(20):         
                    input_, end_ = tf.cond(tf.less(i,sentence_length), \
                                     true_fn=lambda: (tf.reshape(input_embedded[:,i,:], [1,self.embed_size]), True), \
                                     false_fn=lambda: (tf.reshape(tf.nn.embedding_lookup(self.input_embedding_mat, \
                                                                   last_predicted), [1,self.embed_size]), False))
                    output, (state_c, state_h) = self.LSTM(input_, state=(state_c, state_h))
                    if self.experiment == "C":
                        output = projection(output)
                    output = output_embedding(output)
                    word_id = tf.argmax(output, axis=1)[0]
                    last_predicted = tf.reshape(tf.convert_to_tensor(word_id), [1,1])
                    
                    preds.append(tf.cast(word_id, dtype=tf.int32))
                    
                    if word_id == self.eos_index and not end_: #only break if we are past the length of the input sentence
                        break
                preds = tf.convert_to_tensor(preds)
                
                #not nice, but gets the job done :) [would be better to use tf.while_loop...]
                res = tf.cond(tf.less(sentence_length,20), \
                              true_fn=lambda: tf.concat([input_batch[0,:sentence_length],preds[sentence_length:]],axis=0), \
                                                         false_fn=lambda: input_batch[0,:sentence_length])
            return preds


        
        with tf.name_scope('loss'):
            self.loss, self.perplexity = forward_pass(self, self.input_batch, self.output_batch)
            
            
        with tf.name_scope('optimization'):
            params = tf.trainable_variables()
            opt = tf.train.AdamOptimizer()
            gradients = tf.gradients(self.loss, params, colocate_gradients_with_ops=True)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
            
        with tf.name_scope('sentence_generation'):
            self.preds= generate_sentence(self, self.input_batch)   

In [256]:
EXPERIMENT = 'A'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 512
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [228]:
EXPERIMENT = 'B'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 512
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [230]:
EXPERIMENT = 'C'
BATCH_SIZE=64
VOCAB_SIZE = 20000
HIDDEN_UNITS = 1024
EMBED_SIZE = 100
NUM_EPOCHS = 1

In [32]:
#masked perplexity version
tf.reset_default_graph()

model = Model(batch_size=BATCH_SIZE, vocab_size=VOCAB_SIZE, hidden_units=HIDDEN_UNITS, 
              embed_size=EMBED_SIZE, num_epochs=NUM_EPOCHS, experiment=EXPERIMENT)

print("Number of trainable parameters: {}".format(count_trainable_parameters()))

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    if model.experiment == "B":
        load_embedding(sess, word_to_id_dict, model.input_embedding_mat, EMBEDDING_FILE, model.embed_size, model.vocab_size)
    epoch = 0
    while epoch < model.num_epochs:
        
        #training step
        sess.run(model.training_init_op, {model.file_name_train: INPUT_TRAIN})
        while True:
            try:
                _loss, _ppl, _global_step, _ = sess.run([model.loss, model.perplexity, model.global_step, model.updates])

                if _global_step % 10 == 0:
                    print("Batch: {}. Epoch: {} Perplexity: {} Loss: {}. ".format(_global_step, epoch, _ppl, np.sum(_loss)))
                if _global_step > 1000: # for testing we just do 1000 batches, REMOVE
                    break

            except tf.errors.OutOfRangeError:
                # The end of one epoch
                epoch += 1
                break 
                
        #validation step
        sess.run(model.validation_init_op, {model.file_name_validation: INPUT_EVAL})
        eval_loss = []
        eval_ppl = []
        while True:
            try:
                _eval_loss, _eval_ppl = sess.run([model.loss, model.perplexity])
                eval_loss.append(np.sum(_eval_loss))
                eval_ppl.append(_eval_ppl)
                
            except tf.errors.OutOfRangeError:
                print("Epoch: {} Avg eval loss per batch: {}. Avg eval ppl per batch: {} ".format(epoch, \
                                                                                  np.mean(eval_loss), np.mean(eval_ppl)))
                break
                
        break #REMOVE (to account for the fact that we do not iterate through the entire epoch...)
        
    #after training is done, save the model
    save_path = saver.save(sess, "model" + EXPERIMENT + ".ckpt")
    print("Model saved in path: %s" % save_path)
    
    #in the end calculate perplexity for each sentence in test set
    sess.run(model.test_init_op, {model.file_name_test: INPUT_TEST})
    with open ('results' + EXPERIMENT + '.txt', 'w') as file:
        while True:
            try:
                test_ppl = sess.run(model.perplexity)
                file.write(str(test_ppl) + '\n')
            except tf.errors.OutOfRangeError:
                break

Number of trainable parameters: 13495424


KeyboardInterrupt: 

In [260]:
# sentence generation
#TODO: restore trained model and use that in sentence generation

inverse_vocab = {v:k for k,v in word_to_id_dict.items()}

tf.reset_default_graph()

model = Model(batch_size=BATCH_SIZE, vocab_size=VOCAB_SIZE, hidden_units=HIDDEN_UNITS, 
              embed_size=EMBED_SIZE, num_epochs=NUM_EPOCHS, experiment=EXPERIMENT)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    if model.experiment == "B":
        load_embedding(sess, word_to_id_dict, model.input_embedding_mat, EMBEDDING_FILE, model.embed_size, model.vocab_size)
    
    sess.run(model.test_init_op, {model.file_name_test: INPUT_TEST})
  
       
    with open ('sentence_generation' + '.txt', 'w') as file:
        while True:
            try:
                sentence = sess.run(model.preds)
                sentence = [inverse_vocab[i] for i in sentence][1:] #skip the <bos> tag in the output file
                file.write(' '.join(sentence) + '\n')
            except tf.errors.OutOfRangeError:
                break

KeyboardInterrupt: 

In [None]:
#old version
tf.reset_default_graph()

model = Model(batch_size=BATCH_SIZE, vocab_size=VOCAB_SIZE, hidden_units=HIDDEN_UNITS, 
              embed_size=EMBED_SIZE, num_epochs=NUM_EPOCHS, experiment=EXPERIMENT)

print("Number of trainable parameters: {}".format(count_trainable_parameters()))

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    if model.experiment == "B":
        load_embedding(sess, word_to_id_dict, model.input_embedding_mat, EMBEDDING_FILE, model.embed_size, model.vocab_size)
    epoch = 0
    while epoch < model.num_epochs:
        
        #training step
        sess.run(model.training_init_op, {model.file_name_train: INPUT_TRAIN})
        train_loss = 0.0
        train_words = 0.0
        while True:
            try:
                _loss, _words, _global_step, _ = sess.run([model.loss, model.nr_words, model.global_step, model.updates])
                train_loss += np.sum(_loss)
                train_words += _words

                if _global_step % 10 == 0:
                    train_loss /= train_words
                    train_ppl = math.exp(train_loss)
                    print("Batch: {}. Epoch: {} Training perplexity: {}. ".format(_global_step, epoch, train_ppl))
                    train_loss = 0.0
                    train_words = 0
                if _global_step > 1000: # for testing we just do 1000 batches, REMOVE
                    break

            except tf.errors.OutOfRangeError:
                # The end of one epoch
                epoch += 1
                break 
                
        #validation step
        sess.run(model.validation_init_op, {model.file_name_validation: INPUT_EVAL})
        eval_loss = 0.0
        eval_words = 0
        while True:
            try:
                _eval_loss, _eval_words = sess.run([model.loss, model.nr_words])
                eval_loss += np.sum(_eval_loss)
                eval_words += _eval_words

            except tf.errors.OutOfRangeError:
                eval_loss /= eval_words
                eval_ppl = math.exp(eval_loss)
                print("Epoch: {} Validation perplexity: {}. ".format(epoch, eval_ppl))
                break
                
        break #REMOVE (to account for the fact that we do not iterate through the entire epoch...)
        
    #after training is done, save the model
    save_path = saver.save(sess, "model" + EXPERIMENT + ".ckpt")
    print("Model saved in path: %s" % save_path)
    
    #in the end calculate perplexity for each sentence in test set
    sess.run(model.test_init_op, {model.file_name_test: INPUT_TEST})
    with open ('results' + EXPERIMENT + '.txt', 'w') as file:
        while True:
            try:
                test_loss = sess.run(model.loss)
                test_ppl = math.exp(np.sum(test_loss)/29)
                file.write(str(test_ppl) + '\n')
            except tf.errors.OutOfRangeError:
                break