In [19]:
import tensorflow as tf
import numpy as np
import math
from data_utils import build_vocab, word_to_ids

In [17]:
FILE_TRAIN = 'sentences.train'
FILE_EVAL = 'sentences.eval'
FILE_TEST = 'sentences_test.txt'

INPUT_TRAIN = 'train.ids'
INPUT_EVAL = 'eval.ids'
INPUT_TEST = 'test.ids'

In [None]:
word_to_id_dict = build_vocab()

word_to_ids(FILE_TRAIN, INPUT_TRAIN)
word_to_ids(FILE_EVAL, INPUT_EVAL)
word_to_ids(FILE_TEST, INPUT_TEST)

In [14]:
class ModelA():
    
    def __init__(self, 
                 batch_size, 
                 vocab_size, 
                 embed_size, 
                 hidden_units, 
                 num_epochs):
        
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_units = hidden_units
        self.num_epochs = num_epochs
        
        self.initializer = tf.contrib.layers.xavier_initializer()
        self.max_gradient_norm = 5
        self.global_step = tf.Variable(0, trainable=False)
        
        def parse(line, vocab_size=self.vocab_size):
            line_split = tf.string_split([line])
            input_seq = tf.string_to_number(line_split.values[:-1], out_type=tf.int32)
            output_seq = tf.string_to_number(line_split.values[1:], out_type=tf.int32)
            return input_seq, output_seq
        
        with tf.name_scope('inputs'):
            self.file_name_train = tf.placeholder(tf.string)
            self.file_name_validation = tf.placeholder(tf.string)
            self.file_name_test = tf.placeholder(tf.string)

            training_dataset = tf.data.TextLineDataset(self.file_name_train).map(parse).batch(batch_size)
            validation_dataset = tf.data.TextLineDataset(self.file_name_validation).map(parse).batch(batch_size)
            test_dataset = tf.data.TextLineDataset(self.file_name_test).map(parse).batch(1)

            iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
            self.input_batch, self.output_batch = iterator.get_next()

            self.training_init_op = iterator.make_initializer(training_dataset) 
            self.validation_init_op = iterator.make_initializer(validation_dataset)
            self.test_init_op = iterator.make_initializer(test_dataset)

            self.nr_words = tf.reduce_prod(tf.shape(self.input_batch))
            
        with tf.name_scope("embeddings"):
            self.input_embedding_mat = tf.get_variable('input_embedding_mat', shape=(vocab_size, embed_size), 
                                                dtype=tf.float32, initializer=self.initializer, trainable=True)
            self.output_embedding_mat = tf.get_variable('output_embedding_mat', shape=(vocab_size, hidden_units), 
                                                 dtype=tf.float32, initializer=self.initializer, trainable=True)
            def output_embedding(current_output):
                return tf.matmul(current_output, tf.transpose(self.output_embedding_mat))
        
        with tf.name_scope('rnn'):
            self.LSTM = tf.nn.rnn_cell.LSTMCell(self.hidden_units, initializer=self.initializer, trainable=True)
        
        def forward_pass(self, input_batch, output_batch):
            input_embedded = tf.nn.embedding_lookup(self.input_embedding_mat, self.input_batch) #(64,29,100)

            batch_size = tf.shape(input_batch)[0] 
            with tf.name_scope('rnn'):
                state_c, state_h = self.LSTM.zero_state(batch_size=batch_size, dtype=tf.float32)
                preds = []
                for i in range(29):
                    output, (state_c, state_h) = self.LSTM(tf.reshape(input_embedded[:,i,:], [self.batch_size,self.embed_size]), state=(state_c, state_h))
                    preds.append(output)

                preds = tf.stack(preds, axis=1) #concatenate preds over axis=1 (2nd dimension), to obtain tensor of size (batch_size,29,512)
                
                logits = tf.map_fn(output_embedding, preds) #(64,29,20000)
                logits = tf.reshape(logits, [-1, self.vocab_size])  
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.output_batch, [-1]), logits=logits)
            
            return loss      
        
        
        with tf.name_scope('loss'):
            self.loss = forward_pass(self, self.input_batch, self.output_batch)
            
        with tf.name_scope('optimization'):
            params = tf.trainable_variables()
            opt = tf.train.AdamOptimizer()
            gradients = tf.gradients(self.loss, params, colocate_gradients_with_ops=True)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

            
              

In [20]:
tf.reset_default_graph()

model = ModelA(batch_size=64, vocab_size=20000, hidden_units=512, embed_size=100, num_epochs=1)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    epoch = 0
    while epoch < model.num_epochs:
        
        #training step
        sess.run(model.training_init_op, {model.file_name_train: INPUT_TRAIN})
        train_loss = 0.0
        train_words = 0.0
        while True:
            try:
                _loss, _words, _global_step, _ = sess.run([model.loss, model.nr_words, model.global_step, model.updates])
                train_loss += np.sum(_loss)
                train_words += _words

                if _global_step % 10 == 0:
                    train_loss /= train_words
                    train_ppl = math.exp(train_loss)
                    print("Batch: {}. Epoch: {} Training perplexity: {}. ".format(_global_step, epoch, train_ppl))
                    train_loss = 0.0
                    train_words = 0
                if _global_step > 20: # for testing we just do 100 batches, REMOVE
                    break

            except tf.errors.OutOfRangeError:
                # The end of one epoch
                epoch += 1
                break 
                
        #validation step
        sess.run(model.validation_init_op, {model.file_name_validation: INPUT_EVAL})
        eval_loss = 0.0
        eval_words = 0
        while True:
            try:
                _eval_loss, _eval_words = sess.run([model.loss, model.nr_words])
                eval_loss += np.sum(_eval_loss)
                eval_words += _eval_words

            except tf.errors.OutOfRangeError:
                eval_loss /= eval_words
                eval_ppl = math.exp(eval_loss)
                print("Epoch: {} Validation perplexity: {}. ".format(epoch, eval_ppl))
                break
        break #REMOVE
    
    #in the end calculate perplexity for each sentence in test set
    sess.run(model.test_init_op, {model.file_name_test: INPUT_TEST})
    with open ('results.txt', 'w') as file:
        while True:
            try:
                test_loss = sess.run(model.loss)
                test_ppl = math.exp(np.sum(test_loss)/29)
                file.write(str(test_ppl) + '\n')
            except tf.errors.OutOfRangeError:
                break

Batch: 2. Epoch: 0 Training perplexity: 19923.171593672752. 


KeyboardInterrupt: 