In [1]:
import numpy as np
import random
from random import shuffle
import os
import gensim
import re
import pickle
import tensorflow as tf
import operator
import math
import sys
from copy import deepcopy
from collections import Counter
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.python.framework import ops
from tensorflow.models.rnn.translate import seq2seq_model
import string

In [2]:
class BabiDataset:
    def __init__(self, data_dir, task_id, model_type, max_vocab_size=None):
        self.task_id = task_id
        self.data_dir = data_dir
        self.model_type = model_type
        self.max_vocab_size = max_vocab_size
        self.vocab = set()
        self.word_counter = dict()
        self.raw_train_data = None
        self.raw_test_data = None
        self.num_tokens = None
        
        self.__load_data()
        self.word2id_dict, self.id2word_dict = self.__create_word2id_dict()
        self.id2vector_dict = self.__create_id2vector_dict()
                
        (self.train_input_raw, self.train_input_tokens, 
         self.train_labels_raw, self.train_labels_tokens, self.train_sentence_counts)  = self.__tokenize_sentences(self.raw_train_data)
        
        (self.test_input_raw, self.test_input_tokens, 
         self.test_labels_raw, self.test_labels_tokens, self.test_sentence_counts)  = self.__tokenize_sentences(self.raw_test_data)
        
        self.max_context_len = None
        self.max_question_len = None
        self.max_answer_len = None
 
        self.pad_sequences()
        
        #self.train_C_vectors = self.__vectorize_sentences(list(zip(*self.train_input_tokens))[0])
        #self.train_Q_vectors = self.__vectorize_sentences(list(zip(*self.train_input_tokens))[1])
        #self.train_input_vectors = list(zip(self.train_C_vectors, self.train_Q_vectors))
        #self.train_labels_vectors = self.__vectorize_sentences(self.train_labels_tokens)
        
        #self.test_C_vectors = self.__vectorize_sentences(list(zip(*self.test_input_tokens))[0])
        #self.test_Q_vectors = self.__vectorize_sentences(list(zip(*self.test_input_tokens))[1])
        #self.test_input_vectors = list(zip(self.train_C_vectors, self.test_Q_vectors))
        #self.test_labels_vectors = self.__vectorize_sentences(self.test_labels_tokens)
        

    def __update_word_counter(self, sequence):
        """ Update word_counter with counts for words in a sentence
        
        Args:
            sequence (list<str>) : list of words in a sequence
        
        """
        for word in sequence.split():
            self.word_counter[word] = self.word_counter.get(word, 0) + 1
            
    def __create_vocab(self):
        """ Create set of most frequent unique words found in the training data """
        
        if self.max_vocab_size == None:
            self.vocab = set(self.word_counter.keys())
        else:
            self.vocab = set(sorted(self.word_counter, key=self.word_counter.get, reverse=True)[:self.max_vocab_size])
        
    def __clean_words(self, line):
        punctuation = [x for x in list(string.punctuation)]
        space_punct = [' {0}'.format(elem) for elem in punctuation]
        replace_punctuation = str.maketrans(dict(zip(punctuation, space_punct)))
        line = line.translate(replace_punctuation)
        line = line.strip()
        line = line.replace('.', ' . ')
        line = line.replace(',', ' , ')
        line = line[line.find(' ') + 1:]
        line = line.lower()
        return line
    
    def __parse_babi_file(self, txt_file):
        with open(txt_file) as babi_file:
            raw_data = []
            curr_sample = None
            for i, line in enumerate(open(txt_file)):
                id = int(line[0:line.find(' ')])
                if id == 1:
                    skip = False
                    curr_sample = {"C": [], "Q": "", "A": ""}

                line = self.__clean_words(line)
                
                self.__update_word_counter(line)
                if line.find('?') == -1:
                    curr_sample["C"].append(line)
                else:
                    idx = line.find('?')
                    tmp = line[idx + 1:].split('\t')
                    curr_sample["Q"] = line[:idx]
                    if self.task_id==19:
                        #curr_sample["A"] = tmp[1].strip().split(" , ")
                        dirs = tmp[1].strip().split("  , ")
                        directions = {'n': 'north', 'e': 'east', 's': 'south', 'w': 'west'}
                        newdirs = [directions[d] for d in dirs]
                        curr_sample["A"] = " ".join(newdirs)
                            
                    else:
                        curr_sample["A"] = tmp[1].strip()
                    raw_data.append(deepcopy(curr_sample))

            self.__create_vocab()
            print("Loaded {} data samples from {}".format(len(raw_data), txt_file.split(self.data_dir)[1]))
            return raw_data

    def __load_data(self):
        files = [os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir)]
        s = 'qa{}_'.format(self.task_id)
        task_files = [f for f in files if s in f]
        train_file = [f for f in task_files if 'train' in f][0]
        test_file = [f for f in task_files if 'test' in f][0] 
        
        self.raw_train_data = self.__parse_babi_file(train_file)
        self.raw_test_data = self.__parse_babi_file(test_file)
        
#    def __create_unique_word_corpus(self, raw_data_dict):
#        all_words = []
#        for x in raw_data_dict:
#            for fact in x["C"]:
#                for word in fact.lower().split(' '):
#                    if len(word) > 0:
#                         all_words.append(word)

#             for word in x["Q"].lower().split(' '):
#                 if len(word) > 0:
#                     all_words.append(word)
#             for word in x["A"].lower().split(' '):
#                 if len(word) > 0:
#                     all_words.append(word)
#         word_corpus = set(all_words)
#         print "{} unique words found".format(len(word_corpus))
#         return word_corpus
    
    def __create_word2id_dict(self):
        word2id_dict = dict()
        #self.vocab = self.__create_unique_word_corpus(self.raw_train_data)
        
        if self.model_type == 'LSTM':
            word2id_dict['PAD'] = 0
            word2id_dict['UNK'] = 1
        elif self.model_type == 'seq2seq':
            word2id_dict['PAD'] = 0
            word2id_dict['UNK'] = 1
            word2id_dict['Q'] = 2
#             word2id_dict['EOS'] = 2
#             word2id_dict['UNK'] = 3
#             word2id_dict['Q'] = 4
        elif self.model_type == 'DMN':
            word2id_dict['PAD'] = 0
            word2id_dict['UNK'] = 1
        else:
            print( "Error: Model type {} invalid".format(self.model_type))
            
        for word in self.vocab:
            word2id_dict[word] = len(word2id_dict)
        
        self.vocab.add('PAD')
        self.vocab.add('UNK')
        if self.model_type =='seq2seq':
            self.vocab.add('Q')
        id2word_dict = dict(list(zip(word2id_dict.values(), word2id_dict.keys())))
        self.num_tokens = len(word2id_dict)
        return word2id_dict, id2word_dict
    
    def __convert_word2id(self, word):
        try:
            word_id = self.word2id_dict[word]
        except:
            word_id = self.word2id_dict['UNK']
        return word_id
    
    def __create_id2vector_dict(self):
        embeddings_file = '/media/ai2-rey/data_disk/data_sets/glove.6B/glove.6B.100d.txt'
        
        glove_embeddings = {}

        with open(embeddings_file) as read_file:
            for line in read_file:
                embedding = line.split('\n')[0].split(' ')
                embedding_key = embedding[0]
                embedding_vec = np.array([float(val) for val in embedding[1:]])
                glove_embeddings.update({embedding_key : embedding_vec})

        id2vector_dict = {}        
        for word, word_id in self.word2id_dict.items():
            if word in glove_embeddings:
                id2vector_dict[word_id] = glove_embeddings[word]
            else: 
                id2vector_dict[word_id] = np.random.uniform(0.0,1.0,100)        
        return id2vector_dict    

    def __vectorize_sentences(self, sequences):
        vectors = []
        
        for sequence in sequences:
            sentence_vectors = []
            for word_id in sequence:
                sentence_vectors.append(self.id2vector_dict[word_id])
            vectors.append(sentence_vectors)
        return vectors

    
    def __convert_to_one_hot(self, labels):
        one_hot = np.array([[0 for j in range(self.num_tokens)] for i in range(len(labels))])

        for i in range(len(one_hot)):
            one_hot[i][labels[i]] = 1

        return one_hot
    
    def __tokenize_sentences(self, raw_data):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        context = []
        context_ids = []
        questions = []
        question_ids = []
        answers = []
        answer_ids = []
        sentence_counts = []

        if self.model_type == 'LSTM':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append([word for sentence in story for word in sentence])
                context_ids.append([word_id for sentence in story_ids for word_id in sentence])

                questions.append(q)
                question_ids.append(q_ids)

                answers.append([w for w in sample["A"].lower().split(' ') if len(w)>0])  # NOTE: here we assume the answer is one word!
                answer_ids.append([self.__convert_word2id(w) for w in sample["A"].lower().split(' ') if len(w)>0])
                
                sentence_counts.append(len(story))
                
            answer_ids = self.__convert_to_one_hot(answer_ids)
            context_ids, question_ids, answer_ids = np.array(context_ids), np.array(question_ids), np.array(answer_ids)
            packaged_data = list(zip(context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts))
            random.shuffle(packaged_data)
            context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts = list(zip(*packaged_data))
            return list(zip(context, questions)), list(zip(context_ids, question_ids)), answers, answer_ids, sentence_counts
            
        elif self.model_type == 'seq2seq':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append([word for sentence in story for word in sentence] + ['Q'] + q)
                context_ids.append([word_id for sentence in story_ids for word_id in sentence] + [self.word2id_dict['Q']] + q_ids)

#                 answers.append(['GO'] + sample["A"].lower().split(' ') + ['EOS'])
                answers.append([w for w in sample["A"].lower().split(' ') if len(w)>0])  # NOTE: here we assume the answer is one word!
                answer_ids.append([self.__convert_word2id(w) for w in sample["A"].lower().split(' ') if len(w)>0])
                
                sentence_counts.append(len(story))    
            context_ids, answer_ids = np.array(context_ids), np.array(answer_ids)
            packaged_data = list(zip(context, context_ids, answers, answer_ids, sentence_counts))
            random.shuffle(packaged_data)
            context, context_ids, answers, answer_ids, sentence_counts = list(zip(*packaged_data))
            
            answer_ids = list(answer_ids)
            if self.task_id !=8:
                for i, a_id in enumerate(answer_ids):                   
                    answer_ids[i] = a_id.tolist()
            return context, context_ids, answers, answer_ids, sentence_counts
            
        elif model_type == 'DMN':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append(story)
                context_ids.append(story_ids)

                questions.append(q)
                question_ids.append(q_ids)

                answers.append(sample["A"])  # NOTE: here we assume the answer is one word!
                answer_ids.append(self.__convert_word2id(sample["A"]))
                sentence_counts.append(len(story))
            answer_ids = self.__convert_to_one_hot(answer_ids)
            context_ids, question_ids, answer_ids = np.array(context_ids), np.array(question_ids), np.array(answer_ids)
            packaged_data = list(zip(context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts))
            random.shuffle(packaged_data)
            context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts = list(zip(*packaged_data))
            return list(zip(context, questions)), list(zip(context_ids, question_ids)), answers, answer_ids, sentence_counts

    def __get_max_sequence_length(self, sequences):
        max_len = 0
        min_len = 1000
        avg_len = 0
        for sequence in sequences:
            max_len = max(max_len, len(sequence))
            min_len = min(min_len, len(sequence))
            avg_len += len(sequence)
        avg_len = int(float(avg_len) / len(sequences))
        return max_len, min_len, avg_len
    
    def __apply_padding(self, sequences, length):
        padded_data = []
        for id_sequence in sequences:
            if len(id_sequence) < length:
                padded_sequence = id_sequence
                for i in range(length - len(id_sequence)):
                    padded_sequence.append(0)
                padded_data.append(np.array(padded_sequence)) 
            elif len(id_sequence) > length:
                clipped_sequence = id_sequence[:length]
                padded_data.append(np.array(clipped_sequence))
            else:
                padded_data.append(np.array(id_sequence))

        return np.array(padded_data)

    def pad_sequences(self, pad_lengths=None):
        
        if self.model_type == 'LSTM':
            train_context_data, train_question_data = list(zip(*self.train_input_tokens))
            test_context_data, test_question_data = list(zip(*self.test_input_tokens))
            
            if pad_lengths == None:
                self.max_context_len, min_context_len, avg_context_len = self.__get_max_sequence_length(train_context_data)
                self.max_question_len, min_question_len, avg_question_len = self.__get_max_sequence_length(train_question_data)
                print('Context Lengths: max = {}, min = {}, avg = {}'.format(self.max_context_len, min_context_len, avg_context_len))
                print('Question Lengths: max = {}, min = {}, avg = {}'.format(self.max_question_len, min_question_len, avg_question_len))
            elif len(pad_lengths) == 2:
                self.max_context_len = pad_lengths[0]
                self.max_question_len = pad_lengths[1]
            else:
                print("Error: pad_lengths needs form [context_len, question_len]")

            train_context_data = self.__apply_padding(train_context_data, self.max_context_len)
            train_question_data = self.__apply_padding(train_question_data, self.max_question_len)
            test_context_data = self.__apply_padding(test_context_data, self.max_context_len)
            test_question_data = self.__apply_padding(test_question_data, self.max_question_len)

            self.train_input_tokens = list(zip(train_context_data, train_question_data))
            self.test_input_tokens = list(zip(test_context_data, test_question_data)) 
            
            
        if self.model_type == 'seq2seq':
            train_context_data = self.train_input_tokens
            train_answer_data = self.train_labels_tokens
            
            test_context_data = self.test_input_tokens
            test_answer_data = self.test_labels_tokens
            
            if pad_lengths == None:
                self.max_context_len, min_context_len, avg_context_len = self.__get_max_sequence_length(train_context_data)
                print('Context Lengths: max = {}, min = {}, avg = {}'.format(self.max_context_len, min_context_len, avg_context_len))
                
                self.max_answer_len, min_answer_len, avg_answer_len = self.__get_max_sequence_length(train_answer_data)
                print('Answer Lengths: max = {}, min = {}, avg = {}'.format(self.max_answer_len, min_answer_len, avg_answer_len))
            elif len(pad_lengths) == 2:
                self.max_context_len = pad_lengths[0]
                self.max_question_len = pad_lengths[1]
            else:
                print("Error: pad_lengths needs form [context_len, question_len]")

            train_context_data = self.__apply_padding(train_context_data, self.max_context_len)
            train_answer_data = self.__apply_padding(train_answer_data, self.max_answer_len)
            
            test_context_data = self.__apply_padding(test_context_data, self.max_context_len)
            test_answer_data = self.__apply_padding(test_answer_data, self.max_answer_len)
            
            self.train_input_tokens = train_context_data
            self.train_labels_tokens = train_answer_data 
            
            self.test_input_tokens = test_context_data
            self.test_labels_tokens = test_answer_data
        return

In [3]:
class Seq2Seq:
 
    def __init__(self, vocab_size, xseq_len, yseq_len, num_layers, lr_rate=0.001, 
                 momentum = 0.9, n_hidden=256, word_dim=100, 
                 dropout_rate=1., gpu_device=0, model_dir=None):

        self.vocab_size = vocab_size
        self.xseq_len = xseq_len
        self.yseq_len = yseq_len
        self.num_layers = num_layers
        self.lr_rate = lr_rate
        self.momentum = momentum
        self.n_hidden = n_hidden
        self.word_dim = word_dim
        self.dropout_rate = dropout_rate
        self.gpu_device = gpu_device
        self.model_dir = self.__prepare_model_dir(model_dir)
        self.__keep_prob = None
        
        self.__graph = tf.Graph()
        
        self.__build_model()
    
    def __prepare_model_dir(self, model_dir):
        """ Checks model directory for a weights folder and creates one if none exists
        
        Args:
            model_dir (str) : defines directory location to save weights and training log file
            
        Returns:
            str : directory location with weights folder
        
        """
        if model_dir == None:
            model_dir = os.getcwd() + '/'
        else:
            if model_dir[-1] != '/':
                model_dir = model_dir + '/'
            else:
                model_dir = model_dir
        
        if not os.path.exists(model_dir + 'weights'):
            os.makedirs(model_dir + 'weights')
        return model_dir
                   
    def __build_model(self):
        """ Creates computation graph for dual encoder LSTM. Includes structure for training and deploying. """
        
        tf.reset_default_graph()
        gpu_device_name = '/gpu:{}'.format(self.gpu_device)
        
        with self.__graph.as_default():
            with tf.device(gpu_device_name):
                # define placeholder variables for model inputs
                self.enc_inp = [tf.placeholder(shape=[None,], 
                                               dtype=tf.int32, 
                                               name='ei_{}'.format(t)) for t in range(self.xseq_len)]
                self.labels = [tf.placeholder(shape=[None,], 
                                               dtype=tf.int32, 
                                               name='ei_{}'.format(t)) for t in range(self.yseq_len)]
                self.dec_inp = ([tf.zeros_like(self.enc_inp[0], dtype=np.int32, name="GO")]+self.labels[:-1]) 
                
                self.__keep_prob = tf.placeholder(tf.float32)


                # Build the RNN
                with tf.variable_scope("decoder"):
                    # We use an LSTM Cell
                    #cell = tf.nn.rnn_cell.LSTMCell(self.n_hidden, forget_bias=2.0, state_is_tuple=True)
                    cell = tf.nn.rnn_cell.DropoutWrapper(
                           tf.nn.rnn_cell.BasicLSTMCell(word_dim, state_is_tuple=True),
                           output_keep_prob=self.__keep_prob)
                    
                    stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell]*self.num_layers, state_is_tuple=True)
                    
                    self.dec_outputs, self.dec_states = tf.nn.seq2seq.embedding_rnn_seq2seq(
                                                        self.enc_inp, self.dec_inp, stacked_lstm, 
                                                        vocab_size, vocab_size, word_dim)
                    
                    tf.get_variable_scope().reuse_variables()

                    self.dec_outputs_test, self.dec_states_test = tf.nn.seq2seq.embedding_rnn_seq2seq(
                                                        self.enc_inp, self.dec_inp, stacked_lstm, 
                                                        vocab_size, vocab_size, word_dim,
                                                        feed_previous=True) 
                    
                loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in self.labels]

                self.__loss = tf.nn.seq2seq.sequence_loss(self.dec_outputs, self.labels, loss_weights, vocab_size)

                self.__optimizer = tf.train.AdamOptimizer(learning_rate=self.lr_rate).minimize(self.__loss)

                self.__init = tf.global_variables_initializer()
    
    def get_feed(self,X,Y, keep_prob):
        feed_dict={self.enc_inp[t]: X[t] for t in range(self.xseq_len)}
        feed_dict.update({self.labels[t]: Y[t] for t in range(self.yseq_len)})
        feed_dict[self.__keep_prob]=keep_prob
        return feed_dict

    def train_batch(self,sess, data_iter):
        X,Y = data_iter.next_batch()
        feed_dict = self.get_feed(X,Y, self.dropout_rate)
        _, loss_v = sess.run([self.__optimizer, self.__loss], feed_dict)
        return loss_v
     
    def train(self, train_data_iter, test_data_iter, deploy_data_iter,
              deploy_interval = 1000, train_iters=10000, display_step=200, 
              save_weights_interval=5000, id2word_dict=None, weights_prefix=None):

        
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver(max_to_keep=100)
            # Keep training until reach max iterations
            for train_iter in range(train_iters):
                train_iter += 1
                
                loss = self.train_batch(sess, train_data_iter)
                if train_iter % display_step == 0:

                    train_loss_string = "Iter {}, Minibatch Loss = {:.6f}".format(train_iter, loss)
                    print(train_loss_string)
                
                if train_iter % save_weights_interval ==0:
                    if weights_prefix != None:
                        weights_dir = self.model_dir + "weights/{}_iter-{}.cpkt".format(weights_prefix,train_iter)
                    else:
                        weights_dir = self.model_dir + "weights/QA_seq2seq_weights_iter-{}.ckpt".format(train_iter)
                    save_path = saver.save(sess, weights_dir)
                    save_string = "Model saved in file: {}".format(save_path)
                    print(save_string)
                    test_loss, test_accuracy = self.test(sess, test_data_iter, 1)
                    print('Test loss @ iter {}: {} '.format(train_iter, test_loss))
                    print('Test Accuracy @ iter {}: {} '.format(train_iter, test_accuracy))
                
#                 if deploy_data_iter !=None and train_iter % deploy_interval ==0:
#                     d_X, d_Y = deploy_data_iter.next_batch()
#                     d_feed_dict = self.get_feed(d_X,d_Y, 1.)
                                        
                    
    def test_step(self, test_data_iter, sess):
        testX, testY = test_data_iter.next_batch()
        feed_dict = self.get_feed(testX, testY, keep_prob=1.)
        loss_v, dec_op_v = sess.run([self.__loss, self.dec_outputs_test], feed_dict)
        dec_op_v = np.array(dec_op_v).transpose([1,0,2])
        return loss_v, dec_op_v, testX, testY
    
    def test(self, sess, test_data_iter, num_batches):
        losses= []
        predict_loss = []
        for i in range(num_batches):
            loss_t, dec_op_t, batchX, batchY = self.test_step(test_data_iter, sess)
            losses.append(loss_t)
            
            for idx in range(len(dec_op_t)):
                real = batchY.T[idx]
                predict = np.argmax(dec_op_t, axis=2)[idx]
                predict_loss.append(all(real==predict))
        return np.mean(losses), np.mean(predict_loss)        
    
    def predict(self, ckpt_file, X):
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver()
            saver.restore(sess, self.model_dir + 'weights/' + ckpt_file)
            feed_dict = {self.enc_inp[t]: X[t] for t in range(self.xseq_len)}
            feed_dict[self.__keep_prob] = 1.
            dec_op_v = sess.run(self.dec_outputs_test, feed_dict)
            # dec_op_v is a list; also need to transpose 0,1 indices 
            #  (interchange batch_size and timesteps dimensions
            dec_op_v = np.array(dec_op_v).transpose([1,0,2])
            # return the index of item with highest probability
            return np.argmax(dec_op_v, axis=2) 
        
    def test_try(self, ckpt_file, test_data_iter):
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver()
            saver.restore(sess, self.model_dir + 'weights/' + ckpt_file)
            accuracy = []
            loss_t, out_t, X, Y = self.test_step(test_data_iter, sess)
            
            for idx in range(len(out_t)):
                real = Y.T[idx]
                prediction = np.argmax(out_t, axis=2)[idx]
                accuracy.append(all(real==prediction))
            return np.mean(accuracy)

In [4]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.data_iterator = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = next(self.data_iterator)
        except StopIteration:
            self.data_iterator = self.make_random_iter()
            idxs = next(self.data_iterator)
        X, Y = list(zip(*[self.data[i] for i in idxs]))
        X = np.array(X).T
        Y = np.array(Y).T
        return X, Y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)

In [5]:
for task in range(1,21):
    print("task ", task)
    data_dir = '/media/ai2-rey/data_disk/data_sets/bAbI/tasks_1-20_v1-2/en-10k/'
    LSTM_data = BabiDataset(data_dir, task, 'seq2seq')
    
    train_data = list(zip(LSTM_data.train_input_tokens, LSTM_data.train_labels_tokens))
    test_data = list(zip(LSTM_data.test_input_tokens, LSTM_data.test_labels_tokens))
    
    train_data_iter = DataIterator(train_data, 256)
    test_data_iter = DataIterator(test_data, 128)
    deploy_data_iter = DataIterator(test_data, 1)
    
    vocab_size = len(LSTM_data.vocab) 

    xseq_len = LSTM_data.max_context_len
    yseq_len = LSTM_data.max_answer_len

    num_layers = 3
    lr_rate = 0.001
    momentum = 0.9
    n_hidden = 256 
    word_dim = 100
    dropout_rate = 0.5
    gpu_device = 0
    model_dir = None
    
    model = Seq2Seq(xseq_len=xseq_len,
               yseq_len=yseq_len, 
               vocab_size=vocab_size,
               word_dim = word_dim, 
               num_layers = num_layers,
               dropout_rate=0.5,
               gpu_device = gpu_device)
    
    sess = model.train(train_data_iter, test_data_iter, deploy_data_iter,
                      train_iters=6000, display_step=200,
                      save_weights_interval = 600, id2word_dict=LSTM_data.id2word_dict, 
                      weights_prefix='bAbI_QA_{}'.format(LSTM_data.task_id))
    
#     i, l = test_data_iter.next_batch()
#     o = model.predict('bAbI_QA_iter-5000.cpkt', i)
    
#     for ii, tl, oi in list(zip(i.T, l.T, o)):
#         inp_string = " ".join([LSTM_data.id2word_dict[q] for q in ii])
#         true_label = " ".join([LSTM_data.id2word_dict[t] for t in tl])
#         predict_string = " ".join([LSTM_data.id2word_dict[o] for o in oi])
#         print('q:{} \n prediction:{} \n true label:{}'.format(inp_string,predict_string, true_label))

task  1
Loaded 10000 data samples from qa1_single-supporting-fact_train.txt
Loaded 1000 data samples from qa1_single-supporting-fact_test.txt
Context Lengths: max = 72, min = 16, avg = 41
Answer Lengths: max = 1, min = 1, avg = 1
Iter 200, Minibatch Loss = 1.811123
Iter 400, Minibatch Loss = 1.659665
Iter 600, Minibatch Loss = 1.524446
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_1_iter-600.cpkt
Test loss @ iter 600: 1.6211472749710083 
Test Accuracy @ iter 600: 0.28125 
Iter 800, Minibatch Loss = 1.395644
Iter 1000, Minibatch Loss = 1.181001
Iter 1200, Minibatch Loss = 0.691330
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_1_iter-1200.cpkt
Test loss @ iter 1200: 0.6228667497634888 
Test Accuracy @ iter 1200: 0.734375 
Iter 1400, Minibatch Loss = 0.394765
Iter 1600, Minibatch Loss = 0.264978
Iter 1800, Minibatch Loss = 0.185811
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_1_ite

In [5]:
data_dir = '/media/ai2-rey/data_disk/data_sets/bAbI/tasks_1-20_v1-2/en-10k/'
LSTM_data = BabiDataset(data_dir, 19, 'seq2seq')

Loaded 10000 data samples from qa19_path-finding_train.txt
Loaded 1000 data samples from qa19_path-finding_test.txt
Context Lengths: max = 51, min = 51, avg = 51
Answer Lengths: max = 2, min = 2, avg = 2


In [6]:
train_data = list(zip(LSTM_data.train_input_tokens, LSTM_data.train_labels_tokens))
test_data = list(zip(LSTM_data.test_input_tokens, LSTM_data.test_labels_tokens))

In [7]:
train_data_iter = DataIterator(train_data, 256)
test_data_iter = DataIterator(test_data, 999)
deploy_data_iter = DataIterator(test_data, 1)

In [8]:
vocab_size = len(LSTM_data.vocab) 

xseq_len = LSTM_data.max_context_len
yseq_len = LSTM_data.max_answer_len
 
num_layers = 3
lr_rate = 0.001
momentum = 0.9
n_hidden = 256 
word_dim = 100
dropout_rate = 0.5
gpu_device = 0
model_dir = None

In [9]:
model = Seq2Seq(xseq_len=xseq_len,
               yseq_len=yseq_len, 
               vocab_size=vocab_size,
               word_dim = word_dim, 
               num_layers = num_layers,
               dropout_rate=0.5,
               gpu_device = gpu_device)

In [11]:
accuracy = model.test_try('bAbI_QA_19_iter-800.cpkt', test_data_iter)
print(accuracy)

0.105105105105


In [10]:
sess = model.train(train_data_iter, test_data_iter, deploy_data_iter,
                  train_iters=1000, display_step=200,
                  save_weights_interval = 200, id2word_dict=LSTM_data.id2word_dict, 
                  weights_prefix='bAbI_QA_{}'.format(LSTM_data.task_id))

Iter 200, Minibatch Loss = 1.347988
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_19_iter-200.cpkt
Test loss @ iter 200: 1.3055917024612427 
Test Accuracy @ iter 200: 0.07007007007007007 
Iter 400, Minibatch Loss = 1.270750
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_19_iter-400.cpkt
Test loss @ iter 400: 1.2573223114013672 
Test Accuracy @ iter 400: 0.0920920920920921 
Iter 600, Minibatch Loss = 1.264452
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_19_iter-600.cpkt
Test loss @ iter 600: 1.250370979309082 
Test Accuracy @ iter 600: 0.08408408408408409 
Iter 800, Minibatch Loss = 1.247475
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/weights/bAbI_QA_19_iter-800.cpkt
Test loss @ iter 800: 1.2445751428604126 
Test Accuracy @ iter 800: 0.10510510510510511 
Iter 1000, Minibatch Loss = 1.241678
Model saved in file: /home/ai2-rey/Documents/code/Nikhita/deep_QA/wei

In [12]:
i, l = test_data_iter.next_batch()
o = model.predict('bAbI_QA_19_iter-800.cpkt', i)
print(o.shape)

(999, 2)


In [None]:
a = l.T[0]
" ".join([LSTM_data.id2word_dict[q] for q in a])

In [13]:
for ii, tl, oi in list(zip(i.T, l.T, o)):
    inp_string = " ".join([LSTM_data.id2word_dict[q] for q in ii])
    true_label = " ".join([LSTM_data.id2word_dict[t] for t in tl])
    predict_string = " ".join([LSTM_data.id2word_dict[o] for o in oi])
    print('q:{} \n prediction:{} \n true label:{}'.format(inp_string,predict_string, true_label))

q:the hallway is east of the kitchen . the bathroom is north of the kitchen . the bedroom is north of the garden . the kitchen is east of the garden . the office is west of the garden . Q how do you go from the bathroom to the garden 
 prediction:west west 
 true label:south west
q:the office is south of the bathroom . the bedroom is west of the hallway . the garden is east of the office . the hallway is north of the kitchen . the kitchen is west of the office . Q how do you go from the office to the hallway 
 prediction:west west 
 true label:west north
q:the bedroom is south of the garden . the garden is south of the bathroom . the bathroom is west of the office . the bathroom is south of the kitchen . the kitchen is west of the hallway . Q how do you go from the garden to the kitchen 
 prediction:west west 
 true label:north north
q:the bedroom is east of the office . the bathroom is east of the garden . the hallway is west of the office . the office is south of the garden . the kit

In [None]:
LSTM_data.test_labels_tokens[11]

In [None]:
LSTM_data.id2word_dict[11]

In [None]:
i = test_data_iter.next_batch()
o = model.predict(sess, i)
print(o.shape)

In [None]:
LSTM_data.train_labels_raw[15]

In [None]:
LSTM_data.train_labels_tokens[15]

In [None]:
for a in LSTM_data.test_input_tokens[11]:
    print(LSTM_data.id2word_dict[a])

In [None]:
len(LSTM_data.vocab)

In [None]:
LSTM_data.vocab

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [None]:
xseq_length = LSTM_data.max_context_len
yseq_length = LSTM_data.max_answer_len
batch_size = 64
vocab_size = len(LSTM_data.vocab)
embedding_dim = 50
learning_rate = 0.0001
dropout_rate = 0.5
epochs = 10000

In [None]:
encode_input = [tf.placeholder(tf.int32, 
                                shape=(None,),
                                name = "ei_%i" %i)
                                for i in range(xseq_length)]

labels = [tf.placeholder(tf.int32,
                         shape=(None,),
                         name = "l_%i" %i)
          for i in range(yseq_length)]

decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]

In [None]:
keep_prob = tf.placeholder("float")

cells = [rnn_cell.DropoutWrapper(
        rnn_cell.BasicLSTMCell(embedding_dim), output_keep_prob=keep_prob
    ) for i in range(3)]

stacked_lstm = rnn_cell.MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = tf.nn.seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, vocab_size, vocab_size, embedding_dim)
    
    scope.reuse_variables()
    
    decode_outputs_test, decode_state_test = tf.nn.seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, vocab_size, vocab_size, embedding_dim,
    feed_previous=True)

In [None]:
loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in labels]
loss = tf.nn.seq2seq.sequence_loss(decode_outputs, labels, loss_weights, vocab_size)
optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
def get_feed(X, Y):
    feed_dict = {encode_input[t]: X[t] for t in range(xseq_length)}
    feed_dict.update({labels[t]: Y[t] for t in range(yseq_length)})
    return feed_dict

def train_batch(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = dropout_rate
    _, out = sess.run([train_op, loss], feed_dict)
    return out

def get_eval_batch_data(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 1.
    all_output = sess.run([loss] + decode_outputs_test, feed_dict)
    eval_loss = all_output[0]
    decode_output = np.array(all_output[1:]).transpose([1,0,2])
    return eval_loss, decode_output, X, Y

def eval_batch(data_iter, num_batches):
    losses = []
    predict_loss = []
    for i in range(num_batches):
        eval_loss, output, X, Y = get_eval_batch_data(data_iter)
        losses.append(eval_loss)
        
        for index in range(len(output)):
            real = Y.T[index]
            predict = np.argmax(output, axis = 2)[index]
            predict_loss.append(all(real==predict))
    return np.mean(losses), np.mean(predict_loss)

In [None]:
for i in range(epochs):
    try:
        train_batch(train_data_iter)
        if i % 1000 == 0:
            val_loss, val_predict = eval_batch(test_data_iter, 16)
            train_loss, train_predict = eval_batch(train_data_iter, 16)
            print("val loss   : %f, val predict   = %.1f%%" %(val_loss, val_predict * 100))
            print("train loss : %f, train predict = %.1f%%" %(train_loss, train_predict * 100))
            print
            sys.stdout.flush()
    except KeyboardInterrupt:
        print( "interrupted by user")
        break

In [None]:
x= np.array([[1,2,3],[4,5,6]])
y=np.array([[1],[2]])
batch_size = 1
# for i in range(0, len(x), batch_size):
#     if (i+1)*batch_size < len(x):
#         x = x[i: (i+1)*batch_size].T
#         y = y[i:(i+1)*batch_size].T
x