In [1]:
import numpy as np
import random
from random import shuffle
import os
import gensim
import re
import pickle
import tensorflow as tf
import operator
import math
from copy import deepcopy
from collections import Counter
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.python.framework import ops
from tensorflow.models.rnn.translate import seq2seq_model

# Data Preprocessing

In [2]:
class BabiDataset:
    def __init__(self, data_dir, task_id, model_type, max_vocab_size=None):
        self.task_id = task_id
        self.data_dir = data_dir
        self.model_type = model_type
        self.max_vocab_size = max_vocab_size
        self.vocab = set()
        self.word_counter = dict()
        self.raw_train_data = None
        self.raw_test_data = None
        self.num_tokens = None
        
        self.__load_data()
        self.word2id_dict, self.id2word_dict = self.__create_word2id_dict()
                
        (self.train_input_raw, self.train_input_tokens, 
         self.train_labels_raw, self.train_labels_tokens, self.train_sentence_counts)  = self.__tokenize_sentences(self.raw_train_data)
        
        (self.test_input_raw, self.test_input_tokens, 
         self.test_labels_raw, self.test_labels_tokens, self.test_sentence_counts)  = self.__tokenize_sentences(self.raw_test_data)
        
        self.max_context_len = None
        self.max_question_len = None

    def __update_word_counter(self, sequence):
        """ Update word_counter with counts for words in a sentence
        
        Args:
            sequence (list<str>) : list of words in a sequence
        
        """
        for word in sequence:
            self.word_counter[word] = self.word_counter.get(word, 0) + 1
            
    def __create_vocab(self):
        """ Create set of most frequent unique words found in the training data """
        
        if self.max_vocab_size == None:
            self.vocab == set(self.word_counter.keys())
        else:
            self.vocab = set(sorted(self.word_counter, key=self.word_counter.get, reverse=True)[:self.max_vocab_size])
        
    def __parse_babi_file(self, txt_file):
        with open(txt_file) as babi_file:
            raw_data = []
            curr_sample = None
            for i, line in enumerate(open(txt_file)):
                id = int(line[0:line.find(' ')])
                if id == 1:
                    skip = False
                    curr_sample = {"C": [], "Q": "", "A": ""}

                line = line.strip()
                line = line.replace('.', ' . ')
                line = line[line.find(' ') + 1:]
                self.__update_word_counter(line)
                if line.find('?') == -1:
                    curr_sample["C"].append(line)
                else:
                    idx = line.find('?')
                    tmp = line[idx + 1:].split('\t')
                    curr_sample["Q"] = line[:idx]
                    curr_sample["A"] = tmp[1].strip()
                    raw_data.append(deepcopy(curr_sample))

            self.__create_vocab()
            print "Loaded {} data samples from {}".format(len(raw_data), txt_file.split(self.data_dir)[1])
            return raw_data

    def __load_data(self):
        files = [os.path.join(self.data_dir, f) for f in os.listdir(self.data_dir)]
        s = 'qa{}_'.format(self.task_id)
        task_files = [f for f in files if s in f]
        train_file = [f for f in task_files if 'train' in f][0]
        test_file = [f for f in task_files if 'test' in f][0] 
        
        self.raw_train_data = self.__parse_babi_file(train_file)
        self.raw_test_data = self.__parse_babi_file(test_file)
        
#    def __create_unique_word_corpus(self, raw_data_dict):
#        all_words = []
#        for x in raw_data_dict:
#            for fact in x["C"]:
#                for word in fact.lower().split(' '):
#                    if len(word) > 0:
#                         all_words.append(word)

#             for word in x["Q"].lower().split(' '):
#                 if len(word) > 0:
#                     all_words.append(word)
#             for word in x["A"].lower().split(' '):
#                 if len(word) > 0:
#                     all_words.append(word)
#         word_corpus = set(all_words)
#         print "{} unique words found".format(len(word_corpus))
#         return word_corpus
    
    def __create_word2id_dict(self):
        word2id_dict = dict()
        #self.vocab = self.__create_unique_word_corpus(self.raw_train_data)
        
        if self.model_type == 'LSTM':
            word2id_dict['PAD'] = 0
            word2id_dict['UNK'] = 1
        elif self.model_type == 'seq2seq':
            word2id_dict['PAD'] = 0
            word2id_dict['GO'] = 1
            word2id_dict['EOS'] = 2
            word2id_dict['UNK'] = 3
            word2id_dict['Q'] = 4
        elif self.model_type == 'DMN':
            word2id_dict['PAD'] = 0
            word2id_dict['UNK'] = 1
        else:
            print "Error: Model type {} invalid".format(self.model_type)
            
        for word in self.vocab:
            word2id_dict[word] = len(word2id_dict)

        id2word_dict = dict(zip(word2id_dict.values(), word2id_dict.keys()))
        self.num_tokens = len(word2id_dict)
        return word2id_dict, id2word_dict
    
    def __convert_word2id(self, word):
        try:
            word_id = self.word2id_dict[word]
        except:
            word_id = self.word2id_dict['UNK']
        return word_id

    def __convert_to_one_hot(self, labels):
        one_hot = np.array([[0 for j in range(self.num_tokens)] for i in range(len(labels))])

        for i in range(len(one_hot)):
            one_hot[i][labels[i]] = 1

        return one_hot
    
    def __tokenize_sentences(self, raw_data):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        context = []
        context_ids = []
        questions = []
        question_ids = []
        answers = []
        answer_ids = []
        sentence_counts = []

        if self.model_type == 'LSTM':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append([word for sentence in story for word in sentence])
                context_ids.append([word_id for sentence in story_ids for word_id in sentence])

                questions.append(q)
                question_ids.append(q_ids)

                answers.append(sample["A"])  # NOTE: here we assume the answer is one word!
                answer_ids.append(self.__convert_word2id(sample["A"]))
                sentence_counts.append(len(story))
            answer_ids = self.__convert_to_one_hot(answer_ids)
            context_ids, question_ids, answer_ids = np.array(context_ids), np.array(question_ids), np.array(answer_ids)
            packaged_data = zip(context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts)
            random.shuffle(packaged_data)
            context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts = zip(*packaged_data)
            return zip(context, questions), zip(context_ids, question_ids), answers, answer_ids, sentence_counts
            
        elif self.model_type == 'seq2seq':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append([word for sentence in story for word in sentence] + ['Q'] + q)
                context_ids.append([word_id for sentence in story_ids for word_id in sentence] + [self.word2id_dict['Q']] + q_ids)

                answers.append(['GO'] + sample["A"].lower().split(' ') + ['EOS'])
                answer_ids.append([self.word2id_dict['GO']] + [self.__convert_word2id(w) for w in sample["A"].lower().split(' ')] + [self.word2id_dict['EOS']])
                sentence_counts.append(len(story))    
            context_ids, answer_ids = np.array(context_ids), np.array(answer_ids)
            packaged_data = zip(context, context_ids, answers, answer_ids, sentence_counts)
            random.shuffle(packaged_data)
            context, context_ids, answers, answer_ids, sentence_counts = zip(*packaged_data)
            return context, context_ids, answers, answer_ids, sentence_counts
            
        elif model_type == 'DMN':
            for sample in raw_data:
                story = []
                story_ids = []
                for sentence in sample["C"]:
                    seq = [w for w in sentence.lower().split(' ') if len(w) > 0]
                    seq_ids = [self.__convert_word2id(w) for w in sentence.lower().split(' ') if len(w) > 0]
                    story.append(seq)
                    story_ids.append(seq_ids)

                q = [w for w in sample["Q"].lower().split(' ') if len(w) > 0]
                q_ids = [self.__convert_word2id(w) for w in sample["Q"].lower().split(' ') if len(w) > 0]

                context.append(story)
                context_ids.append(story_ids)

                questions.append(q)
                question_ids.append(q_ids)

                answers.append(sample["A"])  # NOTE: here we assume the answer is one word!
                answer_ids.append(self.__convert_word2id(sample["A"]))
                sentence_counts.append(len(story))
            answer_ids = self.__convert_to_one_hot(answer_ids)
            context_ids, question_ids, answer_ids = np.array(context_ids), np.array(question_ids), np.array(answer_ids)
            packaged_data = zip(context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts)
            random.shuffle(packaged_data)
            context, context_ids, questions, question_ids, answers, answer_ids, sentence_counts = zip(*packaged_data)
            return zip(context, questions), zip(context_ids, question_ids), answers, answer_ids, sentence_counts

    def __get_max_sequence_length(self, sequences):
        max_len = 0
        min_len = 1000
        avg_len = 0
        for sequence in sequences:
            max_len = max(max_len, len(sequence))
            min_len = min(min_len, len(sequence))
            avg_len += len(sequence)
        avg_len = int(float(avg_len) / len(sequences))
        return max_len, min_len, avg_len
    
    def __apply_padding(self, sequences, length):
        padded_data = []
        
        for id_sequence in sequences:
            if len(id_sequence) < length:
                padded_sequence = id_sequence
                for i in range(length - len(id_sequence)):
                    padded_sequence.append(0)
                padded_data.append(padded_sequence)
            elif len(id_sequence) > length:
                clipped_sequence = id_sequence[:length]
                padded_data.append(clipped_sequence)
            else:
                padded_data.append(id_sequence)
        return padded_data

    def pad_sequences(self, pad_lengths=None):
        if self.model_type == 'LSTM':
            train_context_data, train_question_data = zip(*self.train_input_tokens)
            test_context_data, test_question_data = zip(*self.test_input_tokens)
            
            if pad_lengths == None:
                self.max_context_len, min_context_len, avg_context_len = self._get_max_sequence_length(train_context_data)
                self.max_question_len, min_question_len, avg_question_len = self._get_max_sequence_length(train_question_data)
                print 'Context Lengths: max = {}, min = {}, avg = {}'.format(self.max_context_len, min_context_len, avg_context_len)
                print 'Question Lengths: max = {}, min = {}, avg = {}'.format(self.max_question_len, min_question_len, avg_question_len)
            elif len(pad_lengths) == 2:
                self.max_context_len = pad_lengths[0]
                self.max_question_len = pad_lengths[1]
            else:
                print "Error: pad_lengths needs form [context_len, question_len]"

            train_context_data = self._apply_padding(train_context_data, self.max_context_len)
            train_question_data = self._apply_padding(train_question_data, self.max_question_len)
            test_context_data = self._apply_padding(test_context_data, self.max_context_len)
            test_question_data = self._apply_padding(test_question_data, self.max_question_len)

            self.train_input_tokens = zip(train_context_data, train_question_data)
            self.test_input_tokens = zip(test_context_data, test_question_data)   
            return

In [3]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.iter = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = self.iter.next()
        except StopIteration:
            self.iter = self.make_random_iter()
            idxs = self.iter.next()
        X, y = zip(*[self.data[i] for i in idxs])
        return X, y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)

# Train Baseline LSTM

In [2]:
data_dir = '/media/ai2-bb8/data_disk/data_sets/bAbI/tasks_1-20_v1-2/en-10k/'
LSTM_data = BabiDataset(data_dir, 20, 'LSTM')

In [41]:
LSTM_data.pad_sequences()

Context Lengths: max = 69, min = 4, avg = 29
Question Lengths: max = 7, min = 4, avg = 5


In [42]:
train_data = zip(LSTM_data.train_input_tokens, LSTM_data.train_labels_tokens)
test_data = zip(LSTM_data.test_input_tokens, LSTM_data.test_labels_tokens)

In [43]:
train_data_iter = DataIterator(train_data, 128)
test_data_iter = DataIterator(test_data, 999)
deploy_data_iter = DataIterator(test_data, 1)

In [44]:
tf.reset_default_graph()

In [45]:
with tf.device('/gpu:1'):
    # Parameters
    learning_rate = 0.1
    training_iters = 300000
    batch_size = 128
    display_step = 1000
    test_interval = 5000
    vocab_size = LSTM_data.num_tokens

    # Network Parameters
    word_dim = 64 # word vector dimensions
    n_steps_question = LSTM_data.max_question_len # timesteps
    n_steps_story = LSTM_data.max_context_len
    n_hidden = 128 # hidden layer num of features
    n_classes = vocab_size # total classes

    X_story = tf.placeholder(tf.int32, [None, n_steps_story])
    X_question = tf.placeholder(tf.int32, [None, n_steps_question])
    y = tf.placeholder(tf.float32, [None, n_classes])

    # Define weights
    weights = {
        'word_embeddings': tf.Variable(tf.random_uniform([vocab_size, word_dim], -1.0, 1.0)),
        'output': tf.Variable(tf.random_normal([n_hidden, n_classes]))
    }
    biases = {
        'output': tf.Variable(tf.random_normal([n_classes]))
    }

    def model(X_story, X_question):
        
        ### story LSTM ###
        story_word_embeddings = tf.nn.embedding_lookup(weights['word_embeddings'], X_story)
        story_word_embeddings = tf.nn.dropout(story_word_embeddings, 0.3)
        # Prepare data shape to match `rnn` function requirements
        # Current data input shape: (batch_size, n_steps, n_input)
        # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)

        # Permuting batch_size and n_steps
        story_word_embeddings = tf.transpose(story_word_embeddings, [1, 0, 2])
        # Reshaping to (n_steps*batch_size, n_input)
        story_word_embeddings = tf.reshape(story_word_embeddings, [-1, word_dim])
        # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
        story_word_embeddings = tf.split(0, n_steps_story, story_word_embeddings)
        
        with tf.variable_scope('story_LSTM'):
            # Define a lstm cell with tensorflow
            story_lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)

            # Get lstm cell output
            story_outputs, story_states = rnn.rnn(story_lstm_cell, story_word_embeddings, dtype=tf.float32)
        
        ### question LSTM ###
        question_word_embeddings = tf.nn.embedding_lookup(weights['word_embeddings'], X_question)
        question_word_embeddings = tf.nn.dropout(question_word_embeddings, 0.3)
        
        question_word_embeddings = tf.transpose(question_word_embeddings, [1, 0, 2])
        question_word_embeddings = tf.reshape(question_word_embeddings, [-1, word_dim])
        question_word_embeddings = tf.split(0, n_steps_question, question_word_embeddings)

        with tf.variable_scope('question_LSTM'):
            question_lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)

            question_outputs, question_states = rnn.rnn(question_lstm_cell, question_word_embeddings, dtype=tf.float32)
        
        # sum question and story vectors
        combined_vector = tf.add(story_outputs[-1], question_outputs[-1])
        
        # return dense layer
        return tf.matmul(combined_vector, weights['output']) + biases['output']

    pred = model(X_story, X_question)
    softmax = tf.nn.softmax(pred)
    
    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    # Evaluate model
    correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initializing the variables
    init = tf.initialize_all_variables()

In [46]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Keep training until reach max iterations
    for train_iter in range(training_iters):
        train_iter += 1
        
        X_batch, y_batch = train_data_iter.next_batch()
        X_story_batch, X_question_batch = zip(*X_batch)

        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={X_story: X_story_batch, X_question: X_question_batch, y: y_batch})
        
        if train_iter % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={X_story: X_story_batch, X_question: X_question_batch, y: y_batch})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={X_story: X_story_batch, X_question: X_question_batch, y: y_batch})
            print("Iter " + str(train_iter) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
            
        if train_iter % test_interval == 0:
            # Calculate accuracy on full test dataset
            X_batch, y_batch = test_data_iter.next_batch()
            X_story_batch, X_question_batch = zip(*X_batch)
            
            print("Testing Accuracy:", \
                sess.run(accuracy, feed_dict={X_story: X_story_batch, X_question: X_question_batch, y: y_batch}))
            
            X_batch, y_batch = deploy_data_iter.next_batch()
            X_story_batch, X_question_batch = zip(*X_batch)
            deploy_outputs = sess.run(softmax, feed_dict={X_story: X_story_batch, X_question: X_question_batch, y: y_batch})
            deploy_pred = LSTM_data.id2word_dict[int(np.argmax(deploy_outputs))]
            
            print ' '
            print 'STORY'
            print ' '.join([LSTM_data.id2word_dict[idx] for idx in X_story_batch[0]])
            print ' '
            print 'QUESTION'
            print ' '.join([LSTM_data.id2word_dict[idx] for idx in X_question_batch[0]])
            print ' '
            print 'GROUND TRUTH'
            print LSTM_data.id2word_dict[list(y_batch[0]).index(1)] 
            print ' '
            print 'MODEL PREDICTION'
            print deploy_pred
            print ' '

Iter 1000, Minibatch Loss= 0.492414, Training Accuracy= 0.73438
Iter 2000, Minibatch Loss= 0.613970, Training Accuracy= 0.76562
Iter 3000, Minibatch Loss= 0.313448, Training Accuracy= 0.81250
Iter 4000, Minibatch Loss= 0.290781, Training Accuracy= 0.91406
Iter 5000, Minibatch Loss= 0.139194, Training Accuracy= 0.87500
('Testing Accuracy:', 0.8998999)
 
STORY
sumit is tired . jason is bored . sumit moved to the bedroom . sumit took the pajamas there . antoine is tired . antoine journeyed to the bedroom . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
 
QUESTION
why did antoine go to the bedroom
 
GROUND TRUTH
tired
 
MODEL PREDICTION
tired
 
Iter 6000, Minibatch Loss= 0.172792, Training Accuracy= 0.89844
Iter 7000, Minibatch Loss= 0.404511, Training Accuracy= 0.91406
Iter 8000, Minibatch Loss= 0.204393, Training Accuracy= 0.85938
Iter 9000, Minibatch Loss= 0.250061, Training Accu

KeyboardInterrupt: 

In [None]:
## LSTM 
# task 1 - 51.3%
# task 2 - 17.1%
# task 3 - 21.7%
# task 4 - 51.1%
# task 5 - 80.1%
# task 6 - 50.6%
# task 7 - 62.6%
# task 8 - 33.5%
# task 9 - 63.3%
# task 10 - 43.9%
# task 11 - 63.6%
# task 12 - 75.5%
# task 13 - 93.9%
# task 14 - 20.9%
# task 15 - 25.1%
# task 16 - 48.7%
# task 17 - 52.2%
# task 18 - 90.6%
# task 19 - 8.5%
# task 20 - 90.9%