In [1]:
import tensorflow as tf
import numpy as np
from copy import deepcopy
from attention_gru import AttentionGRUCell

In [2]:
epoch = 10
batch_size = 64
size_layer = 64
dropout_rate = 0.1
n_hops = 2

In [3]:
class BaseDataLoader():
    def __init__(self):
        self.data = {
            'size': None,
            'val':{
                'inputs': None,
                'questions': None,
                'answers': None,},
            'len':{
                'inputs_len': None,
                'inputs_sent_len': None,
                'questions_len': None,
                'answers_len': None}
        }
        self.vocab = {
            'size': None,
            'word2idx': None,
            'idx2word': None,
        }
        self.params = {
            'vocab_size': None,
            '<start>': None,
            '<end>': None,
            'max_input_len': None,
            'max_sent_len': None,
            'max_quest_len': None,
            'max_answer_len': None,
        }
        
class DataLoader(BaseDataLoader):
    def __init__(self, path, is_training, vocab=None, params=None):
        super().__init__()
        data, lens = self.load_data(path)
        if is_training:
            self.build_vocab(data)
        else:
            self.demo = data
            self.vocab = vocab
            self.params = deepcopy(params)
        self.is_training = is_training
        self.padding(data, lens)


    def load_data(self, path):
        data, lens = bAbI_data_load(path)
        self.data['size'] = len(data[0])
        return data, lens


    def build_vocab(self, data):
        signals = ['<pad>', '<unk>', '<start>', '<end>']
        inputs, questions, answers = data
        i_words = [w for facts in inputs for fact in facts for w in fact if w != '<end>']
        q_words = [w for question in questions for w in question]
        a_words = [w for answer in answers for w in answer if w != '<end>']
        words = list(set(i_words + q_words + a_words))
        self.params['vocab_size'] = len(words) + 4
        self.params['<start>'] = 2
        self.params['<end>'] = 3
        self.vocab['word2idx'] = {word: idx for idx, word in enumerate(signals + words)}
        self.vocab['idx2word'] = {idx: word for word, idx in self.vocab['word2idx'].items()}
        

    def padding(self, data, lens):
        inputs_len, inputs_sent_len, questions_len, answers_len = lens

        self.params['max_input_len'] = max(inputs_len)
        self.params['max_sent_len'] = max([fact_len for batch in inputs_sent_len for fact_len in batch])
        self.params['max_quest_len'] = max(questions_len)
        self.params['max_answer_len'] = max(answers_len)

        self.data['len']['inputs_len'] = np.array(inputs_len)
        for batch in inputs_sent_len:
            batch += [0] * (self.params['max_input_len'] - len(batch))
        self.data['len']['inputs_sent_len'] = np.array(inputs_sent_len)
        self.data['len']['questions_len'] = np.array(questions_len)
        self.data['len']['answers_len'] = np.array(answers_len)
        
        inputs, questions, answers = deepcopy(data)
        for facts in inputs:
            for sentence in facts:
                for i in range(len(sentence)):
                    sentence[i] = self.vocab['word2idx'].get(sentence[i], self.vocab['word2idx']['<unk>'])
                sentence += [0] * (self.params['max_sent_len'] - len(sentence))
            paddings = [0] * self.params['max_sent_len']
            facts += [paddings] * (self.params['max_input_len'] - len(facts))
        for question in questions:
            for i in range(len(question)):
                question[i] = self.vocab['word2idx'].get(question[i], self.vocab['word2idx']['<unk>'])
            question += [0] * (self.params['max_quest_len'] - len(question))
        for answer in answers:
            for i in range(len(answer)):
                answer[i] = self.vocab['word2idx'].get(answer[i], self.vocab['word2idx']['<unk>'])

        self.data['val']['inputs'] = np.array(inputs)
        self.data['val']['questions'] = np.array(questions)
        self.data['val']['answers'] = np.array(answers)
        
def bAbI_data_load(path, END=['<end>']):
    inputs = []
    questions = []
    answers = []

    inputs_len = []
    inputs_sent_len = []
    questions_len = []
    answers_len = []

    for d in open(path):
        index = d.split(' ')[0]
        if index == '1':
            fact = []
        if '?' in d:
            temp = d.split('\t')
            q = temp[0].strip().replace('?', '').split(' ')[1:] + ['?']
            a = temp[1].split() + END
            fact_copied = deepcopy(fact)
            inputs.append(fact_copied)
            questions.append(q)
            answers.append(a)

            inputs_len.append(len(fact_copied))
            inputs_sent_len.append([len(s) for s in fact_copied])
            questions_len.append(len(q))
            answers_len.append(len(a))
        else:
            tokens = d.replace('.', '').replace('\n', '').split(' ')[1:] + END
            fact.append(tokens)
    return [inputs, questions, answers], [inputs_len, inputs_sent_len, questions_len, answers_len]

In [4]:
train_data = DataLoader(path='qa5_three-arg-relations_train.txt',is_training=True)
test_data = DataLoader(path='qa5_three-arg-relations_test.txt',is_training=False,
                       vocab=train_data.vocab, params=train_data.params)

In [5]:
START = train_data.params['<start>']
END = train_data.params['<end>']

In [6]:
def shift_right(x):
    batch_size = tf.shape(x)[0]
    start = tf.to_int32(tf.fill([batch_size, 1], START))
    return tf.concat([start, x[:, :-1]], 1)

def GRU(name, rnn_size=None):
    return tf.nn.rnn_cell.GRUCell(
        rnn_size, kernel_initializer=tf.orthogonal_initializer(), name=name)

def position_encoding(sentence_size, embedding_size):
    encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
    ls = sentence_size + 1
    le = embedding_size + 1
    for i in range(1, le):
        for j in range(1, ls):
            encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
    encoding = 1 + 4 * encoding / embedding_size / sentence_size
    return np.transpose(encoding)

def gen_episode(inputs_len, memory, q_vec, fact_vecs, proj_1, proj_2, attn_gru, is_training):
    def gen_attn(fact_vec):
        features = [fact_vec * q_vec,
                    fact_vec * memory,
                    tf.abs(fact_vec - q_vec),
                    tf.abs(fact_vec - memory)]
        feature_vec = tf.concat(features, 1)
        attention = proj_1(feature_vec)
        attention = proj_2(attention)
        return tf.squeeze(attention, 1)

    attns = tf.map_fn(gen_attn, tf.transpose(fact_vecs, [1,0,2]))
    attns = tf.transpose(attns)                                      
    attns = tf.nn.softmax(attns)                                     
    attns = tf.expand_dims(attns, -1)                                
    
    _, episode = tf.nn.dynamic_rnn(attn_gru,
                                   tf.concat([fact_vecs, attns], 2),
                                   inputs_len,
                                   dtype=np.float32)
    return episode                  

class QA:
    def __init__(self, vocab_size):
        self.questions = tf.placeholder(tf.int32,[None,None])
        self.inputs = tf.placeholder(tf.int32,[None,None,None])
        self.questions_len = tf.placeholder(tf.int32,[None])
        self.inputs_len = tf.placeholder(tf.int32,[None])
        self.answers_len = tf.placeholder(tf.int32,[None])
        self.answers = tf.placeholder(tf.int32,[None,None])
        self.training = tf.placeholder(tf.bool)
        max_sent_len = train_data.params['max_sent_len']
        max_quest_len = train_data.params['max_quest_len']
        max_answer_len = train_data.params['max_answer_len']
        
        lookup_table = tf.get_variable('lookup_table', [vocab_size, size_layer], tf.float32)
        lookup_table = tf.concat((tf.zeros([1, size_layer]), lookup_table[1:, :]), axis=0)
        
        cell_fw = GRU('cell_fw', size_layer // 2)
        cell_bw = GRU('cell_bw', size_layer // 2)
        inputs = tf.nn.embedding_lookup(lookup_table, self.inputs)
        position = position_encoding(max_sent_len, size_layer)
        inputs = tf.reduce_sum(inputs * position, 2)
        birnn_out, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                   cell_bw,
                                                   inputs,
                                                   self.inputs_len,
                                                   dtype=np.float32)
        fact_vecs = tf.concat(birnn_out, -1)
        fact_vecs = tf.layers.dropout(fact_vecs, dropout_rate, training=self.training)
        
        cell = GRU('question_rnn', size_layer)
        questions = tf.nn.embedding_lookup(lookup_table, self.questions)
        _, q_vec = tf.nn.dynamic_rnn(cell,
                                 questions,
                                 self.questions_len,
                                 dtype=np.float32)
        
        proj_1 = tf.layers.Dense(size_layer, tf.tanh, name='attn_proj_1')
        proj_2 = tf.layers.Dense(1, name='attn_proj_2')
        attn_gru = AttentionGRUCell(size_layer, name='attn_gru')
        memory_proj = tf.layers.Dense(size_layer, tf.nn.relu, name='memory_proj')
        
        memory = q_vec
        for i in range(n_hops):
            episode = gen_episode(self.inputs_len,
                              memory,
                              q_vec,
                              fact_vecs,
                              proj_1,
                              proj_2,
                              attn_gru,
                              self.training)
            memory = memory_proj(tf.concat([memory, episode, q_vec], 1))
        
        state_proj = tf.layers.Dense(size_layer, name='state_proj')
        vocab_proj = tf.layers.Dense(vocab_size, name='vocab_proj')
        
        memory = tf.layers.dropout(memory, dropout_rate, training=self.training)
        init_state = state_proj(tf.concat((memory, q_vec), -1))
        helper = tf.contrib.seq2seq.TrainingHelper(
            inputs = tf.nn.embedding_lookup(lookup_table, shift_right(self.answers)),
            sequence_length = self.answers_len)
        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell = cell,
            helper = helper,
            initial_state = init_state,
            output_layer = vocab_proj)
        decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder = decoder)
        self.training_id = decoder_output.sample_id
        self.training_logits = decoder_output.rnn_output
        
        helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            embedding = lookup_table,
            start_tokens = tf.tile(
                tf.constant([START], dtype=tf.int32), [tf.shape(init_state)[0]]),
            end_token = END)
        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell = cell,
            helper = helper,
            initial_state = init_state,
            output_layer = vocab_proj)
        decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder = decoder,
            maximum_iterations = max_answer_len)
        self.predict_id = decoder_output.sample_id
        
        self.cost = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                        targets = self.answers,
                                                        weights = tf.ones_like(self.answers, tf.float32)))
        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)
        
        correct_pred = tf.equal(self.training_id, self.answers)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = QA(train_data.params['vocab_size'])
sess.run(tf.global_variables_initializer())

In [8]:
for i in range(epoch):
    total_cost, total_acc = 0, 0
    for k in range(0, train_data.data['val']['inputs'].shape[0], batch_size):
        index = min(k + batch_size, train_data.data['val']['inputs'].shape[0])
        batch_questions = train_data.data['val']['questions'][k:index]
        batch_inputs = train_data.data['val']['inputs'][k:index]
        batch_inputs_len = train_data.data['len']['inputs_len'][k:index]
        batch_questions_len = train_data.data['len']['questions_len'][k:index]
        batch_answers_len = train_data.data['len']['answers_len'][k:index]
        batch_answers = train_data.data['val']['answers'][k:index]
        acc, cost, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                  feed_dict={model.questions:batch_questions,
                                            model.inputs:batch_inputs,
                                            model.inputs_len:batch_inputs_len,
                                            model.questions_len:batch_questions_len,
                                            model.answers_len:batch_answers_len,
                                            model.answers:batch_answers,
                                            model.training:True})
        total_cost += cost
        total_acc += acc
    total_cost /= (train_data.data['val']['inputs'].shape[0] / batch_size)
    total_acc /= (train_data.data['val']['inputs'].shape[0] / batch_size)
    print('epoch %d, avg cost %f, avg acc %f'%(i+1,total_cost,total_acc))

epoch 1, avg cost 1.193251, avg acc 0.595400
epoch 2, avg cost 0.605435, avg acc 0.702650
epoch 3, avg cost 0.309468, avg acc 0.866150
epoch 4, avg cost 0.183491, avg acc 0.916000
epoch 5, avg cost 0.163441, avg acc 0.917900
epoch 6, avg cost 0.141715, avg acc 0.933100
epoch 7, avg cost 0.103843, avg acc 0.963800
epoch 8, avg cost 0.066167, avg acc 0.986200
epoch 9, avg cost 0.051985, avg acc 0.993900
epoch 10, avg cost 0.037008, avg acc 0.998250


In [10]:
testing_size = 32
batch_questions = test_data.data['val']['questions'][:testing_size]
batch_inputs = test_data.data['val']['inputs'][:testing_size]
batch_inputs_len = test_data.data['len']['inputs_len'][:testing_size]
batch_questions_len = test_data.data['len']['questions_len'][:testing_size]
batch_answers_len = test_data.data['len']['answers_len'][:testing_size]
batch_answers = test_data.data['val']['answers'][:testing_size]
logits = sess.run(model.predict_id,
                        feed_dict={model.questions:batch_questions,
                                   model.inputs:batch_inputs,
                                   model.inputs_len:batch_inputs_len,
                                   model.questions_len:batch_questions_len,
                                   model.answers_len:batch_answers_len,
                                   model.training:False})

In [11]:
for i in range(testing_size):
    print('QUESTION:',' '.join([train_data.vocab['idx2word'][k] for k in batch_questions[i]]))
    print('REAL:',train_data.vocab['idx2word'][batch_answers[i,0]])
    print('PREDICT:',train_data.vocab['idx2word'][logits[i,0]],'\n')

QUESTION: What did Fred give to Jeff ? <pad>
REAL: football
PREDICT: football 

QUESTION: Who gave the football to Jeff ? <pad>
REAL: Fred
PREDICT: Fred 

QUESTION: What did Fred give to Jeff ? <pad>
REAL: football
PREDICT: football 

QUESTION: Who did Fred give the football to ?
REAL: Jeff
PREDICT: Jeff 

QUESTION: Who did Jeff give the football to ?
REAL: Fred
PREDICT: Fred 

QUESTION: Who gave the apple ? <pad> <pad> <pad>
REAL: Jeff
PREDICT: Jeff 

QUESTION: Who received the apple ? <pad> <pad> <pad>
REAL: Bill
PREDICT: Bill 

QUESTION: What did Bill give to Fred ? <pad>
REAL: apple
PREDICT: apple 

QUESTION: What did Bill give to Fred ? <pad>
REAL: apple
PREDICT: apple 

QUESTION: Who received the apple ? <pad> <pad> <pad>
REAL: Fred
PREDICT: Fred 

QUESTION: Who received the milk ? <pad> <pad> <pad>
REAL: Mary
PREDICT: Mary 

QUESTION: What did Mary give to Bill ? <pad>
REAL: milk
PREDICT: milk 

QUESTION: Who received the milk ? <pad> <pad> <pad>
REAL: Bill
PREDICT: Bill 

QUEST