Implementation of https://arxiv.org/abs/1503.08895

In [1]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import os
import re
from functools import reduce

Using TensorFlow backend.


In [2]:
from keras.preprocessing.sequence import pad_sequences

In [3]:
data_dir = './data/tasks_1-20_v1-2/en-10k'
files = os.listdir(data_dir)
sample_file = open(os.path.join(data_dir, files[0]), 'r').read().splitlines()

In [4]:
sample_file

['1 Mary is in the school.',
 '2 Bill is in the kitchen.',
 '3 Is Bill in the bedroom? \tno\t2',
 '4 Bill journeyed to the bedroom.',
 '5 Fred travelled to the cinema.',
 '6 Is Bill in the bedroom? \tyes\t4',
 '7 Fred went back to the park.',
 '8 Bill is either in the school or the office.',
 '9 Is Bill in the park? \tno\t8',
 '10 Mary went to the cinema.',
 '11 Julie is either in the school or the office.',
 '12 Is Fred in the park? \tyes\t7',
 '13 Julie is either in the park or the school.',
 '14 Bill went back to the office.',
 '15 Is Bill in the office? \tyes\t14',
 '1 Fred journeyed to the office.',
 '2 Fred went back to the cinema.',
 '3 Is Fred in the office? \tno\t2',
 '4 Julie is either in the school or the office.',
 '5 Julie moved to the cinema.',
 '6 Is Julie in the cinema? \tyes\t5',
 '7 Julie is either in the kitchen or the bedroom.',
 '8 Julie journeyed to the school.',
 '9 Is Julie in the school? \tyes\t8',
 '10 Bill is either in the bedroom or the school.',
 '11 Bill w

In [5]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [6]:
import functools

def lazy_property(function):
    attribute = '_cache_' + function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            with tf.variable_scope(function.__name__):
                setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator

In [7]:
# Set of functions to parse raw babi data into lists and to vectorize them
def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences
    that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
                
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file,
    retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(story, q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, max_stories, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [[word_idx[w] for w in substory] for substory in story]

        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        X.append(pad_sequences(x, maxlen=story_maxlen))
        Xq.append(xq)
        Y.append(y)

    return (pad_sequences(X, maxlen=max_stories),
            pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

flatten = lambda data: reduce(lambda x, y: x + y, data)

In [8]:
train_file = files[21]
test_file = files[20]
train_stories = get_stories(open(os.path.join(data_dir, train_file), 'r'))
test_stories = get_stories(open(os.path.join(data_dir, test_file), 'r'))

  return _compile(pattern, flags).split(string, maxsplit)


In [9]:
list(enumerate(files))

[(0, 'qa10_indefinite-knowledge_test.txt'),
 (1, 'qa10_indefinite-knowledge_train.txt'),
 (2, 'qa11_basic-coreference_test.txt'),
 (3, 'qa11_basic-coreference_train.txt'),
 (4, 'qa12_conjunction_test.txt'),
 (5, 'qa12_conjunction_train.txt'),
 (6, 'qa13_compound-coreference_test.txt'),
 (7, 'qa13_compound-coreference_train.txt'),
 (8, 'qa14_time-reasoning_test.txt'),
 (9, 'qa14_time-reasoning_train.txt'),
 (10, 'qa15_basic-deduction_test.txt'),
 (11, 'qa15_basic-deduction_train.txt'),
 (12, 'qa16_basic-induction_test.txt'),
 (13, 'qa16_basic-induction_train.txt'),
 (14, 'qa17_positional-reasoning_test.txt'),
 (15, 'qa17_positional-reasoning_train.txt'),
 (16, 'qa18_size-reasoning_test.txt'),
 (17, 'qa18_size-reasoning_train.txt'),
 (18, 'qa19_path-finding_test.txt'),
 (19, 'qa19_path-finding_train.txt'),
 (20, 'qa1_single-supporting-fact_test.txt'),
 (21, 'qa1_single-supporting-fact_train.txt'),
 (22, 'qa20_agents-motivations_test.txt'),
 (23, 'qa20_agents-motivations_train.txt'),
 (24

In [10]:
max(map(len, (x for x, _, _ in train_stories + test_stories)))

10

In [11]:
vocab = set()
for story, q, answer in train_stories + test_stories:
    vocab |= set(flatten(story) + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (s for x, _, _ in train_stories + test_stories for s in x)))
max_stories = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train_stories[0])
print()

-
Vocab size: 22 unique words
Story max length: 7 words
Query max length: 4 words
Number of training stories: 10000
Number of test stories: 1000
-
Here's what a "story" tuple looks like (input, query, answer):
([['Mary', 'moved', 'to', 'the', 'bathroom', '.'], ['John', 'went', 'to', 'the', 'hallway', '.']], ['Where', 'is', 'Mary', '?'], 'bathroom')



In [12]:
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
                                                               word_idx,
                                                               story_maxlen,
                                                               max_stories,
                                                               query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
                                                            word_idx,
                                                            story_maxlen,
                                                            max_stories,
                                                            query_maxlen)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)

-
inputs: integer tensor of shape (samples, max_length)
inputs_train shape: (10000, 10, 7)
inputs_test shape: (1000, 10, 7)
-
queries: integer tensor of shape (samples, max_length)
queries_train shape: (10000, 4)
queries_test shape: (1000, 4)
-
answers: binary (1 or 0) tensor of shape (samples, vocab_size)
answers_train shape: (10000, 22)
answers_test shape: (1000, 22)


In [13]:
tf.train.batch([inputs_train, queries_train, answers_train], batch_size=100, enqueue_many=True)

[<tf.Tensor 'batch:0' shape=(100, 10, 7) dtype=int32>,
 <tf.Tensor 'batch:1' shape=(100, 4) dtype=int32>,
 <tf.Tensor 'batch:2' shape=(100, 22) dtype=float64>]

In [14]:
def batch_vm(v, m):
    shape = tf.shape(v)
    rank = shape.get_shape()[0].value
    v = tf.expand_dims(v, rank)

    vm = tf.multiply(v, m)

    return tf.reduce_sum(vm, rank-1)

class BoWMemN2N:
    """
    Single layer memory network
    """ 
    def __init__(self, text, questions, answers, story_size, max_stories,query_size, emb_dim, **kwargs):
        '''
        Args:
            variable_dims: tuple with dimensions of 
            (dim_a, dim_b, dim_c)
        '''
        self.batch_size = kwargs.get("batch_size", 100)
        self.learn_rate = kwargs.get("learn_rate", 1e-3)
        self.vocab_size = kwargs.get("vocab_size", answers.shape[1])
        self.max_grad_norm = kwargs.get("max_grad_norm", 40.)
        self.emb_dim = emb_dim
        self.query_size = query_size
        self.story_size = story_size
        self.max_stories = max_stories
        
        self.text = text
        self.questions = questions
        self.answers = answers

        self.variables
        self.memory_model
        #self.prediction
        self.optimize
        #self.error
        
    def next_batch(self):
        
        self.n_batches = self.text.shape[0] // self.batch_size
        
        for idx in range(self.n_batches):
            start = self.batch_size * idx
            end = self.batch_size * (idx + 1)
            
            text_batch = self.text[start:end,:,:]
            quest_batch = self.questions[start:end,:]
            answer_batch = self.answers[start:end,:]
            
            yield text_batch, quest_batch, answer_batch
        
    @lazy_property
    def variables(self):
        self.global_step = tf.Variable(0, name="global_step")
        
        self.x = tf.placeholder(dtype=tf.int32, shape=[None, self.max_stories, self.story_size], name='x')
        self.q = tf.placeholder(dtype=tf.int32, shape=[None, self.query_size], name='q')
        self.a = tf.placeholder(dtype=tf.int32, shape=[None, self.answers.shape[1]], name='a')
        
        self.A = tf.Variable(initial_value=tf.random_normal(shape=[self.vocab_size, self.emb_dim], stddev=0.1), name='A')
        self.B = tf.Variable(initial_value=tf.random_normal(shape=[self.vocab_size, self.emb_dim], stddev=0.1), name='B')
        self.C = tf.Variable(initial_value=tf.random_normal(shape=[self.vocab_size, self.emb_dim], stddev=0.1), name='C')
        
        self.T_A = tf.Variable(initial_value=tf.random_normal(shape=[self.batch_size, self.max_stories, self.emb_dim], stddev=0.1), name='T_A')
        self.T_C = tf.Variable(initial_value=tf.random_normal(shape=[self.batch_size, self.max_stories, self.emb_dim], stddev=0.1), name='T_C')
        
        self.W = tf.Variable(initial_value=tf.truncated_normal(shape=[self.emb_dim, self.vocab_size], stddev=0.1))
        
    @lazy_property
    def memory_model(self):
        with tf.name_scope('memory'):
            self.m = tf.reduce_sum(tf.nn.embedding_lookup(self.A, self.x), axis=2, name='m') + self.T_A

            self.u = tf.reduce_sum(tf.nn.embedding_lookup(self.B, self.q), axis=1, name='u')
            self.c = tf.reduce_sum(tf.nn.embedding_lookup(self.C, self.x), axis=2, name='c') + self.T_C



            self.p = tf.nn.sigmoid(
                            tf.einsum('bse,be->bs', self.m, self.u), name='p')

            self.o = tf.einsum('ijk,ij->ik', self.c , self.p)

            self.a_hat = tf.nn.softmax(
                        tf.matmul(self.o, self.W), name='a_hat'
            )
            
    @lazy_property
    def optimize(self):
        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.a_hat, 
                                                            labels=self.a)
        
        # change to variable to add descrese routine
        self.lr = tf.constant(value=self.learn_rate)
        
        self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
        
        trainables = [self.A, self.B, self.C, self.T_A, self.T_C, self.W]
        
        grads_and_vars = self.optimizer.compute_gradients(self.loss, trainables)
        clipped_grads_and_vars = [(tf.clip_by_norm(gv[0], self.max_grad_norm), gv[1]) \
                                   for gv in grads_and_vars]
        
        inc = self.global_step.assign_add(1)
        with tf.control_dependencies([inc]):
            self.optimizer = self.optimizer.apply_gradients(clipped_grads_and_vars)
     

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))
    

In [15]:
epochs = 1000

In [16]:
!mkdir checkpoints

A subdirectory or file checkpoints already exists.


In [None]:
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    mem_net = BoWMemN2N(inputs_train, queries_train, answers_train, emb_dim=64, batch_size=1000,
                        story_size=story_maxlen,max_stories=max_stories, query_size=4, vocab_size=vocab_size)


with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    
    for e in range(1, epochs+1):
        for text_batch, quest_batch, answer_batch in mem_net.next_batch():
            
            feed = {mem_net.x: text_batch,
                    mem_net.q:  quest_batch,
                    mem_net.a: answer_batch}
            
            train_loss, _ = sess.run([mem_net.loss, mem_net.optimizer], feed_dict=feed)
            loss += train_loss.sum()
            
            
            
        if e % 100 == 0:
            print(loss)
        loss = 0
        
        
    save_path = saver.save(sess, "checkpoints/model.ckpt")

28188.8383789
23816.3071289
21924.6506348
21831.1833496
21806.3981934
21799.1643066
21797.6044922
21796.2648926
21794.0668945


In [88]:
with train_graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=train_graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    feed = {mem_net.x: inputs_test,
            mem_net.q:  queries_test,
            mem_net.a: answers_test}
    pred, train_loss = sess.run([mem_net.a_hat, mem_net.loss], feed_dict=feed)

In [94]:
(np.argmax(pred, 1) == np.argmax(answers_test[:100,:], 1)).sum()

83

In [93]:
np.argmax(answers_test[:100,:], 1)

array([ 9, 12, 12, 17,  9,  9,  9,  9, 17, 12,  9, 10,  9,  9, 10, 10, 15,
       12, 12, 12,  9, 11, 15, 12, 17,  9, 10, 11, 12, 11, 12, 15,  9, 15,
       11, 15, 10, 10, 15, 15, 17, 17, 17, 15, 12, 10, 10, 15,  9, 15, 11,
       17, 17, 17, 17, 17, 12, 12, 17, 12, 12,  9, 10, 10, 15, 15,  9, 11,
       15,  9, 11, 17,  9, 12, 17, 11, 12, 15, 10, 10, 17,  9, 12,  9, 15,
       17, 17, 17, 12, 15,  9,  9, 15, 15, 12,  9, 10, 10, 11, 15], dtype=int64)

In [66]:
show_graph(tf.get_default_graph().as_graph_def())