# 10. 	Dynamic Memory Networks for Question Answering

I recommend you take a look at these material first.

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture16-DMN-QA.pdf
* https://arxiv.org/abs/1506.07285

In [1]:
import mindspore
from mindspore import nn, Tensor, ops, Parameter
import random
import numpy as np
from copy import deepcopy
import os
from mindnlp.modules import Accumulator
from mindspore.common.initializer import initializer
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

  from tqdm.autonotebook import tqdm


In [2]:
gpu = '0'
# 设置使用哪些显卡进行训练
os.environ["CUDA_VISIBLE_DEVICES"] = gpu

In [3]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch, w_to_ix):  # for bAbI dataset
    fact, q, a = list(zip(*batch))
    max_fact = max([len(f) for f in fact])
    max_len = max([f.shape[1] for f in flatten(fact)])
    max_q = max([qq.shape[1] for qq in q])
    max_a = max([aa.shape[1] for aa in a])

    facts, fact_masks, q_p, a_p = [], [], [], []
    for i in range(len(batch)):
        fact_p_t = []

        for j in range(len(fact[i])):
            if fact[i][j].shape[1] < max_len:
                fact_p_t.append(ops.cat([fact[i][j], Parameter(Tensor([w_to_ix['<PAD>']] * (max_len - fact[i][j].shape[1]), mindspore.int64)).view(1, -1)], 1))
            else:
                fact_p_t.append(fact[i][j])

        while len(fact_p_t) < max_fact:
            fact_p_t.append(Tensor([w_to_ix['<PAD>']] * max_len, dtype=mindspore.int64).view(1, -1))

        fact_p_t = ops.cat(fact_p_t)
        facts.append(fact_p_t)
        fact_masks.append(ops.cat([Tensor(tuple(map(lambda s: s == 0, t.asnumpy())), dtype=mindspore.byte) for t in fact_p_t]).view(fact_p_t.shape[0], -1))

        if q[i].shape[1] < max_q:
            q_p.append(ops.cat([q[i], Parameter(Tensor([w_to_ix['<PAD>']] * (max_q - q[i].shape[1]), mindspore.int64)).view(1, -1)], 1))
        else:
            q_p.append(q[i])

        if a[i].shape[1] < max_a:
            a_p.append(ops.cat([a[i], Parameter(Tensor([w_to_ix['<PAD>']] * (max_a - a[i].shape[1]), mindspore.int64)).view(1, -1)], 1))
        else:
            a_p.append(a[i])

    questions = ops.cat(q_p)
    answers = ops.cat(a_p)
    question_masks = ops.cat([Tensor(tuple(map(lambda s: s == 0, t.asnumpy())), dtype=mindspore.byte) for t in questions]).view(questions.shape[0], -1)

    return facts, fact_masks, questions, question_masks, answers

In [5]:
def prepare_sequence(seq, word2index):
    idxs = [word2index.get(w, word2index["<UNK>"]) for w in seq]
    sequence = Tensor(idxs, dtype=mindspore.int64)
    return sequence

## Data load and Preprocessing 

### bAbI dataset(https://research.fb.com/downloads/babi/)

In [6]:
def bAbI_data_load(path):
    try:
        data = open(path).readlines()
    except FileNotFoundError:
        print("Such a file does not exist at {}".format(path))
        return None
    except PermissionError:
        print("Permission denied for file at {}".format(path))
        return None
    except IsADirectoryError:
        print("{} is a directory, not a file".format(path))
        return None

    data = [d[:-1] for d in data]
    data_p = []
    fact = []
    try:
        for d in data:
            index = d.split(' ')[0]
            if index == '1':
                fact = []
            if '?' in d:
                temp = d.split('\t')
                q = temp[0].strip().replace('?', '').split(' ')[1:] + ['?']
                a = temp[1].split() + ['</s>']
                stemp = deepcopy(fact)
                data_p.append([stemp, q, a])
            else:
                tokens = d.replace('.', '').split(' ')[1:] + ['</s>']
                fact.append(tokens)
    except Exception:
        print("Please check the data is right")
        return None
    return data_p

In [7]:
train_data = bAbI_data_load('../dataset/bAbI/en-10k/qa5_three-arg-relations_train.txt')

In [8]:
train_data[0]

[[['Bill', 'travelled', 'to', 'the', 'office', '</s>'],
  ['Bill', 'picked', 'up', 'the', 'football', 'there', '</s>'],
  ['Bill', 'went', 'to', 'the', 'bedroom', '</s>'],
  ['Bill', 'gave', 'the', 'football', 'to', 'Fred', '</s>']],
 ['What', 'did', 'Bill', 'give', 'to', 'Fred', '?'],
 ['football', '</s>']]

In [9]:
fact, q, a = list(zip(*train_data))

In [10]:
vocab = list(set(flatten(flatten(fact)) + flatten(q) + flatten(a)))

In [11]:
word2index = {'<PAD>': 0, '<UNK>': 1, '<s>': 2, '</s>': 3}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
index2word = {v: k for k, v in word2index.items()}

In [12]:
len(word2index)

44

In [13]:
for t in train_data:
    for i, fact in enumerate(t[0]):
        t[0][i] = prepare_sequence(fact, word2index).view(1, -1)

    t[1] = prepare_sequence(t[1], word2index).view(1, -1)
    t[2] = prepare_sequence(t[2], word2index).view(1, -1)

## Modeling 

<img src="../images/10.dmn-architecture.png">
<center>borrowed image from https://arxiv.org/pdf/1506.07285.pdf</center>

In [13]:
class DMN(nn.Cell):
    def __init__(self, input_size, hidden_size, output_size, dropout_p=0.1):
        super(DMN, self).__init__()

        self.hidden_size = hidden_size
        self.embed = nn.Embedding(input_size, hidden_size, padding_idx=0, embedding_table="XavierUniform")  # sparse=True)
        self.input_gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.question_gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

        self.gate = nn.SequentialCell(
            nn.Dense(hidden_size * 4, hidden_size, weight_init="XavierNormal"),
            nn.Tanh(),
            nn.Dense(hidden_size, 1, weight_init="XavierNormal"),
            nn.Sigmoid()
        )

        self.attention_grucell = nn.GRUCell(hidden_size, hidden_size)
        self.memory_grucell = nn.GRUCell(hidden_size, hidden_size)
        self.answer_grucell = nn.GRUCell(hidden_size * 2, hidden_size)
        self.answer_fc = nn.Dense(hidden_size, output_size, weight_init="XavierNormal", bias_init="Zero")

        self.dropout = nn.Dropout(p=dropout_p)

    def init_hidden(self, inputs):
        hidden = ops.zeros((1, inputs.shape[0], self.hidden_size))
        return hidden

    def init_weight(self, hidden_size):
        # Weight initialization of GRU
        self.input_gru.weight_ih = initializer('XavierNormal', [hidden_size, hidden_size])
        self.input_gru.weight_hh = initializer('XavierNormal', [hidden_size, hidden_size])

        self.question_gru.weight_ih = initializer('XavierNormal', [hidden_size, hidden_size])
        self.question_gru.weight_hh = initializer('XavierNormal', [hidden_size, hidden_size])

        # Weight initialization of GRUCell
        self.attention_grucell.weight_ih.set_data(initializer('XavierNormal', self.attention_grucell.weight_ih.shape))
        self.attention_grucell.weight_hh.set_data(initializer('XavierNormal', self.attention_grucell.weight_hh.shape))

        self.memory_grucell.weight_ih.set_data(initializer('XavierNormal', self.memory_grucell.weight_ih.shape))
        self.memory_grucell.weight_hh.set_data(initializer('XavierNormal', self.memory_grucell.weight_hh.shape))

        self.answer_grucell.weight_ih.set_data(initializer('XavierNormal', self.answer_grucell.weight_ih.shape))
        self.answer_grucell.weight_hh.set_data(initializer('XavierNormal', self.answer_grucell.weight_hh.shape))

    def construct(self, facts, fact_masks, questions, question_masks, num_decode, episodes=3, is_training=False):
        """
        facts : (B,T_C,T_I) / LongTensor in List # batch_size, num_of_facts, length_of_each_fact(padded)
        fact_masks : (B,T_C,T_I) / ByteTensor in List # batch_size, num_of_facts, length_of_each_fact(padded)
        questions : (B,T_Q) / LongTensor # batch_size, question_length
        question_masks : (B,T_Q) / ByteTensor # batch_size, question_length
        """
        # Input Module
        C = []  # encoded facts
        for fact, fact_mask in zip(facts, fact_masks):
            embeds = self.embed(fact)
            if is_training:
                embeds = self.dropout(embeds)
            hidden = self.init_hidden(fact)
            outputs, hidden = self.input_gru(embeds, hidden)
            real_hidden = []

            for i, o in enumerate(outputs):  # B,T,D
                real_length = fact_mask[i].asnumpy().item(0)
                real_hidden.append(o[real_length - 1])

            C.append(ops.cat(real_hidden).view(fact.shape[0], -1).unsqueeze(0))

        encoded_facts = ops.cat(C)  # B,T_C,D

        # Question Module
        embeds = self.embed(questions)
        if is_training:
            embeds = self.dropout(embeds)
        hidden = self.init_hidden(questions)
        outputs, hidden = self.question_gru(embeds, hidden)

        if is_training:
            real_question = []
            for i, o in enumerate(outputs):  # B,T,D
                real_length = question_masks[i].asnumpy().item(0)
                real_question.append(o[real_length - 1])
            encoded_question = ops.cat(real_question).view(questions.shape[0], -1)  # B,D
        else:  # for inference mode
            encoded_question = hidden.squeeze(0)  # B,D

        # Episodic Memory Module
        memory = encoded_question
        T_C = encoded_facts.shape[1]
        B = encoded_facts.shape[0]

        transposed_facts = ops.transpose(encoded_facts, (1, 0, 2))
        for i in range(episodes):
            hidden = self.init_hidden(transposed_facts[0]).squeeze(0)  # B,D
            for t in range(T_C):
                # TODO: fact masking
                # TODO: gate function => softmax
                z = ops.cat([
                    transposed_facts[t] * encoded_question,  # B,D , element-wise product
                    transposed_facts[t] * memory,  # B,D , element-wise product
                    ops.abs(transposed_facts[t] - encoded_question),  # B,D
                    ops.abs(transposed_facts[t] - memory)  # B,D
                ], 1)
                g_t = self.gate(z)  # B,1 scalar
                hidden = g_t * self.attention_grucell(transposed_facts[t], hidden) + (1 - g_t) * hidden

            e = hidden
            memory = self.memory_grucell(e, memory)

        # Answer Module
        answer_hidden = memory
        start_decode = ops.transpose(Tensor([[word2index['<s>']] * memory.shape[0]], dtype=mindspore.int64), (1, 0))
        y_t_1 = self.embed(start_decode).squeeze(1)  # B,D

        decodes = []
        for t in range(num_decode):
            answer_hidden = self.answer_grucell(ops.cat([y_t_1, encoded_question], 1), answer_hidden)
            decodes.append(ops.log_softmax(self.answer_fc(answer_hidden), 1))
        return ops.cat(decodes, 1).view(B * num_decode, -1)

## Train 

It takes for a while if you use just cpu.

In [14]:
HIDDEN_SIZE = 80
BATCH_SIZE = 64
LR = 0.001
EPOCH = 50
NUM_EPISODE = 3
EARLY_STOPPING = False

In [15]:
model = DMN(len(word2index), HIDDEN_SIZE, len(word2index))
model.init_weight(HIDDEN_SIZE)

loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = nn.Adam(model.trainable_params(), learning_rate=LR)

In [16]:
accumulate_step = 2
accumulator = Accumulator(optimizer, accumulate_step)


def forward_fn(facts, fact_masks, questions, question_masks, answers, is_training):
    """Forward function"""
    preds = model(facts, fact_masks, questions, question_masks, answers.shape[1], NUM_EPISODE, is_training)
    answers = answers.astype(mindspore.int32)
    loss = loss_function(preds, answers.view(-1))
    return loss / accumulate_step


# Get gradient function
grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())


# Define function of one-step training
def train_step(facts, fact_masks, questions, question_masks, answers, is_training):
    """Training steps"""
    loss, grads = grad_fn(facts, fact_masks, questions, question_masks, answers, is_training)
    loss = ops.depend(loss, accumulator(grads))
    return loss

In [27]:
for epoch in range(EPOCH):
    losses = []
    if EARLY_STOPPING:
        break

    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        facts, fact_masks, questions, question_masks, answers = pad_to_batch(batch, word2index)

        loss = train_step(facts, fact_masks, questions, question_masks, answers, True)
        losses.append(loss.asnumpy().item(0) * accumulate_step)

        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" % (epoch, EPOCH, np.mean(losses)))

            if np.mean(losses) < 0.01:
                EARLY_STOPPING = True
                print("Early Stopping!")
                break
            losses = []

[0/50] mean_loss : 3.82
[0/50] mean_loss : 1.87
[1/50] mean_loss : 0.72
[1/50] mean_loss : 0.68
[2/50] mean_loss : 0.67
[2/50] mean_loss : 0.66


## Test 

In [17]:
def pad_to_fact(fact, x_to_ix):  # this is for inference
    max_x = max([s.shape[1] for s in fact])
    x_p = []
    for i in range(len(fact)):
        if fact[i].shape[1] < max_x:
            x_p.append(ops.cat([fact[i], Tensor([x_to_ix['<PAD>']] * (max_x - fact[i].shape[1]), dtype=mindspore.int64).view(1, -1)], 1))
        else:
            x_p.append(fact[i])

    fact = ops.cat(x_p)
    fact_mask = ops.cat([Parameter(Tensor(tuple(map(lambda s: s == 0, t.asnumpy())), dtype=mindspore.byte), requires_grad=False) for t in fact]).view(fact.shape[0], -1)
    return fact, fact_mask

### Prepare Test data 

In [18]:
test_data = bAbI_data_load('../dataset/bAbI/en-10k/qa5_three-arg-relations_test.txt')

In [20]:
for t in test_data:
    for i, fact in enumerate(t[0]):
        t[0][i] = prepare_sequence(fact, word2index).view(1, -1)

    t[1] = prepare_sequence(t[1], word2index).view(1, -1)
    t[2] = prepare_sequence(t[2], word2index).view(1, -1)

### Accuracy 

In [21]:
accuracy = 0

In [None]:
for t in test_data:
    fact, fact_mask = pad_to_fact(t[0], word2index)
    question = t[1]
    question_mask = Parameter(Tensor([0] * t[1].shape[1], dtype=mindspore.byte), requires_grad=False).unsqueeze(0)
    answer = t[2].squeeze(0)

    pred = model([fact], [fact_mask], question, question_mask, answer.shape[0], NUM_EPISODE)
    if (ops.max(pred, 1)[1].asnumpy() == answer.asnumpy()).all():
        accuracy += 1

print(accuracy / len(test_data) * 100)

### Sample test result 

In [34]:
t = random.choice(test_data)
fact, fact_mask = pad_to_fact(t[0], word2index)
question = t[1]
question_mask = Parameter(Tensor([0] * t[1].shape[1], dtype=mindspore.byte), requires_grad=False).unsqueeze(0)
answer = t[2].squeeze(0)

pred = model([fact], [fact_mask], question, question_mask, answer.shape[0], NUM_EPISODE)

print("Facts : ")
print('\n'.join([' '.join(list(map(lambda x: index2word[x], f))) for f in fact.asnumpy().tolist()]))
print("")
print("Question : ", ' '.join(list(map(lambda x: index2word[x], question.asnumpy().tolist()[0]))))
print("")
print("Answer : ", ' '.join(list(map(lambda x: index2word[x], answer.asnumpy().tolist()))))
print("Prediction : ", ' '.join(list(map(lambda x: index2word[x], ops.max(pred, 1)[1].asnumpy().tolist()))))

Facts : 
Bill went back to the bedroom </s>
Mary went to the office </s> <PAD>
Jeff journeyed to the kitchen </s> <PAD>
Fred journeyed to the kitchen </s> <PAD>
Fred got the milk there </s> <PAD>
Fred handed the milk to Jeff </s>
Jeff passed the milk to Fred </s>
Fred gave the milk to Jeff </s>

Question :  Who received the milk ?

Answer :  Jeff </s>
Prediction :  Jeff </s>


## Further topics 

* <a href="https://arxiv.org/pdf/1603.01417.pdf">Dynamic Memory Networks for Visual and Textual Question Answering(DMN+)</a>
* <a href="https://github.com/dandelin/Dynamic-memory-networks-plus-Pytorch">DMN+ Pytorch implementation</a>
* <a href="https://arxiv.org/pdf/1611.01604">Dynamic Coattention Networks For Question Answering</a>
* <a href="https://arxiv.org/pdf/1711.00106">DCN+: Mixed Objective and Deep Residual Coattention for Question Answering</a>