In [25]:
import os
import json
import math

import mindspore as ms
import mindspore.nn as nn
import mindspore.numpy as mnp
from mindspore import ops
from mindspore import Parameter, Tensor
from mindspore.dataset import text
from mindspore.common import dtype as mstype
from mindspore.common.initializer import Uniform, HeUniform, initializer

import mindnlp
from mindnlp._legacy.abc import Seq2vecModel
from mindnlp.modules import Glove, StaticLSTM
from mindnlp.transforms import BasicTokenizer

from tqdm import tqdm
from bidaf.evaluate import evaluate
from bidaf.squad_process import SQuAD1_Process

In [26]:
# load datasets
squad_train, squad_dev = mindnlp.load_dataset('squad1')
print(squad_train.get_col_names())

['id', 'context', 'question', 'answers', 'answer_start']


In [27]:
# load vocab and embedding
char_dic = {"<unk>": 0, "<pad>": 1, "e": 2, "t": 3, "a": 4, "i": 5, "n": 6,\
                    "o": 7, "s": 8, "r": 9, "h": 10, "l": 11, "d": 12, "c": 13, "u": 14,\
                    "m": 15, "f": 16, "p": 17, "g": 18, "w": 19, "y": 20, "b": 21, ",": 22,\
                    "v": 23, ".": 24, "k": 25, "1": 26, "0": 27, "x": 28, "2": 29, "\"": 30, \
                    "-": 31, "j": 32, "9": 33, "'": 34, ")": 35, "(": 36, "?": 37, "z": 38,\
                    "5": 39, "8": 40, "q": 41, "3": 42, "4": 43, "7": 44, "6": 45, ";": 46,\
                    ":": 47, "\u2013": 48, "%": 49, "/": 50, "]": 51, "[": 52}
char_vocab = text.Vocab.from_dict(char_dic)
# you can download the vocab file from "https://download.mindspore.cn/toolkits/mindnlp/vocab/Glove/glove.6B.100d.txt"
word_vocab = text.Vocab.from_file("glove.6B.100d.vocab.txt", special_tokens=["<unk>", "<pad>"], special_first=True)
word_embeddings = Glove.from_pretrained('6B', 100)

In [28]:
# process dataset
tokenizer = BasicTokenizer(True)

print("=============ready to process dataset===========")
squad_train = SQuAD1_Process(squad_train, char_vocab, word_vocab, tokenizer=tokenizer,\
                   max_context_len=768, max_question_len=64, max_char_len=48,\
                   batch_size=8, drop_remainder=False )
squad_dev = SQuAD1_Process(squad_dev, char_vocab, word_vocab, tokenizer=tokenizer,\
                   max_context_len=768, max_question_len=64, max_char_len=48,\
                   batch_size=8, drop_remainder=False )
print("===================process over=================")




In [29]:
# construct bidirectional attention flow model(BiDAF)
def arange(start, stop, step, dtype):
    return Tensor(mnp.arange(start, stop, step), dtype)

def sequence_mask(lengths, maxlen):
    """generate mask matrix by seq_length"""
    range_vector = arange(0, maxlen, 1, lengths.dtype)
    result = range_vector < lengths.view(lengths.shape + (1,))
    result = result.transpose((1, 0))
    return result.astype(lengths.dtype)

def select_by_mask(inputs, mask):
    """mask hiddens by mask matrix"""
    return mask.view(mask.shape + (1,)).swapaxes(0, 1) \
        .expand_as(inputs).astype(mstype.bool_)  * inputs

def get_hidden(output, seq_length):
    """get hidden state by seq_length"""
    batch_index = arange(0, seq_length.shape[0], 1, seq_length.dtype)
    indices = ops.concat((seq_length.view(-1, 1) - 1, batch_index.view(-1, 1)), 1)
    return ops.gather_nd(output, indices)

In [30]:
# build encoder
class Encoder(nn.Module):
    """
    Encoder for BiDAF model
    """
    def __init__(self, char_vocab_size, char_vocab, char_dim, char_channel_size, char_channel_width, word_vocab,
                  word_embeddings, hidden_size, dropout):
        super().__init__()
        self.char_vocab = char_vocab
        self.char_dim = char_dim
        self.char_channel_width = char_channel_width
        self.char_channel_size = char_channel_size
        self.word_vocab = word_vocab
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(p=dropout)
        self.init_embed = initializer(Uniform(0.001), [char_vocab_size, char_dim])
        self.embed = Parameter(self.init_embed, name='embed')

        # 1. Character Embedding Layer
        self.char_emb = Glove(init_embed=self.embed, dropout=0.0)
        self.char_conv = nn.Sequential(
            nn.Conv2d(1, char_channel_size, (char_dim, char_channel_width), pad_mode="pad",
                      weight_init=HeUniform(math.sqrt(5)), bias_init=Uniform(1 / math.sqrt(1))),
            nn.ReLU()
            )

        # 2. Word Embedding Layer
        self.word_emb = word_embeddings

        # highway network
        self.highway_linear0 = nn.Dense(hidden_size * 2, hidden_size * 2,
                                        weight_init=HeUniform(math.sqrt(5)),
                                        bias_init=Uniform(1 / math.sqrt(hidden_size * 2)),
                                        activation=nn.ReLU())
        self.highway_linear1 = nn.Dense(hidden_size * 2, hidden_size * 2,
                                        weight_init=HeUniform(math.sqrt(5)),
                                        bias_init=Uniform(1 / math.sqrt(hidden_size * 2)),
                                        activation=nn.ReLU())
        self.highway_gate0 = nn.Dense(hidden_size * 2, hidden_size * 2,
                                      weight_init=HeUniform(math.sqrt(5)),
                                      bias_init=Uniform(1 / math.sqrt(hidden_size * 2)),
                                      activation=nn.Sigmoid())
        self.highway_gate1 = nn.Dense(hidden_size * 2, hidden_size * 2,
                                      weight_init=HeUniform(math.sqrt(5)),
                                      bias_init=Uniform(1 / math.sqrt(hidden_size * 2)),
                                      activation=nn.Sigmoid())

        # 3. Contextual Embedding Layer
        self.context_LSTM = StaticLSTM(input_size=hidden_size * 2, hidden_size=hidden_size,
                                    bidirectional=True, batch_first=True, dropout=dropout)
    
    def construct(self, c_char, q_char, c_word, q_word, c_lens, q_lens):
        # 1. Character Embedding Layer
        c_char = self.char_emb_layer(c_char)
        q_char = self.char_emb_layer(q_char)

        # 2. Word Embedding Layer
        c_word = self.word_emb(c_word)
        q_word = self.word_emb(q_word)

        # Highway network
        c = self.highway_network(c_char, c_word)
        q = self.highway_network(q_char, q_word)
        
        # 3. Contextual Embedding Layer
        c, _ = self.context_LSTM(c)
        mask = sequence_mask(c_lens, c.shape[1])
        c = select_by_mask(c, mask)

        q, _ = self.context_LSTM(q)
        mask = sequence_mask(q_lens, q.shape[1])
        q = select_by_mask(q, mask)

        return c, q

    def char_emb_layer(self, x):
        """
        param x: (batch, seq_len, word_len)
        return: (batch, seq_len, char_channel_size)
        """
        batch_size = x.shape[0]
        # x: [batch, seq_len, word_len, char_dim]
        x = self.dropout(self.char_emb(x))
        # x: [batch, seq_len, char_dim, word_len]
        x = ops.transpose(x, (0, 1, 3, 2))
        # x: [batch * seq_len, 1, char_dim, word_len]
        x = x.view(-1, self.char_dim, x.shape[3]).expand_dims(1)
        # x: [batch * seq_len, char_channel_size, 1, conv_len] -> [batch * seq_len, char_channel_size, conv_len]
        x = self.char_conv(x).squeeze(2)
        # x: [batch * seq_len, char_channel_size]
        x = ops.max(x, axis=2)[1]
        # x: [batch, seq_len, char_channel_size]
        x = x.view(batch_size, -1, self.char_channel_size)

        return x

    def highway_network(self, x1, x2):
        """
        param x1: (batch, seq_len, char_channel_size)
        param x2: (batch, seq_len, word_dim)
        return: (batch, seq_len, hidden_size * 2)
        """
        # [batch, seq_len, char_channel_size + word_dim]
        x = ops.concat((x1, x2), axis=-1)
        h = self.highway_linear0(x)
        g = self.highway_gate0(x)
        x = g * h + (1 - g) * x
        h = self.highway_linear1(x)
        g = self.highway_gate1(x)
        x = g * h + (1 - g) * x

        # [batch, seq_len, hidden_size * 2]
        return x

In [31]:
# build head
class Head(nn.Module):
    """
    Head for BiDAF model
    """
    def __init__(self, hidden_size, dropout):
        super().__init__()
        # 4. Attention Flow Layer
        self.att_weight_c = nn.Dense(hidden_size * 2, 1,
                                     weight_init=HeUniform(math.sqrt(5)),
                                     bias_init=Uniform(1 / math.sqrt(hidden_size * 2)))
        self.att_weight_q = nn.Dense(hidden_size * 2, 1,
                                     weight_init=HeUniform(math.sqrt(5)),
                                     bias_init=Uniform(1 / math.sqrt(hidden_size * 2)))
        self.att_weight_cq = nn.Dense(hidden_size * 2, 1,
                                      weight_init=HeUniform(math.sqrt(5)),
                                      bias_init=Uniform(1 / math.sqrt(hidden_size * 2)))
        self.softmax = nn.Softmax(axis=-1)
        self.batch_matmul = ops.BatchMatMul()

        # 5. Modeling Layer
        self.modeling_LSTM1 = StaticLSTM(input_size=hidden_size * 8, hidden_size=hidden_size,
                                      bidirectional=True, batch_first=True, dropout=dropout)
        self.modeling_LSTM2 = StaticLSTM(input_size=hidden_size * 2, hidden_size=hidden_size,
                                      bidirectional=True, batch_first=True, dropout=dropout)
        
        # 6. Output Layer
        self.p1_weight_g = nn.Dense(hidden_size * 8, 1,
                                    weight_init=HeUniform(math.sqrt(5)),
                                    bias_init=Uniform(1 / math.sqrt(hidden_size * 8)))
        self.p1_weight_m = nn.Dense(hidden_size * 2, 1,
                                    weight_init=HeUniform(math.sqrt(5)),
                                    bias_init=Uniform(1 / math.sqrt(hidden_size * 2)))
        self.p2_weight_g = nn.Dense(hidden_size * 8, 1,
                                    weight_init=HeUniform(math.sqrt(5)),
                                    bias_init=Uniform(1 / math.sqrt(hidden_size * 8)))
        self.p2_weight_m = nn.Dense(hidden_size * 2, 1,
                                    weight_init=HeUniform(math.sqrt(5)),
                                    bias_init=Uniform(1 / math.sqrt(hidden_size * 2)))

        self.output_LSTM = StaticLSTM(input_size=hidden_size * 2, hidden_size=hidden_size,
                                   bidirectional=True, batch_first=True, dropout=dropout)

    def construct(self, c, q, c_lens):
        # 4. Attention Flow Layer
        g = self.att_flow_layer(c, q)  #c, q are generated from Contextual Embedding Layer in Encoder
        
        # 5. Modeling Layer
        m, _ = self.modeling_LSTM1(g)
        mask = sequence_mask(c_lens, g.shape[1])
        m = select_by_mask(m, mask)

        m, _ = self.modeling_LSTM2(m)
        mask = sequence_mask(c_lens, m.shape[1])
        m = select_by_mask(m, mask)

        # 6. Output Layer
        p1, p2 = self.output_layer(g, m, c_lens)

        # [batch, c_len], [batch, c_len]
        return p1, p2

    def att_flow_layer(self, c, q):
        """
        param c: (batch, c_len, hidden_size * 2)
        param q: (batch, q_len, hidden_size * 2)
        return: (batch, c_len, q_len)
        """
        c_len = c.shape[1]
        q_len = q.shape[1]

        cq = []
        for i in range(q_len):
            # qi: [batch, 1, hidden_size * 2]
            qi = q.gather(ms.Tensor(i), axis=1).expand_dims(1)
            # ci: [batch, c_len, 1] -> [batch, c_len]
            ci = self.att_weight_cq(c * qi).squeeze(2)
            cq.append(ci)
        # cq: [batch, c_len, q_len]
        cq = ops.stack(cq, -1)

        # s: [batch, c_len, q_len]
        s = self.att_weight_c(c).broadcast_to((-1, -1, q_len)) + \
            self.att_weight_q(q).transpose((0, 2, 1)).broadcast_to((-1, c_len, -1)) + cq

        # a: [batch, c_len, q_len]
        a = self.softmax(s)
        # c2q_att: [batch, c_len, hidden_size * 2]
        c2q_att = self.batch_matmul(a, q)
        # b: [batch, 1, c_len]
        b = self.softmax(ops.max(s, axis=2)[1]).expand_dims(1)
        # q2c_att: [batch, hidden_size * 2]
        q2c_att = self.batch_matmul(b, c).squeeze(1)
        # q2c_att: [batch, c_len, hidden_size * 2]
        q2c_att = q2c_att.expand_dims(1).broadcast_to((-1, c_len, -1))

        # x: [batch, c_len, hidden_size * 8]
        x = ops.concat([c, c2q_att, c * c2q_att, c * q2c_att], axis=-1)
        return x

    def output_layer(self, g, m, l):
        """
        param g: (batch, c_len, hidden_size * 8)
        param m: (batch, c_len ,hidden_size * 2)
        return: p1: (batch, c_len), p2: (batch, c_len)
        """
        # p1: [batch, c_len]
        p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze(2)
        # m2: [batch, c_len, hidden_size * 2]
        m2, _ = self.output_LSTM(m)
        mask = sequence_mask(l, m.shape[1])
        m2 = select_by_mask(m2, mask)
        # p2: [batch, c_len]
        p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze(2)

        return p1, p2

In [32]:
class BiDAF(Seq2vecModel):
    def __init__(self, encoder, head):
        super().__init__(encoder, head)
        self.encoder = encoder
        self.head = head

    def construct(self, c_char, q_char, c_word, q_word, c_lens, q_lens):
        c, q = self.encoder(c_char, q_char, c_word, q_word, c_lens, q_lens)
        p1, p2 = self.head(c, q, c_lens)
        return p1, p2

In [33]:
# define some parameters
char_vocab_size = len(char_vocab.vocab())
char_dim = 8
char_channel_width = 5
char_channel_size = 100
hidden_size = 100
dropout = 0.2
lr = 0.5
epochs = 6

In [34]:
# net
encoder = Encoder(char_vocab_size, char_vocab, char_dim, char_channel_size, char_channel_width, word_vocab,
                  word_embeddings, hidden_size, dropout)                  
head = Head(hidden_size, dropout)
net = BiDAF(encoder, head)



In [35]:
net



BiDAF<
  (encoder): Encoder<
    (dropout): Dropout<keep_prob=0.8>
    (char_emb): Glove<
      (dropout_layer): Dropout<p=0.0>
      >
    (char_conv): Sequential<
      (0): Conv2d<input_channels=1, output_channels=100, kernel_size=(8, 5), stride=(1, 1), pad_mode=pad, padding=0, dilation=(1, 1), group=1, has_bias=False, weight_init=<mindspore.common.initializer.HeUniform object at 0x7f30d4acd690>, bias_init=<mindspore.common.initializer.Uniform object at 0x7f303f9cfe10>, format=NCHW>
      (1): ReLU<>
      >
    (word_emb): Glove<
      (dropout_layer): Dropout<p=0.0>
      >
    (highway_linear0): Dense<
      input_channels=200, output_channels=200, has_bias=True, activation=ReLU<>
      (activation): ReLU<>
      >
    (highway_linear1): Dense<
      input_channels=200, output_channels=200, has_bias=True, activation=ReLU<>
      (activation): ReLU<>
      >
    (highway_gate0): Dense<
      input_channels=200, output_channels=200, has_bias=True, activation=Sigmoid<>
      (activa

In [40]:
# define Loss & Optimizer
class Loss(nn.Module):
    def __init__(self):
        super().__init__()

    def construct(self, logit1, logit2, s_idx, e_idx):
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logit1, s_idx) + loss_fn(logit2, e_idx)
        return loss

loss_fn = Loss()
optimizer = nn.Adadelta(net.trainable_params(), learning_rate=lr)

In [41]:
def forward_fn(c_char, q_char, c_word, q_word, c_lens, q_lens, s_idx, e_idx):
    logits = net(c_char, q_char, c_word, q_word, c_lens, q_lens)
    loss = loss_fn(*logits, s_idx, e_idx)
    return_list = (loss,) + logits
    return return_list

grad_fn = ms.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True)

def train_step(c_char, q_char, c_word, q_word, c_lens, q_lens, s_idx, e_idx):
    (loss, *_), grads = grad_fn(c_char, q_char, c_word, q_word, c_lens, q_lens, s_idx, e_idx)
    optimizer(grads)
    return loss

def train_one_epoch(model, train_dataset, epoch=0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for _, c_word, q_word, c_char, q_char, c_lens, q_lens, s_idx, e_idx in train_dataset.create_tuple_iterator():
            loss = train_step(c_char, q_char, c_word, q_word, c_lens, q_lens, s_idx, e_idx)
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/step_total)
            t.update(1)

In [42]:
def test_loop(model, dataset, vocab, loss_fn):
    model.set_train(False)
    loss = 0
    answers = dict()

    for ids, c_word, q_word, c_char, q_char, c_lens, q_lens, s_idx, e_idx in dataset.create_tuple_iterator():
        p1, p2 = model(c_char, q_char, c_word, q_word, c_lens, q_lens)
        batch_loss = loss_fn(p1, p2, s_idx, e_idx)
        loss += batch_loss

        # [batch, c_len]
        batch_size, c_len = p1.shape
        ls = nn.LogSoftmax(axis=1)
        mask = mnp.tril((ops.ones((c_len, c_len), dtype=ms.float32) * float('-inf')),
                         k=-1).expand_dims(0).broadcast_to((batch_size, -1, -1))
        mask = mnp.where(ops.isnan(mask), ops.zeros_like(mask), mask)
        score = (ls(p1).expand_dims(2) + ls(p2).expand_dims(1)) + mask
        s_idx, score = ops.max(score, axis=1)
        e_idx, score = ops.max(score, axis=1)
        s_idx = ops.gather_elements(s_idx, 1, e_idx.view(-1, 1)).squeeze(axis=1)

        for i in range(batch_size):
            answer_id = ids.asnumpy()[i]
            answer = c_word[i][s_idx[i].asnumpy().item():e_idx[i].asnumpy().item()+1]

            answer_list = []
            for idx in answer:
                idx = idx.asnumpy().item()
                if idx < 0:
                    idx = idx + 188744
                answer_list.append(vocab.ids_to_tokens(idx))
            answer = ' '.join(answer_list)
            answers[answer_id.item()] = answer
    # you can download the squad dev dataset from "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
    with open("dev-v1.1.json") as dataset_file:
        dataset_json = json.load(dataset_file)
        squad_data = dataset_json['data']
    exact_match, f1 = evaluate(squad_data, answers)
    print(f"Test: \n EM: {exact_match:.3f}, F1: {f1:.3f}, Avg loss: {loss.asnumpy().item():>8f} \n")

In [43]:
for epoch in range(epochs):
    train_one_epoch(net, squad_train, epoch)
    test_loop(net, squad_dev, word_vocab, loss_fn)
print("Done!")

Epoch 0:  15%|█▌        | 1688/10950 [10:15<56:15,  2.74it/s, loss=9.61] 


KeyboardInterrupt: 

=======================