# 基于BERT的自动问答
本实验基于MindSpore2.0,在启智平台上运行，使用的数据集是squad数据集。

### 1.实验目的
得益于深度学习模型如Bert的不断发展，使得人工智能在阅读理解问答任务上表现得越来越出色。本实验将使用BERT算法实现一个基于阅读理解任务的问答系统。

### 2.BERT介绍
BERT 全称为Bidirectional Encoder Representation from Transformers（来自Transformers的双向编码表示），谷歌发表的发的论文Pre-traning of Deep Bidirectional Transformers for Language Understanding中提出的一个面向自然语言处理任务的无监督预训练语言模型。是近年来自然语言处理领域公认的里程碑模型。
BERT的创新在于Transformer Decoder（包含Masked Multi-Head Attention）作为提取器，并使用与之配套的掩码训练方法。虽然使用了双编码使得BERT不具有文本生成能力，但BERT在对输入文本的编码过程中，利用了每个词的所有上下文信息，与只能使用前序信息提取语义的单向编码器相比，BERT的语义信息提取能力更强。

### 3.实验环境
本实验基于MindSpore2.0,在启智NPU平台上运行。

### 4.实验过程
步骤1 下载数据和导入依赖库

In [1]:

from download import download

url = "https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com:443/NLP/src.zip"

download(url, "./", kind="zip", replace=True)


Downloading data from https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com:443/NLP/src.zip (223 kB)

file_sizes: 100%|█████████████████████████████| 229k/229k [00:00<00:00, 779kB/s]
Extracting zip file...
Successfully downloaded / unzipped to ./


'./'

In [2]:

from download import download

url = "https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com:443/NLP/data.zip"

download(url, "./", kind="zip", replace=True)


Downloading data from https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com:443/NLP/data.zip (1.08 GB)

file_sizes: 100%|██████████████████████████| 1.16G/1.16G [00:17<00:00, 67.1MB/s]
Extracting zip file...
Successfully downloaded / unzipped to ./


'./'

In [3]:
import os
import collections
from easydict import EasyDict as edict

#import mindspore.common.dtype as mstype
from mindspore import dtype as mstype
#import mindspore.dataset.transforms.c_transforms as C
import mindspore.dataset.transforms as C


#import mindspore.common.dtype as mstype
import mindspore.dataset as ds
from mindspore import context
from mindspore import log as logger
#from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn import DynamicLossScaleUpdateCell
#from mindspore.nn.optim import AdamWeightDecay, Lamb, Momentum
from mindspore.nn import AdamWeightDecay, Lamb, Momentum
#from mindspore.common.tensor import Tensor
from mindspore import Tensor
#from mindspore.train.model import Model
#from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore.train import Model, TimeMonitor, CheckpointConfig, ModelCheckpoint
#from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore import load_checkpoint, load_param_into_net

from src.bert_for_finetune import BertSquadCell, BertSquad
from src.finetune_eval_config import optimizer_cfg
from src.bert_model import BertConfig
from src.utils import make_directory, LossCallBack, LoadNewestCkpt, BertLearningRate



_cur_dir = os.getcwd()

步骤2	运行环境配置

In [4]:
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")  #, device_id=0

步骤3 定义超参数

In [5]:
args_opt = edict({
    "device_target":"Ascend",
    "do_train":"true",
    "do_eval":"true",
    "epoch_num":3,
    "num_class":2,
    "train_data_shuffle":"false",
    "eval_data_shuffle":"false",
    "train_batch_size":32,
    "eval_batch_size":1,
    "vocab_file_path":"./data/vocab_bert_large_en.txt",
    "save_finetune_checkpoint_path":"./ckpt/",
    "load_pretrain_checkpoint_path":"./data/pretrain_ckpt/bert_base.ckpt",
    "load_finetune_checkpoint_path":"./ckpt/squad-3_2745.ckpt",
    "train_data_file_path":"./data/train.tf_record",
    "eval_json_path":"./data/dev-v1.1.json",
    "schema_file_path":""
})

In [6]:
bert_net_cfg = BertConfig(
    seq_length=384,
    vocab_size=21128,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    use_relative_positions=False,
    dtype=mstype.float32,
    compute_type=mstype.float16,
)

步骤4	数据预览
    使用json模块导入训练数据train-v1.1.json：

In [7]:
import json
data = json.load(open("./data/train-v1.1.json"))
print(data["data"][0]["paragraphs"][0])

{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'qas': [{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661182'}, {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ'}], 'question

从train-v1.1.json的数据内容分析来看，通过json模块载入后形成一个词典，数据集中在”data”键下。data["data"][0]["paragraphs"][0]中第一个[0]代表第一个元素，”paragraph”代表段落。上面的输出结果为一个段落的内容。
我们可以看到，这里有一段文本”context”，同时有”qas”问答部分将答案与问题对应。

步骤5 定义数据预处理函数

In [8]:
def generator_squad(data_features):
    for feature in data_features:
        yield (feature.input_ids, feature.input_mask, 
               feature.segment_ids, feature.unique_id)
        
def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, 
                         schema_file_path=None,is_training=True, do_shuffle=True):
    """create finetune or evaluation dataset"""
    type_cast_op = C.TypeCast(mstype.int32)
    if is_training:
        data_set = ds.TFRecordDataset([data_file_path], 
                                      schema_file_path if schema_file_path != "" else None,
                                      columns_list=["input_ids", "input_mask", 
                                            "segment_ids", "start_positions",
                                            "end_positions", "unique_ids", "is_impossible"],
                                      shuffle=do_shuffle)
        data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
        data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
    else:
        data_set = ds.GeneratorDataset(generator_squad(data_file_path), 
                                       shuffle=do_shuffle,column_names=["input_ids", 
                                        "input_mask", "segment_ids", "unique_ids"])
    data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
    data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
    data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
    data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids")
    #data_set = data_set.repeat(repeat_count)      #去掉repeat
    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    return data_set

In [9]:
train_ds = create_squad_dataset(batch_size=args_opt.train_batch_size, repeat_count=1,
                              data_file_path=args_opt.train_data_file_path,
                              schema_file_path=args_opt.schema_file_path,
                              do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
item =train_ds.create_dict_iterator()
next(item)

{'input_ids': Tensor(shape=[32, 384], dtype=Int32, value=
 [[ 101, 1999, 2054 ...    0,    0,    0],
  [ 101, 2040, 2020 ...    0,    0,    0],
  [ 101, 2054, 2003 ...    0,    0,    0],
  ...
  [ 101, 2043, 2001 ...    0,    0,    0],
  [ 101, 2054, 2173 ...    0,    0,    0],
  [ 101, 2054, 2846 ...    0,    0,    0]]),
 'input_mask': Tensor(shape=[32, 384], dtype=Int32, value=
 [[1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  ...
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0]]),
 'segment_ids': Tensor(shape=[32, 384], dtype=Int32, value=
 [[0, 0, 0 ... 0, 0, 0],
  [0, 0, 0 ... 0, 0, 0],
  [0, 0, 0 ... 0, 0, 0],
  ...
  [0, 0, 0 ... 0, 0, 0],
  [0, 0, 0 ... 0, 0, 0],
  [0, 0, 0 ... 0, 0, 0]]),
 'start_positions': Tensor(shape=[32, 1], dtype=Int32, value=
 [[22],
  [51],
  [48],
  ...
  [31],
  [20],
  [76]]),
 'end_positions': Tensor(shape=[32, 1], dtype=Int32, value=
 [[22],
  [58],
  [51],
  ...
  [31],
  [20],
  [79]]),
 'uniqu

模型的输入包括：input_ids, input_mask, segment_ids, start_positions, end_positions, unique_ids, is_impossible。分别代表输入的id，输入句子的有效标记，输入属于那一句话，答案的开始位置、结束位置，特征id，是否有答案。

步骤6 定义训练函数

In [10]:
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError("Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    
    # 优化器
    lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate,
                                   end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate,
                                   warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                   decay_steps=steps_per_epoch * epoch_num,
                                   power=optimizer_cfg.AdamWeightDecay.power)
    params = network.trainable_params()
    decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
    other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params))
    group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay},
                    {'params': other_params, 'weight_decay': 0.0}]

    optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps)
        
    # 导入预先训练好的模型
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="squad",
                                 directory=None if save_checkpoint_path == "" else save_checkpoint_path,
                                 config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    
    config_ck = CheckpointConfig(save_checkpoint_steps=2745, keep_checkpoint_max=10)
    ckpoint = ModelCheckpoint(prefix="CKP", config=config_ck)
    
    
    #callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    
    callbacks=[ckpoint, TimeMonitor(2745)]
    
    model.train(epoch_num, dataset, callbacks=callbacks)

步骤7 执行训练  
加载bert_base.ckpt由于5个参数模型加载不上去，会出现警告，因此需要训练好一个ckpt文件CKP_1-3_2745.ckpt用来替换bert_base.ckpt，这样参数才能都加载上去，没有警告。

In [12]:
#!export GLOG_v=1

netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)

train_ds = create_squad_dataset(batch_size=args_opt.train_batch_size, repeat_count=1,
                          data_file_path=args_opt.train_data_file_path,
                          schema_file_path=args_opt.schema_file_path,
                          do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
do_train(train_ds, netwithloss, args_opt.load_pretrain_checkpoint_path, args_opt.save_finetune_checkpoint_path, args_opt.epoch_num)

Train epoch time: 501446.709 ms, per step time: 182.676 ms
Train epoch time: 408718.344 ms, per step time: 148.896 ms
Train epoch time: 403629.564 ms, per step time: 147.042 ms


步骤8 定义评估函数

In [13]:
def do_eval(dataset=None, load_checkpoint_path="", eval_batch_size=1):
    # 加载训练参数、定义模型、定义输出结果变量
    if load_checkpoint_path == "":
        raise ValueError("Finetune model missed, evaluation task must load finetune model!")
    net = BertSquad(bert_net_cfg, False, 2)
    net.set_train(False)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(net, param_dict)
    model = Model(net)
    output = []
    RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
    columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"]
    # 对评估数据集进行预测评估
    for data in dataset.create_dict_iterator(num_epochs=1):
        input_data = []
        for i in columns_list:
            input_data.append(data[i])
        input_ids, input_mask, segment_ids, unique_ids = input_data
        start_positions = Tensor([1], mstype.float32)
        end_positions = Tensor([1], mstype.float32)
        is_impossible = Tensor([1], mstype.float32)
        logits = model.predict(input_ids, input_mask, segment_ids, start_positions,
                               end_positions, unique_ids, is_impossible)
        ids = logits[0].asnumpy()
        start = logits[1].asnumpy()
        end = logits[2].asnumpy()

        for i in range(eval_batch_size):
            unique_id = int(ids[i])
            start_logits = [float(x) for x in start[i].flat]
            end_logits = [float(x) for x in end[i].flat]
            output.append(RawResult(
                unique_id=unique_id,
                start_logits=start_logits,
                end_logits=end_logits))
    return output

步骤9 评估模型效果
 模型评估针对dev-v1.1.json中的数据，通过测试集评估训练效果，
测试集数据预处理

In [14]:
from src import tokenization
from src.create_squad_data import read_squad_examples, convert_examples_to_features
from src.squad_get_predictions import write_predictions
from src.squad_postprocess import SQuad_postprocess
tokenizer = tokenization.FullTokenizer(vocab_file=args_opt.vocab_file_path, do_lower_case=True)
eval_examples = read_squad_examples(args_opt.eval_json_path, False)
eval_features = convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=bert_net_cfg.seq_length,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=None,
vocab_file=args_opt.vocab_file_path)
eval_ds = create_squad_dataset(batch_size=args_opt.eval_batch_size, repeat_count=1,
                          data_file_path=eval_features,
                          schema_file_path=args_opt.schema_file_path, is_training=False,
                          do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))

outputs = do_eval(eval_ds, args_opt.load_finetune_checkpoint_path, args_opt.eval_batch_size)
all_predictions = write_predictions(eval_examples, eval_features, outputs, 20, 30, True)
SQuad_postprocess(args_opt.eval_json_path, all_predictions, output_metrics="output.json")

{"exact_match": 41.83538315988647, "f1": 54.42541185741247}


步骤10 单例评估

In [15]:
one_example_path = "./data/my_example.json"
data = json.load(open(one_example_path))
print(data["data"][0]["paragraphs"][0])
one_examples = read_squad_examples(one_example_path, False)
one_features = convert_examples_to_features(
    examples=one_examples,
    tokenizer=tokenizer,
    max_seq_length=bert_net_cfg.seq_length,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=None,
    vocab_file=args_opt.vocab_file_path)
one_ds = create_squad_dataset(batch_size=1, repeat_count=1,
                          data_file_path=one_features,
                          schema_file_path=args_opt.schema_file_path, is_training=False,
                          do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
outputs = do_eval(one_ds, args_opt.load_finetune_checkpoint_path, args_opt.eval_batch_size)
all_predictions = write_predictions(one_examples, one_features, outputs, 2, 30, True)
print(all_predictions)

{'context': 'Dog is the best friend of human.', 'qas': [{'answers': [{'answer_start': 0, 'text': 'Dog'}], 'question': 'Who is the best friend of human?', 'id': '56be4db0acb8001400a502ec'}]}
OrderedDict([('56be4db0acb8001400a502ec', 'Dog')])
