# 利用BERT模型及台達閱讀理解資料集(DRCD) 實作中文問答系統

This notebook is inspired by Keras document code example - BERT (from HuggingFace Transformers) for Text Extraction.

https://keras.io/examples/nlp/text_extraction_with_bert/

**Author:** [Vincent Wu](https://twitter.com/pleomax_wu)<br>
**Contact:** pleomax0730@gmail.com<br>
**Date created:** 2020/07/17<br>
**Last modified:** 2021/05/20<br>
**Description:** Fine tune pretrained BERT on DRCD.

# Delta Reading Comprehension Dataset 
台達閱讀理解資料集 Delta Reading Comprehension Dataset (DRCD) 屬於通用領域繁體中文機器閱讀理解資料集。
本資料集期望成為適用於遷移學習之標準中文閱讀理解資料集。
本資料集從2,108篇維基條目中整理出10,014篇段落，並從段落中標註出30,000多個問題

關於資料集之更詳細資訊請洽詢論文：
For more information please refer to Paper https://arxiv.org/abs/1806.00920

## Data format 資料格式

- version : <String> 資料集版本
- data : <Array>
  - title : <String> : 文章標題
  - id : <String> : 文章編號
  - paragraphs : <Array>
    - id : <String> : 文章編號_段落編號
    - context : <String> : 段落內容
    - qas : <Array>
      - question : <String> : 問題內容
      - id :<String> : 文章編號_段落編號_問題編號
      - answers : <Arrays>
        - answer_start : <int> text在文中位置
        - id : <String> : "1"表示為人工標註的答案，"2"以上為人工答題的答案
        - text : <string> : 答案內容

**References:**

- [BERT](https://arxiv.org/pdf/1810.04805.pdf)
- [DRCD](https://arxiv.org/abs/1806.00920)

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 2.1MB 8.2MB/s 
[K     |████████████████████████████████| 3.3MB 37.0MB/s 
[K     |████████████████████████████████| 901kB 54.0MB/s 
[?25h

In [None]:
import os
import re
import json
import string
import numpy as np
import logging
import tensorflow as tf
import time
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForQuestionAnswering
from pprint import pprint
from collections import Counter
from tqdm import tqdm

In [None]:
max_len = 384
stride = 128                  # use stride(windowing) if tokenized_context + tokenized_question > max_len
configuration = BertConfig()  # default paramters and configuration for BERT

In [None]:
# logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
logging.basicConfig(level=logging.ERROR)

# Save the pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
save_path = "bert_base_chinese/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

# Download the data
train_data_url = "https://raw.githubusercontent.com/DRCKnowledgeTeam/DRCD/master/DRCD_training.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://github.com/DRCKnowledgeTeam/DRCD/blob/master/DRCD_dev.json?raw=true"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

with open(train_path, "r", encoding="UTF-8") as f:
    raw_train_data = json.load(f)

with open(eval_path, "r", encoding="UTF-8") as f:
    raw_eval_data = json.load(f)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…


Downloading data from https://raw.githubusercontent.com/DRCKnowledgeTeam/DRCD/master/DRCD_training.json
Downloading data from https://github.com/DRCKnowledgeTeam/DRCD/blob/master/DRCD_dev.json?raw=true


In [None]:
# modified from ref: https://github.com/google-research/bert/blob/master/run_squad.py

def check_is_max_context(spans, tokenized_ans):
    best_score = None
    best_span = None
    true_start = None
    true_end = None
    tmp = []

    for span in spans:
        for idx, token in enumerate(span):
            if span[idx:idx + len(tokenized_ans)] == tokenized_ans:
                tmp.append(span)
                tmp[-1].append([idx, idx + len(tokenized_ans) - 1])

    for span in tmp:
        start, end = span.pop()
        num_left_context = len(span) - len(span[start:])
        num_right_context = len(span) - len(span[:end + 1])
        score = min(num_left_context, num_right_context) + 0.01 * len(span)
        if best_score is None or score > best_score:
            best_score = score
            best_span = span
            true_start = start
            true_end = end

    if best_score is None:
        logging.debug(best_score)
        logging.debug(spans)
        logging.debug(tokenized_ans)
        return None

    return best_span, true_start, true_end

In [None]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text,
                 all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer_text)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Tokenize context
        tokenized_context = tokenizer.encode(context)
        context_no_special = tokenized_context[1:-1]

        # Tokenize answer
        tokenized_ans = tokenizer.encode(answer_text, add_special_tokens=False)

        # Tokenize question
        tokenized_question = tokenizer.encode(question)[1:]

        if len(tokenized_context + tokenized_question) <= max_len:
            start_token_idx = None
            end_token_idx = None

            # [CLS] context [SEP] question [SEP]
            input_ids = tokenized_context + tokenized_question

            # Find start and end token index for tokens from answer
            for idx, token in enumerate(input_ids):
                if input_ids[idx:idx + len(tokenized_ans)] == tokenized_ans:
                    start, end = idx, idx + len(tokenized_ans) - 1
                    start_token_idx = start
                    end_token_idx = end
                    
            # 少數情況會造成 result 為 None，例如 "90%" 會被分詞為 ["90", "%"]，但正確答案為 ["9"]
            if start_token_idx is None or end_token_idx is None:
                self.skip = True
                return

            if len(tokenized_ans) == 1:
                logging.debug(start_token_idx)
                logging.debug(end_token_idx)
                logging.debug("tokenized_ans:", tokenized_ans)
                logging.debug("tokenized_sub_context:", input_ids[start_token_idx])
                assert tokenized_ans[0] == input_ids[start_token_idx]
            else:
                logging.debug(start_token_idx)
                logging.debug(end_token_idx)
                logging.debug("tokenized_ans:", tokenized_ans)
                logging.debug("tokenized_sub_context:", input_ids[start_token_idx:end_token_idx+1])
                assert tokenized_ans == input_ids[
                    start_token_idx:end_token_idx + 1]
            
            # Create token_type_ids, attention_mask
            token_type_ids = [0] * len(tokenized_context) + [1] * len(tokenized_question)
            attention_mask = [1] * len(input_ids)

            # Padding            
            padding_length = max_len - len(input_ids)
            assert padding_length >= 0
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)            

            self.input_ids = input_ids
            self.token_type_ids = token_type_ids
            self.attention_mask = attention_mask
            self.start_token_idx = start_token_idx
            self.end_token_idx = end_token_idx

        else:
            spans = []
            for idx, token in enumerate(context_no_special):
                if set(tokenized_ans).issubset(context_no_special[idx:idx +
                                                                  stride - 2]):
                    spans.append(context_no_special[idx:idx + stride - 2])
            result = check_is_max_context(spans, tokenized_ans)
            # 少數情況會造成 result 為 None，例如 "90%" 會被分詞為 ["90", "%"]，但正確答案為 ["9"]
            if not result:
                self.skip = True
                return

            final_span, start, end = result
            final_span = [101] + final_span + [102]
            # [CLS] context [SEP] question [SEP]
            input_ids = final_span + tokenized_question

            # Find start and end token index for tokens from answer
            start_token_idx = start + 1  # "[CLS]" token offset
            end_token_idx = end + 1

            if len(tokenized_ans) == 1:
                logging.debug("tokenized_ans:", tokenized_ans)
                logging.debug("tokenized_sub_context:", input_ids[start_token_idx])
                assert tokenized_ans[0] == input_ids[start_token_idx]
            else:
                logging.debug("tokenized_ans:", tokenized_ans)
                logging.debug("tokenized_sub_context:", input_ids[start_token_idx:end_token_idx+1])
                logging.debug(start_token_idx)
                logging.debug(end_token_idx)
                assert tokenized_ans == input_ids[start_token_idx:end_token_idx + 1]

            # Create token_type_ids, attention_mask
            token_type_ids = [0] * len(final_span) + [1] * len(tokenized_question)
            attention_mask = [1] * len(input_ids)

            # Padding            
            padding_length = max_len - len(input_ids)
            assert padding_length >= 0
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)            

            self.input_ids = input_ids
            self.token_type_ids = token_type_ids
            self.attention_mask = attention_mask
            self.start_token_idx = start_token_idx
            self.end_token_idx = end_token_idx

In [None]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(question, context, start_char_idx,
                                        answer_text, all_answers)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

In [None]:
def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for example in squad_examples:
        if example.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(example, key))
    for key in dataset_dict:
        dataset_dict[key] = np.asarray(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [None]:
start_time = time.time()
train_squad_examples = create_squad_examples(raw_train_data)
end_time = time.time()
print(f"Took {end_time - start_time:.2f} seconds to preprocess raw train data.")

start_time = time.time()
eval_squad_examples = create_squad_examples(raw_eval_data)
end_time = time.time()
print(f"Took {end_time - start_time:.2f} seconds to preprocess raw eval data.")

Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors


Took 184.96 seconds to preprocess raw train data.
Took 26.42 seconds to preprocess raw eval data.


In [None]:
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")
print(f"{len(x_train[0])} valid train samples with length {max_len}.")
print(f"Dropped {len(train_squad_examples) - len(x_train[0])} invalid train samples.")

x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")
print(f"{len(x_eval[0])} valid samples with length {max_len}.")
print(f"Dropped {len(eval_squad_examples) - len(x_eval[0])} invalid samples.")

26936 training points created.
26890 valid train samples with length 384.
Dropped 46 invalid train samples.
3524 evaluation points created.
3520 valid samples with length 384.
Dropped 4 invalid samples.


In [None]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-chinese")

    ## QA Model
    input_ids = layers.Input(shape=(max_len, ), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len, ), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len, ), dtype=tf.int32)
    embedding = encoder(
        input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask)[0]  # take sequence outputs (bsz, seq_len, hidden_size)

    start_logits = layers.Dense(1, use_bias=False)(embedding)  # (bsz, seq_len, 1)
    start_logits = layers.Flatten(name="start_logit")(start_logits)  # (bsz, seq_len)

    end_logits = layers.Dense(1, use_bias=False)(embedding)
    end_logits = layers.Flatten(name="end_logit")(end_logits)

    # start_probs = layers.Activation(keras.activations.softmax, name="start")(start_logits)
    # end_probs = layers.Activation(keras.activations.softmax, name="end")(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_logits, end_logits],
    )
    losses = {
        "start_logit": keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        "end_logit": keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    }
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=losses)
    return model


model = create_model()
model.summary()

#  EM&F1參考
中文
https://github.com/ymcui/Chinese-XLNet/blob/0dcda8c4fe99f39317bb7af51f30469f65f8e577/src/cmrc2018_evaluate_drcd.py

英文
https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py

In [None]:
# remove punctuation
def remove_punctuation(in_str):
    in_str = str(in_str).lower().strip()
    sp_char = [
        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
        '）', '－', '～', '『', '』'
    ]

    out_segs = []
    for char in in_str:
        if char in sp_char or char in string.punctuation:
            continue
        else:
            out_segs.append(char)
    return ''.join(out_segs)

In [None]:
import nltk
nltk.download('punkt')

# split Chinese with English
def mixed_segmentation(in_str, rm_punc=False):
	in_str = str(in_str).lower().strip()
	segs_out = []
	temp_str = ""
	sp_char = ['-',':','_','*','^','/','\\','~','`','+','=',
			   '，','。','：','？','！','“','”','；','’','《','》','……','·','、',
			   '「','」','（','）','－','～','『','』']
	for char in in_str:
		if rm_punc and char in sp_char:
			continue
		if re.search(u"[\u4e00-\u9fff]", char) or char in sp_char:
			if temp_str != "":
				ss = nltk.word_tokenize(temp_str)
				segs_out.extend(ss)
				temp_str = ""
			segs_out.append(char)
		else:
			temp_str += char

	#handling last part
	if temp_str != "":
		ss = nltk.word_tokenize(temp_str)
		segs_out.extend(ss)

	return segs_out

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
mixed_segmentation(remove_punctuation("today 天氣 真是棒rrr!！"))

['today', '天', '氣', '真', '是', '棒', 'rrr']

In [None]:
def f1_score(prediction, ground_truth):
    prediction_tokens = mixed_segmentation(prediction) # [char for char in prediction]
    ground_truth_tokens = mixed_segmentation(ground_truth) # [char for char in ground_truth]
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [None]:
f1_score(prediction="today 天氣 真rrr", ground_truth="today 天氣 真是棒")

0.7272727272727272

In [None]:
class ExactMatch_F1(keras.callbacks.Callback):
    """
    Each `SquadExample` object contains the character level offsets for each token
    in its input paragraph. We use them to get back the span of text corresponding
    to the tokens between our predicted start and end tokens.
    All the ground-truth answers are also present in each `SquadExample` object.
    We calculate the percentage of data points where the span of text obtained
    from model predictions matches one of the ground-truth answers.
    """

    def __init__(self, x, y, squad_examples):
        self.x = x
        self.y = y
        self.squad_examples = squad_examples
        

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x)
        em = 0
        f1 = 0
        examples_no_skip = [_ for _ in self.squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = examples_no_skip[idx]
            
            start = np.argmax(start)
            #print("\nstart:", start)
            end = np.argmax(end)
            #print("end:", end)
            if end < start:
                pred_ans = ""
            else:
                pred_ans = "".join(tokenizer.convert_ids_to_tokens(squad_eg.input_ids[start:end+1], skip_special_tokens=True))
            #print("predict answer span:", pred_ans)
            #print("true answer span:", squad_eg.all_answers[0])

            pred_ans = remove_punctuation(pred_ans)
            true_ans = remove_punctuation(squad_eg.all_answers[0])

            # 如果predict ans 是 true ans 的子集
            if pred_ans in true_ans and pred_ans:
                #print("pred is subset of true")
                em += 1
            
            f1 += f1_score(pred_ans, true_ans)

        em = em / len(self.y[0])
        f1 = f1 / len(self.y[0])
        print(f"\nepoch={epoch+1}, exact match score={em:.2f}, F1 score={f1:.2f}")

# 訓練模型


In [None]:
em_f1_callback = ExactMatch_F1(x_eval, y_eval, eval_squad_examples)

In [None]:
start_time = time.time()

model.fit(
    x_train,
    y_train,
    epochs=3,  # For demonstration, 3 epochs are recommended
    verbose=1,
    batch_size=12,
    callbacks=[em_f1_callback],
)

end_time = time.time()
print(f"Took {end_time - start_time} seconds to train our model.")

Epoch 1/3
















epoch=1, exact match score=0.82, F1 score=0.88
Epoch 2/3
epoch=2, exact match score=0.85, F1 score=0.90
Epoch 3/3
epoch=3, exact match score=0.86, F1 score=0.90
Took 13333.64521598816 seconds to train our model.


# Hugging Face

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/qa_bert"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
qa_bert = TFBertForQuestionAnswering.from_pretrained(path)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at /content/drive/MyDrive/qa_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [None]:
train_inputs = {
        "input_ids": x_train[0][:],
        "token_type_ids": x_train[1][:],
        "attention_mask": x_train[2][:], 
        "start_positions": y_train[0][:],
        "end_positions": y_train[1][:],
}

tf_train_dataset = tf.data.Dataset.from_tensor_slices(train_inputs).batch(10)

In [None]:
eval_inputs = {
        "input_ids": x_eval[0][:],
        "token_type_ids": x_eval[1][:],
        "attention_mask": x_eval[2][:], 
        "start_positions": y_eval[0][:],
        "end_positions": y_eval[1][:],
}

tf_eval_dataset = tf.data.Dataset.from_tensor_slices(eval_inputs).batch(1)

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=3e-5)

In [None]:
epochs = 3

for _ in tqdm(range(epochs)):
    # pbar = tqdm(tf_train_dataset)
    # for i, batch in enumerate(pbar):
    for i, batch in enumerate(tf_train_dataset):
        with tf.GradientTape() as tape:
            outputs = qa_bert(batch)
            loss_value = outputs.loss

        grads = tape.gradient(loss_value, qa_bert.trainable_variables)
        optimizer.apply_gradients(zip(grads, qa_bert.trainable_variables))

        # pbar.set_description(f"Loss: {loss_value.numpy()[0]:.4f}")

100%|██████████| 3/3 [1:56:08<00:00, 2322.86s/it]


In [None]:
examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]

In [None]:
em = 0
f1 = 0

for i, batch in enumerate(tf_eval_dataset):
    squad_eg = examples_no_skip[i]

    outputs = qa_bert(batch)
    start = np.argmax(outputs.start_logits)    
    end = np.argmax(outputs.end_logits)


    if end < start:
        pred_ans = ""
    else:
        pred_ans = "".join(tokenizer.convert_ids_to_tokens(squad_eg.input_ids[start:end+1], skip_special_tokens=True))

    pred_ans = remove_punctuation(pred_ans)
    true_ans = remove_punctuation(squad_eg.all_answers[0])

    if pred_ans in true_ans and pred_ans:
        em += 1

    f1 += f1_score(pred_ans, true_ans)

em = em / len(tf_eval_dataset)
f1 = f1 / len(tf_eval_dataset)
print(f"exact match score={em:.3f}, F1 score={f1:.3f}")

exact match score=0.83, F1 score=0.86
