In [1]:
# 根据你使用的模型和GPU资源情况，调整以下关键参数
squad_v2 = False
model_id = "distilbert-base-uncased"
batch_size = 16

In [2]:
from datasets import load_dataset

dataset=load_dataset("squad_v2" if squad_v2 else "squad")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(dataset['train'],3)

Unnamed: 0,id,title,context,question,answers
0,57281515ff5b5019007d9cb2,Federalism,"Until recently, in the absence of prior agreement on a clear and precise definition, the concept was thought to mean (as a shorthand) 'a division of sovereignty between two levels of government'. New research, however, argues that this cannot be correct, as dividing sovereignty - when this concept is properly understood in its core meaning of the final and absolute source of political authority in a political community - is not possible. The descent of the United States into Civil War in the mid-nineteenth century, over disputes about unallocated competences concerning slavery and ultimately the right of secession, showed this. One or other level of government could be sovereign to decide such matters, but not both simultaneously. Therefore, it is now suggested that federalism is more appropriately conceived as 'a division of the powers flowing from sovereignty between two levels of government'. What differentiates the concept from other multi-level political forms is the characteristic of equality of standing between the two levels of government established. This clarified definition opens the way to identifying two distinct federal forms, where before only one was known, based upon whether sovereignty resides in the whole (in one people) or in the parts (in many peoples): the federal state (or federation) and the federal union of states (or federal union), respectively. Leading examples of the federal state include the United States, Germany, Canada, Switzerland, Australia and India. The leading example of the federal union of states is the European Union.",What historical event illustrated that dividing sovereignty was not possible?,"{'text': ['Civil War'], 'answer_start': [480]}"
1,56d385c459d6e4140014662b,American_Idol,"American Idol premiered in June 2002 and became the surprise summer hit show of 2002. The first show drew 9.9 million viewers, giving Fox the best viewing figure for the 8.30 pm spot in over a year. The audience steadily grew, and by finale night, the audience had averaged 23 million, with more than 40 million watching some part of that show. That episode was placed third amongst all age groups, but more importantly it led in the 18–49 demographic, the age group most valued by advertisers.",How many people on average watched the American Idol finale in its first season?,"{'text': ['23 million'], 'answer_start': [274]}"
2,57265ff85951b619008f70fb,Professional_wrestling,"A wrestler may voluntarily submit by verbally informing the referee (usually used in moves such as the Mexican Surfboard, where all four limbs are incapacitated, making tapping impossible). Also, since Ken Shamrock (a legitimate UFC competitor in its early days) popularized it in 1997, a wrestler can indicate a voluntary submission by ""tapping out"", that is, tapping a free hand against the mat or against an opponent. Occasionally, a wrestler will reach for a rope (see rope breaks below), only to put their hand back on the mat so they can crawl towards the rope some more; this is not a submission, and the referee decides what their intent is.",What can a wrestler do to show a willing submission?,"{'text': ['tapping out'], 'answer_start': [338]}"


In [6]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained(model_id)

In [7]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
tokenizer("how are you,Are you ok?")

{'input_ids': [101, 2129, 2024, 2017, 1010, 2024, 2017, 7929, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## 使用tokenizer进行处理数据

#### 测试实验一下，看看原理

In [9]:
def test():
    return tokenizer("who are you?","i am your father and really be strong!",
                     max_length=10,
                     truncation="only_second",
                     return_overflowing_tokens=True,
                     return_offsets_mapping=True,
                     stride=1)

In [10]:
a=test()
a

{'input_ids': [[101, 2040, 2024, 2017, 1029, 102, 1045, 2572, 2115, 102], [101, 2040, 2024, 2017, 1029, 102, 2115, 2269, 1998, 102], [101, 2040, 2024, 2017, 1029, 102, 1998, 2428, 2022, 102], [101, 2040, 2024, 2017, 1029, 102, 2022, 2844, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (0, 1), (2, 4), (5, 9), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (5, 9), (10, 16), (17, 20), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (17, 20), (21, 27), (28, 30), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (28, 30), (31, 37), (37, 38), (0, 0)]], 'overflow_to_sample_mapping': [0, 0, 0, 0]}

In [11]:
for i in a['input_ids']:
    print(tokenizer.decode(i))

[CLS] who are you? [SEP] i am your [SEP]
[CLS] who are you? [SEP] your father and [SEP]
[CLS] who are you? [SEP] and really be [SEP]
[CLS] who are you? [SEP] be strong! [SEP]


In [12]:
a['offset_mapping']

[[(0, 0),
  (0, 3),
  (4, 7),
  (8, 11),
  (11, 12),
  (0, 0),
  (0, 1),
  (2, 4),
  (5, 9),
  (0, 0)],
 [(0, 0),
  (0, 3),
  (4, 7),
  (8, 11),
  (11, 12),
  (0, 0),
  (5, 9),
  (10, 16),
  (17, 20),
  (0, 0)],
 [(0, 0),
  (0, 3),
  (4, 7),
  (8, 11),
  (11, 12),
  (0, 0),
  (17, 20),
  (21, 27),
  (28, 30),
  (0, 0)],
 [(0, 0),
  (0, 3),
  (4, 7),
  (8, 11),
  (11, 12),
  (0, 0),
  (28, 30),
  (31, 37),
  (37, 38),
  (0, 0)]]

#### 这次我们不能截断问题，但是文本可以截断，虽然说可能截断关键答案，但是可以使用stride保存下来，这样就可以有多个content和同一个answer。且可以使用mapping的数据映射到原文

In [13]:
# The maximum length of a feature (question and context)
max_length = 384 
# The authorized overlap between two part of the context when splitting it is needed.
doc_stride = 128 

In [14]:
a

{'input_ids': [[101, 2040, 2024, 2017, 1029, 102, 1045, 2572, 2115, 102], [101, 2040, 2024, 2017, 1029, 102, 2115, 2269, 1998, 102], [101, 2040, 2024, 2017, 1029, 102, 1998, 2428, 2022, 102], [101, 2040, 2024, 2017, 1029, 102, 2022, 2844, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (0, 1), (2, 4), (5, 9), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (5, 9), (10, 16), (17, 20), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (17, 20), (21, 27), (28, 30), (0, 0)], [(0, 0), (0, 3), (4, 7), (8, 11), (11, 12), (0, 0), (28, 30), (31, 37), (37, 38), (0, 0)]], 'overflow_to_sample_mapping': [0, 0, 0, 0]}

In [15]:
a.sequence_ids()

[None, 0, 0, 0, 0, None, 1, 1, 1, None]

## 综上我们可以有以下操作

In [16]:
for i,example in enumerate(dataset['train']):
    if len(tokenizer(example['question'],example['context'])['input_ids'])>384:
        num=i
        break
num

249

In [17]:
example=dataset['train'][num]
example

{'id': '5733caf74776f4190066124c',
 'title': 'University_of_Notre_Dame',
 'context': "The men's basketball team has over 1,600 wins, one of only 12 schools who have reached that mark, and have appeared in 28 NCAA tournaments. Former player Austin Carr holds the record for most points scored in a single game of the tournament with 61. Although the team has never won the NCAA Tournament, they were named by the Helms Athletic Foundation as national champions twice. The team has orchestrated a number of upsets of number one ranked teams, the most notable of which was ending UCLA's record 88-game winning streak in 1974. The team has beaten an additional eight number-one teams, and those nine wins rank second, to UCLA's 10, all-time in wins against the top team. The team plays in newly renovated Purcell Pavilion (within the Edmund P. Joyce Center), which reopened for the beginning of the 2009–2010 season. The team is coached by Mike Brey, who, as of the 2014–15 season, his fifteenth at Notre

In [18]:
def test(example):
    return tokenizer(example['question'],example['context'],
                     max_length=max_length,
                     truncation="only_second",
                     return_overflowing_tokens=True,
                     return_offsets_mapping=True,
                     stride=doc_stride,
                     padding="max_length")
tokenized_example=test(example)
sequence_ids_example=tokenized_example.sequence_ids()

In [19]:
tokenized_example

{'input_ids': [[101, 2129, 2116, 5222, 2515, 1996, 10289, 8214, 2273, 1005, 1055, 3455, 2136, 2031, 1029, 102, 1996, 2273, 1005, 1055, 3455, 2136, 2038, 2058, 1015, 1010, 5174, 5222, 1010, 2028, 1997, 2069, 2260, 2816, 2040, 2031, 2584, 2008, 2928, 1010, 1998, 2031, 2596, 1999, 2654, 5803, 8504, 1012, 2280, 2447, 5899, 12385, 4324, 1996, 2501, 2005, 2087, 2685, 3195, 1999, 1037, 2309, 2208, 1997, 1996, 2977, 2007, 6079, 1012, 2348, 1996, 2136, 2038, 2196, 2180, 1996, 5803, 2977, 1010, 2027, 2020, 2315, 2011, 1996, 16254, 2015, 5188, 3192, 2004, 2120, 3966, 3807, 1012, 1996, 2136, 2038, 23339, 1037, 2193, 1997, 6314, 2015, 1997, 2193, 2028, 4396, 2780, 1010, 1996, 2087, 3862, 1997, 2029, 2001, 4566, 12389, 1005, 1055, 2501, 6070, 1011, 2208, 3045, 9039, 1999, 3326, 1012, 1996, 2136, 2038, 7854, 2019, 3176, 2809, 2193, 1011, 2028, 2780, 1010, 1998, 2216, 3157, 5222, 4635, 2117, 1010, 2000, 12389, 1005, 1055, 2184, 1010, 2035, 1011, 2051, 1999, 5222, 2114, 1996, 2327, 2136, 1012, 1996, 21

In [20]:
print(len(tokenizer.decode(tokenized_example['input_ids'][0])))

print(len(tokenizer.decode(tokenized_example['input_ids'][1])))


1778
2085


In [21]:
sequence_ids_example

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1

In [22]:
answer=example['answers']
start_char=answer['answer_start'][0]
end_char=start_char+len(answer['text'][0])
#答案在context中的起始和结束位置
print(start_char,end_char)

token_start_index=0
while sequence_ids_example[token_start_index]!=1:
    token_start_index+=1

token_end_index=token_start_index
while sequence_ids_example[token_end_index]==1:
    token_end_index+=1
token_end_index-=1
#content在整个句子中的起始和结束位置
print(token_start_index,token_end_index)

offsets = tokenized_example["offset_mapping"][0]
if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
        token_start_index += 1
    start_position = token_start_index - 1
    while offsets[token_end_index][1] >= end_char:
        token_end_index -= 1
    end_position = token_end_index + 1
    print(start_position, end_position)
#start_position和end_position分别在over词以及1600词中的位置
else:
    print("答案不在此特征中。")   

30 40
16 382
23 26


In [23]:
# 通过查找 offset mapping 位置，解码 context 中的答案 
print(tokenizer.decode(tokenized_example["input_ids"][0][start_position: end_position+1]))
# 直接打印 数据集中的标准答案（answer["text"])
print(answer["text"][0])

over 1, 600
over 1,600


## 验证成功，没什么问题

## 综上所述，我们可以将上述代码封装成一个函数

In [24]:
pad_on_right = tokenizer.padding_side == "right"

In [25]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples=tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
            # 我们将使用 CLS 特殊 token 的索引来标记不可能的答案。
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # 获取与该示例对应的序列（以了解上下文和问题是什么）。
            sequence_ids = tokenized_examples.sequence_ids(i)

            # 一个示例可以提供多个跨度，这是包含此文本跨度的示例的索引。
            sample_index = sample_mapping[i]
            answers = examples["answers"][sample_index]
            # 如果没有给出答案，则将cls_index设置为答案。
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # 答案在文本中的开始和结束字符索引。
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # 当前跨度在文本中的开始令牌索引。
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                    token_start_index += 1

                # 当前跨度在文本中的结束令牌索引。
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                    token_end_index -= 1

                # 检测答案是否超出跨度（在这种情况下，该特征的标签将使用CLS索引）。
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # 否则，将token_start_index和token_end_index移到答案的两端。
                    # 注意：如果答案是最后一个单词（边缘情况），我们可以在最后一个偏移之后继续。
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [26]:
tokenized_datasets = dataset.map(prepare_train_features,
                                  batched=True,
                                  remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
})

## 数据处理完毕

## 加载模型，配置训练参数

In [28]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model=AutoModelForQuestionAnswering.from_pretrained(model_id)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
batch_size=8
model_dir=f"E:\\model\\language\\fine-tuning\\{model_id}-finetuned-squad"

training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01
)

In [30]:
from transformers import default_data_collator

data_collator = default_data_collator

In [31]:
trainer=Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

## 开始训练

In [32]:
trainer.train()

  0%|          | 0/11066 [00:00<?, ?it/s]

In [None]:
model_to_save = trainer.save_model(model_dir)

In [None]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()