情感分类任务

In [1]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载训练数据
dataset = load_dataset('glue', 'sst2')
# 此函数直接从网络下载基准数据集

Found cached dataset glue (/home/zouyuheng/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 648.74it/s]


In [3]:
# 加载分词器、预训练模型和评价方法
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
# 此处的参数可以选择不同的分词器，可以在hagging face 官网查阅
model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict = True)
# 模型选择同理，还可以选择一些学者上传的、特别的预训练模型
metric = load_metric('/home/zouyuheng/tool/huggingface-datasets/glue.py', 'sst2')
# 基准数据集是有特定的评价指标的
# 注意，此处mertic的加载无法直接联网获取，需要将对应文件下载到本地，输入本地文件路径获取

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [4]:
# 对训练集分词
def tokenize(examples):
    return tokenizer(examples['sentence'], truncation = True, padding = 'max_length')
    # 未查询到这两个参数的API，猜测truncation指明对于过长序列是否阶段，padding则是指对于不到最大长度的序列进行pad操作
dataset = dataset.map(tokenize, batched = True)
# 通过tokenize对数据集进行批量处理
encoded_dataset = dataset.map(lambda example: {'label': example['label']}, batched=True)


 99%|█████████▊| 67/68 [00:14<00:00,  4.54ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
 50%|█████     | 1/2 [00:00<00:00,  2.68ba/s]
 99%|█████████▊| 67/68 [00:40<00:00,  1.66ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
 50%|█████     | 1/2 [00:01<00:01,  1.05s/ba]


In [5]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [6]:
# 数据格式转换
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'label']
encoded_dataset.set_format(type = 'torch',  columns = columns)
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [7]:
# 定义评价指标
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions = np.argmax(predictions, axis = 1), references = labels)
    

In [8]:
# 设置训练超参
args = TrainingArguments(
    "ft-ss2", # 输出路径
    evaluation_strategy = "epoch",# 每轮结束后进行评价
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,# 训练批次大小
    per_device_eval_batch_size = 32,# 验证批次大小
    num_train_epochs = 2 # 训练轮次
)

In [9]:
# 训练
trainer = Trainer(
    model, 
    args, 
    train_dataset = encoded_dataset["train"],
    eval_dataset = encoded_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [10]:
# 开始训练
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 67349
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4210
  Number of trainable parameters = 108311810
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# 进行评估
trainer.evaluate()

句对文本分类任务

In [1]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载训练数据
dataset = load_dataset('glue', 'rte')
# 此函数直接从网络下载基准数据集
# 加载分词器、预训练模型和评价方法
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
# 此处的参数可以选择不同的分词器，可以在hagging face 官网查阅
model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict = True)
# 模型选择同理，还可以选择一些学者上传的、特别的预训练模型
metric = load_metric('/home/zouyuheng/tool/huggingface-datasets/glue.py', 'rte')
# 基准数据集是有特定的评价指标的
# 注意，此处mertic的加载无法直接联网获取，需要将对应文件下载到本地，输入本地文件路径获取


Downloading and preparing dataset glue/rte to /home/zouyuheng/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data: 100%|██████████| 697k/697k [00:00<00:00, 786kB/s] 
                                                                                    

Dataset glue downloaded and prepared to /home/zouyuheng/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 694.04it/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [6]:
# 对训练集分词
def tokenize(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation = True, padding = 'max_length')
    # 注意，由于dataset版本更新，此处键名应改为sentence1和sentence2
dataset = dataset.map(tokenize, batched = True)
# 通过tokenize对数据集进行批量处理
encoded_dataset = dataset.map(lambda example: {'label': example['label']}, batched=True)


 67%|██████▋   | 2/3 [00:00<00:00,  2.82ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
 67%|██████▋   | 2/3 [00:00<00:00,  2.79ba/s]
 67%|██████▋   | 2/3 [00:01<00:00,  1.04ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
 67%|██████▋   | 2/3 [00:01<00:00,  1.01ba/s]


In [7]:
# 数据格式转换
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'label']
encoded_dataset.set_format(type = 'torch',  columns = columns)

In [8]:
# 定义评价指标
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions = np.argmax(predictions, axis = 1), references = labels)

In [9]:
# 设置训练超参
args = TrainingArguments(
    "ft-rte", # 输出路径
    evaluation_strategy = "epoch",# 每轮结束后进行评价
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,# 训练批次大小
    per_device_eval_batch_size = 32,# 验证批次大小
    num_train_epochs = 2 # 训练轮次
)

# 训练
trainer = Trainer(
    model, 
    args, 
    train_dataset = encoded_dataset["train"],
    eval_dataset = encoded_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

抽取式阅读理解任务

In [10]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer, default_data_collator


In [13]:
# 加载训练数据
dataset = load_dataset('squad')
# 此函数直接从网络下载基准数据集
# 加载分词器、预训练模型和评价方法
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
# 此处的参数可以选择不同的分词器，可以在hagging face 官网查阅
model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict = True)
# 模型选择同理，还可以选择一些学者上传的、特别的预训练模型
metric = load_metric('/home/zouyuheng/tool/huggingface-datasets/squad.py')
# 基准数据集是有特定的评价指标的
# 注意，此处mertic的加载无法直接联网获取，需要将对应文件下载到本地，输入本地文件路径获取


Found cached dataset squad (/home/zouyuheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 528.75it/s]
loading file vocab.txt from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file tokenizer.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
loading configuration file config.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/co

In [16]:
# 准备训练数据
'''这一部分没有完全理解，该部分使用了较多专用API，由于本人并不专攻阅读理解任务，暂不深入思考'''
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples['question'], # 问题文本
        examples['context'], # 篇章文本
        truncation = 'only_second', # 截断只发生在第二部分，即只截断篇章文本
        max_length = 384, # 最大长度
        stride = 128, # 篇章切片步长
        return_overflowing_tokens = True, # 返回超出最大长度的标记，将篇章切成多片
        return_offsets_mapping = True, # 返回偏置信息，用于对齐答案位置
        padding = 'max_length' # 按最大长度补齐
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # 建立到example的映射关系
    offset_mapping = tokenized_examples.pop("offset_mapping")
    # 建立token到原文的字符级映射关系，用于确定答案的开始位置和结束位置

    tokenized_examples["start_position"] = []
    tokenized_examples["end_position"] = []
    # 开始位置和结束位置

    for i, offsets in enumerate(offset_mapping):
        # 遍历输入序列
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        # 获取输入序列的input_ids以及[cls]标记的位置

        sequence_ids = tokenized_examples.sequence_ids(i)
        # 获取哪些部分是问题，哪些部分是篇章

        sample_index = sample_mapping[i]
        # 第i个序列
        answers = examples["answers"][sample_index]
        # 答案
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])
        # 答案的开始结束位置

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        # 找到值为1的位置
        token_end_index = 0
        while sequence_ids[token_end_index] != 1:
            token_end_index += 1
        # 同上，应该是值为1标志着答案开始和结束

        if not(offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            # 答案是否超出当前切片的范围
            tokenized_examples["start_position"].append(cls_index)
            tokenized_examples["end_position"].append(cls_index)
            # 如果超出，开始和结束的位置均设置为cls的位置
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index +=1
            tokenized_examples["start_position"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_position"].append(token_end_index + 1)
            # 将开始和结束位置移至篇章中答案的两端

    return tokenized_examples

In [17]:
tokenize_datasets = dataset.map(prepare_train_features, batched=True, remove_columns=dataset["train"].column_names)


 99%|█████████▉| 87/88 [00:31<00:00,  2.73ba/s]
 91%|█████████ | 10/11 [00:04<00:00,  2.33ba/s]


In [18]:
# 设置训练超参
args = TrainingArguments(
    "ft-squad", # 输出路径
    evaluation_strategy = "epoch",# 每轮结束后进行评价
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,# 训练批次大小
    per_device_eval_batch_size = 32,# 验证批次大小
    num_train_epochs = 2 # 训练轮次
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [19]:
# 训练
trainer = Trainer(
    model, 
    args, 
    train_dataset = encoded_dataset["train"],
    eval_dataset = encoded_dataset["validation"],
    tokenizer = tokenizer,
    data_collator = default_data_collator
)

In [20]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2. If idx, sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2490
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 156
  Number of trainable parameters = 108311810


Epoch,Training Loss,Validation Loss
1,No log,0.665392
2,No log,0.659249


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2. If idx, sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2. If idx, sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 277
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=156, training_loss=0.6416334005502554, metrics={'train_runtime': 212.9792, 'train_samples_per_second': 23.383, 'train_steps_per_second': 0.732, 'total_flos': 1310293055692800.0, 'train_loss': 0.6416334005502554, 'epoch': 2.0})

命名实体识别任务

In [21]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


In [23]:
dataset = load_dataset('conll2003')
# 此处如果发现数据下载不成功，可以待会儿再试
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading and preparing dataset conll2003/conll2003 to /home/zouyuheng/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data: 100%|██████████| 983k/983k [00:01<00:00, 959kB/s]  
                                                                                         

Dataset conll2003 downloaded and prepared to /home/zouyuheng/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 482.29it/s]
loading file vocab.txt from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file tokenizer.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
loading configuration file config.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,


In [24]:
def tokenize_and_align_labels(examples):
    # 将训练集转化为特征形式，即分词以及对齐标签
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words = True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        # 此处应该是标签序列
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        # 找到对应词的id，由于NER任务中，BERT会将token拆分，同一token拆分的子词共享同一标签，因此该标签对应了一个子词序列
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # 遍历每一个子词
            if word_idx is None:
                 label_ids.append(-100)
            # 如果词为空，说明是特殊符号，将其标签设置为-100，之后在计算损失函数中将忽略
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
            # 将标签设置到每个词的第一个token上，此处就是实现子词共享标签的操作
            '''没有特别理解为什么要判断一个elif'''
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    # 设置为标签
    return tokenized_inputs

In [25]:
tokenize_datasets = dataset.map(tokenize_and_align_labels, batched=True, load_from_cache_file=False)
'''此处为什么要将从缓存中加载设置为False'''

 93%|█████████▎| 14/15 [00:01<00:00,  7.14ba/s]
 75%|███████▌  | 3/4 [00:00<00:00,  7.08ba/s]
 75%|███████▌  | 3/4 [00:00<00:00,  7.48ba/s]


'此处为什么要将从缓存中加载设置为False'

In [26]:
label_list = dataset["train"].features["ner_tags"].feature.names
# 获取所有的标签列表
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels = len(label_list))
# 加载预训练模型

loading configuration file config.json from cache at /home/zouyuheng/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("/home/zouyuheng/tool/huggingface-datasets/seqeval.py")
# 注意，此处不光需要下载文件到本地，而且需要pip安装seqeval库

In [30]:
def compute_metrics(p):
    # 定义评价指标
    predictions, labels = p
    predictions = np.argmax(predictions, axis = 2)
    # 取概率最大的作为预测结果
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
        # 取预测结果，去掉我们标注为-100的特殊符号
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
        # 取预测结果，去掉我们标注为-100的特殊符号
    ]

    results = metric.compute(predictions = true_predictions, references=true_labels)
    return {
        "precision" : results["overall_precision"],# 精确率
        "recall" : results["overall_recall"], # 召回率
        "f1" : results["overall_f1"], # f1值
        "accuracy" : results["overall_accuracy"] # 准确率
    }

In [33]:
# 设置训练超参
args = TrainingArguments(
    "ft-conll2003", # 输出路径
    evaluation_strategy = "epoch",# 每轮结束后进行评价
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,# 训练批次大小
    per_device_eval_batch_size = 32,# 验证批次大小
    num_train_epochs = 3 # 训练轮次
)

# 训练
trainer = Trainer(
    model, 
    args, 
    train_dataset = tokenize_datasets["train"],
    eval_dataset = tokenize_datasets["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer.train()
trainer.evaluate()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: chunk_tags, ner_tags, tokens, id, pos_tags. If chunk_tags, ner_tags, tokens, id, pos_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1317
  Number of trainable parameters = 107726601


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 