In [1]:
# 参考 https://zhuanlan.zhihu.com/p/390823624
# 参考 https://blog.csdn.net/weixin_45397053/article/details/120478054
# 参考 https://segmentfault.com/a/1190000041524159

In [5]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW

In [48]:
# 加载数据集
raw_datasets = load_dataset("glue", "mrpc")  
raw_datasets  # DatasetDict   [train/validation/test]
# 文本相似度   label = 1: 代表句子是同义词 - label = 0: 代表句子不是同义词

train_dataset = raw_datasets['train']  
# type(train_dataset)
# dir(train_dataset)
# train_dataset  # Dataset   features: ['sentence1', 'sentence2', 'label', 'idx'],
# train_dataset.features
# train_dataset.column_names
# train_dataset[0]

# train = pd.DataFrame(train_dataset)
# train

Found cached dataset glue (C:/Users/lizhong/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,sentence1,sentence2,label,idx
0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr...",1,0
1,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...,0,1
2,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ...",1,2
3,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set...",0,3
4,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,1,4
...,...,...,...,...
3663,""" At this point , Mr. Brando announced : ' Som...","Brando said that "" somebody ought to put a bul...",1,4071
3664,"Martin , 58 , will be freed today after servin...",Martin served two thirds of a five-year senten...,0,4072
3665,""" We have concluded that the outlook for price...","In a statement , the ECB said the outlook for ...",1,4073
3666,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...,1,4074


In [6]:
# 加载分词编码工具
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fase=True)

# 加载模型（model head：ForSequenceClassification）
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

loading configuration file config.json from cache at C:\Users\lizhong/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\lizhong/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\voc

In [None]:
# 例子
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.",
    "This is the third sentence.",
    "To access an actual element, you need to select a split first, then give an index."
]

# 分词编码
batch = tokenizer(sequences, 
                  truncation=True,
                  padding="max_length",  
                  max_length=20,
                  return_tensors="pt")
batch
# type(batch)  # transformers.tokenization_utils_base.BatchEncoding
# dir(batch)
# batch.keys()  # dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
# for k,v in batch.items():
#     print(k, ':', v)
    
# print(batch['input_ids'])
# print(tokenizer.decode(batch['input_ids'][0]))
# print(tokenizer.decode(batch['input_ids'][1]))
# print(tokenizer.decode(batch['input_ids'][2]))
# print(tokenizer.decode(batch['input_ids'][3]))

# print(batch['input_ids'].tolist())
# print(tokenizer.decode(batch['input_ids'].tolist()[0]))
# print(tokenizer.decode(batch['input_ids'].tolist()[1]))
# print(tokenizer.decode(batch['input_ids'].tolist()[2]))
# print(tokenizer.decode(batch['input_ids'].tolist()[3]))
# help(tokenizer.decode)

batch['labels'] = torch.tensor([1, 1, 0, 1])
optimizer = AdamW(model.parameters())
# 序列解包 list(range(*[3,6]))  
# 字典解包 {'x':1, **{'y':2, 'z':3}} 
outputs = model(**batch)  
outputs.logits.shape

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions

loss = outputs.loss
loss.backward()
optimizer.step()

# tokenized_text = tokenizer("This is the first sentence.", "This is the second one.","This is the third sentence.")  # 第三个句子不显示
# tokenized_text, tokenizer.decode(tokenized_text['input_ids'])

In [1]:
# 训练集预处理
# 将句子对列表传给tokenizer，对整个数据集进行分词处理

# 1.这种方式不能保证结果为dataset形式，且需要将整个dataset加载到RAM中
tokenized_dataset = tokenizer(
    raw_datasets['train']['sentence1'],
    raw_datasets['train']['sentence2'],
    truncation=True,
    padding=True,
    return_tensors='pt')

type(tokenized_dataset)  # transformers.tokenization_utils_base.BatchEncoding
dir(tokenized_dataset)
tokenized_dataset.keys() # dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
tokenized_dataset["input_ids"].shape  # torch.Size([3668, 103])

del tokenized_dataset

NameError: name 'tokenizer' is not defined

In [26]:
%who

AdamW	 AutoModelForSequenceClassification	 AutoTokenizer	 DataCollatorWithPadding	 Dataset	 Trainer	 TrainingArguments	 batch	 checkpoint	 
collate_fn	 compute_metrics	 data_collator	 load_dataset	 loss	 model	 optimizer	 outputs	 pd	 
predictions	 raw_datasets	 sequences	 tokenize_function	 tokenizer	 torch	 train_args	 trainer	 training_args	 



In [43]:
# 2. map方法保持dataset格式，这里没有使用padding到模型最大长度(效率低)，在构造batch时再进行padding到batch最大长度
def tokenize_function(example: dict):
    return tokenizer(example["sentence1"], 
                     example["sentence2"], 
                     truncation=True, 
                     padding=True,                 
                     return_tensors="pt"
                    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)  # batch=True可以同时批处理
tokenized_datasets  # DatasetDict [train/validation/test]

# train_dataset = tokenized_datasets['train']
# dir(train_dataset)
# type(train_dataset)  # datasets.arrow_dataset.Dataset
# train_dataset.features
# train_dataset.column_names

# for key, value in tokenized_datasets['train'][0].items():
#     print(key, ':', value)
# tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])

# train = pd.DataFrame(train_dataset)
# train

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-28520c057cc163de.arrow
Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-95d8f6fd48c94828.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [78]:
# collate将一个batch数据进行聚合，将数据样本(list,tuples,dictionary)转化为pytorch tensor，进行拼接
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 例子
samples = tokenized_datasets["train"][:8]
type(samples)  # dict
samples = { k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"] }
pd.DataFrame(samples)
samples
# [len(x) for x in samples["input_ids"]]

# batch = data_collator(samples) 
# type(batch)  # transformers.tokenization_utils_base.BatchEncoding
# {k: v.shape for k, v in batch.items()}

{'label': [1, 0, 1, 0, 1, 1, 0, 1],
 'input_ids': [[101,
   2572,
   3217,
   5831,
   5496,
   2010,
   2567,
   1010,
   3183,
   2002,
   2170,
   1000,
   1996,
   7409,
   1000,
   1010,
   1997,
   9969,
   4487,
   23809,
   3436,
   2010,
   3350,
   1012,
   102,
   7727,
   2000,
   2032,
   2004,
   2069,
   1000,
   1996,
   7409,
   1000,
   1010,
   2572,
   3217,
   5831,
   5496,
   2010,
   2567,
   1997,
   9969,
   4487,
   23809,
   3436,
   2010,
   3350,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [101,
   9805,
   3540,
   11514,
   2050,
   3079,
   11282,
   2243,
   1005,
   1055,
   2077,
   4855,
   1996,
   4677,
   2000,
   3647,
   4576,
   1999,
   2687,
   2005,
   1002,
   1016,
   1012,
   1019,
   4551,
   1012,
   102,
   9805,
   3540,
   115

In [93]:
%who
%lsmagic

AdamW	 AutoModelForSequenceClassification	 AutoTokenizer	 DataCollatorWithPadding	 Dataset	 batch	 checkpoint	 data_collator	 k	 
load_dataset	 loss	 model	 optimizer	 raw_datasets	 samples	 sequences	 tokenize_function	 tokenized_dataset	 
tokenized_datasets	 tokenizer	 torch	 train_dataset	 v	 


Available line magics:
%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cd  %clear  %cls  %colors  %conda  %config  %connect_info  %copy  %ddir  %debug  %dhist  %dirs  %doctest_mode  %echo  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %macro  %magic  %matplotlib  %mkdir  %more  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %ren  %rep  %rerun  %reset  %reset_selective  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%cmd  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%markdown  %%perl  %%prun  %%pypy  %%python 

In [2]:
# 1.数据预处理
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

# 加载数据
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"

# 加载模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 数据处理
def tokenize_function(example):
    return tokenizer(example["sentence1"], 
                     example["sentence2"], 
                     truncation=True)  # return_tensors='pt'

# 分词编码
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # 动态padding，转换tensor

# 设置训练的参数
training_args = TrainingArguments("checkpoint")  # 模型保存的路径

# 加载模型（model head）
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Trainer 进行精调
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,  # collator
    tokenizer=tokenizer,
)

    
# 训练
trainer.train()

Found cached dataset glue (C:/Users/lizhong/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-0934c045dd1afe10.arrow
Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-0ca0a34349921406.arrow
Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-c3b222f285c62abb.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expec

Step,Training Loss
500,0.5455
1000,0.3473


Saving model checkpoint to checkpoint\checkpoint-500
Configuration saved in checkpoint\checkpoint-500\config.json
Model weights saved in checkpoint\checkpoint-500\pytorch_model.bin
tokenizer config file saved in checkpoint\checkpoint-500\tokenizer_config.json
Special tokens file saved in checkpoint\checkpoint-500\special_tokens_map.json
Saving model checkpoint to checkpoint\checkpoint-1000
Configuration saved in checkpoint\checkpoint-1000\config.json
Model weights saved in checkpoint\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in checkpoint\checkpoint-1000\tokenizer_config.json
Special tokens file saved in checkpoint\checkpoint-1000\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1377, training_loss=0.37798399083754597, metrics={'train_runtime': 216.3198, 'train_samples_per_second': 50.869, 'train_steps_per_second': 6.366, 'total_flos': 405470580750720.0, 'train_loss': 0.37798399083754597, 'epoch': 3.0})

In [3]:
# 验证
predictions = trainer.predict(tokenized_datasets["validation"])
predictions  # namedtuple

# predictions.predictions.shape  # logits 并没有经过softmax 
# predictions.label_ids.shape
# predictions.metrics

import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)
preds

# 构造评价指标
from datasets import load_metric

metric = load_metric('glue', 'mrpc')
metric.compute(predictions=preds,
              references=predictions.label_ids)

# # 新版本
# import evaluate
# accuracy = evaluate.load("accuracy")
# accuracy.compute(references=predictions.label_ids,
#                  predictions=preds)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, sentence2, idx. If sentence1, sentence2, idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 408
  Batch size = 8


  metric = load_metric('glue', 'mrpc')


{'accuracy': 0.8455882352941176, 'f1': 0.8934010152284263}

In [11]:
# 2.数据预处理
import torch
from datasets import load_dataset,load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

# 加载数据
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"

# 加载模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                          num_labels=2)

# 数据处理
def tokenize_function(example):
    return tokenizer(example["sentence1"], 
                     example["sentence2"], 
                     truncation=True)  # return_tensors='pt'

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # 动态padding，

# 包装成一个函数
def compute_metrics(eval_preds: tuple):
    metric = load_metric('glue', 'mrpc')
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds,
                         references=labels)

# from transformers.trainer_utils import EvalPrediction
# # 模拟测试输出
# eval_pred = EvalPrediction(predictions=np.array([[0, 1], [2, 3], [4, 5], [6, 7]]),
#                            label_ids=np.array([1, 1, 1, 1]))

# compute_metrics(eval_pred)


# 配置验证策略为每个epoch进行验证,并传入评价函数
train_args = TrainingArguments('test-trainer', 
#                                evaluation_strategy='epoch',
#                               per_device_train_batch_size=4,
                              )

# Trainer 进行精调
trainer = Trainer(model,
                 train_args,
                 train_dataset=tokenized_datasets['train'],
                 eval_dataset=tokenized_datasets['validation'],
                 data_collator=data_collator, #  将一批样本进行整理，转换为pyorch tensor/padding/连接
                 tokenizer=tokenizer,
                 compute_metrics=compute_metrics)

# 训练
trainer.train()

Found cached dataset glue (C:/Users/lizhong/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

loading configuration file config.json from cache at C:\Users\lizhong/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\lizhong/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\voc

Step,Training Loss
500,0.5599
1000,0.336


Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json
Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json
Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-1000\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1377, training_loss=0.3739865648633947, metrics={'train_runtime': 209.2047, 'train_samples_per_second': 52.599, 'train_steps_per_second': 6.582, 'total_flos': 405470580750720.0, 'train_loss': 0.3739865648633947, 'epoch': 3.0})

In [13]:
# 评估
import numpy as np
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 8


{'eval_loss': 0.6336934566497803,
 'eval_accuracy': 0.8578431372549019,
 'eval_f1': 0.9003436426116839,
 'eval_runtime': 8.8745,
 'eval_samples_per_second': 45.974,
 'eval_steps_per_second': 5.747,
 'epoch': 3.0}

In [14]:
# 保存模型
trainer.save_model(output_dir="output_dir")

Saving model checkpoint to output_dir
Configuration saved in output_dir\config.json
Model weights saved in output_dir\pytorch_model.bin
tokenizer config file saved in output_dir\tokenizer_config.json
Special tokens file saved in output_dir\special_tokens_map.json


In [15]:
tokenized_datasets['test']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1725
})

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer

# 加载数据
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"

# 加载分词编码工具
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

dataset_test = raw_datasets['test']
del raw_datasets
dataset_test
dataset_test[0]

Found cached dataset glue (C:/Users/lizhong/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

{'sentence1': "PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .",
 'sentence2': 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .',
 'label': 1,
 'idx': 0}

In [28]:
help(tokenizer.batch_encode_plus)

Help on method batch_encode_plus in module transformers.tokenization_utils_base:

batch_encode_plus(batch_text_or_text_pairs: Union[List[str], List[Tuple[str, str]], List[List[str]], List[Tuple[List[str], List[str]]], List[List[int]], List[Tuple[List[int], List[int]]]], add_special_tokens: bool = True, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False, truncation: Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy] = None, max_length: Optional[int] = None, stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, **kwargs) -> transformers.tokenization_utils_ba

In [2]:
# 批处理函数
def collate_fn(data):
#     print(len(data), data[0])
    labels = [i['label'] for i in data]
    sents = [(i['sentence1'], i['sentence2']) for i in data]
#     print(labels)
#     print(sents)
    
    
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                       truncation=True,
                                       padding="max_length",
                                       max_length=50,
                                       return_tensors='pt')
    
    input_ids = data['input_ids']
    token_type_ids = data['attention_mask']
    attention_mask = data['token_type_ids']
    labels = torch.LongTensor(labels)


    return labels, input_ids, token_type_ids, attention_mask

# 数据加载器
loader_test = torch.utils.data.DataLoader(dataset=dataset_test,
                                          batch_size=4,
                                          collate_fn=collate_fn,
                                          shuffle=True,
                                          drop_last=True)
loader_test

<torch.utils.data.dataloader.DataLoader at 0x23947e18040>

In [3]:
def test():
    # 加载模型
    model_test = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                          num_labels=2)
    # 加载参数
    model_test.load_state_dict(torch.load('output_dir/pytorch_model.bin'))
    model_test.eval()
    correct = 0
    total = 0
    
    for i,(labels, input_ids, token_type_ids, attention_mask) in enumerate(loader_test):
        with torch.no_grad():
            out = model_test(input_ids=input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_mask)
            
        out = out['logits'].argmax(dim=-1)
        correct += (out==labels).sum().item()
        total += len(labels)
        print(f'第{i}批数据:', correct/total)

if __name__ == "__main__":
    print("start...")
    test()
    print("end...")

start...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

第0批数据: 0.5
第1批数据: 0.375
第2批数据: 0.4166666666666667
第3批数据: 0.375
第4批数据: 0.45
第5批数据: 0.375
第6批数据: 0.39285714285714285
第7批数据: 0.375
第8批数据: 0.3888888888888889
第9批数据: 0.425
第10批数据: 0.4090909090909091
第11批数据: 0.375
第12批数据: 0.36538461538461536
第13批数据: 0.35714285714285715
第14批数据: 0.36666666666666664
第15批数据: 0.375
第16批数据: 0.39705882352941174
第17批数据: 0.375
第18批数据: 0.3815789473684211
第19批数据: 0.3625
第20批数据: 0.35714285714285715
第21批数据: 0.3409090909090909
第22批数据: 0.33695652173913043
第23批数据: 0.3541666666666667
第24批数据: 0.36
第25批数据: 0.34615384615384615
第26批数据: 0.3425925925925926
第27批数据: 0.3482142857142857
第28批数据: 0.3448275862068966
第29批数据: 0.3416666666666667
第30批数据: 0.3548387096774194
第31批数据: 0.359375
第32批数据: 0.36363636363636365
第33批数据: 0.3602941176470588
第34批数据: 0.37142857142857144
第35批数据: 0.375
第36批数据: 0.3716216216216216
第37批数据: 0.3684210526315789
第38批数据: 0.3717948717948718
第39批数据: 0.36875
第40批数据: 0.3719512195121951
第41批数据: 0.36904761904761907
第42批数据: 0.36627906976744184
第43批数据: 0.36363636363636365
第4

第308批数据: 0.3454692556634304
第309批数据: 0.3443548387096774
第310批数据: 0.3432475884244373
第311批数据: 0.3421474358974359
第312批数据: 0.3426517571884984
第313批数据: 0.3431528662420382
第314批数据: 0.34365079365079365
第315批数据: 0.34414556962025317
第316批数据: 0.3438485804416404
第317批数据: 0.34355345911949686
第318批数据: 0.3440438871473354
第319批数据: 0.34375
第320批数据: 0.3442367601246106
第321批数据: 0.3447204968944099
第322批数据: 0.34365325077399383
第323批数据: 0.3449074074074074
第324批数据: 0.3453846153846154
第325批数据: 0.3458588957055215
第326批数据: 0.3463302752293578
第327批数据: 0.34603658536585363
第328批数据: 0.34498480243161095
第329批数据: 0.34393939393939393
第330批数据: 0.34365558912386707
第331批数据: 0.3433734939759036
第332批数据: 0.3430930930930931
第333批数据: 0.34281437125748504
第334批数据: 0.3417910447761194
第335批数据: 0.34226190476190477
第336批数据: 0.34124629080118696
第337批数据: 0.34097633136094674
第338批数据: 0.3407079646017699
第339批数据: 0.34044117647058825
第340批数据: 0.34017595307917886
第341批数据: 0.3399122807017544
第342批数据: 0.3403790087463557
第343批数据: 0.340843

In [22]:
%who

AutoModelForSequenceClassification	 AutoTokenizer	 checkpoint	 collate_fn	 dataset_test	 load_dataset	 loader_test	 model_test	 test	 
tokenize_function	 tokenizer	 torch	 


In [3]:
!pip install optuna
!pip install ray[tune]
!pip install sigopt

Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
     -------------------------------------- 365.3/365.3 kB 1.3 MB/s eta 0:00:00
Collecting alembic>=1.5.0
  Downloading alembic-1.9.2-py3-none-any.whl (210 kB)
     -------------------------------------- 210.6/210.6 kB 6.5 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.7/78.7 kB 4.3 MB/s eta 0:00:00
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.9.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0
Collecting ray[tune]
  Downloading ray-2.2.0-cp39-cp39-win_amd64.whl (20.8 MB)
     --------------------------------------- 20.8/20.8 MB 28.4 MB/s eta 0:00:00
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.17.1-py3-none-

In [None]:
# 超参数搜索

# 3.数据预处理
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

# 加载数据
raw_datasets = load_dataset("glue", "mrpc")
model_checkpoint = "bert-base-uncased"

# 加载模型
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# 数据处理
def tokenize_function(example):
    return tokenizer(example["sentence1"], 
                     example["sentence2"], 
                     truncation=True)  # return_tensors='pt'

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  # 动态padding，

# 包装成一个函数
def compute_metrics(eval_preds: tuple):
    metric = load_metric('glue', 'mrpc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions,
                         references=labels)

# 配置验证策略为每个epoch进行验证,并传入评价函数
train_args = TrainingArguments('test-trainer', evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint,  num_labels=2)

# Trainer 进行精调
trainer = Trainer(model_init=model_init,
                 args=train_args,
                 train_dataset=tokenized_datasets['train'][:100],
                 eval_dataset=tokenized_datasets['validation'],
                 data_collator=data_collator, #  将一批样本进行整理，转换为pyorch tensor/padding/连接
                 tokenizer=tokenizer,
                 compute_metrics=compute_metrics)

# 搜索
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [11]:
# 设置Trainer 为搜索到地最好参数，进行训练
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Help on function load_metric in module datasets.load:

load_metric(path: str, config_name: Optional[str] = None, process_id: int = 0, num_process: int = 1, cache_dir: Optional[str] = None, experiment_id: Optional[str] = None, keep_in_memory: bool = False, download_config: Optional[datasets.download.download_config.DownloadConfig] = None, download_mode: Optional[datasets.download.download_manager.DownloadMode] = None, revision: Union[str, datasets.utils.version.Version, NoneType] = None, **metric_init_kwargs) -> datasets.metric.Metric
    Load a `datasets.Metric`.
    
    <Deprecated version="2.5.0">
    
    Use `evaluate.load` instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
    
    </Deprecated>
    
    Args:
    
        path (``str``):
            path to the metric processing script with the metric builder. Can be either:
                - a local path to processing script or the directory containing the script (if the script has the same name as t