# 版本1.0
- 模型：bert-base-chinese
- 数据集：蚂蚁金融语义相似度数据集 AFQMC 作为语料，提供了官方的数据划分，训练集 / 验证集 / 测试集分别包含 34334 / 4316 / 3861 个句子对，标签 0 表示非同义句，1 表示同义句：
- 任务：二分类
- 指标：accuracy,f1
### 特点：
- 高度调用API
- 几乎没有定义model本身
- 定义了trainer和trainerarguments
- 所以没有采用loader而是采用了采用dataset的方式进行数据的加载

In [28]:
import torch
from transformers import AutoTokenizer,AutoModel


# 1读取数据部分
- 采用dataset直接读取json文件
- 加载train.json和dev.json文件

In [55]:
import datasets

data=datasets.load_dataset("json",data_files="./afqmc_public/train.json",split="train")


In [48]:
#去空
data=data.filter(lambda example: example["sentence1"] != "" and example["sentence2"] != "")


In [56]:
val_data=datasets.load_dataset("json",data_files="./afqmc_public/dev.json",split="train")
val_data=val_data.filter(lambda example: example["sentence1"] != "" and example["sentence2"] != "")


查看一条数据，以字典形式存在

In [32]:
data[0]

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}

# 2 数据预处理部分
- 进行分词
- 为每一条数据加上label
- 去除原特征
- tokenize之后，得到input_ids, attention_mask, token_type_ids, labels

In [58]:
checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def process_function(example):
    example["label"]=[int(l) for l in example["label"]] 
    tokenized_example=tokenizer(example["sentence1"],example["sentence2"],max_length=128,truncation=True)
    tokenized_example["labels"]=example["label"]
    return tokenized_example

data=data.map(process_function,batched=True,remove_columns=data.column_names)
val_data=val_data.map(process_function,batched=True,remove_columns=val_data.column_names)
    


Map: 100%|██████████| 34334/34334 [00:01<00:00, 26652.70 examples/s]
Map: 100%|██████████| 4316/4316 [00:00<00:00, 21627.30 examples/s]


In [59]:
val_data[1]


{'input_ids': [101,
  5709,
  1446,
  3118,
  2898,
  7770,
  7188,
  4873,
  3118,
  802,
  1408,
  102,
  711,
  784,
  720,
  1351,
  802,
  2140,
  679,
  3118,
  2898,
  5709,
  1446,
  802,
  3621,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': 0}

In [37]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained("./rbt3",num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3 模型训练部分
- 加载训练器
- 定义评估指标
- 定义训练参数
- 训练模型

In [38]:
from transformers import Trainer,TrainingArguments,DataCollatorWithPadding
import evaluate

In [39]:
def myeval(eval_pred):
    f1=evaluate.load("f1")
    acc=evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results=f1.compute(predictions=predictions, references=labels)
    results.update(acc.compute(predictions=predictions, references=labels))
    return results


In [60]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)


training_args=TrainingArguments(
    output_dir="./results",
    save_strategy="epoch", 
    learning_rate=2e-5,
    eval_steps=100, # 每个100步评估一次
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    num_train_epochs=3,
    weight_decay=0.01, # 权重衰减
    logging_dir="./logs", 
    logging_steps=100# 每个100步记录一次日志
    #save_total_limit=1, # 只保存最新的一个模型
)


trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=myeval
)

  trainer=Trainer(


In [62]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [61]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 