<a href="https://colab.research.google.com/github/nytce/NLP/blob/main/CLUEbenchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##数据预处理：

In [None]:
# !pip install --upgrade transformers

In [None]:
# !pip install datasets

In [None]:
from functools import partial
import numpy as np
import time
import os
import copy
import json
import random
from tqdm import tqdm
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader, DistributedSampler, RandomSampler
import torch.nn.functional as F
import torch.nn as nn
# import paddlenlp as ppnlp
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
# from transformers import LinearDecayWithWarmup

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR


In [None]:
def read_json(input_file):
    with open(input_file, "r") as f:
        reader = f.readlines()
        lines = []
        for line in reader:
            lines.append(json.loads(line.strip()))
    return lines

In [None]:
# 自定义数据集类
class CLUE_Dataset(Dataset):
    def __init__(self, input_data):
      # # 使用 read_json 函数加载数据
      # self.data = read_json(input_file)
      if isinstance(input_data, list):
        # 如果传入的是列表，则直接将列表存储为数据集
        self.data = input_data
      elif isinstance(input_data, (str, Path)):
        # 如果传入的是文件路径，则读取数据文件
        self.data = read_json(input_data)
      else:
        raise ValueError("Unsupported input_data type. Expecting list, str, or Path.")

    def __len__(self):
      # 返回数据集大小
      return len(self.data)

    def __getitem__(self, index):
      # 返回指定索引的数据
      return self.data[index]

    def map(self, func):
      # 对数据集中的每个样本应用自定义的转换函数
      self.data = list(map(func, self.data))

In [None]:
# 导入Google drive库
from google.colab import drive

# 挂载Google Drive
drive.mount('/content/drive')

# 数据集所在文件夹路径
dataset_folder = '/content/drive/My Drive/CLUEdataset/afqmc_public'

# 训练集和验证集文件路径
train_file_path = os.path.join(dataset_folder, 'train.json')
dev_file_path = os.path.join(dataset_folder, 'dev.json')
test_file_path = os.path.join(dataset_folder, 'test.json')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_ds = CLUE_Dataset(train_file_path)
dev_ds = CLUE_Dataset(dev_file_path)

In [None]:
# 输出训练集的前 2 条样本
for idx, example in enumerate(train_ds):
    if idx <= 4:
        print(example)
        #print(example["sentence1"])

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}
{'sentence1': '蚂蚁花呗说我违约一次', 'sentence2': '蚂蚁花呗违约行为是什么', 'label': '0'}
{'sentence1': '帮我看一下本月花呗账单有没有结清', 'sentence2': '下月花呗账单', 'label': '0'}
{'sentence1': '蚂蚁借呗多长时间综合评估一次', 'sentence2': '借呗得评估多久', 'label': '0'}
{'sentence1': '我的花呗账单是***，还款怎么是***', 'sentence2': '我的花呗，月结出来说让我还***元，我自己算了一下详细名单我应该还***元', 'label': '1'}


In [None]:
class BERT_RNN(nn.Module):
    def __init__(self, bert_model_name, hidden_size, output_size):
        super(BERT_RNN, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.rnn = nn.RNN(self.bert.config.hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask)[0]
        rnn_output, _ = self.rnn(bert_output)
        output = self.linear(rnn_output[:, -1, :])  # 使用 RNN 输出的最后一个时间步作为输入
        return output

my_model = BERT_RNN('bert-base-uncased', hidden_size=256, output_size=2)

In [None]:
# 使用 ERNIE-Gram 预训练模型
# pretrained_model = ppnlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')
# tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

# 使用 ERNIE 预训练模型
# ernie-1.0
#pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained('ernie-1.0'))
#tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

# ernie-tiny
# pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained('ernie-tiny'))
# tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained('ernie-tiny')


# 使用 BERT 预训练模型
# bert-base-chinese
# pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-base-chinese')
# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')

# bert-wwm-chinese
# pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-chinese')
# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-chinese')

# bert-wwm-ext-chinese
# pretrained_model = ppnlp.transformers.BertModel.from_pretrained('bert-wwm-ext-chinese')
# tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-wwm-ext-chinese')


# 使用 RoBERTa 预训练模型
# roberta-wwm-ext
# pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext')
# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')

# roberta-wwm-ext
# pretrained_model = ppnlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large')
# tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large')

# 自己的模型
predicted_model = BERT_RNN(bert_model_name='bert-base-chinese', hidden_size=256, output_size=2)
# 初始化分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [None]:
# 转换成id的函数
def convert_example(example, tokenizer):
    encoded_inputs = tokenizer(text=example["sentence1"],text_pair=example["sentence2"], max_length=128, padding='max_length')
    return tuple([np.array(x, dtype="int64") for x in [
            encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], [example["label"]]]])

# 加载BERT的分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 把训练集合转换成id
# train_ds = train_ds.map(partial(convert_example, tokenizer=tokenizer))
# # 把验证集合转换成id
# dev_ds = dev_ds.map(partial(convert_example, tokenizer=tokenizer))

In [None]:
# 逐一转换
def process(Dataset):
  processed_data = []
  for example in Dataset:
    try:
      # 调用 convert_example 函数处理示例
      processed_example = convert_example(example, tokenizer)

      # 将处理后的结果添加到列表中
      processed_data.append(processed_example)
    except Exception as e:
      # 捕获异常并处理
      print(f"Error processing example: {e}")
      # 可以选择跳过当前示例，继续处理下一个示例
      continue
  return CLUE_Dataset(processed_data)

# 保存为数据集
train_tokenized_ds = process(train_ds)
dev_tokenized_ds = process(dev_ds)


In [None]:
print(type(train_tokenized_ds))
# 这里打印的是tokenzier转换过后的数据
for idx, example in enumerate(train_tokenized_ds):
    if idx <= 4:
        print(example)

<class '__main__.CLUE_Dataset'>
(array([ 101, 6010, 6009,  955, 1446, 5023, 7583, 6820, 3621, 1377,  809,
       2940, 2768, 1044, 2622, 1400, 3315, 1408,  102,  955, 1446, 3300,
       1044, 2622, 1168, 3309, 6820, 3315, 1408,  102,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

In [None]:
# 构建训练集合的dataloader
train_batch_size=32
dev_batch_size=32
# train_batch_sampler = DistributedSampler(dataset=train_ds, batch_size=train_batch_size, shuffle=True)
# train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True)

# train_smapler = DistributedSampler(dataset=train_ds)
# train_data_loader = DataLoader(dataset=train_ds, batch_size=train_batch_size, shuffle=True, sampler=train_sampler)
train_data_loader = DataLoader(dataset=train_tokenized_ds, batch_size=train_batch_size, shuffle=True, num_workers=0)


# 针对验证集数据加载，我们使用单卡进行评估，所以采用 paddle.io.BatchSampler 即可
# 定义验证集的dataloader
# dev_batch_sampler = DistributedSampler(dev_ds, batch_size=dev_batch_size, shuffle=False)

# dev_data_loader = DataLoader(
#         dataset=dev_ds,
#         batch_sampler=dev_batch_sampler,
#         return_list=True)
# dev_smapler = RandomSampler(dataset=dev_ds)
# dev_data_loader = DataLoader(dataset=dev_ds, batch_size=dev_batch_size, shuffle=True, sampler=dev_sampler)
dev_data_loader = DataLoader(dataset=dev_tokenized_ds, batch_size=dev_batch_size, shuffle=True)


##模型构建：



In [None]:
class BERT_RNN(nn.Module):
    def __init__(self, bert_model_name, hidden_size, output_size):
        super(BERT_RNN, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.rnn = nn.RNN(self.bert.config.hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids, attention_mask):#token_type_ids=token_type_ids,
        bert_output = self.bert(input_ids, attention_mask)[0]  #token_type_ids=token_type_ids,
        rnn_output, _ = self.rnn(bert_output)
        output = self.linear(rnn_output[:, -1, :])  # 使用 RNN 输出的最后一个时间步作为输入

        print(bert_output.shape)
        print(rnn_output.shape)
        print(output.shape)

        return output

In [None]:
model = BERT_RNN('bert-base-uncased', hidden_size=256, output_size=2)

##训练配置：

In [None]:
epochs = 3
train_data_loader = train_data_loader
num_training_steps = len(train_data_loader) * epochs

# 定义优化器和初始学习率
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# 定义总的训练步数和预热步数
total_steps = len(train_data_loader) * epochs
warmup_steps = 0.1 * total_steps  # 预热步数为总步数的 10%

# 定义学习率调度器函数
lr_scheduler = LambdaLR(optimizer, lr_lambda=lambda step: min((step + 1) / warmup_steps, 1.0))


# 交叉熵损失
criterion = nn.CrossEntropyLoss()

# # 评估的时候采用准确率指标
# metric = evaluate.load("accuracy")

##模型训练：

In [None]:
# 因为训练过程中同时要在验证集进行模型评估，因此我们先定义评估函数
# @torch.no_grad()
# def evaluate(model, criterion, metric, data_loader, phase="dev"):
#     model.eval()
#     metric.reset()
#     losses = []
#     true_labels = []
#     predicted_labels = []

#     for batch in data_loader:
#         input_ids, token_type_ids, labels = batch
#         outputs = model(input_ids, token_type_ids)
#         loss = criterion(outputs, labels)
#         losses.append(loss.item())

#         _, predicted = torch.max(outputs, 1)
#         true_labels.extend(labels.tolist())
#         predicted_labels.extend(predicted.tolist())

#     true_labels = np.array(true_labels)
#     predicted_labels = np.array(predicted_labels)

#     accuracy = (predicted_labels == true_labels).mean()

#     # 计算召回率、精确率和 F1 分数
#     tp = np.sum((predicted_labels == 1) & (true_labels == 1))
#     fp = np.sum((predicted_labels == 1) & (true_labels == 0))
#     fn = np.sum((predicted_labels == 0) & (true_labels == 1))

#     recall = tp / (tp + fn)
#     precision = tp / (tp + fp)
#     f1_score = 2 * (precision * recall) / (precision + recall)

#     print(f"eval {phase} loss: {np.mean(losses):.5f}, accu: {accuracy:.5f}, recall: {recall:.5f}, precision: {precision:.5f}, f1-score: {f1_score:.5f}")

#     model.train()
#     return np.mean(losses), accuracy

# from datasets import load_metric

# accuracy_metric = load_metric("accuracy")
# recall_metric = load_metric("recall")
# f1_metric = load_metric("f1")

# def evaluate(model, criterion, data_loader, phase="dev"):
#     model.eval()
#     losses = []
#     true_labels = []
#     predicted_labels = []

#     for batch in data_loader:
#         input_ids, token_type_ids, labels = batch
#         outputs = model(input_ids, token_type_ids)
#         loss = criterion(outputs, labels)
#         losses.append(loss.item())

#         _, predicted = torch.max(outputs, 1)
#         true_labels.extend(labels.tolist())
#         predicted_labels.extend(predicted.tolist())

#     accuracy = accuracy_metric.compute(references=true_labels, predictions=predicted_labels)
#     recall = recall_metric.compute(references=true_labels, predictions=predicted_labels)
#     f1 = f1_metric.compute(references=true_labels, predictions=predicted_labels)

#     print(f"eval {phase} loss: {np.mean(losses):.5f}, accuracy: {accuracy:.5f}, recall: {recall:.5f}, f1-score: {f1:.5f}")

#     model.train()
#     return np.mean(losses), accuracy, recall, f1
def evaluate(model, criterion, data_loader, phase="dev"):
    model.eval()
    losses = []
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            losses.append(loss.item())

            predicted = torch.argmax(outputs, dim=1)
            true_labels.extend(labels.tolist())
            predicted_labels.extend(predicted.tolist())

    accuracy = (torch.tensor(true_labels) == torch.tensor(predicted_labels)).float().mean().item()

    # 计算召回率、精确率和 F1 分数
    tp = sum((p == 1 and l == 1) for p, l in zip(predicted_labels, true_labels))
    fp = sum((p == 1 and l == 0) for p, l in zip(predicted_labels, true_labels))
    fn = sum((p == 0 and l == 1) for p, l in zip(predicted_labels, true_labels))

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print(f"eval {phase} loss: {np.mean(losses):.5f}, accu: {accuracy:.5f}, recall: {recall:.5f}, precision: {precision:.5f}, f1-score: {f1_score:.5f}")

    model.train()
    return np.mean(losses), accuracy

In [None]:
# 接下来，开始正式训练模型，训练时间较长，可注释掉这部分
def train(model, criterion, optimizer, scheduler, dev_data_loader, train_data_loader, epochs=3):
    global_step = 0
    tic_train = time.time()
    best_accuracy = 0.0

    for epoch in range(1, epochs + 1):
        model.train()  # 将模型切换到训练模式
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            optimizer.zero_grad()  # 清除梯度
            probs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) #加一个退出函数
            exit()
            loss = criterion(probs, labels)
            loss.backward()  # 反向传播
            optimizer.step()  # 更新参数
            scheduler.step()  # 更新学习率

            # 更新训练指标
            correct = (torch.argmax(probs, dim=1) == labels).sum().item()
            total = labels.size(0)
            accuracy = correct / total

            global_step += 1

            # 每间隔 100 step 输出训练指标
            if global_step % 100 == 0:
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                      % (global_step, epoch, step, loss.item(), accuracy, 10 / (time.time() - tic_train)))
                tic_train = time.time()

            # 每间隔 500 step 在验证集上进行评估
            if global_step % 500 == 0:
                eval_loss, eval_acc = evaluate(model, criterion, dev_data_loader, "dev")
                if best_accuracy < eval_acc:
                    best_accuracy = eval_acc
                    # # 保存模型
                    # save_param_path = os.path.join(save_dir, 'model_best.pth')
                    # torch.save(model.state_dict(), save_param_path)
                    # 保存tokenizer
                    # tokenizer.save_pretrained(save_dir)


In [None]:
# evaluate(model, criterion, dev_data_loader, phase="dev")
train(model, criterion, optimizer, lr_scheduler, dev_data_loader, train_data_loader, epochs=3)

TypeError: BERT_RNN.forward() got an unexpected keyword argument 'token_type_ids'

##模型预测：

In [None]:
# state_dict=paddle.load('checkpoint/model_best.pdparams')
# model.load_dict(state_dict)

# state_dict = torch.load('checkpoint/model_best.pth')
# model.load_state_dict(state_dict)

# 不保存模型就不用这一步，直接加载模型就行
model = BERT_RNN('bert-base-uncased', hidden_size=256, output_size=2)

In [None]:
test_ds = CLUE_Dataset(test_file_path)

In [None]:
def do_predict(model, example, tokenizer):
    # 把文本转换成input_ids, token_type_ids
    encoded_text = tokenizer(text=example["sentence1"], text_pair=example["sentence2"], max_length=512, padding='max_length', truncation=True, return_tensors="pt")
    # encoded_text = tokenizer(text=example["sentence1"],text_pair=example["sentence2"], max_length=512, padding='max_length')
    # 把input_ids变成PyTorch tensor
    input_ids = encoded_text['input_ids']
    # 把token_type_ids变成PyTorch tensor
    segment_ids = encoded_text['token_type_ids']

    # 模型预测
    with torch.no_grad():
        pooled_output = model(input_ids, segment_ids)

    # 取概率值最大的索引
    _, out2 = torch.max(pooled_output, axis=1)

    return out2.item()

# 预测测试集
# predict_label = []
# for example in tqdm(test_ds):
#     label_pred = do_predict(model, example, tokenizer)
#     predict_label.append(label_pred)

In [None]:
# 打印测试集预测结果
# print(predict_label[:100])


In [None]:
# 预测验证集
dev_predict_label = []
for example in tqdm(dev_ds):
    label_pred = do_predict(model, example, tokenizer)
    dev_predict_label.append(label_pred)

100%|██████████| 4316/4316 [1:17:07<00:00,  1.07s/it]


In [None]:
print(dev_predict_label[3000:3100])

NameError: name 'dev_predict_label' is not defined

In [None]:
from sklearn.metrics import accuracy_score

# 假设真实标签存储在验证集数据集的 'labels' 字段中
true_labels = [example["label"] for example in dev_ds]

# 计算准确率
accuracy = accuracy_score(true_labels, dev_predict_label)
print("Accuracy:", accuracy)


Accuracy: 0.0


## 评价指标

In [None]:
# for batch in data_loader:
#     input_ids, attention_mask, labels = batch
#     outputs = model(input_ids, attention_mask)
#     predicted_labels = torch.argmax(outputs, dim=1)
#     acc = accuracy(predicted_labels, labels)

# def accuracy(predictions, labels):
#     _, predicted = torch.max(predictions, 1)
#     correct = (predicted == labels).sum().item()
#     total = labels.size(0)
#     accuracy = correct / total
#     return accuracy
# outputs = model(batch['input_ids'], batch['attention_mask'])
# predicted_labels = torch.argmax(outputs, dim=1)
# acc = accuracy(predicted_labels, labels)


In [None]:
criterion = torch.nn.CrossEntropyLoss()
metric = accuracy()
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

evaluate(model, criterion, metric, dev_data_loader, phase="dev")

NameError: name 'Accuracy' is not defined