In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd

In [None]:
class TextClassificationDataset(torch.utils.data.Dataset):
    #定义一个数据集的抽象类
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        #获取长度
        return len(self.texts)

    def __getitem__(self, idx):
        #获取不同位置的元素
        text = self.texts[idx]
        label = self.labels[idx]

        #使用tokenizer.encode_plus对文本进行分词和编码
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze() #输入文本的编码
        attention_mask = encoding['attention_mask'].squeeze() #注意力掩码
        return input_ids, attention_mask, label


In [None]:
class BertClassifier(nn.Module):
    # 类的初始化方法，接受一个参数num_classes，表示分类的类别数
    def __init__(self, num_classes):
        # 调用父类的初始化方法
        super(BertClassifier, self).__init__()
        # 使用Hugging Face的transformers库加载预训练的bert-base-uncased模型
        self.bert = BertModel.from_pretrained('./bert-base-uncased')
        # 定义一个dropout层，丢弃概率为0.1
        self.dropout = nn.Dropout(0.1)
        # 定义一个全连接层，输入维度为bert模型的隐藏层大小，输出维度为分类的类别数
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    # 前向传播方法，接受inputs和attention_mask作为输入
    def forward(self, inputs, attention_mask):
        # 使用bert模型进行前向传播，返回的第一个元素为所有层的输出，第二个元素为池化后的输出
        _, pooled_output = self.bert(inputs, attention_mask, return_dict=False)
        # 对池化后的输出进行dropout操作
        output = self.dropout(pooled_output)
        # 将dropout后的输出输入全连接层，得到分类的logits
        logits = self.fc(output)
        # 返回logits
        return logits

In [None]:
# 定义一些超参数
max_length = 128 #最大长度
batch_size = 32 #批大小
num_epochs = 20 #迭代次数
learning_rate = 1e-5 #学习率
num_classes = 2  #分类数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
task = 'race' #任务名称
print(task)

In [None]:
# 实例化一个预训练的 BERT 分词器（tokenizer）
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
#数据处理
train_dataframe = pd.read_csv('race/train10000.txt') #导入训练集
train_dataframe[task] = train_dataframe[task].replace({'Positive': 1, 'Negative': 0}) #进行换词
print(len(train_dataframe))

test_dataframe = pd.read_csv('race/test2000.txt', encoding='ISO-8859-1') #导入测试集
test_dataframe[task] = test_dataframe[task].replace({'Positive': 1, 'Negative': 0}) #进行换词
print(len(test_dataframe))

#从数据框中提取文本和标签，并将它们转换为列表
train_texts = train_dataframe['text'].tolist()
train_labels = train_dataframe[task].tolist()

test_texts = test_dataframe['text'].tolist()
test_labels = test_dataframe[task].tolist()

# 生成相应数据集
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

# 加载相应数据集和模型
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False)



In [None]:

# 实例化模型
model = BertClassifier(num_classes) #在models.py中定义的模型，其中使用了bert模型
model.to(device)


criterion = nn.CrossEntropyLoss() #交叉熵损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) #使用adamw优化器

In [None]:
model.train() #设置模型为训练模式
for epoch in range(num_epochs):
    train_total_correct = 0
    train_total_samples = 0

    for input_ids, attention_mask, labels in train_dataloader:
        #对每个批次的训练数据进行前向传播、计算损失、反向传播和参数更新

        ## 将tensor移动到GPU上
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # 优化器进行初始化
        optimizer.zero_grad()

        # 生成模型的结果，这里会调用forward函数
        outputs = model(input_ids, attention_mask)
        # 通过argmax将模型输出转换为分类标签，并计算准确率
        _, predicted_labels = torch.max(outputs, dim=1)

        # 计算分类准确率，并累加到总准确率中，并计算总样本数
        train_total_correct += torch.sum(predicted_labels == labels).item()
        train_total_samples += labels.size(0)

        loss = criterion(outputs, labels) #计算损失函数

        loss.backward()#反向传播
        optimizer.step() #参数更新

    train_accuracy = train_total_correct / train_total_samples

    model.eval()

    with torch.no_grad():
        #不计算梯度，对test集的元素进行验证
        test_total_correct = 0
        test_total_samples = 0

        for test_input_ids, test_attention_mask, test_labels in test_dataloader:
            test_input_ids = test_input_ids.to(device)
            test_attention_mask = test_attention_mask.to(device)
            test_labels = test_labels.to(device)

            test_outputs = model(test_input_ids, test_attention_mask)
            _, test_predicted_labels = torch.max(test_outputs, dim=1)

            test_total_correct += torch.sum(test_predicted_labels ==
                                            test_labels).item()
            test_total_samples += test_labels.size(0)

    test_accuracy = test_total_correct / test_total_samples

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Training Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

In [None]:
torch.save(model.state_dict(), 'bert_classifier.pt') #保存模型参数至文件中