In [1]:
import torch
from datasets import load_dataset, load_from_disk
BASE = os.getcwd()  #项目目录

#定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.label2id = {"科技":0,"股票":1,"教育":2,"财经":3,"娱乐":4} 
        #存放训练集 验证集和测试集的路径
        data_files={
            "train": [f"{BASE}/data/extract/{label}-train.csv" for label in  self.label2id],
            "dev": [f"{BASE}/data/extract/{label}-dev.csv" for label in  self.label2id],
            "test": [f"{BASE}/data/extract/{label}-test.csv" for label in  self.label2id],
        } 
        #读取数据，delimiter是每行的分隔符，column_names是文件数据的列名
        self.dataset = load_dataset('csv', data_files=data_files, delimiter='\t', column_names=[ "label","title", "content"], split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i): #迭代生成每条数据
        text = self.dataset[i]['title'] #一个i就是一条数据，将标题作为训练的文本，也可以增加content
        label = self.label2id[self.dataset[i]['label']]  #把文字的label转换成id

        return text, label


train_dataset = Dataset('train')
dev_dataset = Dataset('dev')
test_dataset = Dataset('test')

len(train_dataset),len(dev_dataset),len(test_dataset), train_dataset[0]

ModuleNotFoundError: No module named 'datasets'

In [None]:
from transformers import BertTokenizer

#加载字典和分词工具
tokenizer = BertTokenizer.from_pretrained(f'{BASE}/model/bert-base-chinese')

tokenizer

In [3]:
def collate_fn(data): #数据加载函数
    # print(data)
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码，把中文变成id
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                    #当句子长度大于max_length时,截断
                                   truncation=True,
                                   padding='max_length', #一律补零到max_length长度
                                   max_length=50,
                                   return_tensors='pt', #可取值tf,pt,np,默认为返回list
                                   return_length=True)  #返回length 标识长度

    #input_ids:编码之后的数字
    input_ids = data['input_ids']
    #attention_mask:是补零的位置是0,其他位置是1
    attention_mask = data['attention_mask']
    #token_type_ids 第一个句子和特殊符号的位置是0,第二个句子的位置是1
    token_type_ids = data['token_type_ids']

    #标签
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())
    return input_ids, attention_mask, token_type_ids, labels


#数据加载器
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(train_loader):
    break

print(len(train_loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

312


(torch.Size([32, 50]),
 torch.Size([32, 50]),
 torch.Size([32, 50]),
 tensor([0, 1, 0, 3, 0, 0, 4, 4, 3, 0, 2, 2, 0, 3, 2, 0, 2, 4, 1, 4, 2, 2, 3, 4,
         1, 3, 2, 1, 4, 4, 2, 2]))

In [4]:
from transformers import BertModel
import os

device = 'cpu' 
device = 'cuda:0' #如果有GPU可以换成'cuda:0'

#加载预训练模型Bert
pretrained = BertModel.from_pretrained(f'{BASE}/model/bert-base-chinese').to(device)

#不训练,不需要计算梯度
# for param in pretrained.parameters():
#     param.requires_grad_(False)

#模型试算
out = pretrained(input_ids=input_ids.to(device),
           attention_mask=attention_mask.to(device),
           token_type_ids=token_type_ids.to(device))

out.last_hidden_state.shape

torch.Size([32, 50, 768])

In [5]:
#定义下游任务模型
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,f1_score
import math
class Model(torch.nn.Module):
    def __init__(self,num_label,bert):
        super().__init__()
        #定义分类器
        self.fc = torch.nn.Linear(768, num_label)
        #定义bert
        self.bert = bert

    #指标计算函数
    def compute(self,labels, preds):
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
        if precision + recall < 0.01:
            precision = 0.01
        # f1 = 2 * precision * recall / (precision + recall)
        if math.isnan(f1):
            f1 = 0
            precision = 0.01
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    #前向传播过程
    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            #得到bert最后一层的每句话的输出向量，shape=(bs,maxlen,768)
            out = self.bert(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0]) #取出[CLS]的首部信息作为分类器的输入

        out = out.softmax(dim=1) #softmax转换成概率

        return out


model = Model(5,pretrained).to(device)

In [8]:
from transformers import AdamW
import os
import warnings
warnings.filterwarnings("ignore")
#训练

#优化器定义
optimizer = AdamW(model.parameters(), lr=5e-5)  
#损失函数
criterion = torch.nn.CrossEntropyLoss().to(device)

#早停，如果10轮内指标不再上升就停止训练
early_stop=10

#模型保存路径
save_path=f"{BASE}/result/model.pt"
early_stop_flag = 0 

#记录最好结果的模型epoch
best_epoch = 0 
best_batch = 0
#最好的模型在验证集上的指标
best_val_f_macro = 0

#标识模型训练
model.train()

#训练epoch
num_epoch=5

for epoch in range(num_epoch):
    for batch_id, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(train_loader):
        input_ids=input_ids.to(device);attention_mask=attention_mask.to(device);token_type_ids=token_type_ids.to(device)
        labels=labels.to(device)
        # print(input_ids.is_cuda)
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
        #计算损失
        train_loss = criterion(out, labels)
        #反向传播
        train_loss.backward()
        #参数更新
        optimizer.step()
        #梯度清零
        optimizer.zero_grad()

        #每10batch计算验证集指标
        if batch_id % 50 == 0:
            #模型验证
            model.eval()
            #记录全部dev集的预测类别和标签类别
            Labels=torch.tensor([]);Pres=torch.tensor([])
            for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(dev_loader):
                input_ids=input_ids.to(device);attention_mask=attention_mask.to(device);token_type_ids=token_type_ids.to(device)
                out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
                loss = criterion(out.cpu(), labels)
                out = out.argmax(dim=1)
                out.detach_() 

                #拼接
                Labels=torch.concat([Labels,labels])
                Pres=torch.concat([Pres,out.cpu()])
            #指标计算
            zb = model.compute(Labels, Pres)
            # print(i,loss.item(), zb)
            print(f"*** epoch{epoch + 1},batch{batch_id+1} train loss {train_loss.item()}, dev loss {loss.item()}, dev f1 {zb['f1']}, best f1-macro now:{best_val_f_macro}")      

            #记录最优结果和保存模型
            if zb['f1'] > best_val_f_macro:
                best_val_f_macro = zb['f1']
                best_epoch = epoch
                best_batch = batch_id
                early_stop_flag = 0
                # 保存本轮训练结果
                torch.save({'net':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':i, 'batch':batch_id}, save_path) 
            else:
                early_stop_flag += 1
                #早停
                if early_stop_flag == early_stop:
                    print(f'\nThe model has not been improved for {early_stop} rounds. Stop early!')
                    break

*** epoch1,batch1 train loss 1.2083163261413574, dev loss 1.1821329593658447, dev f1 0.8621039983483978, best f1-macro now:0
*** epoch1,batch51 train loss 1.2121587991714478, dev loss 1.1329888105392456, dev f1 0.8632374712723792, best f1-macro now:0.8621039983483978
*** epoch1,batch101 train loss 1.1789382696151733, dev loss 1.1485633850097656, dev f1 0.8656054573271567, best f1-macro now:0.8632374712723792
*** epoch1,batch151 train loss 1.1174002885818481, dev loss 1.1126203536987305, dev f1 0.869636326572231, best f1-macro now:0.8656054573271567
*** epoch1,batch201 train loss 1.1982020139694214, dev loss 1.1299853324890137, dev f1 0.8679107751590293, best f1-macro now:0.869636326572231
*** epoch1,batch251 train loss 1.1362107992172241, dev loss 1.0824124813079834, dev f1 0.868089728314757, best f1-macro now:0.869636326572231
*** epoch1,batch301 train loss 1.0984047651290894, dev loss 1.1879661083221436, dev f1 0.8693344588765655, best f1-macro now:0.869636326572231
*** epoch2,batch1

In [11]:
#测试
def test(model,test_loader):
    model.eval()
    correct = 0
    total = 0
    test_out=torch.tensor([]);test_label = torch.tensor([])
    for epoch, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(test_loader):
        input_ids=input_ids.to(device);attention_mask=attention_mask.to(device);token_type_ids=token_type_ids.to(device)
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        test_label=torch.concat([test_label,labels])
        test_out=torch.concat([test_out,out.cpu()])

    zb = model.compute(test_label, test_out)
    print(f"*** test ***")     
    print(zb)    

#加载最优模型       
test_model = Model(5,pretrained).to(device)
test_model.load_state_dict(torch.load(save_path)['net'])
test(test_model,test_loader)

*** test ***
{'accuracy': 0.8890224358974359, 'f1': 0.8881352395536218, 'precision': 0.887867603173526, 'recall': 0.889002609200697}
