In [10]:
# 导入
import os
import logging
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, RandomSampler
from transformers import BertTokenizer
from config import args

import preprocess
import dataset
import models
import utils

In [11]:
# 日志
logger = logging.getLogger(__name__)

In [12]:
# 关于训练函数的封装
class Trainer:
    def __init__(self, args, train_loader, dev_loader, test_loader):
        self.args = args

        # 我把这里注释了，换一种设置device的方式
        # gpu_ids = args.gpu_ids.split(',')
        # self.device = torch.device(
        #     "cpu" if gpu_ids[0] == '-1' else "cuda:" + gpu_ids[0])
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        # 创建模型并移动到设备上
        self.model = models.BertForMultiLabelClassification(args)
        self.model.to(self.device)
        # print(999, self.model)
        # 这个就是非常简单，把bert 768维输出，输入到线性层，转换为65个类型的多分类任务
        # 999 BertForMultiLabelClassification(
        #   (bert): BertModel(
        #     (embeddings): BertEmbeddings(
        #       (word_embeddings): Embedding(21128, 768, padding_idx=0)
        #       (position_embeddings): Embedding(512, 768)
        #       (token_type_embeddings): Embedding(2, 768)
        #       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        #       (dropout): Dropout(p=0.1, inplace=False)
        #     )
        #     (encoder): BertEncoder(
        #       (layer): ModuleList(
        #         (0-11): 12 x BertLayer(
        #           (attention): BertAttention(
        #             (self): BertSelfAttention(
        #               (query): Linear(in_features=768, out_features=768, bias=True)
        #               (key): Linear(in_features=768, out_features=768, bias=True)
        #               (value): Linear(in_features=768, out_features=768, bias=True)
        #               (dropout): Dropout(p=0.1, inplace=False)
        #             )
        #             (output): BertSelfOutput(
        #               (dense): Linear(in_features=768, out_features=768, bias=True)
        #               (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        #               (dropout): Dropout(p=0.1, inplace=False)
        #             )
        #           )
        #           (intermediate): BertIntermediate(
        #             (dense): Linear(in_features=768, out_features=3072, bias=True)
        #             (intermediate_act_fn): GELUActivation()
        #           )
        #           (output): BertOutput(
        #             (dense): Linear(in_features=3072, out_features=768, bias=True)
        #             (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        #             (dropout): Dropout(p=0.1, inplace=False)
        #           )
        #         )
        #       )
        #     )
        #     (pooler): BertPooler(
        #       (dense): Linear(in_features=768, out_features=768, bias=True)
        #       (activation): Tanh()
        #     )
        #   )
        #   (dropout): Dropout(p=0.3, inplace=False)
        #   (linear): Linear(in_features=768, out_features=65, bias=True)
        # )

        # 优化器
        self.optimizer = torch.optim.Adam(
            params=self.model.parameters(), lr=self.args.lr)

        # 损失函数
        self.criterion = nn.BCEWithLogitsLoss()

        # 数据
        self.train_loader = train_loader
        self.dev_loader = dev_loader
        self.test_loader = test_loader

    # 从本地加载模型，这个cpk是checkpoint的意思，训练好的模型会保存在这个目录中
    def load_ckp(self, model, optimizer, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        return model, optimizer, epoch, loss

    # 保存模型到本地
    def save_ckp(self, state, checkpoint_path):
        torch.save(state, checkpoint_path)

    """
    def save_ckp(self, state, is_best, checkpoint_path, best_model_path):
        tmp_checkpoint_path = checkpoint_path
        torch.save(state, tmp_checkpoint_path)
        if is_best:
            tmp_best_model_path = best_model_path
            shutil.copyfile(tmp_checkpoint_path, tmp_best_model_path)
    """

    # 训练函数
    def train(self):
        # 总训练步数
        total_step = len(self.train_loader) * self.args.train_epochs
        global_step = 0
        eval_step = 100  # 这里是每100步做一次评估
        best_dev_micro_f1 = 0.0

        # 开始训练
        for epoch in range(args.train_epochs):
            for train_step, train_data in enumerate(self.train_loader):
                self.model.train()  # 让模型处于训练状态

                # 数据转移到设备上
                token_ids = train_data['token_ids'].to(self.device)
                attention_masks = train_data['attention_masks'].to(self.device)
                token_type_ids = train_data['token_type_ids'].to(self.device)
                labels = train_data['labels'].to(self.device)
                # print(222, labels.shape, labels)
                # 222 torch.Size([32, 65]) tensor([[0., 0., 0.,  ..., 0., 0., 0.],
                # [0., 0., 0.,  ..., 0., 0., 0.],
                # [0., 0., 0.,  ..., 0., 0., 0.],
                # ...,
                # [0., 0., 0.,  ..., 0., 0., 0.],
                # [0., 0., 0.,  ..., 0., 0., 0.],
                # [0., 0., 0.,  ..., 0., 1., 0.]])

                # 获取模型预测
                train_outputs = self.model(
                    token_ids, attention_masks, token_type_ids)
                # print(333, train_outputs.shape, train_outputs)
                # 32个样本，每个样本有65维的分类预测
                #  333 torch.Size([32, 65]) tensor([[-0.3871, -0.3241, -0.0690,  ..., -0.4411, -0.5412, -0.5385],
                # [-0.6295,  0.0456, -0.3547,  ..., -0.0591, -0.2377, -0.7483],
                # [-0.1751, -0.4477, -0.5056,  ..., -0.0935, -0.7540, -1.1031],
                # ...,
                # [-0.5838, -0.8496, -0.0425,  ..., -0.0125, -0.4996, -1.1216],
                # [-1.0604, -0.1496, -0.6887,  ..., -0.2512, -0.3959, -0.7717],
                # [-0.5345,  0.1968, -0.3468,  ..., -0.9787,  0.2667,  0.2490]],

                # 计算损失
                loss = self.criterion(train_outputs, labels)

                # 反向传播
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                logger.info(
                    "【train】 epoch：{} step:{}/{} loss：{:.6f}".format(epoch, global_step, total_step, loss.item()))
                global_step += 1
                # 进入dev评估
                if global_step % eval_step == 0:
                    dev_loss, dev_outputs, dev_targets = self.dev()
                    accuracy, micro_f1, macro_f1 = self.get_metrics(
                        dev_outputs, dev_targets)
                    logger.info(
                        "【dev】 loss：{:.6f} accuracy：{:.4f} micro_f1：{:.4f} macro_f1：{:.4f}".format(dev_loss, accuracy,
                                                                                                   micro_f1, macro_f1))
                    if macro_f1 > best_dev_micro_f1:
                        logger.info("====保存当前最好的模型====")
                        checkpoint = {
                            'epoch': epoch,
                            'loss': dev_loss,
                            'state_dict': self.model.state_dict(),
                            'optimizer': self.optimizer.state_dict(),
                        }
                        best_dev_micro_f1 = macro_f1
                        checkpoint_path = os.path.join(
                            self.args.output_dir, 'best.pt')
                        self.save_ckp(checkpoint, checkpoint_path)

    def dev(self):
        # 模型进入评估模式
        self.model.eval()

        # 记录总的损失
        total_loss = 0.0

        # 存储预测结果和真实结果
        dev_outputs = []
        dev_targets = []
        with torch.no_grad():
            # 便利dev数据
            for dev_step, dev_data in enumerate(self.dev_loader):
                # 把数据转移到gpu或cpu
                token_ids = dev_data['token_ids'].to(self.device)
                attention_masks = dev_data['attention_masks'].to(self.device)
                token_type_ids = dev_data['token_type_ids'].to(self.device)
                labels = dev_data['labels'].to(self.device)

                # 获取模型输出
                outputs = self.model(
                    token_ids, attention_masks, token_type_ids)

                # 计算损失
                loss = self.criterion(outputs, labels)
                # val_loss = val_loss + ((1 / (dev_step + 1))) * (loss.item() - val_loss)
                total_loss += loss.item()

                # 阈值超过0.6才有输出
                outputs = torch.sigmoid(
                    outputs).cpu().detach().numpy().tolist()
                outputs = (np.array(outputs) > 0.6).astype(int)

                # 把本批的结果存储起来
                dev_outputs.extend(outputs.tolist())
                dev_targets.extend(labels.cpu().detach().numpy().tolist())

        return total_loss, dev_outputs, dev_targets

    def test(self, checkpoint_path):
        # 这个和前面的dev函数大体上差不多
        model = self.model
        optimizer = self.optimizer
        model, optimizer, epoch, loss = self.load_ckp(
            model, optimizer, checkpoint_path)
        model.eval()
        model.to(self.device)
        total_loss = 0.0
        test_outputs = []
        test_targets = []
        with torch.no_grad():
            for test_step, test_data in enumerate(self.test_loader):
                token_ids = test_data['token_ids'].to(self.device)
                attention_masks = test_data['attention_masks'].to(self.device)
                token_type_ids = test_data['token_type_ids'].to(self.device)
                labels = test_data['labels'].to(self.device)
                outputs = model(token_ids, attention_masks, token_type_ids)
                loss = self.criterion(outputs, labels)
                # val_loss = val_loss + ((1 / (dev_step + 1))) * (loss.item() - val_loss)
                total_loss += loss.item()
                outputs = torch.sigmoid(
                    outputs).cpu().detach().numpy().tolist()
                outputs = (np.array(outputs) > 0.6).astype(
                    int)  # 这里允许多分类的存在，即一个句子对应多个事件
                test_outputs.extend(outputs.tolist())
                test_targets.extend(labels.cpu().detach().numpy().tolist())

        return total_loss, test_outputs, test_targets

    def predict(self, tokenizer, text, id2label, args):
        model = self.model
        optimizer = self.optimizer
        checkpoint = os.path.join(args.output_dir, 'best.pt')
        model, optimizer, epoch, loss = self.load_ckp(
            model, optimizer, checkpoint)
        model.eval()
        model.to(self.device)
        with torch.no_grad():
            inputs = tokenizer.encode_plus(text=text,
                                           add_special_tokens=True,
                                           max_length=args.max_seq_len,
                                           truncation='longest_first',
                                           padding="max_length",
                                           return_token_type_ids=True,
                                           return_attention_mask=True,
                                           return_tensors='pt')
            token_ids = inputs['input_ids'].to(self.device)
            attention_masks = inputs['attention_mask'].to(self.device)
            token_type_ids = inputs['token_type_ids'].to(self.device)
            outputs = model(token_ids, attention_masks, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
            outputs = (np.array(outputs) > 0.6).astype(int)
            outputs = np.where(outputs[0] == 1)[0].tolist()
            if len(outputs) != 0:
                outputs = [id2label[i] for i in outputs]
                return outputs
            else:
                return '不好意思，我没有识别出来'

    def get_metrics(self, outputs, targets):
        accuracy = accuracy_score(targets, outputs)
        micro_f1 = f1_score(targets, outputs, average='micro')
        macro_f1 = f1_score(targets, outputs, average='macro')
        return accuracy, micro_f1, macro_f1

    def get_classification_report(self, outputs, targets, labels):
        # confusion_matrix = multilabel_confusion_matrix(targets, outputs)
        report = classification_report(targets, outputs, target_names=labels)
        return report

In [13]:
# 读取配置参数，设置随机数种子和日志记录
utils.utils.set_seed(args.seed)
utils.utils.set_logger(os.path.join(args.log_dir, 'main.log'))

processor = preprocess.Processor()

label2id = {}
id2label = {}
with open('./data/final_data/labels.txt', 'r', encoding='utf-8') as fp:
    labels = fp.read().strip().split('\n')
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label
# print(111, label2id)
# print(222, label2id)
# 这里一共是64类事件
# 111 {'财经/交易-出售/收购': 0, '财经/交易-跌停': 1, '财经/交易-加息': 2, '财经/交易-降价': 3, '财经/交易-降息': 4, '财经/交易-融资': 5, '财经/交易-上市': 6, '财经/交易-涨价': 7, '财经/交易-涨停': 8, '产品行为-发布': 9, '产品行为-获奖': 10, '产品行为-上映': 11, '产品行为-下架': 12, ' 产品行为-召回': 13, '交往-道歉': 14, '交往-点赞': 15, '交往-感谢': 16, '交往-会见': 17, '交往-探班': 18, '竞赛行为-夺冠': 19, '竞赛行为-晋级': 20, '竞赛行为-禁赛': 21, '竞赛行为-胜负': 22, '竞赛行为-退赛': 23, '竞赛行为-退役': 24, '人生-产子/女': 25, '人生-出轨': 26, '人生-订婚': 27, '人生-分手': 28, '人生-怀孕': 29, '人生-婚礼': 30, '人生-结婚': 31, '人生-离婚': 32, '人生-庆生': 33, '人生-求婚': 34, '人生-失联': 35, '人生-死亡': 36, '司法行为-罚款': 37, '司法行为-拘捕': 38, '司法行为-举报': 39, '司法行为-开庭': 40, '司法行为-立案': 41, '司法行为-起诉': 42, '司 法行为-入狱': 43, '司法行为-约谈': 44, '灾害/意外-爆炸': 45, '灾害/意外-车祸': 46, '灾害/意外-地震': 47, '灾害/意外-洪灾': 48, '灾害/意外-起火': 49, '灾害/意外-坍/垮塌': 50, '灾害/意外-袭击': 51, '灾害/意外-坠机': 52, '组织关系-裁员': 53, '组织关系-辞/离职': 54, '组织关系-加盟': 55, '组织关系-解雇': 56, '组织关系-解散': 57, '组织关系-解约': 58, '组织关系-停职': 59, '组织关系-退出': 60, '组织行为-罢工': 61, '组织行为-闭幕': 62, '组织行为-开幕': 63, '组织行为-游行': 64}
# 222 {'财经/交易-出售/收购': 0, '财经/交易-跌停': 1, '财经/交易-加息': 2, '财经/交易-降价': 3, '财经/交易-降息': 4, '财经/交易-融资': 5, '财经/交易-上市': 6, '财经/交易-涨价': 7, '财经/交易-涨停': 8, '产品行为-发布': 9, '产品行为-获奖': 10, '产品行为-上映': 11, '产品行为-下架': 12, ' 产品行为-召回': 13, '交往-道歉': 14, '交往-点赞': 15, '交往-感谢': 16, '交往-会见': 17, '交往-探班': 18, '竞赛行为-夺冠': 19, '竞赛行为-晋级': 20, '竞赛行为-禁赛': 21, '竞赛行为-胜负': 22, '竞赛行为-退赛': 23, '竞赛行为-退役': 24, '人生-产子/女': 25, '人生-出轨': 26, '人生-订婚': 27, '人生-分手': 28, '人生-怀孕': 29, '人生-婚礼': 30, '人生-结婚': 31, '人生-离婚': 32, '人生-庆生': 33, '人生-求婚': 34, '人生-失联': 35, '人生-死亡': 36, '司法行为-罚款': 37, '司法行为-拘捕': 38, '司法行为-举报': 39, '司法行为-开庭': 40, '司法行为-立案': 41, '司法行为-起诉': 42, '司 法行为-入狱': 43, '司法行为-约谈': 44, '灾害/意外-爆炸': 45, '灾害/意外-车祸': 46, '灾害/意外-地震': 47, '灾害/意外-洪灾': 48, '灾害/意外-起火': 49, '灾害/意外-坍/垮塌': 50, '灾害/意外-袭击': 51, '灾害/意外-坠机': 52, '组织关系-裁员': 53, '组织关系-辞/离职': 54, '组织关系-加盟': 55, '组织关系-解雇': 56, '组织关系-解散': 57, '组织关系-解约': 58, '组织关系-停职': 59, '组织关系-退出': 60, '组织行为-罢工': 61, '组织行为-闭幕': 62, '组织行为-开幕': 63, '组织行为-游行': 64}
# 训练数据
train_out = preprocess.get_out(
    processor, './data/raw_data/train.json', args, label2id, 'train')
train_features, train_callback_info = train_out
train_dataset = dataset.MLDataset(train_features)
train_sampler = RandomSampler(train_dataset)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=args.train_batch_size,
                          sampler=train_sampler,
                          num_workers=2)

# 验证数据
dev_out = preprocess.get_out(
    processor, './data/raw_data/dev.json', args, label2id, 'dev')
dev_features, dev_callback_info = dev_out
dev_dataset = dataset.MLDataset(dev_features)
dev_loader = DataLoader(dataset=dev_dataset,
                        batch_size=args.eval_batch_size,
                        num_workers=2)

# 开始训练、评估之类的流程
trainer = Trainer(args, train_loader, dev_loader, dev_loader)

Convert 11958 examples to features
Build 11958 features
Convert 1498 examples to features
Build 1498 features
Some weights of the model checkpoint at C:/Users/ji/.cache/huggingface/hub/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mo

In [14]:
# 训练
trainer.train()

【train】 epoch：0 step:0/14960 loss：0.703281


In [None]:
# 测试
logger.info('========进行测试========')
checkpoint_path = './checkpoints/best.pt'
total_loss, test_outputs, test_targets = trainer.test(checkpoint_path) # 主要是这里
accuracy, micro_f1, macro_f1 = trainer.get_metrics(
    test_outputs, test_targets)
logger.info("【test】 loss：{:.6f} accuracy：{:.4f} micro_f1：{:.4f} macro_f1：{:.4f}".format(
    total_loss, accuracy, micro_f1, macro_f1))
report = trainer.get_classification_report(
    test_outputs, test_targets, labels)
logger.info(report)

In [None]:
# 预测1，测试多个案例
trainer = Trainer(args, None, None, None)
checkpoint_path = './checkpoints/best.pt'
tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
with open(os.path.join('./data/raw_data/test1.json'), 'r', encoding='utf-8') as fp:
    lines = fp.read().strip().split('\n')[:10]  # 批量预测前10条
    for line in lines:
        text = eval(line)['text']
        print(text)
        result = trainer.predict(tokenizer, text, id2label, args)
        print(result)

In [None]:
# 测试2，预测单条
text = '2019年，习近平主席热情称赞了国家图书馆工作人员的辛勤付出'
print(trainer.predict(tokenizer, text, id2label, args))