In [1]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertModel, BertTokenizer, BertConfig, get_cosine_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
import torch.nn as nn
import torch
import time
from bs4 import BeautifulSoup

BERT_PATH = "bert_model/"    # 该文件夹下存放三个文件（'vocab.txt', 'pytorch_model.bin', 'config.json'）
# DATA_PATH = "data/tags_data.txt"
DATA_PATH = "data/test_data.txt"
MAX_LEN = 64
BATCH_SIZE = 4
EPOCHS = 10

In [2]:
class Bert_Model(nn.Module):
    def __init__(self, bert_path, classes=2):
        super(Bert_Model, self).__init__()
        self.config = BertConfig.from_pretrained(bert_path)  # 导入模型超参数
        self.bert = BertModel.from_pretrained(bert_path)     # 加载预训练模型权重
        self.fc = nn.Linear(self.config.hidden_size, classes)  # 直接分类
        
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        out_pool = outputs[1]   # 池化
        logit = self.fc(out_pool) # 线性模型二分类
        return logit

In [3]:
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

In [None]:
def load_data():
    input_ids, input_masks, input_types, tag_labels = [], [], [], []

    with open(DATA_PATH, encoding="utf-8") as f:
        for line in tqdm(f):
            tags, labels = line.strip().split("\t")
            encode_dict = tokenizer.encode_plus(text=tags, max_length=MAX_LEN,
                                                    padding="max_length", truncation=True)

            input_ids.append(encode_dict["input_ids"])
            input_types.append(encode_dict["token_type_ids"])
            input_masks.append(encode_dict["attention_mask"])
            tag_labels.append(int(labels))

    all_data = (input_ids, input_masks, input_types, tag_labels)
    unit = len(tag_labels) // 10
    train_data = list(map(lambda x: x[:unit*8], all_data))
    valid_data = list(map(lambda x: x[unit*8:unit*9], all_data))
    test_data = list(map(lambda x: x[unit*9:], all_data))

    return train_data, valid_data, test_data
train_data, valid_data, test_data = load_data()

In [8]:
train_dataset = TensorDataset(*tuple(map(torch.LongTensor, train_data)))
train_sampler = RandomSampler(train_dataset)  
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(*tuple(map(torch.LongTensor, valid_data)))
valid_sampler = RandomSampler(valid_dataset)  
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(*tuple(map(torch.LongTensor, test_data)))
test_sampler = RandomSampler(test_dataset)  
test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
def get_parameter_number(model):
    #  打印模型参数量
    total_num = sum(p.numel() for p in model.parameters())
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return 'Total parameters: {}, Trainable parameters: {}'.format(total_num, trainable_num)

# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cpu")
model = Bert_Model(BERT_PATH).to(DEVICE)
print(get_parameter_number(model))
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4) #AdamW优化器
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=len(train_loader),
                                            num_training_steps=EPOCHS*len(train_loader))

In [32]:
# 评估模型性能，在验证集上
def evaluate(model, data_loader, device):
    model.eval()
    val_true, val_pred = [], []
    with torch.no_grad():
        for idx, (ids, att, tpe, y) in (enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device), tpe.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(y.squeeze().cpu().numpy().tolist())
    
    return accuracy_score(val_true, val_pred)  #返回accuracy


# 测试集没有标签，需要预测提交
def predict(model, data_loader, device):
    model.eval()
    val_true, val_pred = [], []
    with torch.no_grad():
        for idx, (ids, att, tpe, y) in tqdm(enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device), tpe.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(y.squeeze().cpu().numpy().tolist())

    print("\n Test Accuracy = {} \n".format(accuracy_score(val_true, val_pred)))
    print(classification_report(val_true, val_pred, digits=4))


def train_and_eval(model, train_loader, valid_loader, 
                   optimizer, scheduler, device, epoch):
    best_acc = 0.0
    patience = 0
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):
        """训练模型"""
        start = time.time()
        model.train()
        print("***** Running training epoch {} *****".format(i+1))
        train_loss_sum = 0.0
        for idx, (ids, att, tpe, y) in enumerate(train_loader):
            ids, att, tpe, y = ids.to(device), att.to(device), tpe.to(device), y.to(device)  
            y_pred = model(ids, att, tpe)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()   # 学习率变化
            
            train_loss_sum += loss.item()
            if (idx + 1) % (len(train_loader)//5) == 0:    # 只打印五次结果
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f}".format(
                          i+1, idx+1, len(train_loader), train_loss_sum/(idx+1), time.time() - start))
                # print("Learning rate = {}".format(optimizer.state_dict()['param_groups'][0]['lr']))

        """验证模型"""
        model.eval()
        acc = evaluate(model, valid_loader, device)  # 验证模型的性能
        ## 保存最优模型
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_bert_model.pth") 
        
        print("current acc is {:.4f}, best acc is {:.4f}".format(acc, best_acc))
        print("time costed = {}s \n".format(round(time.time() - start, 5)))

In [None]:
train_and_eval(model, train_loader, valid_loader, optimizer, scheduler, DEVICE, EPOCHS)


In [33]:
model.load_state_dict(torch.load("best_bert_model.pth"))
pred_test = predict(model, test_loader, DEVICE)



 Test Accuracy = 0.85 

              precision    recall  f1-score   support

           0     0.9677    0.7317    0.8333        41
           1     0.7755    0.9744    0.8636        39

    accuracy                         0.8500        80
   macro avg     0.8716    0.8530    0.8485        80
weighted avg     0.8740    0.8500    0.8481        80



In [None]:
tags = "html head meta meta meta meta meta meta link meta link link link link link link title meta meta meta meta meta meta link link meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta script script link meta meta meta meta meta meta body div section div div div script span a div h1 span img a a nav ul li a ul li a li a li a li a li a li a li a li a li a li a li a li a li a li a ul li a li a li a li a li a li a li a ul li a li a li a li a li a li a li a li a li a li a li a li a ul li a li a li a li a li a li span section div div img img div div div div div div h1 span p input a p a section div div div div div div div div div div h2 p br br p a section div div div div div div div div div div h2 p p p a footer section div div div div ul li a li a li a li a span div ul li a span span a span span a span span section div div div h5 ul li a li a li a li a li a li a li a li a li a li a li a li a div h5 ul li a li a li a li a li a li a div h5 ul li a li a li a li a li a li a li a li a li a li a li a div h5 ul li a li a li a li a li a div a div div br br p b hr div div small span span a a a small a a a section a div div div h6 section div div div div div h3 p a script script script script script script script script noscript img script script script script"

encode_dict = tokenizer.encode_plus(text=tags, max_length=MAX_LEN,
                                                padding="max_length", truncation=True)

input_ids = encode_dict["input_ids"]
input_types = encode_dict["token_type_ids"]
input_masks = encode_dict["attention_mask"]
result = model(torch.LongTensor([input_ids, ]), torch.LongTensor([input_types, ]), torch.LongTensor([input_masks, ]))
torch.argmax(result, dim=1)