In [2]:
from random import random
import os
import random
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import numpy as np
import torch
from torch.utils.data import Dataset
import re
from transformers import AutoConfig
from transformers import AutoTokenizer, BertModel
from torch.utils.tensorboard import SummaryWriter
import time
import sklearn.metrics as metrics

In [13]:
max_length = 512
batch_size = 4
learning_rate = 1e-5
epoch_num = 3
themes = {"动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"}


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


Using cuda device


In [14]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(12)

def Ac(y_true, y_pred):
    return metrics.accuracy_score(y_true, y_pred)

def predict(outputs, alpha=0.4):
    predic = torch.sigmoid(outputs)
    zero = torch.zeros_like(predic)
    topk = torch.topk(predic, k=2, dim=1, largest=True)[1]
    for i, x in enumerate(topk):
        for y in x:
            if predic[i][y] > alpha:
                zero[i][y] = 1
    return zero.cpu()

In [15]:
class ChnSentiCorp(Dataset):
    def __init__(self, data_file):
        self.themes = ["动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"]
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        theme_sentiment_pattern = re.compile(r'(\S+?)#(-?\d+)')
        Data = {}

        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                line = line.strip()
                matches = theme_sentiment_pattern.findall(line)
                comment = re.sub(theme_sentiment_pattern, "", line).strip()
                themes_in_line = [theme for theme, _ in matches]
                multi_hot_vector = [1 if theme in themes_in_line else 0 for theme in self.themes]
                total_sentiment = sum(int(sentiment) for _, sentiment in matches)

                if total_sentiment > 0:
                    sentiment_label = 2  
                elif total_sentiment < 0:
                    sentiment_label = 0  
                else:
                    sentiment_label = 1  

                Data[idx] = {
                    'comment': comment.replace(" ", ""),  
                    'themes': multi_hot_vector,  
                    'sentiment': sentiment_label  
                }
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


train_data = ChnSentiCorp('data/train.txt')
test_data = ChnSentiCorp('data/test.txt')

print(f'train set size: {len(train_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 8000
test set size: 2653
{'comment': '因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。', 'themes': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'sentiment': 1}


In [16]:
def collate_fn(batch_samples):
    batch_sentences, batch_themes_labels, batch_sentiment_labels = [], [], []

    for sample in batch_samples:
        batch_sentences.append(sample['comment'])
        batch_themes_labels.append(sample['themes'])
        #batch_sentiment_labels.append(int(sample['sentiment']))

    batch_inputs = tokenizer(
        batch_sentences,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    return {
        'batch_inputs': batch_inputs,
        'theme_labels': batch_themes_labels,
        #'sentiment_labels': batch_sentiment_labels
    }

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [17]:
class BertForMultiTaskLearning(BertPreTrainedModel):
    def __init__(self, config, num_themes):
        super().__init__(config)
        self.bert = BertModel(config)
        self.num_themes = num_themes
        #self.num_sentiments = 3

        self.theme_classifier = nn.Linear(self.bert.config.hidden_size, num_themes)  # 多标签主题分类
        #self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, num_sentiments)  # 情感分类

        self.theme_loss_fn = nn.BCEWithLogitsLoss()
        #self.sentiment_loss_fn = nn.CrossEntropyLoss()


    def forward(self, batch_inputs, theme_labels=None):
        _ = self.bert(**batch_inputs)

        theme_logits = self.theme_classifier(_[1])   # [batch_size, hidden_size]

        theme_loss = self.theme_loss_fn(theme_logits, theme_labels.float())

        # # 情感分类
        # sentiment_logits = self.sentiment_classifier(pooled_output)  # [batch_size, num_sentiments]
        # sentiment_loss = None
        # if sentiment_labels is not None:
        #     sentiment_loss = self.sentiment_loss_fn(sentiment_logits, sentiment_labels.long())

        # 返回损失和 logits
        return {
            "theme_loss": theme_loss,
            "theme_logits": theme_logits,
            # "sentiment_loss": sentiment_loss
            #"sentiment_logits": sentiment_logits,
        }

config = AutoConfig.from_pretrained(checkpoint)
model = BertForMultiTaskLearning.from_pretrained(checkpoint, config=config, num_themes=len(themes)).to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMultiTaskLearning: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiTaskLearning from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultiTaskLearning from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiTaskLearning were not initialized from the model checkpoint at bert-base-chinese a

In [26]:
total_train_step = 0
total_train_loss = 0.
best_f1_score = 0.
total_test_loss = 0

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step):
    progress_bar = tqdm(range(len(dataloader)),disable=True)
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_step_num = epoch * len(dataloader)
    true_labels, predictions = [], []
    model.train()
    for step, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
        theme_labels = batch_data["theme_labels"]
        theme_outputs = model(**batch_data)
        loss = theme_outputs["theme_loss"]
        logits = theme_outputs["theme_logits"]

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        pred = predict(logits)
        true_labels += theme_labels.cpu().numpy().tolist()
        predictions += pred.cpu().numpy().tolist()
        #theme_metrics = classification_report(true_labels, predictions, target_names=themes, output_dict=True)


        total_train_step +=1
        progress_bar.set_description(f'loss: {total_train_loss / (finish_step_num + step):>7f}')
        progress_bar.update(1)

        if total_train_step % 100 == 0:
            accuracy = Ac(true_labels, predictions)
            print("训练次数:{}，loss:{}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)
            writer.add_scalar("train_accuracy", accuracy, total_train_step)

    return total_train_loss, total_train_step


def test_loop(dataloader, model, epoch):
    true_labels, predictions = [], []
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for step, batch_data in enumerate(dataloader, start=1):
            batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
            theme_labels = batch_data["theme_labels"]
            theme_outputs = model(**batch_data)
            loss = theme_outputs["theme_loss"]
            logits = theme_outputs["theme_logits"]

            pred = predict(logits)

            true_labels += theme_labels.cpu().numpy().tolist()
            predictions += pred.cpu().numpy().tolist()

            total_test_loss += loss.item()
 
    metrics = classification_report(true_labels, predictions, target_names=themes, output_dict=True)

    macro_precision = metrics["macro avg"]["precision"]
    macro_recall = metrics["macro avg"]["recall"]
    macro_f1 = metrics['macro avg']['f1-score']

    accuracy = Ac(true_labels, predictions)

    print("整体测试集上的Loss:{}".format(total_test_loss))
    writer.add_scalar("test_loss", total_test_loss, epoch)
    writer.add_scalar("test_accuarcy", accuracy, epoch)

    print(f"Accuracy: {accuracy * 100:>0.2f}\n")
    print(f"Recall: {macro_recall * 100:>0.2f}\n")
    print(f"Precision: {macro_precision * 100:>0.2f}\n")
    print(f"Macro-F1: {macro_f1 * 100:>0.2f}\n")

    return metrics


In [27]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num * len(train_dataloader),
)


In [28]:
writer = SummaryWriter(log_dir='themes_classification_logs' + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))

for epoch in range(epoch_num):
    print(f"Epoch {epoch + 1}/{epoch_num}\n" + 30 * "-")
    total_train_loss, total_train_step= train_loop(train_dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step)
    valid_scores = test_loop(test_dataloader, model, epoch)
    macro_f1, micro_f1 = valid_scores['macro avg']['f1-score'], valid_scores['weighted avg']['f1-score']
    f1_score = (macro_f1 + micro_f1) / 2
    if f1_score > best_f1_score:
        best_f1_score = f1_score
        print('saving new weights...\n')
        torch.save(
            model.state_dict(),
            f'epoch_{epoch + 1}_valid_macrof1_{(macro_f1 * 100):0.3f}_microf1_{(micro_f1 * 100):0.3f}_model_weights.bin'
        )

writer.close()
print("Done!")

Epoch 1/3
------------------------------
训练次数:100，loss:0.14128893613815308
训练次数:200，loss:0.238071471452713
训练次数:300，loss:0.05874832347035408
训练次数:400，loss:0.14770178496837616
训练次数:500，loss:0.14221671223640442
训练次数:600，loss:0.10402069240808487
训练次数:700，loss:0.1474270224571228
训练次数:800，loss:0.09061691164970398
训练次数:900，loss:0.025390852242708206
训练次数:1000，loss:0.05570119246840477
训练次数:1100，loss:0.08061929792165756
训练次数:1200，loss:0.09871605783700943
训练次数:1300，loss:0.041693590581417084
训练次数:1400，loss:0.0485195517539978
训练次数:1500，loss:0.03534787520766258
训练次数:1600，loss:0.07674387842416763
训练次数:1700，loss:0.02461278811097145
训练次数:1800，loss:0.031014425680041313
训练次数:1900，loss:0.021289953961968422
训练次数:2000，loss:0.02409210614860058


  _warn_prf(average, modifier, msg_start, len(result))


整体测试集上的Loss:50.0447001317516
Accuracy: 82.81

Recall: 90.39

Precision: 86.14

Macro-F1: 88.01

saving new weights...

Epoch 2/3
------------------------------
训练次数:2100，loss:0.0386396162211895
训练次数:2200，loss:0.04403633996844292
训练次数:2300，loss:0.009903945960104465
训练次数:2400，loss:0.02262192964553833
训练次数:2500，loss:0.10436846315860748
训练次数:2600，loss:0.15481458604335785
训练次数:2700，loss:0.020712612196803093
训练次数:2800，loss:0.025577455759048462
训练次数:2900，loss:0.04280760884284973
训练次数:3000，loss:0.025423675775527954
训练次数:3100，loss:0.20209431648254395
训练次数:3200，loss:0.14644818007946014
训练次数:3300，loss:0.050151217728853226
训练次数:3400，loss:0.06498054414987564
训练次数:3500，loss:0.10896965116262436
训练次数:3600，loss:0.07368852198123932
训练次数:3700，loss:0.02990954928100109
训练次数:3800，loss:0.025648152455687523
训练次数:3900，loss:0.13362044095993042
训练次数:4000，loss:0.1389678567647934


  _warn_prf(average, modifier, msg_start, len(result))


整体测试集上的Loss:45.46855849120766
Accuracy: 83.38

Recall: 89.96

Precision: 86.88

Macro-F1: 88.33

saving new weights...

Epoch 3/3
------------------------------
训练次数:4100，loss:0.11974555253982544
训练次数:4200，loss:0.017473747953772545
训练次数:4300，loss:0.08850589394569397
训练次数:4400，loss:0.015337551943957806
训练次数:4500，loss:0.028903687372803688


Exception ignored in: <bound method tqdm.__del__ of <tqdm.notebook.tqdm_notebook object at 0x0000016EDD010240>>
Traceback (most recent call last):
  File "C:\Users\17576\anaconda3\envs\yl\lib\site-packages\tqdm\std.py", line 1162, in __del__
    self.close()
  File "C:\Users\17576\anaconda3\envs\yl\lib\site-packages\tqdm\notebook.py", line 288, in close
    self.disp(bar_style='danger', check_delay=False)
  File "C:\Users\17576\anaconda3\envs\yl\lib\site-packages\tqdm\notebook.py", line 160, in display
    d = self.format_dict
  File "C:\Users\17576\anaconda3\envs\yl\lib\site-packages\tqdm\std.py", line 1476, in format_dict
    'colour': self.colour}
  File "C:\Users\17576\anaconda3\envs\yl\lib\site-packages\tqdm\notebook.py", line 204, in colour
    return self.container.children[-2].style.bar_color
AttributeError: 'FloatProgress' object has no attribute 'style'


训练次数:4600，loss:0.015385778620839119
训练次数:4700，loss:0.044226642698049545
训练次数:4800，loss:0.04595111310482025
训练次数:4900，loss:0.02217872627079487
训练次数:5000，loss:0.14120233058929443
训练次数:5100，loss:0.09069563448429108
训练次数:5200，loss:0.012033330276608467
训练次数:5300，loss:0.010209862142801285
训练次数:5400，loss:0.13968037068843842
训练次数:5500，loss:0.07125606387853622
训练次数:5600，loss:0.011379818432033062
训练次数:5700，loss:0.08518587052822113
训练次数:5800，loss:0.09954748302698135
训练次数:5900，loss:0.055016547441482544
训练次数:6000，loss:0.0365290492773056


  _warn_prf(average, modifier, msg_start, len(result))


整体测试集上的Loss:45.55600940622389
Accuracy: 83.53

Recall: 90.51

Precision: 87.07

Macro-F1: 88.66

saving new weights...

Done!
