<a href="https://colab.research.google.com/github/netfloator/AI_experiments/blob/main/transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 传统迁移学习

以告警派单到局向的应用作为例子，演示如何通过迁移学习来利用预训练语言模型完成特定任务。这是一个典型的多分类任务，每个局向为一个类，需要根据告警信息把告警分到某个类里。

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification,AutoTokenizer,set_seed, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
import torch
from torch import nn
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

## 加载告警派单数据

数据集包括143477条告警，每条包括两个字段，description字段是告警信息连接成的长串，ticket_bureau_id是要派单的局向id

In [4]:
trouble_shooting_df = pd.read_excel("/content/drive/MyDrive/trouble_shooting_cleaned.xlsx")
print(len(trouble_shooting_df))
trouble_shooting_df.head()

3897


Unnamed: 0,Text,label
0,046B25-3E843046B25AAB510，终端解版,用户终端信息查询及解绑
1,073187573029，千升万，调优后，万兆光猫光猫注册不上，麻烦帮忙看看，谢谢！,绑定设备信息查询
2,073198770568麻烦查下用户光猫是网口几在上网拨号,绑定设备信息查询
3,07345486398@VOD用新机顶盒激活码注册，提示激活码与机顶盒不匹配，是哪里要解绑吗时，,查激活码
4,07375238760@VOD帐参解绑，要换机顶盒,机顶盒信息查询及解绑


## 数据预处理

对ticket_bureau_id字段进行编码

In [5]:
encoder = LabelEncoder()
Y = encoder.fit_transform(trouble_shooting_df[["label"]])

  y = column_or_1d(y, warn=True)


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(trouble_shooting_df["Text"], Y, test_size = 0.15)
len(X_train), len(X_test), len(Y_train), len(Y_test)

(3312, 585, 3312, 585)

In [7]:
model_path = "bert-base-chinese"
# model_path = "/content/drive/MyDrive/bert-trouble-shooting-007"
# model_path = "nghuyong/ernie-3.0-base-zh"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=len(encoder.classes_)
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# tokenizer.add_tokens(["IPTV", "OBD", "LOID"], special_tokens = True)

In [8]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label)}

In [1]:
import os
assert os.environ['COLAB_TPU_ADDR']
# !pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp310-cp310-linux_x86_64.whl
import torch_xla
import torch_xla.core.xla_model as xm
device = xm.xla_device()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16
learning_rate = 2e-4
num_epochs = 15
accumulation_steps = 25
history = []

In [9]:
def train(model, data_loader, optimizer, scheduler, device, loss_to_stop = 1):
    model.train()
    for i, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].type(torch.LongTensor).to(device)
        # outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # outputs = torch.tensor(outputs.logits)
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss # nn.CrossEntropyLoss()(outputs, labels)
        history.append(loss.item())
        if loss.item() <= loss_to_stop:
          print("Stop on loss: ", loss.item())
          break
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
    optimizer.step()

def evaluate(model, data_loader, device):
    predictions, actual_labels = test(model, data_loader, device)
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions), confusion_matrix(actual_labels, predictions)

def predict(text, model, tokenizer, device, max_length=512):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return encoder.inverse_transform(preds.item())

def test(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return predictions, actual_labels

In [10]:
train_dataset = TextClassificationDataset(X_train.values, Y_train, tokenizer)
val_dataset = TextClassificationDataset(X_test.values, Y_test, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

model = model.to(device)

xla:1


In [11]:
def save_model():
  save_path = "/content/drive/MyDrive/bert-trouble-shooting-001"
  model.save_pretrained(save_path)
  tokenizer.save_pretrained(save_path)

In [12]:
# 冻结除输出层和最靠近输出层的一层以外的所有层
#for param in model.base_model.parameters():
#    param.requires_grad = False

optimizer = AdamW(model.parameters(), lr=learning_rate)
# 构建优化器
# optimizer = AdamW([
#    {'params': model.base_model.encoder.layer[-5:].parameters(), 'lr': 1e-3},  # 调整最靠近输出层的一层
#    {'params': model.base_model.pooler.parameters(), 'lr': 1e-3},
#    {'params': model.classifier.parameters(), 'lr': 1e-3}  # 调整输出层
# ])
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps/accumulation_steps)

In [13]:
# num_epochs = 1
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device, loss_to_stop = 0.1)
    print("learning_rate: ", scheduler.get_lr())
    accuracy, report, cm = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    save_model()
    # print(report)

Epoch 1/15


RuntimeError: ignored

In [None]:
import matplotlib.pyplot as plt

#history = history[200:]
plt.plot(range(len(history)),history)

In [None]:
scheduler.get_lr()

In [None]:
import matplotlib.pyplot as plt

accuracy, report, cm = evaluate(model, val_dataloader, device)
display = ConfusionMatrixDisplay(cm)
fig, ax = plt.subplots(figsize=(30,30))
display.plot(ax = ax)
plt.show()

In [None]:
confused_labels = []
for actual_label in range(len(cm)):
    for pred in range(len(cm[actual_label])):
        if cm[actual_label][pred] >= 4 and actual_label != pred:
            print(f"{pred} : {actual_label} - {cm[actual_label][pred]} - {encoder.classes_[pred]},{encoder.classes_[actual_label]}")
            confused_labels.append((pred, actual_label))

In [None]:
Y_pred,_ = test(model, val_dataloader, device)
Y_pred = np.array(Y_pred)
for pred,actual_label in confused_labels:
    print("predicted: ", encoder.classes_[pred], ":::::::::::: actual: ", encoder.classes_[actual_label])
    print(X_test[(Y_pred == pred) & (Y_test == actual_label)])

In [None]:
import re
# regex = r"解绑.+?光猫|光猫.+?解绑"
regex = r"(机顶盒)?.+?激活码"
# trouble_shooting_df[[True if re.search(regex, x) else False for x in trouble_shooting_df["Text"]]].groupby(by="label").count()
trouble_shooting_df[[True if "机顶盒" in x and "激活码" in x else False for x in trouble_shooting_df["Text"]]].groupby(by="label").count()
# trouble_shooting_df[[True if re.search(regex, x) else False for x in trouble_shooting_df["Text"]] & (trouble_shooting_df["label"] != "查激活码")]