### 加载数据集

In [None]:
import os
import torch
import pandas as pd
import matplotlib
import sklearn
import transformers

!python --version
print(f"os          : {os.name}")
print(f"torch       : {torch.__version__}")
print(f"pandas      : {pd.__version__}")
print(f"matplotlib  : {matplotlib.__version__}")
print(f"sklearn     : {sklearn.__version__}")
print(f"transformers: {transformers.__version__}")

In [None]:
# imdb的无监督数据集可以做对比学习或特定领域微调（如果允许）
#!git clone https://hf-mirror.com/datasets/stanfordnlp/imdb ./datasets/imdb
# !git clone https://hf-mirror.com/datasets/stanfordnlp/sst2 ./datasets/sst2

# model  全库
!git clone https://hf-mirror.com/google-bert/bert-base-uncased ./models/bert-base-uncased

In [None]:
import os

# 创建保存模型文件的目录
local_dir = "./models/bert-base-uncased"
os.makedirs(local_dir, exist_ok=True)

# 使用 wget 下载文件
os.system(f"wget -P {local_dir} https://hf-mirror.com/google-bert/bert-base-uncased/resolve/main/config.json")
os.system(f"wget -P {local_dir} https://hf-mirror.com/google-bert/bert-base-uncased/resolve/main/pytorch_model.bin")
os.system(f"wget -P {local_dir} https://hf-mirror.com/google-bert/bert-base-uncased/resolve/main/tokenizer.json")
os.system(f"wget -P {local_dir} https://hf-mirror.com/google-bert/bert-base-uncased/resolve/main/tokenizer_config.json")
os.system(f"wget -P {local_dir} https://hf-mirror.com/google-bert/bert-base-uncased/resolve/main/vocab.txt")

In [None]:
from google.colab import drive
import shutil
drive.mount("/content/drive/")
srcC = "./models/bert-base-uncased/bert-base-uncased-e2e.bin"
destC = "/content/drive/MyDrive/sdxxdl/bert-base-uncased-e2e.bin"
shutil.copy(srcC, destC)
#shutil.copy(destC, srcC)

In [None]:
# IMDB需要
#%pip install fastparquet

In [3]:
%mkdir datasets

In [None]:
import os
import re
import json
import torch
import string
import random
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, BertModel, BertForMaskedLM, DataCollatorForLanguageModeling
from torch.optim import AdamW   # transformers 里的 AdamW 不再被推荐

IMDB_PATH = "./data/imdb/plain_text/"
IMDB_TRAIN = IMDB_PATH + "train-00000-of-00001.parquet"
IMDB_TEST = IMDB_PATH + "test-00000-of-00001.parquet"
IMDB_UNSUPERVISED = IMDB_PATH + "unsupervised-00000-of-00001.parquet"   # 无标签数据，可以拿来做语料训练

TRAIN_PATH = "datasets/train.jsonl"
TEST_PATH = "datasets/test.jsonl"

MODEL_PATH = "./models/bert-base-uncased"
MODEL_E2E_NAME = MODEL_PATH+"/bert-base-uncased-e2e.bin"
MODEL_PRE_NAME = MODEL_PATH+"/bert-base-uncased-mlm.bin"
MODEL_PRE_PATH = MODEL_PATH+"/bert-base-uncased-mlm"
# 分类标签数
NUM_LABELS = 2
classes= {
    1: "Machine",
    0: "Human"
}

机器相关配置：队友们在运行前必须先根据机器情况配置此处

In [2]:
# 这两项十分消耗显存，参考：colab的T4显卡只能运行 128*128
MAX_LENGTH = 128  # 一般不超过512
BATCH_SIZE = 128
BATCH_SIZE_TEST = BATCH_SIZE << 3 # 测试不会消耗太多资源
# 线程数（一般一个运数据，一个跑）
NUM_WORKERS = 2
# 学习率
LEARNING_RATE = 5e-5
EPOCHS = 20

In [None]:
print(BATCH_SIZE << 3)

In [None]:
!nvidia-smi
device = torch.device("cuda")
print(f"Using device: {torch.cuda.get_device_name(0)}")
if torch.cuda.get_device_capability(0)[0] >= 7:
    print("[INFO] 支持混合精度")
else:
    print("[WARNING] 不支持混合精度")

### 数据预处理

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from tqdm import tqdm
from bge import BGE  # 引入 BGE 类

# 数据预处理函数
def clean_text(text, is_uncased=True):
    import re
    import string
    text = re.sub(r'<.*?>', '', text)                                 # 去除HTML标签
    text = text.translate(str.maketrans('', '', string.punctuation))  # 去除标点符号
    if is_uncased:
        text = text.lower()                                           # 转为小写
    text = re.sub(r'\s+', ' ', text).strip()                          # 去除多余空格
    return text

# 加载数据集
def preprocess_dataset(df):
    df["text"] = df["text"].apply(clean_text)
    return df

# 使用 BGE 嵌入进行聚类处理
def cluster_large_model_text(df, bge_model, output_path):
    # 筛选大模型文本（标签为 1 的样本）
    large_model_texts = df[df["label"] == 1]["text"].tolist()

    # 使用 BGE 嵌入生成向量
    embeddings = bge_model.embed_texts(large_model_texts)
    np.save("bge_vectors.npy", np.vstack(embeddings))  # 保存嵌入向量
    embeddings = np.vstack(embeddings)  # 转换为二维数组

    # 使用 K-means 聚类
    kmeans = KMeans(n_clusters=7, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)

    # 更新标签
    df.loc[df["label"] == 1, "label"] = cluster_labels + 1  # 聚类结果从 1 到 7

    # 将更新后的数据写入新的文件
    df.to_json(output_path, orient="records", lines=True)
    print(f"聚类完成，数据已保存到 {output_path}")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAIN_PATH = "datasets/train.jsonl"  # 数据路径
OUTPUT_PATH = "datasets/clustered_train.jsonl"  # 输出文件路径

# 加载 JSONL 数据集
data = pd.read_json(TRAIN_PATH, lines=True)
data = preprocess_dataset(data)
    
# 初始化 BGE 模型
bge_model = BGE(model_path="D:/Study_Work/Electronic_data/CS/AAAUniversity/Machine_Learning/sdxxylysj/Lab3/src/datasets/models/bge-base-en-v1.5")
    
# 聚类处理并保存结果
cluster_large_model_text(data, bge_model, OUTPUT_PATH)

生成嵌入中:   3%|▎         | 520/14907 [00:47<19:00, 12.62it/s]

In [3]:
def clean_text(text, is_uncased=True):
    text = re.sub(r'<.*?>', '', text)                                 # 去除HTML标签
    text = text.translate(str.maketrans('', '', string.punctuation))  # 去除标点符号
    if is_uncased:
        text = text.lower()                                           # 转为小写（注意：uncased模型需要，其余模型不必）
    text = re.sub(r'\s+', ' ', text).strip()                          # 去除多余空格
    return text
def preprocess_dataset(df, ds_type="DETECT", is_MLM=False):
    row_name="sentence" if ds_type == "SST2" else "text"    # 本次任务与IMDB均为text，仅SST2为sentence
    df["text"] = df[row_name].apply(clean_text)
    if not is_MLM:
        data_list = df[["text", "label"]].to_dict(orient="records")
        print(df["label"].value_counts(normalize=True))
    else:
        data_list = df["text"].to_list()
    return data_list

In [None]:
def init_data(is_MLM=False):
    with open(TRAIN_PATH, "r", encoding="utf-8") as file:
        train_raw = [json.loads(line) for line in file]
    with open(TEST_PATH, "r", encoding="utf-8") as file:
        test_raw = [json.loads(line) for line in file]

    train_df = pd.DataFrame(train_raw)
    test_df = pd.DataFrame(test_raw)

    train_text_lengths = train_df["text"].apply(len)
    test_text_lengths = test_df["text"].apply(len)
    print(f"avg train text: {train_text_lengths.mean():.2f}")
    print(f"avg test text: {test_text_lengths.mean():.2f}")

    print(train_df.head())
    print(test_df.head())

    train_list_ = preprocess_dataset(train_df)
    test_data = preprocess_dataset(test_df, is_MLM=True)
    
    train_processed_lengths = [len(item["text"]) for item in train_list_]
    test_processed_lengths = [len(text) for text in test_data]
    print(f"avg train text(final): {sum(train_processed_lengths) / len(train_processed_lengths):.2f}")
    print(f"avg test text(final): {sum(test_processed_lengths) / len(test_processed_lengths):.2f}")

    labels = [item["label"] for item in train_list_]

    # 训练：测试+验证 = 8:2
    train_data, valid_data = train_test_split(
        train_list_, test_size=0.2, random_state=42, stratify=labels
    )

    # 无监督训练用不到测试集，把原有的训练集和验证集覆盖，保留正式数据划分好的测试集
    if is_MLM:
        combined_data = preprocess_dataset(pd.read_parquet(IMDB_UNSUPERVISED), ds_type="IMDB", is_MLM=True)
        train_data, valid_data = train_test_split(
            combined_data, test_size=0.2, random_state=42
        )
    random.shuffle(train_data)
    random.shuffle(valid_data)

    print(f"训练集长度: {len(train_data)}; 验证集长度: {len(valid_data)}; 测试集长度: {len(test_data)}")

    return train_data, valid_data, test_data
if __name__ == "__main__":
    init_data()   # train.jsonl的标签分布为 1:0.532393 | 0:0.467607

In [18]:
# data_list是经过预处理的列表，每项包含 text 和 label
class DetectDataset(Dataset):
    def __init__(self, data_list, tokenizer, max_length=MAX_LENGTH, is_MLM=False):
        self.data = data_list
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_MLM = is_MLM

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if not self.is_MLM:
            text = self.data[idx]["text"]
        else:
            text = self.data[idx]   # MLM 只有一列

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,    # 加上 [CLS]（开头） 和 [SEP]（结尾），不够的用[PAD]填充
            max_length=self.max_length, # 输入序列的最大长度
            padding="max_length",
            truncation=True,            # 启用截断
            return_tensors="pt"
        )
        if not self.is_MLM:
            return {key: val.squeeze(0) for key, val in encoding.items()}, torch.tensor(self.data[idx]["label"])

        return {key: val.squeeze(0) for key, val in encoding.items()}

# 这个运行挺快的，就不缓存了
def load_data(tokenizer, is_MLM=False):
    train_data, val_data, test_data = init_data(is_MLM)
    train_dataset = DetectDataset(train_data, tokenizer, is_MLM=is_MLM)
    val_dataset = DetectDataset(val_data, tokenizer, is_MLM=is_MLM)
    test_dataset = DetectDataset(test_data, tokenizer, is_MLM=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST)

    # 自动处理掩码，随机掩码（默认15%的）token并使用原始input_ids作为labels（非掩码位置设为-100以忽略损失）
    collate_fn = None if not is_MLM else DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=NUM_WORKERS, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE_TEST, collate_fn=collate_fn, num_workers=2, pin_memory=True)
    # for batch in train_loader:
    #    inputs, labels = batch
    #    print("Inputs:", inputs)
    #    print("Labels:", labels)
    #    break

    return train_loader, val_loader, test_loader

初始化分词器与数据加载器

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
train_loader, valid_loader, test_loader = load_data(tokenizer)

### 训练器与测试器

在`colab`上训练时每个`batch`都至少输出一条信息，不然会因为长时间无响应而断开连接

In [13]:
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, epochs=3, lr=2e-5, is_MLM=False):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.is_MLM = is_MLM
        self.epochs = epochs
        self.lr = lr

        # AdamW 是加入了权重衰减的Adam
        self.optimizer = AdamW(self.model.parameters(), lr=lr)
        # 学习率从0增加到设定的最大值然后逐渐线性下降
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=2,
            num_training_steps=len(train_loader)*epochs
        )
        self.loss_func = nn.CrossEntropyLoss()
        # 拓展：混合精度训练
        self.scaler = torch.amp.GradScaler(device=self.device.type)

        # 训练记录
        self.train_losses, self.val_losses = [], []
        self.train_accs, self.val_accs = [], []
        self.train_precisions, self.val_precisions = [], []
        self.train_recalls, self.val_recalls = [], []
        self.train_f1s, self.val_f1s = [], []
        self.best_accuracy = 0.0
        self.min_loss = float("inf")
        self.save_name = MODEL_E2E_NAME
        self.bear_cnt = 0
        self.epc=1
        self.mlm_loss_train = []
        self.mlm_loss_valid = []

        # print(f"[DEBUG] train loader length:{len(self.train_loader)}")

    def train(self):
        for epoch in range(self.epochs):
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            # 训练
            # print(f"[DEBUG] train")
            train_loss, train_acc, train_precision, train_recall, train_f1 = self.__train_part()

            # 验证
            # print(f"[DEBUG] valid")
            val_loss, val_acc, val_precision, val_recall, val_f1 = self._valid_part()
            self.epc+=1

            # 保存最佳模型
            if not self.is_MLM and val_acc > self.best_accuracy:
                self.best_accuracy = val_acc
                self.bear_cnt=0
                torch.save(self.model.state_dict(), self.save_name)
            if val_loss < self.min_loss:
                self.min_loss = val_loss
                self.bear_cnt = 0
            else:
                self.bear_cnt += 1
                if self.bear_cnt >= 3:
                    print("[INFO] Early stop")
                    break

            print(f"Epoch {epoch + 1}/{self.epochs}")
            print(
                f"Train Loss: {train_loss:.4f}, "
                f"Acc: {f'{train_acc:.4f}'             if train_acc       is not None else 'N/A'}, "
                f"Precision: {f'{train_precision:.4f}' if train_precision is not None else 'N/A'}, "
                f"Recall: {f'{train_recall:.4f}'       if train_recall    is not None else 'N/A'}, "
                f"F1: {f'{train_f1:.4f}'               if train_f1        is not None else 'N/A'} | "
                f"Val Loss: {val_loss:.4f}, "
                f"Acc: {f'{val_acc:.4f}'               if val_acc         is not None else 'N/A'}, "
                f"Precision: {f'{val_precision:.4f}'   if val_precision   is not None else 'N/A'}, "
                f"Recall: {f'{val_recall:.4f}'         if val_recall      is not None else 'N/A'}, "
                f"F1: {f'{val_f1:.4f}'                 if val_f1          is not None else 'N/A'}"
            )

        return (
            self.train_losses,
            self.val_losses,
            self.train_accs,
            self.val_accs,
            self.train_precisions,
            self.val_precisions,
            self.train_recalls,
            self.val_recalls,
            self.train_f1s,
            self.val_f1s,
        )

    def __train_part(self):
        # print(f"[DEBUG] jump into __train_part")
        self.model.train()
        # print(f"[DEBUG] {self.model.training}")
        total_loss = 0.0
        all_labels = []
        all_preds = []
        cc=1
        for batch in self.train_loader:
            print(f"[DEBUG] batch/epoch: {cc}/{self.epc}")
            cc+=1
            if not self.is_MLM:
                inputs, labels = batch
                # print(f"[DEBUG] {inputs}")
                # print(f"[DEBUG] {labels}")
                labels = labels.to(self.device)
            else:
                inputs = batch
                labels = inputs["input_ids"].to(self.device)

            input_ids = inputs["input_ids"].to(self.device)
            attention_mask = inputs["attention_mask"].to(self.device)

            # print(f"[DEBUG] {input_ids.shape}")
            # print(f"[DEBUG] {attention_mask.shape}")
            # 拓展： 混合精度前向传播
            with torch.amp.autocast(device_type=self.device.type):
                if not self.is_MLM:
                    outputs = self.model(input_ids=input_ids,attention_mask=attention_mask)
                    # print(f"[DEBUG] {outputs}")
                    loss = self.loss_func(outputs, labels)
                else:
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss # MLM内含loss

            # print(f"[DEBUG] {loss.item()}")

            self.optimizer.zero_grad()
            #loss.backward()
            self.scaler.scale(loss).backward()  # 反向传播前对loss乘一个缩放因子，避免梯度过小导致的下溢
            self.scaler.unscale_(self.optimizer)  # 反缩放，梯度裁剪通常是基于原始梯度值进行的
            # 梯度裁剪 <-- 防止爆炸、稳定训练、加速收敛
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            #self.optimizer.step()
            self.scaler.step(self.optimizer)
            self.scaler.update()  # 更新缩放因子，为下次迭代做准备
            self.scheduler.step()

            if self.is_MLM:
                self.mlm_loss_train.append(loss.item())
            total_loss += loss.item()
            if not self.is_MLM:
                tmp = torch.softmax(outputs, dim=1)
                preds = torch.argmax(tmp, dim=1).cpu().numpy()
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds)

        train_loss = total_loss / len(self.train_loader)

        if not self.is_MLM:
            train_acc = accuracy_score(all_labels, all_preds)
            train_precision = precision_score(all_labels, all_preds, average="weighted")
            train_recall = recall_score(all_labels, all_preds, average="weighted")
            train_f1 = f1_score(all_labels, all_preds, average="weighted")

            # 保存记录
            self.train_losses.append(train_loss)
            self.train_accs.append(train_acc)
            self.train_precisions.append(train_precision)
            self.train_recalls.append(train_recall)
            self.train_f1s.append(train_f1)
            return train_loss, train_acc, train_precision, train_recall, train_f1
        else:
            return train_loss, None, None, None, None

    def _valid_part(self):
        self.model.eval()
        total_loss = 0.0
        all_labels = []
        all_preds = []
        cc=1
        with torch.no_grad():
            for batch in self.val_loader:
                print(f"[DEBUG] batch/epoch: {cc}/{self.epc}")
                cc+=1
                if not self.is_MLM:
                    inputs, labels = batch
                    labels = labels.to(self.device)
                else:
                    inputs = batch
                    labels = inputs["input_ids"].to(self.device)

                input_ids = inputs["input_ids"].to(self.device)
                attention_mask = inputs["attention_mask"].to(self.device)

                # 拓展： 混合精度前向传播
                with torch.amp.autocast(device_type=self.device.type):
                    if not self.is_MLM:
                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                        loss = self.loss_func(outputs, labels)
                    else:
                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                        loss = outputs.loss

                if self.is_MLM:
                    self.mlm_loss_valid.append(loss.item())

                total_loss += loss.item()
                if not self.is_MLM:
                    tmp = torch.softmax(outputs, dim=1)
                    preds = torch.argmax(tmp, dim=1).cpu().numpy()
                    all_labels.extend(labels.cpu().numpy())
                    all_preds.extend(preds)

        val_loss = total_loss / len(self.val_loader)

        if not self.is_MLM:
            val_acc = accuracy_score(all_labels, all_preds)
            val_precision = precision_score(all_labels, all_preds, average="weighted")
            val_recall = recall_score(all_labels, all_preds, average="weighted")
            val_f1 = f1_score(all_labels, all_preds, average="weighted")

            self.val_losses.append(val_loss)
            self.val_accs.append(val_acc)
            self.val_precisions.append(val_precision)
            self.val_recalls.append(val_recall)
            self.val_f1s.append(val_f1)

            return val_loss, val_acc, val_precision, val_recall, val_f1
        else:
            return val_loss, None, None, None, None

In [14]:
class ModelEvaluator:
    def __init__(
        self,
        test_loader=None,
        compare_lists=None,
    ):
        self.test_loader = test_loader
        self.compare_lists = compare_lists
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.fpr= None
        self.tpr= None
        self.roc_auc= None

    def evaluate_model(self, model, analyse=True):
        '''
        当处于分析模式时，不绘制混淆矩阵，直接返回评估指标
        特别注意：本方法要求测试集带有标签
        '''
        model.to(self.device)
        model.eval()

        all_labels = []
        all_preds = []
        cc=0
        with torch.no_grad():
            for inputs, labels in self.test_loader:
                print(f"[DEBUG] Testing batch:{cc}")
                cc+=1
                inputs = {key: val.to(self.device) for key, val in inputs.items()}
                labels = labels.to(self.device)
                outputs= model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
                tmp = torch.softmax(outputs, dim=1)
                preds = torch.argmax(tmp, dim=1).tolist()

                all_labels.extend(labels.tolist())
                all_preds.extend(preds)

        # 计算评估指标
        test_acc = accuracy_score(all_labels, all_preds)
        test_precision = precision_score(all_labels, all_preds)
        test_recall = recall_score(all_labels, all_preds)
        test_f1 = f1_score(all_labels, all_preds)
        self.fpr, self.tpr, _ = roc_curve(all_labels, all_preds)
        self.roc_auc = auc(self.fpr, self.tpr)

        print(f"Test Accuracy: {test_acc:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test F1 Score: {test_f1:.4f}")
        if analyse:
            return test_acc, test_precision, test_recall, test_f1

        # 绘制混淆矩阵
        cm = confusion_matrix(all_labels, all_preds, labels=[i for i in range(NUM_LABELS)])
        disp = ConfusionMatrixDisplay(
            confusion_matrix=cm, display_labels=[classes[i] for i in range(NUM_LABELS)]
        )
        disp.plot(cmap=plt.cm.Blues)
        plt.title("Confusion Matrix")
        plt.show()

    def detect(self, model):
        """
        评估无标签测试集
        """
        model.to(self.device)
        model.eval()

        all_preds = []
        cc = 0
        with torch.no_grad():
            for inputs in self.test_loader:
                print(f"[DEBUG] Testing batch:{cc}")
                cc += 1
                inputs = {key: val.to(self.device) for key, val in inputs.items()}
                outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
                tmp = torch.softmax(outputs, dim=1)
                preds = torch.argmax(tmp, dim=1).tolist()

                all_preds.extend(preds)

        print(f"Predicted Labels: {all_preds}")
        return all_preds

    def collect_errors(self, model, tokenizer):
        model.to(self.device)
        model.eval()

        errors = []  # 存储错误分类的样本
        cc=1
        with torch.no_grad():
            for inputs, labels in self.test_loader:
                print(f"[DEBUG] Testing batch:{cc}")
                cc+=1
                inputs = {key: val.to(self.device) for key, val in inputs.items()}
                labels = labels.to(self.device)

                # 模型预测
                outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
                probs = torch.softmax(outputs, dim=1)
                preds = torch.argmax(probs, dim=1)

                # 收集错误分类的样本
                for i in range(len(labels)):
                    if preds[i] != labels[i]:
                        text = tokenizer.decode(inputs["input_ids"][i], skip_special_tokens=True)
                        errors.append({
                            "text": text,
                            "label": labels[i].item(),
                            "pred": preds[i].item()
                        })

        return errors

### 模型结构

直接将 `bert-base-uncased` 作为基础编码器，接上分类层（全连接层+softmax/sigmoid激活）

这里只返回`logits`，具体


In [None]:
# 本部分为bert输出维度展示，可不用运行
bert_out = BertModel.from_pretrained(MODEL_PATH).config.hidden_size
print(f"bert_out: {bert_out}")

In [15]:
class BertClassifier(nn.Module):
    def __init__(self, mlm=False):
        super(BertClassifier, self).__init__()
        model_path = MODEL_PATH if not mlm else MODEL_PRE_PATH
        self.bert = BertModel.from_pretrained(model_path)

        self.dropout = nn.Dropout(0.6)
        self.classifier = nn.Linear(self.bert.config.hidden_size, NUM_LABELS)  # 分类层
        #self.activation = nn.Sigmoid() # 二分类

    def forward(self, input_ids, attention_mask):
        # BERT 编码器
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask # 区别有效区域与填充区域
        )
        # 基于 [CLS] token 的隐藏状态，经过一个额外的全连接层和 tanh 激活函数后的结果
        cls_output = outputs.pooler_output

        # Dropout + 分类层
        #cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

In [None]:
# 理论上应该也可以直接调用这个，不过自己编的分类器更灵活一些
# model = BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_LABELS)

In [16]:
def compare_eval(initial_metrics, final_metrics):
    # 对比前后的性能变化
    metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
    initial_values = [initial_metrics[0], initial_metrics[1], initial_metrics[2], initial_metrics[3]]
    final_values = [final_metrics[0], final_metrics[1], final_metrics[2], final_metrics[3]]

    x = range(len(metrics))
    plt.figure(figsize=(8, 6))
    plt.bar(x, initial_values, width=0.4, label="Before Fine-tuning", align="center")
    plt.bar([i + 0.4 for i in x], final_values, width=0.4, label="After Fine-tuning", align="center")
    plt.xticks([i + 0.2 for i in x], metrics)
    plt.ylabel("Score")
    plt.title("Performance Comparison Before and After Fine-tuning")
    plt.legend()
    plt.show()

def train_show(trainer):
    plt.figure(figsize=(15, 10))

    # 准确率
    plt.subplot(2, 3, 1)
    plt.plot(trainer.train_accs, label="Train Accuracy")
    plt.plot(trainer.val_accs, label="Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training and Validation Accuracy")
    plt.legend()

    # 精确率
    plt.subplot(2, 3, 2)
    plt.plot(trainer.train_precisions, label="Train Precision")
    plt.plot(trainer.val_precisions, label="Validation Precision")
    plt.xlabel("Epoch")
    plt.ylabel("Precision")
    plt.title("Training and Validation Precision")
    plt.legend()

    # 损失
    plt.subplot(2, 3, 3)
    plt.plot(trainer.train_losses, label="Train Loss")
    plt.plot(trainer.val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()

    # 召回率
    plt.subplot(2, 3, 4)
    plt.plot(trainer.train_recalls, label="Train Recall")
    plt.plot(trainer.val_recalls, label="Validation Recall")
    plt.xlabel("Epoch")
    plt.ylabel("Recall")
    plt.title("Training and Validation Recall")
    plt.legend()

    # F1
    plt.subplot(2, 3, 5)
    plt.plot(trainer.train_f1s, label="Train F1")
    plt.plot(trainer.val_f1s, label="Validation F1")
    plt.xlabel("Epoch")
    plt.ylabel("F1 Score")
    plt.title("Training and Validation F1 Score")
    plt.legend()

    plt.tight_layout()
    plt.show()

#### 端到端训练

加载预训练权重，更新整个网络的参数


In [None]:
# 初始化模型和数据
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier()
model.to(device)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

trainer = ModelTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=valid_loader,
    epochs=EPOCHS,
    lr=LEARNING_RATE,
)

evaluator = ModelEvaluator(test_loader=valid_loader)
print("端到端微调前的性能：")
initial_metrics = evaluator.evaluate_model(model=model)

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()
trainer.train()
train_show(trainer)

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()

print("端到端微调后的性能：")
final_metrics = evaluator.evaluate_model(model=model)

# 对比端到端训练前后性能变化
compare_eval(initial_metrics, final_metrics)

In [None]:
# 评估测试集
evaluator.test_loader= test_loader
predicted_labels = evaluator.detect(model=model)

# 将预测结果逐行写入 txt 文件
output_file = "./result/submit.txt"
with open(output_file, "w", encoding="utf-8") as file:
    for label in predicted_labels:
        file.write(f"{label}\n")

print(f"预测结果已写入文件: {output_file}")

#### 先预训练后微调

##### 预训练

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()
# 加载预训练模型
is_MLM = True
model = BertForMaskedLM.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

train_loader, val_loader, test_loader = load_data(tokenizer, is_MLM)

mlm_trainer = ModelTrainer(model=model, train_loader=train_loader, val_loader=val_loader, epochs=2, lr=LEARNING_RATE, is_MLM=is_MLM)

# 损失在3.5以下（通用预训练MLM损失）比较正常
mlm_trainer.train()

model.save_pretrained(MODEL_PRE_PATH, safe_serialization=False)
tokenizer.save_pretrained(MODEL_PRE_PATH)

In [None]:
plt.figure(figsize=(8, 16))
plt.plot(mlm_trainer.mlm_loss_train[30:], label="Train Loss")
plt.plot(mlm_trainer.mlm_loss_valid[30:], label="Validation Loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.tight_layout()
plt.show()

if torch.cuda.is_available():
    torch.cuda.empty_cache()

##### 微调

In [None]:
def layerwise_lr_decay(model, lr=5e-5, decay=0.8):
    # 分类头参数（学习率较高）
    # classifier_params = list(model.classifier.parameters()) + list(model.dropout.parameters())
    classifier_params = list(model.classifier.parameters())

    # BERT本体参数（学习率较低）
    bert_params = []
    bert_lr = lr *0.5
    for i, layer in enumerate(model.bert.encoder.layer[::-1]):  # 从最后一层开始
        bert_params.append({"params": layer.parameters(), "lr": bert_lr})
        bert_lr *= decay  # 每层学习率衰减

    # 分类头学习率较高，放在前面确保优化器优先处理
    return [{"params": classifier_params, "lr": lr}] + bert_params

In [None]:
# 初始化模型和数据
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier(mlm=True)
model.to(device)
tokenizer = BertTokenizer.from_pretrained(MODEL_PRE_PATH)
train_loader, val_loader, test_loader = load_data(tokenizer)

params = layerwise_lr_decay(model)
optimizer = AdamW(params, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=2, num_training_steps=len(train_loader) * EPOCHS
)

In [None]:
# 初始化训练器
trainer = ModelTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=EPOCHS,
    lr=5e-5,
)
trainer.optimizer = optimizer  # 覆盖默认优化器 <-- 引入分层学习率
trainer.scheduler = scheduler
trainer.save_name = MODEL_PRE_NAME

evaluator = ModelEvaluator(test_loader=test_loader)
print("微调前的性能：")
initial_metrics = evaluator.evaluate_model(model=model)

In [None]:
if torch.cuda.is_available():
    import time
    torch.cuda.empty_cache()
    start_time = time.time()
    torch.cuda.reset_peak_memory_stats()
    initial_memory = torch.cuda.memory_allocated()

# 开始训练
trainer.train()

if torch.cuda.is_available():
    end_time = time.time()
    peak_memory = torch.cuda.max_memory_allocated()
    elapsed_time = end_time - start_time
    memory_used = peak_memory - initial_memory
    print(f"Elapsed Time: {elapsed_time:.2f} seconds")
    print(f"Memory Used: {memory_used / (1024 ** 2):.2f} MB\n")

train_show(trainer)

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()

# 测试模型效果
print("微调后的性能：")
final_metrics = evaluator.evaluate_model(model=model)

# 对比微调前后性能变化
compare_eval(initial_metrics, final_metrics)

### 对比

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier()
model.load_state_dict(torch.load(MODEL_E2E_NAME, map_location=device))
model.to(device)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
_, _, test_loader = load_data(tokenizer, is_check=True)
evaluator = ModelEvaluator(test_loader=test_loader)
print("端到端的性能：")
e2e=evaluator.evaluate_model(model, is_check=True)
fpr_e2e, tpr_e2e, roc_auc_e2e = evaluator.fpr, evaluator.tpr, evaluator.roc_auc

In [None]:
model = BertClassifier(mlm=True)
model.load_state_dict(torch.load(MODEL_PRE_NAME, map_location=device))
model.to(device)
tokenizer = BertTokenizer.from_pretrained(MODEL_PRE_PATH)
_, _, test_loader = load_data(tokenizer, is_check=True)
evaluator = ModelEvaluator(test_loader=test_loader)
print("预训练&微调的性能：")
mlm = evaluator.evaluate_model(model=model, is_check=True)
fpr_mlm, tpr_mlm, roc_auc_mlm = evaluator.fpr, evaluator.tpr, evaluator.roc_auc

In [None]:
compare_eval(e2e, mlm)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr_e2e, tpr_e2e, label=f"End-to-End (AUC = {roc_auc_e2e:.2f})")
plt.plot(fpr_mlm, tpr_mlm, label=f"Pre-trained & Fine-tuned (AUC = {roc_auc_mlm:.2f})")
plt.xlabel("False Positive Rate")
plt.plot([0, 1], [0, 1], "k--", lw=2, label="Random Guessing")
# 图形设置
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("FP Rate")
plt.ylabel("TP Rate")
plt.title("ROC-AUC Curve Comparison")
plt.legend(loc="lower right")
plt.grid()
plt.show()

In [None]:
if torch.cuda.is_available():
  torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier()
model.load_state_dict(torch.load(MODEL_E2E_NAME, map_location=device))
model.to(device)

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
_, _, test_loader = load_data(tokenizer, is_check=True)
#lengt=1000
#test_dataset = test_loader.dataset  # 获取原始数据集
#test_subset = Subset(test_dataset, list(range(lengt)))
#test_loader = DataLoader(test_subset, batch_size=test_loader.batch_size, shuffle=False)


evaluator = ModelEvaluator(test_loader=test_loader)

errors = evaluator.collect_errors(model=model, tokenizer=tokenizer)
print(f"使用的测试集大小：{len(test_loader.dataset)} | 错误分类样本数量: {len(errors)}")
for error in errors[:10]:
    print(f"Text: {error['text']}")
    print(f"True Label: {error['label']}, Pred: {error['pred']}")
    print("-" * 50)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier()
model.load_state_dict(torch.load(MODEL_PRE_NAME, map_location=device))
model.to(device)

tokenizer = BertTokenizer.from_pretrained(MODEL_PRE_PATH)
_, _, test_loader = load_data(tokenizer, is_check=True)
#lengt=1000
#test_dataset = test_loader.dataset  # 获取原始数据集
#test_subset = Subset(test_dataset, list(range(lengt)))
#test_loader = DataLoader(test_subset, batch_size=test_loader.batch_size, shuffle=False)


evaluator = ModelEvaluator(test_loader=test_loader)

errors = evaluator.collect_errors(model=model, tokenizer=tokenizer)
print(f"使用的测试集大小：{len(test_loader.dataset)} | 错误分类样本数量: {len(errors)}")
for error in errors[:10]:
    print(f"Text: {error['text']}")
    print(f"True Label: {error['label']}, Pred: {error['pred']}")
    print("-" * 50)