In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

# ========== 1. 加载 tokenizer 和已保存的模型 ==========
model_path = "./go_emotions_model"  # 你保存模型的路径
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)  # 指定分类头的类别数

# ========== 2. 加载 GoEmotions 数据集 ==========
dataset = load_dataset("go_emotions")

# 五分类映射
five_class_mapping = {
    # very negative
    'anger': 0, 'disgust': 0, 'fear': 0, 'grief': 0, 'remorse': 0, 'sadness': 0,
    # negative
    'annoyance': 1, 'disappointment': 1, 'embarrassment': 1, 'nervousness': 1, 'confusion': 1,
    # neutral
    'neutral': 2, 'realization': 2, 'curiosity': 2,
    # positive
    'approval': 3, 'joy': 3, 'love': 3, 'optimism': 3, 'desire': 3, 'amusement': 3,
    # very positive
    'admiration': 4, 'excitement': 4, 'gratitude': 4, 'pride': 4, 'relief': 4, 'caring': 4, 'surprise': 4
}

# 映射函数
# def map_emotion(example):
#     label_ids = example['labels']
#     emotions = [dataset['train'].features['labels'].feature.names[i] for i in label_ids]
#     for emo in emotions:
#         if emo in five_class_mapping:
#             return five_class_mapping[emo]
#     return 2  # 默认 neutral
from collections import Counter
def map_emotion(example):
    label_ids = example['labels']
    emotions = [dataset['train'].features['labels'].feature.names[i] for i in label_ids]
    mapped = [five_class_mapping.get(emo, 2) for emo in emotions]
    if not mapped:
        return 2  # 如果没有匹配的情绪，则返回 neutral
    return Counter(mapped).most_common(1)[0][0]  # 取出现频率最高的类别

# 应用映射函数
dataset = dataset.map(lambda x: {'five_class_label': map_emotion(x)})

# ========== 3. 数据预处理 ==========
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# 对数据集进行分词
dataset = dataset.map(tokenize_function, batched=True)

# 创建训练集和验证集
train_data = dataset['train']
eval_data = dataset['validation']

# 删除原始的 labels 字段
train_data = train_data.remove_columns(["labels"])
eval_data = eval_data.remove_columns(["labels"])

# 重命名 five_class_label 为 labels
train_data = train_data.rename_column("five_class_label", "labels")
eval_data = eval_data.rename_column("five_class_label", "labels")

# 设置格式
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ==========使用加权损失函数

import torch
from torch.nn import CrossEntropyLoss

# 根据标签分布计算权重
class_counts = [4033, 4791, 16826, 8940, 8820]  # 各类别样本数量
total_samples = sum(class_counts)
class_weights = [total_samples / count for count in class_counts]
class_weights = torch.tensor(class_weights).to(model.device)

# 将权重传递给损失函数
# 自定义 Trainer 类以使用加权损失函数
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # 获取输入和标签
        labels = inputs.pop("labels").to(model.device)  # 确保 labels 在与模型相同的设备上
        outputs = model(**inputs)
        logits = outputs.logits

        # 使用加权 CrossEntropyLoss
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))  # 确保 class_weights 在与模型相同的设备上
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ==========过采样少数类
""" from datasets import concatenate_datasets

# 过滤出少数类样本
class_0 = train_data.filter(lambda x: x['labels'] == 0)
class_1 = train_data.filter(lambda x: x['labels'] == 1)

# 过采样少数类
oversampled_class_0 = concatenate_datasets([class_0] * 4)  # 复制 4 次
oversampled_class_1 = concatenate_datasets([class_1] * 3)  # 复制 3 次

# 合并回训练集
train_data = concatenate_datasets([train_data, oversampled_class_0, oversampled_class_1])

# ==========欠采样多数类

# 过滤出多数类样本
class_2 = train_data.filter(lambda x: x['labels'] == 2)

# 随机采样多数类
class_2_sampled = class_2.shuffle(seed=42).select(range(5000))  # 仅保留 5000 条样本

# 合并回训练集
train_data = concatenate_datasets([train_data.filter(lambda x: x['labels'] != 2), class_2_sampled])
 """

# ========== 4. 定义评估指标 ==========
def compute_metrics(pred):
    logits, labels = pred.predictions, pred.label_ids
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# ========== 5. 训练参数 ==========
training_args = TrainingArguments(
    output_dir="./result",       # 保存路径
    evaluation_strategy="epoch",         # 每个 epoch 做验证
    save_strategy="epoch",               # 每个 epoch 保存模型
    logging_dir="./logs",                # 日志路径
    # logging_strategy="epoch",            # 每个 epoch 打印日志
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,                  # 最多保存几个模型
    report_to="none",                    # 不连接 wandb 等
    fp16=True,                            # 启用混合精度训练
    # max_steps=10,                     # 训练步数
    learning_rate=3e-5,  # 调低学习率
)

# ========== 6. 初始化 Trainer ==========
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ========== 7. 开始训练 ==========
trainer.train()

# ========== 8. 验证并输出准确率 ==========
eval_results = trainer.evaluate()
print(f"\n📊 Validation Accuracy: {eval_results['eval_accuracy']:.4f}")

  trainer = WeightedLossTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6221,1.695218,0.377626,0.320951
2,0.3874,1.866658,0.422964,0.397303


KeyboardInterrupt: 

In [None]:
#检查标签分布：
from collections import Counter
print("训练集标签分布:", Counter(train_data['labels'].numpy()))
print("验证集标签分布:", Counter(eval_data['labels'].numpy()))

训练集标签分布: Counter({np.int64(2): 16826, np.int64(3): 8940, np.int64(4): 8820, np.int64(1): 4791, np.int64(0): 4033})
验证集标签分布: Counter({np.int64(2): 2111, np.int64(3): 1158, np.int64(4): 1117, np.int64(1): 551, np.int64(0): 489})


In [None]:
model.save_pretrained('./go_emotions_model_2')
tokenizer.save_pretrained('./go_emotion_model_2')

('./go_emotion_model_2\\tokenizer_config.json',
 './go_emotion_model_2\\special_tokens_map.json',
 './go_emotion_model_2\\vocab.txt',
 './go_emotion_model_2\\added_tokens.json')

In [None]:
#学习率过高可能导致模型无法收敛，过低可能导致训练过慢。