In [7]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import torch
import numpy as np

# ========= 1. 定义情绪标签映射 =========
five_class_mapping = {
    # very negative
    'anger': 0, 'disgust': 0, 'fear': 0, 'grief': 0, 'remorse': 0, 'sadness': 0,
    # negative
    'annoyance': 1, 'disappointment': 1, 'embarrassment': 1, 'nervousness': 1, 'confusion': 1,
    # neutral
    'neutral': 2, 'realization': 2, 'curiosity': 2,
    # positive
    'approval': 3, 'joy': 3, 'love': 3, 'optimism': 3, 'desire': 3, 'amusement': 3,
    # very positive
    'admiration': 4, 'excitement': 4, 'gratitude': 4, 'pride': 4, 'relief': 4, 'caring': 4, 'surprise': 4
}

# ========= 2. 加载数据集 =========
dataset = load_dataset("go_emotions")

# 多数投票映射函数
def map_to_five_class(example):
    original_labels = example['labels']
    emotions = [dataset['train'].features['labels'].feature.names[i] for i in original_labels]
    mapped = [five_class_mapping.get(e, 2) for e in emotions if e in five_class_mapping]
    if not mapped:
        return {'five_class_label': 2}
    most_common = Counter(mapped).most_common(1)[0][0]
    return {'five_class_label': most_common}

dataset = dataset.map(map_to_five_class)

# ========= 3. 初始化 tokenizer 和模型 =========
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

config = BertConfig.from_pretrained("bert-base-chinese", num_labels=5)
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", config=config)

# ========= 4. 分词 =========
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)

# ========= 5. 设置格式 =========
dataset = dataset.remove_columns(["labels"])  # 移除原多标签字段
dataset = dataset.rename_column("five_class_label", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 划分训练集和验证集
train_data = dataset["train"]
eval_data = dataset["validation"]

# ========= 6. 加权损失 Trainer =========
# 自定义 Trainer 用于加权交叉熵
class_counts = [4033, 4791, 16826, 8940, 8820]
total = sum(class_counts)
weights = [total / c for c in class_counts]
weights = torch.tensor(weights)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # 获取输入和标签
        labels = inputs.pop("labels").to(model.device)
        outputs = model(**inputs)
        logits = outputs.logits

        # 使用加权 CrossEntropyLoss
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights.to(model.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ========= 7. 评估指标 =========
def compute_metrics(pred):
    logits, labels = pred.predictions, pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# ========= 8. 训练参数 =========
training_args = TrainingArguments(
    output_dir="./result",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    fp16=True,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# ========= 9. 开始训练 =========
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

# ========= 10. 验证评估 =========
results = trainer.evaluate()
print(f"\n📊 Accuracy: {results['eval_accuracy']:.4f} | F1: {results['eval_f1']:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6282,1.614935,0.203649,0.068912


KeyboardInterrupt: 

In [None]:
#检查标签分布：
from collections import Counter
print("训练集标签分布:", Counter(train_data['labels'].numpy()))
print("验证集标签分布:", Counter(eval_data['labels'].numpy()))

训练集标签分布: Counter({np.int64(2): 16826, np.int64(3): 8940, np.int64(4): 8820, np.int64(1): 4791, np.int64(0): 4033})
验证集标签分布: Counter({np.int64(2): 2111, np.int64(3): 1158, np.int64(4): 1117, np.int64(1): 551, np.int64(0): 489})


In [None]:
model.save_pretrained('./go_emotions_model_2')
tokenizer.save_pretrained('./go_emotion_model_2')

('./go_emotion_model_2\\tokenizer_config.json',
 './go_emotion_model_2\\special_tokens_map.json',
 './go_emotion_model_2\\vocab.txt',
 './go_emotion_model_2\\added_tokens.json')

In [None]:
#学习率过高可能导致模型无法收敛，过低可能导致训练过慢。