In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import numpy as np
import torch

# ========== 1. 加载 tokenizer 和已保存的模型 ==========
model_path = "./go_emotions_model"  # 你保存模型的路径
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# ========== 2. 加载 GoEmotions 数据集（这里作为例子）==========
# 加载 GoEmotions 数据集
dataset = load_dataset("go_emotions")
print(dataset['train'].features)

# 五分类映射
five_class_mapping = {
    # very negative
    'anger': 0, 'disgust': 0, 'fear': 0, 'grief': 0, 'remorse': 0, 'sadness': 0,
    
    # negative
    'annoyance': 1, 'disappointment': 1, 'embarrassment': 1, 'nervousness': 1, 'confusion': 1,
    
    # neutral
    'neutral': 2, 'realization': 2, 'curiosity': 2,
    
    # positive
    'approval': 3, 'joy': 3, 'love': 3, 'optimism': 3, 'desire': 3, 'amusement': 3,
    
    # very positive
    'admiration': 4, 'excitement': 4, 'gratitude': 4, 'pride': 4, 'relief': 4, 'caring': 4, 'surprise': 4
}
id2label = ['very negative', 'negative', 'neutral', 'positive', 'very positive']


{'text': Value(dtype='string', id=None), 'labels': Sequence(feature=ClassLabel(names=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'], id=None), length=-1, id=None), 'id': Value(dtype='string', id=None)}


In [7]:
dataset['train'].features['labels'].feature.names

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [None]:
 

# 你已经定义了 five_class_mapping 和 map_emotion 函数
# def map_emotion(example):
#     label_ids = example['labels']
#     emotions = [dataset['train'].features['labels'].feature.names[i] for i in label_ids]
    
#     # 将情感类别映射为五分类标签
#     for emo in emotions:
#         if emo in five_class_mapping:
#             return {'five_class_label': five_class_mapping[emo]}
#     return {'five_class_label': 2}  # 默认为 neutral

# 定义映射函数：返回 int
def map_emotion(example):
    label_ids = example['labels']
    emotions = [dataset['train'].features['labels'].feature.names[i] for i in label_ids]
    for emo in emotions:
        if emo in five_class_mapping:
            return five_class_mapping[emo]
    return 2  # 默认 neutral

# 应用映射函数，将原始多标签映射为单个五分类标签
dataset = dataset.map(lambda x: {'five_class_label': map_emotion(x)})

# # 在训练集、验证集和测试集上应用映射函数
# dataset['train'] = dataset['train'].map(map_emotion)
# dataset['validation'] = dataset['validation'].map(map_emotion)
# dataset['test'] = dataset['test'].map(map_emotion)

# ========== 3. Tokenize ==========
# 将数据集转换为训练所需的格式
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# 初始化 BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 对数据集进行分词
dataset = dataset.map(tokenize_function, batched=True)

# 创建训练集和验证集
train_data = dataset['train']
eval_data = dataset['validation']
test_data = dataset['test']

print(dataset)

# 设置格式
# train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# eval_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 定义 EmotionDataset 类
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 提取训练数据和标签
train_encodings = tokenizer(train_data['text'], truncation=True, padding=True, max_length=512)
train_labels = train_data['five_class_label']

# 提取验证数据和标签
val_encodings = tokenizer(eval_data['text'], truncation=True, padding=True, max_length=512)
val_labels = eval_data['five_class_label']

# 创建训练数据集
train_data = EmotionDataset(train_encodings, train_labels)
val_data = EmotionDataset(val_encodings, val_labels)


{'text': Value(dtype='string', id=None), 'labels': Sequence(feature=ClassLabel(names=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'], id=None), length=-1, id=None), 'id': Value(dtype='string', id=None)}


Map: 100%|██████████| 43410/43410 [00:14<00:00, 2929.73 examples/s]
Map: 100%|██████████| 5426/5426 [00:02<00:00, 2453.13 examples/s]
Map: 100%|██████████| 5427/5427 [00:01<00:00, 2969.21 examples/s]
Map: 100%|██████████| 43410/43410 [00:30<00:00, 1423.91 examples/s]
Map: 100%|██████████| 5426/5426 [00:03<00:00, 1575.00 examples/s]
Map: 100%|██████████| 5427/5427 [00:03<00:00, 1548.19 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id', 'five_class_label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id', 'five_class_label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id', 'five_class_label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5427
    })
})


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

# ========== 1. 加载 tokenizer 和模型 ==========
model_path = "./emotion_model"  # 替换为你自己的路径
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# ========== 2. 加载 GoEmotions 数据集 ==========
dataset = load_dataset("go_emotions")
 
# 定义映射函数：返回 int
def map_emotion(example):
    label_ids = example['labels']
    emotions = [dataset['train'].features['labels'].feature.names[i] for i in label_ids]
    for emo in emotions:
        if emo in five_class_mapping:
            return five_class_mapping[emo]
    return 2  # 默认 neutral

# 应用映射函数，将原始多标签映射为单个五分类标签
dataset = dataset.map(lambda x: {'labels': map_emotion(x)})

# ========== 3. Tokenize ==========
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize_function, batched=True)

# 设置格式供 PyTorch Trainer 使用
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 拆分数据集
train_data = dataset['train']
eval_data = dataset['validation']
test_data = dataset['test']

# ========== 4. 定义评估指标 ==========
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


In [None]:

# ========== 4. 计算验证准确率 ==========
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=1)
#     acc = accuracy_score(labels, predictions)
#     return {"accuracy": acc}

# ========== 5. 训练参数 ==========
training_args = TrainingArguments(
    output_dir="./emotion_model_1",       # 保存路径
    evaluation_strategy="epoch",            # 每个 epoch 做验证
    save_strategy="epoch",                  # 每个 epoch 保存模型
    logging_dir="./logs",                   # 日志路径
    logging_strategy="no",               # 每个 epoch 打印日志
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,                     # 最多保存几个模型
    report_to="none",                       # 不连接 wandb 等
    fp16=True,  # ✅ 启用混合精度训练
)

# ========== 6. 初始化 Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ========== 7. 开始继续训练 ==========
trainer.train()

# ========== 8. 验证并输出准确率 ==========
eval_results = trainer.evaluate()
print(f"\n📊 Validation Accuracy: {eval_results['eval_accuracy']:.4f}")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.937563,0.667711,0.638969
2,No log,1.041103,0.639329,0.633288
3,No log,1.067617,0.656653,0.642075



📊 Validation Accuracy: 0.6677


In [None]:
model.save_pretrained('./emotion_model_11')
tokenizer.save_pretrained('./emotion_model_11')

('./emotion_model_11\\tokenizer_config.json',
 './emotion_model_11\\special_tokens_map.json',
 './emotion_model_11\\vocab.txt',
 './emotion_model_11\\added_tokens.json')