In [None]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# ========== 1. 加载数据集 ==========
# 下一个使用mteb/amazon_reviews_multi训练
dataset = load_dataset("codyburker/yelp_review_sampled")
# ========== 2. 数据字段重命名 ==========
def rename_fields(examples):
    examples["label"] = examples["stars"]  # 将 'stars' 字段改名为 'label'
    del examples["stars"]  # 删除原来的 'stars' 字段
    return examples
train_dataset = dataset["train"].map(rename_fields)
test_dataset = dataset["test"].map(rename_fields)

# 处理label的起始数字
def adjust_labels(examples):
    # 将标签减去 1，使标签范围变为 [0, 1, 2, 3, 4]
    examples["label"] = [label - 1 for label in examples["label"]]  # 如果是列表情况
    return examples

# 重新应用标签调整函数，确保是逐个样本进行调整
# batched=True 允许我们一次处理多个样本，这对于大数据集尤其有效。
train_dataset = train_dataset.map(adjust_labels, batched=True)
test_dataset = test_dataset.map(adjust_labels, batched=True)

# 检查标签是否已调整到正确的范围
print(np.unique(train_dataset['label']))  # 输出调整后的标签范围



# ========== 2. 初始化 tokenizer 和模型 ==========
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)

# ========== 3. 数据处理 ==========
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# ========== 4. 定义评估函数 ==========
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# ========== 5. 设置训练参数 ==========
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

# ========== 6. 定义 Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# ========== 7. 训练模型 ==========
train_results = trainer.train()

# ========== 8. 可视化训练过程 ==========
# 提取训练过程中的准确率、损失和情感强度
train_loss = train_results.metrics["train_loss"]
eval_loss = train_results.metrics["eval_loss"]
train_accuracy = train_results.metrics["eval_accuracy"]

# 绘制损失（Loss）曲线
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(train_results.history['eval_loss'], label='Validation Loss')
plt.plot(train_results.history['train_loss'], label='Training Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss Curve")

# 绘制准确率曲线
plt.subplot(1, 2, 2)
plt.plot(train_results.history['eval_accuracy'], label='Validation Accuracy')
plt.plot(train_results.history['train_accuracy'], label='Training Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy Curve")

plt.tight_layout()
plt.show()

# ========== 9. 保存模型 ==========
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# ========== 10. 推理与情感强度计算 ==========
# 测试模型预测
inputs = tokenizer("我今天心情非常好", return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1)
predicted_label = predicted_class.item()

# 输出预测情感标签和强度（可以通过softmax获得概率作为情感强度）
probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_probabilities = probabilities.detach().cpu().numpy()

print(f"Predicted Emotion: {predicted_label} (0: Very Negative, 1: Negative, 2: Neutral, 3: Positive, 4: Very Positive)")
print(f"Emotion Intensity: {predicted_probabilities}")

# 绘制情感强度图
plt.bar(np.arange(5), predicted_probabilities[0])
plt.xticks(np.arange(5), ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'])
plt.xlabel("Emotion Class")
plt.ylabel("Intensity")
plt.title("Emotion Intensity")
plt.show()


Map: 100%|██████████| 80000/80000 [00:00<00:00, 319411.18 examples/s]
Map: 100%|██████████| 20000/20000 [00:00<00:00, 297211.21 examples/s]


[0 1 2 3 4]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 80000/80000 [02:53<00:00, 459.78 examples/s]
Map: 100%|██████████| 20000/20000 [00:41<00:00, 476.64 examples/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [16]:
print(train_dataset['label'][:10])  # Check first 10 labels
print(np.unique(train_dataset['label']))  # Check unique label values


[4, 3, 3, 4, 3, 2, 4, 4, 4, 2]
[0 1 2 3 4]


In [None]:
#检查标签分布：
from collections import Counter
print("训练集标签分布:", Counter(train_data['labels'].numpy()))
print("验证集标签分布:", Counter(eval_data['labels'].numpy()))

训练集标签分布: Counter({np.int64(2): 16826, np.int64(3): 8940, np.int64(4): 8820, np.int64(1): 4791, np.int64(0): 4033})
验证集标签分布: Counter({np.int64(2): 2111, np.int64(3): 1158, np.int64(4): 1117, np.int64(1): 551, np.int64(0): 489})


In [None]:
model.save_pretrained('./go_emotions_model_2')
tokenizer.save_pretrained('./go_emotion_model_2')

('./go_emotion_model_2\\tokenizer_config.json',
 './go_emotion_model_2\\special_tokens_map.json',
 './go_emotion_model_2\\vocab.txt',
 './go_emotion_model_2\\added_tokens.json')

In [None]:
#学习率过高可能导致模型无法收敛，过低可能导致训练过慢。