In [1]:
from transformers import TrainingArguments
print("TrainingArguments from:", TrainingArguments.__module__)

TrainingArguments from: transformers.training_args


In [2]:
import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

In [3]:
print(TrainingArguments.__module__)

transformers.training_args


In [4]:

# ========== 1. 加载数据集 ==========
# 下一个使用mteb/amazon_reviews_multi训练
dataset = load_dataset("mteb/amazon_reviews_multi", "zh")

# ========== 2. 初始化 tokenizer 和模型 ==========
MODEL_NAME = "google-bert/bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)

# ========== 数据分析==========
# 统计文本长度分布
text_lengths = [len(tokenizer.tokenize(text)) for text in dataset["train"]["text"]]
plt.hist(text_lengths, bins=50)
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.title("Text Length Distribution")
plt.show()
print(f"""
长度统计:
- 平均值: {np.mean(text_lengths):.1f}
- 中位数: {np.median(text_lengths):.1f}
- 95%分位: {np.percentile(text_lengths, 95):.1f}
- 最大值: {np.max(text_lengths)}
""")

#动态填充最大长度
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",  # 动态填充到批次内最大长度
    max_length=512,     # 安全上限
    pad_to_multiple_of=8  # 对齐GPU计算
)

# 将数据集划分为训练集、验证集和测试集
train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["validation"]


# 检查标签是否已调整到正确的范围
print(np.unique(train_dataset['label']))  # 输出调整后的标签范围

KeyboardInterrupt: 

In [None]:
# ========== 3. 数据处理 ==========
def preprocess_function(examples):
    # 移除了padding参数，改为动态处理
    return tokenizer(
        examples["text"], 
        truncation=True,       # 仅进行截断
        max_length=256         # 设置最大截断长度
    )  # 不再使用 padding="max_length"

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

In [None]:
# 取一个样例batch观察
sample_batch = [train_dataset[i] for i in range(2)]
collated = data_collator(sample_batch)

print("原始样本长度:", [len(x["input_ids"]) for x in sample_batch])
print("填充后形状:", collated["input_ids"].shape)

In [None]:
# ========== 4. 定义评估函数 ==========
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


# ========== 5. 设置训练参数 ==========
training_args = TrainingArguments(
    # 在训练参数中添加速度监控
    report_to=["tensorboard"], # 启用TensorBoard监控
    eval_accumulation_steps=20,
    
    output_dir="./results",
    eval_strategy="epoch",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    max_steps=20,
    fp16=True,  # 启用混合精度训练
)

# 运行速度测试（无反向传播）
with torch.no_grad():
    dummy_batch = tokenizer(
        ["样例文本"]*training_args.per_device_train_batch_size, 
        return_tensors="pt",
        max_length=512,  # 测试最大可能长度
        truncation=True
    ).to("cuda")
    outputs = model(**dummy_batch)
    print(f"显存占用: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")

# ========== 6. 定义 Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Ensure eval_dataset is provided
    compute_metrics=compute_metrics,
    data_collator=data_collator  # 关键添加项
)

# ========== 7. 训练模型 ==========
train_results = trainer.train()

# ========== 8. 可视化训练过程 ==========
# 提取训练过程中的准确率、损失和情感强度
train_loss = train_results.metrics.get("train_loss", None)  # Use .get() to avoid KeyError
eval_results = trainer.evaluate()
eval_loss = eval_results.get("eval_loss", None)
eval_accuracy = eval_results.get("eval_accuracy", None)

# 绘制损失（Loss）曲线
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
if train_loss is not None:
    plt.plot([train_loss], label='Training Loss')
if eval_loss is not None:
    plt.plot([eval_loss], label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss Curve")

# 绘制准确率曲线
plt.subplot(1, 2, 2)
if eval_accuracy is not None:
    plt.plot([eval_accuracy], label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy Curve")

plt.tight_layout()
plt.show()

# ========== 9. 保存模型 ==========
model.save_pretrained("./sentiment_model_1")
tokenizer.save_pretrained("./sentiment_model_1")

# ========== 10. 推理与情感强度计算 ==========
# 测试模型预测
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to the same device
inputs = tokenizer("我今天心情非常好", return_tensors="pt").to(device)  # Move inputs to the same device
outputs = model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1)
predicted_label = predicted_class.item()

# 输出预测情感标签和强度（可以通过softmax获得概率作为情感强度）
probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_probabilities = probabilities.detach().cpu().numpy()

print(f"Predicted Emotion: {predicted_label} (0: Very Negative, 1: Negative, 2: Neutral, 3: Positive, 4: Very Positive)")
print(f"Emotion Intensity: {predicted_probabilities}")

# 绘制情感强度图
plt.bar(np.arange(5), predicted_probabilities[0])
plt.xticks(np.arange(5), ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'])
plt.xlabel("Emotion Class")
plt.ylabel("Intensity")
plt.title("Emotion Intensity")
plt.show()


In [None]:
import transformers
print("Transformers version:", transformers.__version__)
print("Transformers path:", transformers.__file__)

In [None]:
# ========== 11. 在测试集上进行评估 ==========

# 使用测试集进行评估
test_results = trainer.evaluate(test_dataset)

# 打印评估结果
print("Test Results:", test_results)

# 如果你想要更多的详细输出，可以通过如下方式获取：
test_loss = test_results["eval_loss"]
test_accuracy = test_results["eval_accuracy"]
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


In [None]:
print(train_dataset['label'][:10])  # Check first 10 labels
print(np.unique(train_dataset['label']))  # Check unique label values
print(train_dataset[0]["text"])  # 打印一个样本查看文本
print(train_dataset[0]["label"])  # 打印对应的标签
print("训练集标签范围:", set(train_dataset["label"]))
print("验证集标签范围:", set(val_dataset["label"]))
print("训练数据第一个样本:", train_dataset[0])
import torch
print("是否使用 GPU:", torch.cuda.is_available())
print("当前使用的设备:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
# for i, example in enumerate(train_dataset):
#     if not example["text"] or example["label"] not in range(5):
#         print(f"异常样本索引: {i}, 样本内容: {example}")
print(len(train_dataset[0]['attention_mask'])) 

In [None]:
import torch
print(torch.__version__)  # 检查 PyTorch 版本

In [None]:
model.save_pretrained('./go_emotions_model_3')
tokenizer.save_pretrained('./go_emotion_model_3')

In [None]:
#学习率过高可能导致模型无法收敛，过低可能导致训练过慢。