In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# 加载Yelp Review Full数据集
dataset = load_dataset("yelp_review_full")

# 加载已保存的BERT模型和Tokenizer
# model = BertForSequenceClassification.from_pretrained("./emotion_model_1", num_labels=5)
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model_path = "./emotion_model_1"  # 替换为你自己的路径
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path,num_labels=5)


In [5]:
# 查看数据集结构
print(dataset)

# 查看训练集和测试集样本
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})
{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}


In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

In [None]:
# ========== 3. Tokenize ==========
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize_function, batched=True)

# 设置格式供 PyTorch Trainer 使用
# dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 650000/650000 [40:23<00:00, 268.22 examples/s]
Map: 100%|██████████| 50000/50000 [03:06<00:00, 267.97 examples/s]


ValueError: Columns ['labels'] not in the dataset. Current columns in the dataset: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
# 拆分数据集
train_data = dataset['train']
eval_data = dataset['test']  # Yelp Review Full 使用test作为评估集
test_data = dataset['test']

In [10]:
# 映射：将yelp_review_full的标签转换为目标标签
label2id = {
    "very negative": 0,  # LABEL_0
    "negative": 1,       # LABEL_1
    "neutral": 2,        # LABEL_2
    "positive": 3,       # LABEL_3
    "very positive": 4   # LABEL_4
}

id2label = {v: k for k, v in label2id.items()}

# 更新模型配置
model.config.label2id = label2id
model.config.id2label = id2label


In [11]:
# ========== 4. 定义评估指标 ==========
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [12]:
# 假设yelp_review_full数据集的标签是从0到4的整数
# 如果标签是整数，你可以直接映射为目标标签

# def label_mapping(example):
#     label_mapping_dict = {
#         0: "very negative",
#         1: "negative",
#         2: "neutral",
#         3: "positive",
#         4: "very positive"
#     }
#     example['label'] = label_mapping_dict[example['label']]
#     return example

# 对训练和验证数据集进行标签映射
# train_dataset = train_data.map(label_mapping)
# val_dataset = eval_data.map(label_mapping)

train_dataset = train_data
val_dataset = eval_data

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    num_train_epochs=3,              # 训练轮数
    per_device_train_batch_size=8,   # 每设备的批次大小
    per_device_eval_batch_size=16,   # 每设备的评估批次大小
    evaluation_strategy="epoch",     # 每个epoch后评估
    logging_dir='./logs',            # 日志目录
    logging_steps=10,
    save_steps=100,
    load_best_model_at_end=True,     # 在训练结束时加载最佳模型
    metric_for_best_model="f1",     # 使用 F1 分数作为最佳模型的标准
    greater_is_better=True,          # F1越高越好
    save_strategy="epoch", 
    weight_decay=0.01,               # 权重衰减
    warmup_steps=500,                # 预热步数
    fp16=True,  # ✅ 启用混合精度训练
)

# 定义EarlyStoppingCallback
# early_stopping_callback = EarlyStoppingCallback(
#     early_stopping_patience=2,       # 早停耐心值
#     early_stopping_threshold=0.01    # 早停阈值
# )

# 使用Trainer类进行训练
trainer = Trainer(
    model=model,                       # 预训练模型
    args=training_args,                # 训练参数
    train_dataset=train_dataset,       # 训练数据集
    eval_dataset=val_dataset,          # 验证数据集
    tokenizer=tokenizer,               # tokenizer
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


  trainer = Trainer(


In [16]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6419,1.613477,0.2,0.066667


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate()
print(results)


In [None]:
model.save_pretrained("./emotion_model_1_finetuned")
tokenizer.save_pretrained("./emotion_model_1_finetuned")


In [None]:
import matplotlib.pyplot as plt

# 提取训练损失
train_losses = []
for log in trainer.state.log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])

# 绘制训练损失图
plt.plot(train_losses, label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.legend()
plt.show()


In [None]:
inputs = tokenizer("This is an example sentence!", return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
print(predictions)  # 输出预测的情感类别
