In [2]:
import os
from datasets import Dataset, DatasetDict
from transformers  import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

In [3]:
import torch
#确认GPU是否可用，如果不可用则使用CPU
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device=torch.device('cpu')
print(torch.version.cuda)  # 例如 11.8

print(f"Using device: {device}")
print(f"Using device: {torch.cuda.is_available()}")



12.6
Using device: cuda
Using device: True


In [4]:
import os

train_dir = "../aclImdb/train"
print("Train Path Exists:", os.path.exists(train_dir))
print("Pos Path Exists:", os.path.exists(os.path.join(train_dir, "pos")))


Train Path Exists: True
Pos Path Exists: True


In [5]:

# 定义路径
train_dir = "../aclImdb/train"
test_dir = "../aclImdb/test"

# 读取数据
def read_reviews_from_dir(directory):
    reviews = []
    labels = []
    for label_dir, label in zip(["pos", "neg"], [1, 0]):
        label_dir_path = os.path.join(directory, label_dir)
        for filename in os.listdir(label_dir_path):
            with open(os.path.join(label_dir_path, filename), 'r', encoding='utf-8') as file:
                reviews.append(file.read())
                labels.append(label)
    return reviews, labels

# 加载训练和测试数据
train_reviews, train_labels = read_reviews_from_dir(train_dir)
test_reviews, test_labels = read_reviews_from_dir(test_dir)

# 将数据转换为 Hugging Face Dataset 格式
train_data = Dataset.from_dict({"text": train_reviews, "label": train_labels})
test_data = Dataset.from_dict({"text": test_reviews, "label": test_labels})

# 创建 DatasetDict
dataset = DatasetDict({
    'train': train_data,
    'test': test_data
})

# 加载 BERT 模型和分词器
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)

# 加载 BERT 模型
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5).to(device)  # 修改为5个标签

# 数据预处理函数
def preprocess_function(examples):
    # 标签从 0/1 -> 1/5 映射
    label_mapping = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
    examples["label"] = [label_mapping[label] for label in examples["label"]]
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# 对训练和测试数据进行预处理
dataset = dataset.map(preprocess_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map: 100%|██████████| 25000/25000 [02:47<00:00, 148.92 examples/s]
Map: 100%|██████████| 25000/25000 [02:35<00:00, 161.05 examples/s]


In [6]:
import accelerate
print(accelerate.__version__)  # 确保 >= 0.26.0


1.6.0


In [7]:
import os
from datasets import Dataset, DatasetDict
from transformers  import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from accelerate import Accelerator

print(f"Torch version: {torch.__version__}")
print(f"Accelerate version: {Accelerator().state}")

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

# 创建 Trainer
trainer = Trainer(
    model=model,                         # 模型
    args=training_args,                  # 训练参数
    train_dataset=dataset['train'],         # 训练数据
    eval_dataset=dataset['test'],           # 验证数据
)

# 开始训练
trainer.train()

# 训练结束后可以进行模型评估
trainer.evaluate()


Torch version: 2.6.0+cu126
Accelerate version: Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no



Epoch,Training Loss,Validation Loss
1,0.2931,0.296097
2,0.2345,0.345305
3,0.1135,0.34313


{'eval_loss': 0.3431304693222046,
 'eval_runtime': 507.8478,
 'eval_samples_per_second': 49.227,
 'eval_steps_per_second': 6.153,
 'epoch': 3.0}

In [8]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# =======================
# ✅ 保存模型
# =======================
model_save_path = "./saved_model"
trainer.save_model(model_save_path)  # 保存模型和配置文件
tokenizer.save_pretrained(model_save_path)

print("✅ 模型已保存至:", model_save_path)

# =======================
# ✅ 测试集推理
# =======================
# 加载保存的模型
model = BertForSequenceClassification.from_pretrained(model_save_path).to(device)
tokenizer = BertTokenizer.from_pretrained(model_save_path)

# 单个样本预测函数
def predict_sentiment(text):
    model.eval()  # 切换为评估模式
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class

# 批量预测
def evaluate_on_test_set():
    correct = 0
    total = len(test_reviews)
    predictions = []

    for text, label in zip(test_reviews, test_labels):
        pred = predict_sentiment(text)
        predictions.append(pred)
        # 模型预测的label范围是1-5, 映射回0(负面)/1(正面)
        mapped_pred = 0 if pred <= 2 else 1  # 1,2 -> 0 (负面)；4,5 -> 1 (正面)
        if mapped_pred == label:
            correct += 1

    accuracy = correct / total
    print("✅ 测试集准确率: {:.2f}%".format(accuracy * 100))

# 在测试集上评估
evaluate_on_test_set()


✅ 模型已保存至: ./saved_model
✅ 测试集准确率: 50.00%
