数据集构建和预处理。

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据集
data_path = "./data/ChnSentiCorp_htl_all.csv"
data = pd.read_csv(data_path, encoding='utf-8')

# 数据预处理
assert 'label' in data.columns and 'review' in data.columns, "数据集字段缺失"

# 划分数据集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

print(f"训练集大小: {len(train_data)}, 验证集大小: {len(val_data)}, 测试集大小: {len(test_data)}")

加载模型。

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
model = AutoModelForSequenceClassification.from_pretrained("Qwen/Qwen1.5-0.5B", num_labels=2)

# 设置 padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# 将模型移到设备（CPU或GPU）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

微调方法。

In [None]:
lora_config = {
    "rank": 8,
    "alpha": 16,
    "max_length": 512,
    "method": "LoRA"
}

prompt 设置。

In [None]:
# Prompt模板
def create_prompt(review):
    return f"评论内容：{review} 这条评论是正面还是负面？"

# 创建输入输出示例
example_review = "这家酒店非常干净，服务也很好。"
print(create_prompt(example_review))

数据集准备。

In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=512):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(review, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0).to(device)
        attention_mask = inputs['attention_mask'].squeeze(0).to(device)
        label_tensor = torch.tensor(label, dtype=torch.long).to(device)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_tensor
        }

# 数据加载器
train_dataset = SentimentDataset(train_data['review'].tolist(), train_data['label'].tolist(), tokenizer)
val_dataset = SentimentDataset(val_data['review'].tolist(), val_data['label'].tolist(), tokenizer)
test_dataset = SentimentDataset(test_data['review'].tolist(), test_data['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)


模型微调和训练。

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    logging_dir="./logs",
    use_cpu=True  # 确保不使用 CUDA
)

# 配置模型和任务
trainer = Trainer(
    model=model,
    task_name="text-classification",
    lora_config=lora_config,
    train_dataset=train_data,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
    output_dir="./outputs"
)

trainer.train()

结果打印。

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# 假设我们有模型预测结果
y_true = test_data['label'].tolist()
y_pred = trainer.predict(test_data['review'].tolist())

accuracy = accuracy_score(y_true, y_pred)
print(f"模型在测试集上的 Accuracy: {accuracy}")

# 打印详细的分类报告
print(classification_report(y_true, y_pred))

import matplotlib.pyplot as plt

train_loss = trainer.state.log_history['train_loss']
val_loss = trainer.state.log_history['eval_loss']

plt.plot(train_loss, label='Train Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Curve')
plt.show()
