# 进行微调和优化

### 1. 准备数据集

In [1]:
from transformers import Trainer


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch

# 加载 Stanford Sentiment Treebank 数据集（sst2 为二分类数据集）
dataset = load_dataset("glue", "sst2")

# 打印数据集的结构和内容
print(dataset)

# 加载预训练模型和分词器
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)

# 加载 BERT 模型
model = BertForSequenceClassification.from_pretrained(model_name)

# 将分类层调整为 2 个类别（POSITIVE, NEGATIVE）
model.classifier = torch.nn.Linear(model.config.hidden_size, 2)  # sst2 只有 2 个类别：POSITIVE 和 NEGATIVE

# 数据处理函数
def preprocess_function(examples):
    # 映射 SST2 标签为 1 和 5 星
    label_mapping = {0: 1, 1: 5}  # 假设 0 = very negative, 1 = very positive
    examples["label"] = [label_mapping[label] for label in examples["label"]]
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

# 对训练数据进行预处理
train_dataset = dataset['train'].map(preprocess_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# 对验证数据进行预处理
eval_dataset = dataset['validation'].map(preprocess_function, batched=True)
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# 打印预处理后的数据集
print(train_dataset[0])  # 打印第一个样本

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    evaluation_strategy="epoch",     # 每个epoch后评估
    per_device_train_batch_size=8,   # 训练批次大小
    per_device_eval_batch_size=8,    # 验证批次大小
    num_train_epochs=3,              # 训练轮数
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
    logging_steps=10,                # 每10步记录日志
)

# 创建 Trainer
trainer = Trainer(
    model=model,                         # 模型
    args=training_args,                  # 训练参数
    train_dataset=train_dataset,         # 训练数据
    eval_dataset=eval_dataset,           # 验证数据
)

# 开始训练
trainer.train()


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]


ValueError: Class label 5 greater than configured num_classes 2

In [None]:
# train_data = [
#     {"text": "今天心情很好，感觉很开心", "label": 4},  # 4 stars -> POSITIVE
#     {"text": "今天很沮丧，压力很大", "label": 2},  # 2 stars -> NEGATIVE
#     {"text": "天气不错，但心情平静", "label": 3},  # 3 stars -> NEUTRAL
#     {"text": "和朋友一起很开心，今天真是太好了", "label": 5},  # 5 stars -> POSITIVE
#     {"text": "今天失业了，心情很糟糕", "label": 1}  # 1 star -> NEGATIVE
# ]
train_data = load_dataset("amazon_polarity")

NameError: name 'load_dataset' is not defined

### 2. 加载预训练模型

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

# 加载预训练模型和分词器
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 假设标签有 3 个：POSITIVE, NEGATIVE, NEUTRAL

### 3. 数据预处理

In [None]:
# 处理数据集
def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# 将训练数据转化为 Hugging Face 数据集格式
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
train_dataset = train_dataset.map(preprocess_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


### 4. 训练设置

In [None]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",          # 输出目录
    evaluation_strategy="epoch",     # 每个 epoch 后进行评估
    learning_rate=2e-5,              # 学习率
    per_device_train_batch_size=8,   # 每个设备的训练批大小
    num_train_epochs=3,              # 训练 epoch 数量
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
)

# 使用 Trainer 进行训练
trainer = Trainer(
    model=model,                     # 微调的模型
    args=training_args,              # 训练参数
    train_dataset=train_dataset,     # 训练数据集
    eval_dataset=train_dataset,      # 评估数据集（可选，通常你会有单独的验证集）
)

# 开始训练
trainer.train()


### 5. 评估和保存模型

In [None]:
# 评估模型
trainer.evaluate()

# 保存模型
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

### 6. 使用微调后的模型

In [None]:
# 加载微调后的模型
fine_tuned_model = BertForSequenceClassification.from_pretrained("./fine_tuned_model")
fine_tuned_tokenizer = BertTokenizer.from_pretrained("./fine_tuned_model")

# 使用微调后的模型进行预测
def predict(text):
    inputs = fine_tuned_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = fine_tuned_model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()
    return prediction

# 测试预测
sample_text = "今天心情很好，感觉很开心"
print(f"预测标签: {predict(sample_text)}")