In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

使用设备: cpu


In [6]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# 1. 加载预训练模型和分词器
model_name = "gpt2"  # 使用较小的模型作为演示
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 2. 加载基础模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

# 3. 配置LoRA
lora_config = LoraConfig(
    r=8,                      # LoRA的秩
    lora_alpha=32,            # LoRA的alpha参数
    target_modules=["c_attn", "c_proj"],  # 要应用LoRA的模块名称
    lora_dropout=0.1,         # Dropout概率
    bias="none",              # 是否对偏置进行微调
    task_type=TaskType.CAUSAL_LM  # 任务类型
)

# 4. 准备模型进行LoRA微调
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 打印可训练参数比例

# 5. 加载数据集（这里使用tiny_shakespeare作为示例）
dataset = load_dataset("tiny_shakespeare", trust_remote_code=True)
# 获取完整文本
full_text = dataset["train"]["text"][0]  # 假设文本在"text"字段中

# 手动划分
split_point = int(len(full_text) * 0.9)  # 90%作为训练集
train_text = full_text[:split_point]
test_text = full_text[split_point:]

# 创建新的数据集
from datasets import Dataset
train_dataset = Dataset.from_dict({"text": [train_text]})
test_dataset = Dataset.from_dict({"text": [test_text]})

# 6. 数据预处理函数
def preprocess_function(examples):
    return tokenizer([text for text in examples["text"]], truncation=True, max_length=128)

# 7. 应用预处理
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)
tokenized_test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
)

# 8. 数据整理器
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [12]:
# 9. 训练参数
training_args = TrainingArguments(
    output_dir="./lora_finetuned_gpt2",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=8,
    fp16=True,
    report_to="none",
)

# 10. 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
)

# 11. 开始训练
trainer.train()

# 12. 保存模型
model.save_pretrained("./lora_finetuned_gpt2")
tokenizer.save_pretrained("./lora_finetuned_gpt2")

# 13. 测试微调后的模型
prompt = "To be or not to be,"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_length=50, num_return_sequences=1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To be or not to be, we all know it.

We know it.

We know it.

We know it.

We know it.
We know it.
We know it.

We know
