# 基于 Llama-2-7B-Chat 微调


In [1]:
!nvidia-smi

Thu Mar 21 19:01:06 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.05    Driver Version: 525.85.05    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:10:00.0 Off |                  N/A |
| 40%   40C    P8    25W / 320W |      0MiB / 10240MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# install requirements
%pip install torch datasets accelerate peft bitsandbytes transformers trl protobuf sentencepiece torch git+https://github.com/huggingface/transformers huggingface_hub

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-6dxf8669
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-6dxf8669
  Resolved https://github.com/huggingface/transformers to commit 5d1a58a6462a45a17380c2487ee733b2f6163c54
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


使用模型：https://huggingface.co/meta-llama/Llama-2-7b-chat

使用数据集：https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k


In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

base_model = "NousResearch/Llama-2-7b-chat-hf"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "llama-2-7b-chat-guanaco"

# 加载数据
dataset = load_dataset(guanaco_dataset, split="train")

  from .autonotebook import tqdm as notebook_tqdm


加载模型：

用 BitsAndBytes 库的“nf4”类型创建 4bit 量化。计算数据类型：“float16”


In [4]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, quantization_config=quant_config, device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.36s/it]


加载 tokenizer


In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

PEFT 配置


In [6]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

训练参数配置


In [7]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # 训练轮数
    per_device_train_batch_size=4,  # 训练时每个 GPU 的 Batch Size
    gradient_accumulation_steps=1,  # 梯度累积步数（训练多少步才进行一次累积的梯度更新）
    gradient_checkpointing=True,  # 是否开启梯度检查点（PyTorch 自带的一种显存优化技术）
    optim="paged_adamw_32bit",  # 模型优化器（使用当前主流的 AdamW 优化器）
    save_steps=25,  # 每隔多少步存储一次 checkpoint
    logging_steps=25,  # 每隔多少步记录一次 log
    learning_rate=2e-4,  # 初始学习率
    weight_decay=0.001,  # 权重衰减值（除 bias/LayerNorm 权重以外，会作用到其他全部层）
    fp16=False,  # 是否进行 fp16/bf16 训练
    bf16=False,  # 是否进行 fp16/bf16 训练
    max_grad_norm=0.3,  # 梯度裁剪的最大值（可以通过梯度裁剪来防止梯度爆炸）
    max_steps=-1,  # 最大训练步数（设置为-1，则会使用 num_train_epochs 参数）
    warmup_ratio=0.03,  # 线性学习率 warmup 的步数比例
    group_by_length=True,  # 该设置可以显著提高性能并加速训练过程
    lr_scheduler_type="constant",  # 学习率下降策略（使用"constant"，即保持常数不下降）
    report_to="tensorboard",  # log 记录形式
)

为 SFT Trainer 提供模型、数据集、LoRA 配置、Tokenizer 和训练参数。


In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 9.77 GiB of which 2.69 MiB is free. Including non-PyTorch memory, this process has 9.77 GiB memory in use. Of the allocated memory 8.01 GiB is allocated by PyTorch, and 684.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

开始训练


In [12]:
# Train model
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 9.77 GiB of which 18.69 MiB is free. Including non-PyTorch memory, this process has 9.75 GiB memory in use. Of the allocated memory 8.00 GiB is allocated by PyTorch, and 684.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

存储模型


In [None]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

# View log
"""
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))
"""

模型测试


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(
    task="text-generation", model=new_model, tokenizer=new_model, max_length=200
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])

prompt = "What is Datacamp Career track?"
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])

模型合并

将 Llama 2 之前训练好的模型 weights 和刚刚微调得到的 LoRA weights 合并。


In [None]:
# Reload model in FP16 and merge it with LoRA weights
load_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

model = PeftModel.from_pretrained(load_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save model to the local
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

上传合并模型到 HF

在网络畅通的前提下，也可以通过 huggingface-cli login 将模型上传到 HF。


In [None]:
# Push model to the hf hub
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

模型使用


In [None]:
from transformers import pipeline


# Fine-tuned and merged model path in local or HF
new_model = "/root/aidaily/myllama/llama-2-7b-chat-guanaco"

# Run text generation pipeline with our next model
prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(
    task="text-generation",
    model=new_model,
    tokenizer=new_model,
    device=0,
    max_length=200,
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])