# 使用QLoRA 和 FlashAttention微调Qwen模型

首先，安装必要的环境。如果没有安装flash-attn的话需要按照官方文档的说明安装flash-attn

In [None]:
%pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple transformers=='4.45.2' peft=='0.13.1' accelerate=='1.0.0' tiktoken bitsandbytes datasets

导入需要使用的各个库

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"    # 这里设置使用哪块GPU
from torch.utils.data import Dataset        # 用于自定义数据集
from transformers import (
    AutoModelForCausalLM,                   # 用于加载预训练模型
    AutoTokenizer,                          # 用于加载分词器
    BitsAndBytesConfig,                     # 用于加载配置文件
    TrainingArguments,                      # 用于加载训练参数
    Trainer,                                # 用于训练
    DataCollatorForLanguageModeling,        # 用于处理数据
)
import torch
import time
import json
import random

加载数据集

In [2]:
import csv
import pprint
def load_dataset(filename):
    data_list = []
    # read csv
    with open(filename, "r", encoding="gb18030") as f:    # 这里使用gb18030编码，因为我们所选择的数据csv文件是这种编码
        reader = csv.DictReader(f)                        # 使用csv.DictReader读取csv文件
        for row in reader:
            data_list.append(
                {
                    'department': row['department'],
                    'input': row['ask'],
                    'output': row['answer']
                }
            )

    return data_list                  
                       
dataset = load_dataset("儿科5-14000.csv")                  # 读取数据
print('len(dataset):', len(dataset))
pprint.pprint(dataset[0])

len(dataset): 101602
{'department': '营养保健科',
 'input': '女宝宝，刚7岁，这一年，察觉到，我家孩子身上肉很多，而且，食量非常的大，平时都不喜欢吃去玩，请问：小儿肥胖超重该如何治疗。',
 'output': '孩子出现肥胖症的情况。家长要通过孩子运功和健康的饮食来缓解他的症状，可以先让他做一些有氧运动，比如慢跑，爬坡，游泳等，并且饮食上孩子多吃黄瓜，胡萝卜，菠菜等，禁止孩子吃一些油炸食品和干果类食物，这些都是干热量高脂肪的食物，而且不要让孩子总是吃完就躺在床上不动，家长在治疗小儿肥胖期间如果孩子情况严重就要及时去医院在医生的指导下给孩子治疗。'}


准备可以用来模型训练的数据集，包括医疗对话数据和自我认知数据

In [3]:
def prepare_message(data_list):
    new_list = []
    for i, data in enumerate(data_list):
        _id = f"identity_{i}"
        new_list.append(
        {
            "id": _id,
            "conversations": [       # 将所输入的对话转换成conversations的形式
                {
                    "from": "user",
                    "value": data["input"]
                },
                {
                    "from": "assistant",
                    "value": data["output"]
                }
            ]
        })
    return new_list   


def replace_name(s):
    s = s.replace('<NAME>', '智能医生客服机器人小D')
    s = s.replace('<AUTHOR>', 'Greedy AI')
    return s


def load_self_cong_data(filename):
    data_list = []
    id = 0
    for d in json.load(open(filename, "r", encoding="utf-8")):
        d["instruction"] = replace_name(d["instruction"])     # 将instruction中的<NAME>和<AUTHOR>替换成智能医生客服机器人小D和Greedy AI
        d["output"] = replace_name(d["output"])
        data_list.append({
            "id": id,
            "conversations": [
                {
                    "from": "user",
                    "value": d["instruction"]
                },
                {
                    "from": "assistant",
                    "value": d["output"]
                }
            ]
        })
        id += 1
    return data_list

self_cong_data = load_self_cong_data("self_cognition.json")       # 读取自我认知数据
format_data_list = prepare_message(dataset[:1000])                # 将数据转换成conversations的形式
format_data_list = self_cong_data + format_data_list
random.shuffle(format_data_list)                                  # 打乱数据
print(format_data_list[0])

{'id': 'identity_547', 'conversations': [{'from': 'user', 'value': '男孩，8岁，上小学了，刚开始，说话时觉得嗓子疼，察觉到，咳嗽好像也比较严重，而且，有点发烧也没精神，请问：孩子出现扁桃体炎哭闹饮食禁忌是什么。'}, {'from': 'assistant', 'value': '发现扁桃体炎务必要及时给孩子治疗，另外日常饮食可以多吃一些蔬菜和水果，主要是因为蔬菜和水果不但清淡，而且还富含人体所需的各种维生素和微量元素，显然是可以帮助孩子抵御疾病的，水果的话建议选择梨、苹果等，蔬菜的话像青菜、白菜、西红柿等都是不错的，可以根据孩子的喜好选择，日常饮食应选择易于消化的，比如粥、面条等，在清淡饮食的同时还要少吃发物，例如家禽蛋、驴肉、牛羊肉等，饮食对扁桃体炎的治疗也是很关键的，务必注意。'}]}


实现数据加载脚本，将数据通过tokenizer转换成模型可以接受的token id输入。

In [4]:
model_path = 'Qwen2.5-3B-Instruct'
# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True, padding_side="right")

from transformers.trainer_pt_utils import LabelSmoother

IGNORE_TOKEN_ID = LabelSmoother.ignore_index

# 定义预处理数据的逻辑
def preprocess(
    sources,
    tokenizer: AutoTokenizer,
    max_len: int,
    system_message: str = "You are a helpful assistant."
):
    
    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}

    im_start = tokenizer('<|im_start|>').input_ids[0]
    im_end = tokenizer('<|im_end|>').input_ids[0]
    nl_tokens = tokenizer('\n').input_ids
    _system = tokenizer('system').input_ids + nl_tokens
    _user = tokenizer('user').input_ids + nl_tokens
    _assistant = tokenizer('assistant').input_ids + nl_tokens

    # Apply prompt templates
    input_ids, targets = [], []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != roles["user"]:
            source = source[1:]

        input_id, target = [], []
        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
        input_id += system
        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
        assert len(input_id) == len(target)
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            _input_id = tokenizer(role).input_ids + nl_tokens + \
                tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
            input_id += _input_id
            if role == '<|im_start|>user':
                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
            elif role == '<|im_start|>assistant':
                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
            else:
                raise NotImplementedError
            target += _target
        assert len(input_id) == len(target)
        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
        input_ids.append(input_id[:max_len])
        targets.append(target[:max_len])
    input_ids = torch.tensor(input_ids, dtype=torch.int)
    targets = torch.tensor(targets, dtype=torch.int)

    return dict(
        input_ids=input_ids,
        labels=targets,
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
    )

class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, raw_data, tokenizer, max_len: int):
        super(SupervisedDataset, self).__init__()

        print("Formatting inputs...")
        sources = [example["conversations"] for example in raw_data]
        data_dict = preprocess(sources, tokenizer, max_len)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        self.attention_mask = data_dict["attention_mask"]
        print("Formatting done...")

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i):
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i],
            attention_mask=self.attention_mask[i],
        )

# 加载这些数据集
train_dataset = SupervisedDataset(format_data_list, tokenizer, max_len=1024)

print(train_dataset[0])

Formatting inputs...
Formatting done...
{'input_ids': tensor([151644,   8948,    198,  ..., 151643, 151643, 151643],
       dtype=torch.int32), 'labels': tensor([151644,   -100,   -100,  ...,   -100,   -100,   -100],
       dtype=torch.int32), 'attention_mask': tensor([ True,  True,  True,  ..., False, False, False])}


设置LoRA参数和量化参数

In [5]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

output_dir = 'checkpoints_self_cong/'
# 设置LoRA参数
config = LoraConfig(
    r=32,                                  # LoRA所使用的Rank
    lora_alpha=16,                         # LoRA的Alpha
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],    # LoRA所作用的模块
    bias="none",                           # LoRA的Bias，这里设置为none，表示不使用Bias
    lora_dropout=0.05,                     # LoRA的Dropout，这里设置为0.05
    task_type="CAUSAL_LM",                 
)
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                       # 加载4bit量化模型
    bnb_4bit_quant_type="nf4",               # 量化类型，这里使用nf4
    bnb_4bit_compute_dtype=compute_dtype,    # 计算精度，这里使用float16
    bnb_4bit_use_double_quant=True,          # 使用双量化
)

peft_training_args = TrainingArguments(
    output_dir = output_dir,                 # 输出目录
    warmup_steps=1,                          # warmup步数，一般在训练模型时，都会将学习率从0逐渐增加到一个较大的值，这个过程就是warmup
    per_device_train_batch_size=1,           # 每个设备的训练batch_size
    gradient_accumulation_steps=1,           # 梯度累积步数
    learning_rate=2e-4,                      # 学习率
    optim="paged_adamw_8bit",                # 优化器，这里使用paged_adamw_8bit
    logging_steps=100,                       # 多少步打印一次日志
    logging_dir="./logs",                    # 日志目录
    save_strategy="steps",                   # 保存策略，按照步数保存
    max_steps=1000,                          # 要训练多少步
    save_steps=500,                          # 多少步保存一次
    gradient_checkpointing=True,             # 是否使用gradient_checkpointing功能，这个功能可以节省显存
    report_to="none",                        # 不输出报告，这里可以设置成向tensorboard和wandb输出报告
    overwrite_output_dir = 'True',           # 是否覆盖输出目录
    group_by_length=True,                    # 是否根据长度分组，这个参数可以加速训练，其原理是将长度相近的数据放在一起
)

加载模型并且开始训练

In [9]:
# 加载预训练模型
original_model = AutoModelForCausalLM.from_pretrained(
    model_path,                                # 预训练模型所存放的路径
    torch_dtype=compute_dtype,                 # 模型的精度
    quantization_config=quant_config,          # 使用什么样的量化配置
    attn_implementation="flash_attention_2",   # 是否要使用flash attention，这里的设置是使用flash attention
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()
# 2 - Using the prepare_model_for_kbit_training method from PEFT
original_model = prepare_model_for_kbit_training(original_model)
peft_model = get_peft_model(original_model, config)   # 基于LoRA的配置获取PEFT模型

peft_model.config.use_cache = False
peft_trainer = Trainer(                   # 定义Trainer
    model=peft_model,
    train_dataset=train_dataset,
    args=peft_training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

torch.cuda.empty_cache()

start_time = time.time()
peft_trainer.train()                      # 开始训练
end_time = time.time()

print(f"Training time: {end_time - start_time} seconds")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
100,1.2988
200,0.5077
300,0.476
400,0.4619
500,0.2921
600,0.3995
700,0.4471
800,0.3653
900,0.4141
1000,0.3251


Training time: 466.1714286804199 seconds
