# Install Libraries

In [1]:
import importlib.util

def is_installed(pkg_name):
  return importlib.util.find_spec(pkg_name) is not None

if not is_installed("bitsandbytes"): # 这里选一个pip install前后有变化的包进行验证
  !pip install \
  fsspec==2024.12.0 \
  gcsfs==2024.12.0 \
  datasets \
  transformers \
  accelerate \
  evaluate \
  bitsandbytes \
  trl \
  peft

Collecting fsspec==2024.12.0
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting gcsfs==2024.12.0
  Downloading gcsfs-2024.12.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidi

# Import Libraries

In [2]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTConfig, SFTTrainer
import bitsandbytes as bnb
import evaluate
from google.colab import drive
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load Model

In [3]:
# 量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 启用 4bit 量化，将模型的线性层（Linear / Conv1D）替换成量化层 Linear4bit
    bnb_4bit_use_double_quant=True, # 启用嵌套量化，进一步压缩量化参数，减少存储开销 (Linear4bit内部计算逻辑)
    bnb_4bit_quant_type="nf4", # 4bit 量化格式有2种（nf4和fp4），其中nf4基于正态分布优化，通常效果更优
    bnb_4bit_compute_dtype=torch.bfloat16 # 设置计算时的数据类型，实际权重以 4bit 存储但会映射到 bfloat16 进行计算，也就是 Linear4bit 内部的中间计算使用 bfloat16
)

In [4]:
# 选择 distilbert/distilgpt2 作为基础模型
model_id = "distilbert/distilgpt2"

# 将整个模型加载到 GPU 0
device_map = {"": 0}

# 加载原始模型
original_model = AutoModelForCausalLM.from_pretrained(model_id)

# 加载量化模型（将量化配置应用在模型上）
quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    use_cache = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# 加载与模型对应的分词器，并设置填充标记为结束标记
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
tokenizer

GPT2TokenizerFast(name_or_path='distilbert/distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

# Comparing original_model and quantized_model

- 参数数量不会变（还是那么多矩阵元素）
  - 81912576
- 参数精度和大小变了（用 4-bit 表示）
  - 参数大小变化
    - Original size: 318.47 MB
      - 估算：81912576 * 4 bytes / (1024^2) = 308.66 MB
    - Quantized size: 101.49 MB
      - 估算
        - 0.5 bytes 的参数个数：42467328 = 6 layers * (768 * 2304 + 768 * 768 + 2 * 768 * 3072)
        - 2 bytes 的参数个数：39445248 = 81912576 - 42467328
        - （0.5 * 42467328 + 2 * 39445248 ） / (1024^2) = 95.49 MB
  - 模型结构变化
    - attn
      - (c_attn): Conv1D(nf=2304, nx=768) -> (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
      - (c_proj): Conv1D(nf=768, nx=768) -> (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
    - mlp
      - (c_fc): Conv1D(nf=3072, nx=768) -> (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
      - (c_proj): Conv1D(nf=768, nx=3072) -> (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
  - 参数精度变化
    - 量化前：
      - 所有参数全都是dtype=torch.float32（32位）
        - transformer.h.0.attn.c_attn.weight: torch.Size([768, 2304]), dtype=torch.float32
        - transformer.h.0.attn.c_proj.weight: torch.Size([768, 768]), dtype=torch.float32
        - transformer.h.0.mlp.c_fc.weight: torch.Size([768, 3072]), dtype=torch.float32
        - transformer.h.0.mlp.c_proj.weight: torch.Size([3072, 768]), dtype=torch.float32
    - 量化后：
      - dtype=torch.uint8（每层的4个地方变为4bit）实际存储中使用压缩技术，将2个4bit组合为int8
        - transformer.h.0.attn.c_attn.weight: torch.Size([884736, 1]), dtype=torch.uint8
        - transformer.h.0.attn.c_proj.weight: torch.Size([294912, 1]), dtype=torch.uint8
        - transformer.h.0.mlp.c_fc.weight: torch.Size([1179648, 1]), dtype=torch.uint8
        - transformer.h.0.mlp.c_proj.weight: torch.Size([1179648, 1]), dtype=torch.uint8
      - dtype=torch.float16（其余参数都变为float16）


变化解释
- 变化
  - 量化前：transformer.h.0.attn.c_attn.weight: torch.Size([768, 2304]), dtype=torch.float32
  - 量化后：transformer.h.0.attn.c_attn.weight: torch.Size([884736, 1]), dtype=torch.uint8
- 解释
  - 原始 float32 的矩阵 [768, 2304] → 总共参数数量是 768 * 2304 = 1,769,472
  - 用 4-bit 表示就是 1,769,472 * 0.5 byte = 884,736 bytes = 884736 uint8 （存储为 packed 的 uint8，每 byte 存两个 4-bit 权重）
  - [884736, 1]正是把原来的权重展开成1维后量化存储的结果




In [7]:
original_model.num_parameters()

81912576

In [8]:
quantized_model.num_parameters()

81912576

In [9]:
# 原始模型大小
print(f"Original size: {original_model.get_memory_footprint()/1024**2:.2f} MB")

# 量化模型大小
print(f"Quantized size: {quantized_model.get_memory_footprint()/1024**2:.2f} MB")

Original size: 318.47 MB
Quantized size: 101.49 MB


In [10]:
original_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
quantized_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [12]:
for name, param in original_model.named_parameters():
    print(f"{name}: {param.shape}, dtype={param.dtype}")

transformer.wte.weight: torch.Size([50257, 768]), dtype=torch.float32
transformer.wpe.weight: torch.Size([1024, 768]), dtype=torch.float32
transformer.h.0.ln_1.weight: torch.Size([768]), dtype=torch.float32
transformer.h.0.ln_1.bias: torch.Size([768]), dtype=torch.float32
transformer.h.0.attn.c_attn.weight: torch.Size([768, 2304]), dtype=torch.float32
transformer.h.0.attn.c_attn.bias: torch.Size([2304]), dtype=torch.float32
transformer.h.0.attn.c_proj.weight: torch.Size([768, 768]), dtype=torch.float32
transformer.h.0.attn.c_proj.bias: torch.Size([768]), dtype=torch.float32
transformer.h.0.ln_2.weight: torch.Size([768]), dtype=torch.float32
transformer.h.0.ln_2.bias: torch.Size([768]), dtype=torch.float32
transformer.h.0.mlp.c_fc.weight: torch.Size([768, 3072]), dtype=torch.float32
transformer.h.0.mlp.c_fc.bias: torch.Size([3072]), dtype=torch.float32
transformer.h.0.mlp.c_proj.weight: torch.Size([3072, 768]), dtype=torch.float32
transformer.h.0.mlp.c_proj.bias: torch.Size([768]), dtyp

In [13]:
for name, param in quantized_model.named_parameters():
    print(f"{name}: {param.shape}, dtype={param.dtype}")

transformer.wte.weight: torch.Size([50257, 768]), dtype=torch.float16
transformer.wpe.weight: torch.Size([1024, 768]), dtype=torch.float16
transformer.h.0.ln_1.weight: torch.Size([768]), dtype=torch.float16
transformer.h.0.ln_1.bias: torch.Size([768]), dtype=torch.float16
transformer.h.0.attn.c_attn.weight: torch.Size([884736, 1]), dtype=torch.uint8
transformer.h.0.attn.c_attn.bias: torch.Size([2304]), dtype=torch.float16
transformer.h.0.attn.c_proj.weight: torch.Size([294912, 1]), dtype=torch.uint8
transformer.h.0.attn.c_proj.bias: torch.Size([768]), dtype=torch.float16
transformer.h.0.ln_2.weight: torch.Size([768]), dtype=torch.float16
transformer.h.0.ln_2.bias: torch.Size([768]), dtype=torch.float16
transformer.h.0.mlp.c_fc.weight: torch.Size([1179648, 1]), dtype=torch.uint8
transformer.h.0.mlp.c_fc.bias: torch.Size([3072]), dtype=torch.float16
transformer.h.0.mlp.c_proj.weight: torch.Size([1179648, 1]), dtype=torch.uint8
transformer.h.0.mlp.c_proj.bias: torch.Size([768]), dtype=tor

In [14]:
for name, module in quantized_model.named_modules():
    if isinstance(module, bnb.nn.Linear4bit):
        print(f"{name}:")
        print(f"  weight dtype: {module.weight.dtype}")
        print(f"  compute dtype: {module.compute_dtype}")

transformer.h.0.attn.c_attn:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.0.attn.c_proj:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.0.mlp.c_fc:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.0.mlp.c_proj:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.1.attn.c_attn:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.1.attn.c_proj:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.1.mlp.c_fc:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.1.mlp.c_proj:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.2.attn.c_attn:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.2.attn.c_proj:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.2.mlp.c_fc:
  weight dtype: torch.uint8
  compute dtype: torch.bfloat16
transformer.h.2.mlp.c_proj:
  we

# Preparing the Dataset

In [15]:
# 选择 yahma/alpaca-cleaned 作为数据集
dataset_name = "yahma/alpaca-cleaned"

# 加载数据集
full_dataset = load_dataset(dataset_name, split="train")

# 选取小规模子集（1000 条）
small_subset = full_dataset.shuffle(seed=42).select(range(1000))

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [16]:
full_dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [17]:
small_subset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 1000
})

In [18]:
# 定义 Alpaca 数据集的 Prompt 模版
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 定义 formatting_prompts_func 函数
def formatting_prompts_func(examples):

    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]

    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts }


# 应用 formatting_prompts_func 函数
small_subset = small_subset.map(formatting_prompts_func, batched=True,)

print(small_subset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'input', 'instruction', 'text'],
    num_rows: 1000
})


In [19]:
# 对 "text" 应用 tokenizer；如果超长，截断到模型最大长度；所有样本 pad 到相同长度，方便 batch 训练
small_subset = small_subset.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length"), batched=True)

print(small_subset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'input', 'instruction', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1000
})


In [25]:
# 查看数据示例
print("数据示例:", small_subset[0])

数据示例: {'output': 'Early, she left the party.', 'input': 'She left the party early', 'instruction': 'Rearrange the following sentence to make the sentence more interesting.', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nRearrange the following sentence to make the sentence more interesting.\n\n### Input:\nShe left the party early\n\n### Response:\nEarly, she left the party.', 'input_ids': [21106, 318, 281, 12064, 326, 8477, 257, 4876, 11, 20312, 351, 281, 5128, 326, 3769, 2252, 4732, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 49, 451, 9521, 262, 1708, 6827, 284, 787, 262, 6827, 517, 3499, 13, 198, 198, 21017, 23412, 25, 198, 3347, 1364, 262, 2151, 1903, 198, 198, 21017, 18261, 25, 198, 20457, 11, 673, 1364, 262, 2151, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 502

# Fine-Tuning

In [26]:
# LoRA 参数配置
peft_config = LoraConfig(
    r=8, # 秩，越大表达能力越强，但参数也更多
    lora_alpha=16, # 缩放因子
    lora_dropout=0.05, # dropout 概率
    target_modules=["c_attn", "c_proj", "c_fc"],  # 需要插入 LoRA 的模块
    bias="none", # 是否训练 bias 项：否
    task_type="CAUSAL_LM", # 任务类型：因果语言建模
)

# 训练参数配置
training_args = SFTConfig(
    output_dir="outputs", # 输出路径
    logging_steps=1, # 多少steps记录一次日志
    num_train_epochs=3, # 训练轮数
    per_device_train_batch_size=2, # 每个设备的训练批次大小
    per_device_eval_batch_size=2, # 每个设备的验证批次大小
    gradient_accumulation_steps=5, # 梯度累积
    gradient_checkpointing=True, # 启用梯度检查点
    learning_rate=2e-4, # 学习率
    optim="adamw_8bit", # 优化器
    weight_decay=0.01, # 权重衰减
    max_grad_norm=0.3, # 梯度裁剪
    warmup_ratio=0.03, # 预热比例
    fp16=not torch.cuda.is_bf16_supported(), # 使用半精度训练
    bf16=torch.cuda.is_bf16_supported(),
    dataset_text_field="text",
)

# 实例化 SFTTrainer
trainer = SFTTrainer(
    model=quantized_model,
    train_dataset=small_subset,
    peft_config=peft_config,
    args=training_args,
)

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [27]:
trainer.model.print_trainable_parameters()

trainable params: 589,824 || all params: 82,502,400 || trainable%: 0.7149


- 基础模型参数量
  - 81912576
- 模型结构中的目标模块
  - attn
    - (c_attn): Conv1D(nf=2304, nx=768) -> (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
    - (c_proj): Conv1D(nf=768, nx=768) -> (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
  - mlp
    - (c_fc): Conv1D(nf=3072, nx=768) -> (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
    - (c_proj): Conv1D(nf=768, nx=3072) -> (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
- 参数之间的关系
  - 总参数量 82,502,400 - QLoRA可训练参数量 589,824 = 基础模型参数量 81912576
  - QLoRA可训练参数量 589,824 = 6层 * (24576 + 12288 + 30720 + 30720) = 6 * 98304
    - 共6层，每层4个目标模块
      - c_attn: 768 * 8 + 2304 * 8 = 24576
      - c_proj: 768 * 8 + 768 * 8 = 12288
      - c_fc: 3072 * 8 + 768 * 8 = 30720
      - c_proj: 768 * 8 + 3072 * 8 = 30720

In [28]:
# Train model
torch.cuda.empty_cache()
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwhggydycq[0m ([33mwhggydycq-beihang-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,7.8139
2,8.1432
3,8.3673
4,8.1904
5,7.7668
6,7.9738
7,8.0458
8,7.3412
9,7.5425
10,7.3058


Step,Training Loss
1,7.8139
2,8.1432
3,8.3673
4,8.1904
5,7.7668
6,7.9738
7,8.0458
8,7.3412
9,7.5425
10,7.3058


TrainOutput(global_step=300, training_loss=1.0729552553594113, metrics={'train_runtime': 1098.6428, 'train_samples_per_second': 2.731, 'train_steps_per_second': 0.273, 'total_flos': 794761887744000.0, 'train_loss': 1.0729552553594113})

# Save Trained Model

In [64]:
# Save trained model
peft_model = "distilgpt2-qlora"

trainer.model.save_pretrained(peft_model)

In [65]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

peft_model = PeftModel.from_pretrained(base_model, peft_model)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")



# Inference with the fine-tuned model

In [91]:
def get_outputs(model, inputs):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.5,  # 控制生成文本的随机性
        top_p=0.7,        # 核采样
        do_sample=True    # 启用采样
    )
    return outputs

In [89]:
input_sentences = tokenizer("The meaning of life is",return_tensors="pt").to('cuda')

In [92]:
foundational_outputs_sentence = get_outputs(merged_model,
                                            input_sentences)

print(tokenizer.batch_decode(foundational_outputs_sentence,
                             skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['The meaning of life is to be able to live in harmony with the world.']


In [93]:
import gc
del original_model
del quantized_model
del trainer
del small_subset
torch.cuda.empty_cache()
gc.collect()

3534

# QLoRA 和全参数微调⽅法对比

- 微调参数量：  
  - QLoRA：仅微调一小部分参数（LoRA Adapter），在本例中，可训练参数仅占总参数的 0.7149%
  - 全参数微调：更新所有模型参数，需要更多内存和计算资源
- 内存使用：  
  - QLoRA：使用 4-bit 量化，内存占用大幅减少
  - 全参数微调：需要全精度（例如 16 位或 32 位），通常需要较大的 GPU 内存
- 训练速度：  
  - QLoRA：由于参数少且精度低，训练更快  
  - 全参数微调：更新所有权重，训练时间较长
- 性能：  
  - QLoRA：由于量化和有限的参数更新，性能可能略低于全参数微调，但仍保留大部分能力
  - 全参数微调：因所有权重都被优化，性能可能更高
- 适用场景：  
  - QLoRA：适合资源受限环境或快速实验  
  - 全参数微调：适合资源充足且需最大精度的场景

# QLoRA 在该任务中的优势和潜在局限性

## 优势：
- 内存效率：4 位量化和 LoRA 减少了内存使用，使微调能在较小的 GPU 上运行（例如 Colab 免费版）
- 速度：由于参数少且精度低，训练速度更快，适合快速迭代
- 保留预训练知识：冻结大部分权重保留了基础模型的泛化能力，同时适配任务
- 适用于小数据集：对像 1,000 条 Alpaca 子集这样的小数据集效果良好，不易过拟合

## 局限性：
- 性能权衡：量化和 LoRA 可能导致性能略低于全参数微调，尤其在复杂任务中  
- 任务特定性：LoRA 适配器是任务特定的，切换任务需重新训练或维护多个适配器  
- 量化噪声：4-bit 精度引入噪声，可能影响输出质量  
- 超参数敏感性：调整 `r`、`lora_alpha` 和量化设置需要实验，增加了复杂性