# Phi-2 + QLoRA 训练实验

本实验将使用QLoRA方法对Phi-2模型进行微调训练。

**预计执行时间**: 4-5小时

**CUDA版本要求**: 12.4

In [2]:
# @title 安装Unsloth
!pip install unsloth accelerate transformers
!pip install datasets

Collecting unsloth
  Downloading unsloth-2025.3.8-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2025.3.7 (from unsloth)
  Downloading unsloth_zoo-2025.3.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Downloading

In [19]:
# @title 环境设置与验证
import torch
from unsloth import FastLanguageModel
import os

# 禁用wandb
os.environ["WANDB_DISABLED"] = "true"

# 打印环境信息
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU内存: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch版本: 2.6.0+cu124
CUDA可用: True
CUDA版本: 12.4
GPU: Tesla T4
GPU内存: 15.83 GB


In [20]:
# 检查CUDA版本
!nvcc --version
print("\n")

# 检查系统信息和GPU
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


Sat Mar  8 03:08:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   75C    P0             34W /   70W |     102MiB /  15360MiB |      0%      Default |
|                     

In [5]:
# 克隆项目仓库
!git clone https://github.com/michaelearncoding/MSE718FinalProject-FinetuneAnalysisForLLMTechnique.git

Cloning into 'MSE718FinalProject-FinetuneAnalysisForLLMTechnique'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 66 (delta 0), reused 0 (delta 0), pack-reused 65 (from 1)[K
Receiving objects: 100% (66/66), 74.84 MiB | 17.61 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Updating files: 100% (40/40), done.


In [6]:
%cd /content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique


/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique


In [15]:
import random
# 创建2K样本子集
if os.path.exists("llm-peft-compare/data/alpaca_train.jsonl"):
    # 读取所有行
    with open("llm-peft-compare/data/alpaca_train.jsonl", "r") as f:
        lines = f.readlines()

    # 随机选择2000行
    random.seed(42)
    sample = random.sample(lines, min(2000, len(lines)))

    # 写入子集
    with open("llm-peft-compare/data/alpaca_train_2k.jsonl", "w") as f:
        f.writelines(sample)

    print(f"✅ 创建了2K数据子集，共{len(sample)}条数据")
else:
    print("❌ JSONL文件不存在")

✅ 创建了2K数据子集，共2000条数据


In [16]:
!pwd
# content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique


In [18]:
# @title 数据准备
import json
from datasets import Dataset

# 加载2K数据集并转换为Unsloth格式
with open("llm-peft-compare/data/alpaca_train_2k.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

# 转换为指令格式
formatted_data = []
for item in data:
    instruction = item["instruction"]
    input_text = item.get("input", "")
    output = item["output"]

    # 构建提示文本
    if input_text:
        prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: "
    else:
        prompt = f"Instruction: {instruction}\nOutput: "

    formatted_data.append({"prompt": prompt, "response": output})

# 创建HF数据集
dataset = Dataset.from_list(formatted_data)
print(f"数据集准备完成，共{len(dataset)}个样本")

数据集准备完成，共2000个样本


In [21]:
# 创建必要的目录结构
!mkdir -p llm-peft-compare/data
!mkdir -p llm-peft-compare/models
!mkdir -p llm-peft-compare/results/figures
!mkdir -p llm-peft-compare/results/model_comparison
!mkdir -p llm-peft-compare/results/raw_data

In [7]:
# # 准备数据集
# %cd llm-peft-compare

# # 检查是否已有数据，如果没有则下载
# if not os.path.exists("data/alpaca_train.jsonl"):
#     !wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O data/alpaca_data.json
#     # 转换为JSONL格式
#     !python -c "import json; data = json.load(open('data/alpaca_data.json')); [print(json.dumps(item)) for item in data]" > data/alpaca_train.jsonl

# # 创建5K数据子集
# if not os.path.exists("data/alpaca_train_5k.jsonl"):
#     !python scripts/create_subset_data.py --input data/alpaca_train.jsonl --output data/alpaca_train_5k.jsonl --size 5000

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare


In [25]:
%cd /content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare


In [26]:
!pwd

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare


## 开始训练: Phi-2 + QLoRA - 使用Unsloth进行微调:


接下来我们将执行`fast_train.sh`脚本，使用'small'模型尺寸（Phi-2）和'qlora'方法进行训练。

In [33]:
# @title 正确加载JSONL数据
import json
import os
from datasets import Dataset

# 确保数据目录存在
os.makedirs("data", exist_ok=True)

# 数据路径
data_path = "data/alpaca_train.jsonl"

# 确认文件是否存在
if not os.path.exists(data_path):
    print(f"文件不存在: {data_path}")
    print("尝试下载原始数据...")
    !wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -O data/alpaca_data.json

    # 转换为JSONL
    if os.path.exists("data/alpaca_data.json"):
        with open("data/alpaca_data.json", "r") as f:
            data_list = json.load(f)

        with open(data_path, "w") as f:
            for item in data_list:
                f.write(json.dumps(item) + "\n")

        print(f"✅ 已创建JSONL文件: {data_path}")

# 以更严格的方式加载数据
data = []
with open(data_path, "r") as f:
    for i, line in enumerate(f):
        try:
            item = json.loads(line.strip())
            if isinstance(item, dict) and "instruction" in item and "output" in item:
                data.append(item)
            else:
                print(f"跳过第{i+1}行: 格式不正确")
        except json.JSONDecodeError:
            print(f"跳过第{i+1}行: JSON解析错误")

print(f"成功加载了{len(data)}条有效数据")

# 只取前2000条作为训练数据
training_data = data[:2000]
print(f"使用前2000条数据进行训练")

# 转换为Unsloth所需格式
formatted_data = []
for item in training_data:
    instruction = item["instruction"]
    input_text = item.get("input", "")
    output = item["output"]

    # 构建提示文本
    if input_text:
        prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput: "
    else:
        prompt = f"Instruction: {instruction}\nOutput: "

    formatted_data.append({"prompt": prompt, "response": output})

# 创建数据集
dataset = Dataset.from_list(formatted_data)
print(f"✅ 数据集创建成功，共{len(dataset)}个样本")


成功加载了52002条有效数据
使用前2000条数据进行训练
✅ 数据集创建成功，共2000个样本


In [40]:
dataset

Dataset({
    features: ['prompt', 'response'],
    num_rows: 2000
})

In [42]:
dataset['response'][:3]

['1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'The three primary colors are red, blue, and yellow.',
 'An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.']

In [43]:
dataset['prompt'][:3]

['Instruction: Give three tips for staying healthy.\nOutput: ',
 'Instruction: What are the three primary colors?\nOutput: ',
 'Instruction: Describe the structure of an atom.\nOutput: ']

In [37]:
# 1. 加载Llama-3.2-1B模型
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=1024,
    dtype=torch.float16,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.3.8: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [38]:
# 2. 添加LoRA适配器
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.8 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [46]:
# 3. 设置训练参数
training_args = TrainingArguments(
    output_dir="models/llama-3.2-1b-instruction-unsloth",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # 更小的模型，可以用更大的批量
    gradient_accumulation_steps=4,  # 可以减少累积步骤
    gradient_checkpointing=True,
    optim="adamw_torch",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    fp16=True,
    report_to="none",
)

In [50]:
# 创建格式化函数 - 修复为返回字符串列表
def formatting_func(examples):
    """
    格式化函数 - 返回一个字符串列表

    Args:
        examples: 包含'prompt'和'response'字段的字典

    Returns:
        列表: 处理过的字符串列表
    """
    prompts = examples["prompt"]
    responses = examples["response"]

    # 确保输入是列表
    if not isinstance(prompts, list):
        prompts = [prompts]
        responses = [responses]

    # 构建格式化的输出
    formatted_texts = []
    for prompt, response in zip(prompts, responses):
        # 构建按照Llama 3.2的格式
        formatted_text = f"<|system|>\nYou are a helpful, accurate and truthful AI assistant.\n<|user|>\n{prompt.strip()}\n<|assistant|>\n{response.strip()}"
        formatted_texts.append(formatted_text)

    return formatted_texts

In [51]:
# 5. 准备训练器
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,  # 使用已有数据集
    formatting_func=formatting_func,  # 添加格式化函数
    args=training_args,
    max_seq_length=1024,
    packing=False,
)

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [52]:
trainer

<unsloth_compiled_cache.UnslothSFTTrainer.UnslothSFTTrainer at 0x7d33c2133510>

In [53]:
# 5. 显示训练信息
print("训练设置:")
print(f"- 模型: unsloth/Llama-3.2-1B-bnb-4bit")
print(f"- 数据集样本数: {len(dataset)}")
print(f"- 批量大小: {training_args.per_device_train_batch_size}")
print(f"- 梯度累积: {training_args.gradient_accumulation_steps}")
print(f"- 训练轮数: {training_args.num_train_epochs}")
print(f"- LoRA秩: 16")

训练设置:
- 模型: unsloth/Llama-3.2-1B-bnb-4bit
- 数据集样本数: 2000
- 批量大小: 4
- 梯度累积: 4
- 训练轮数: 1
- LoRA秩: 16


In [54]:
# 显示可训练参数数量
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"可训练参数: {trainable_params:,} ({100 * trainable_params / total_params:.4f}% 占总参数)")


可训练参数: 11,272,192 (1.4821% 占总参数)


In [55]:
# 6. 开始训练
print("\n🔄 开始训练...")
trainer.train()


🔄 开始训练...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 125
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192/760,547,328 (1.48% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.3229
20,1.3883
30,1.2772
40,1.2232
50,1.2163
60,1.2019
70,1.2247
80,1.1714
90,1.18
100,1.1584


TrainOutput(global_step=125, training_loss=1.2937623710632324, metrics={'train_runtime': 208.942, 'train_samples_per_second': 9.572, 'train_steps_per_second': 0.598, 'total_flos': 1855589126307840.0, 'train_loss': 1.2937623710632324})

In [56]:
# 7. 保存模型
output_dir = "models/llama-3.2-1b-instruction-unsloth/final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ 训练完成，模型保存在: {output_dir}")

✅ 训练完成，模型保存在: models/llama-3.2-1b-instruction-unsloth/final


In [57]:
# @title 压缩并下载训练好的模型
from google.colab import files
import os
import shutil

# 模型路径
model_path = "models/llama-3.2-1b-instruction-unsloth/final"

# 检查模型文件夹是否存在
if os.path.exists(model_path):
    # 压缩模型文件夹
    output_file = "llama-3.2-1b-instruction-unsloth.zip"
    print(f"正在压缩模型文件夹到 {output_file}...")
    !zip -r {output_file} {model_path}

    # 下载压缩包
    print("开始下载模型压缩包...")
    files.download(output_file)
    print("下载已启动，请检查浏览器下载区域")
else:
    print(f"❌ 错误：模型目录 {model_path} 不存在")

正在压缩模型文件夹到 llama-3.2-1b-instruction-unsloth.zip...
  adding: models/llama-3.2-1b-instruction-unsloth/final/ (stored 0%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/README.md (deflated 66%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/tokenizer.json (deflated 85%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/tokenizer_config.json (deflated 96%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/adapter_model.safetensors (deflated 8%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/special_tokens_map.json (deflated 71%)
  adding: models/llama-3.2-1b-instruction-unsloth/final/adapter_config.json (deflated 56%)
开始下载模型压缩包...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

下载已启动，请检查浏览器下载区域


In [58]:
!pwd

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare


In [59]:
# 合并LoRA适配器与基础模型
from unsloth import FastLanguageModel
import torch

# 1. 加载基础模型
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",  # 基础模型路径
    max_seq_length=1024,
    dtype=torch.float16,
)

# 2. 加载您的LoRA适配器
# models/llama-3.2-1b-instruction-unsloth/final

# adapter_path = "models/llama-3.2-1b-instruction-unsloth/final"  # 替换为您下载的适配器路径
# model = FastLanguageModel.from_pretrained(
#     model_path=adapter_path,
#     model=base_model,  # 传入基础模型
#     tokenizer=tokenizer
# )

# # 3. 合并LoRA权重到基础模型中
# merged_model = FastLanguageModel.merge_lora_to_base_model(model)

# # 4. 保存完整的合并模型
# output_path = "llama-3.2-1b-merged"
# merged_model.save_pretrained(output_path)
# tokenizer.save_pretrained(output_path)

# print(f"✅ 模型已合并并保存到: {output_path}")

==((====))==  Unsloth 2025.3.8: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.3.8: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

TypeError: LlamaForCausalLM.__init__() got an unexpected keyword argument 'model_path'

In [61]:
from peft import PeftModel
# 2. 加载您的LoRA适配器
adapter_path = "models/llama-3.2-1b-instruction-unsloth/final"  # 适配器路径
# 正确的方法是使用PeftModel.from_pretrained
model = PeftModel.from_pretrained(base_model, adapter_path)

In [62]:
# 3. 合并LoRA权重到基础模型
merged_model = model.merge_and_unload()



In [63]:
# 4. 保存完整的合并模型
output_path = "llama-3.2-1b-merged"
merged_model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print(f"✅ 模型已合并并保存到: {output_path}")

✅ 模型已合并并保存到: llama-3.2-1b-merged


In [64]:
!pwd

/content/MSE718FinalProject-FinetuneAnalysisForLLMTechnique/llm-peft-compare


In [65]:
# 直接压缩并下载合并后的模型
from google.colab import files
import os

# 模型路径
model_path = "llama-3.2-1b-merged"

# 检查模型文件夹是否存在
if os.path.exists(model_path):
    # 查看模型大小（供参考）
    !du -sh {model_path}

    # 压缩模型文件夹
    output_file = "llama-3.2-1b-merged.zip"
    print(f"正在压缩模型到 {output_file}...")
    !zip -r {output_file} {model_path}

    # 下载压缩包
    print("开始下载模型压缩包...")
    files.download(output_file)
    print("下载已启动，请检查浏览器下载区域")
else:
    print(f"❌ 错误：模型目录 {model_path} 不存在")

997M	llama-3.2-1b-merged
正在压缩模型到 llama-3.2-1b-merged.zip...
  adding: llama-3.2-1b-merged/ (stored 0%)
  adding: llama-3.2-1b-merged/config.json (deflated 55%)
  adding: llama-3.2-1b-merged/tokenizer.json (deflated 85%)
  adding: llama-3.2-1b-merged/tokenizer_config.json (deflated 96%)
  adding: llama-3.2-1b-merged/generation_config.json (deflated 37%)
  adding: llama-3.2-1b-merged/special_tokens_map.json (deflated 71%)
  adding: llama-3.2-1b-merged/model.safetensors (deflated 13%)
开始下载模型压缩包...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

下载已启动，请检查浏览器下载区域


In [8]:
# 修改脚本权限确保可执行
!chmod +x scripts/fast_train.sh

In [14]:
# 执行训练脚本
!./scripts/fast_train.sh small qlora

🚀 开始优化版LLM微调训练（使用5K数据子集）
⚠️ 未检测到Apple Silicon Mac，此脚本优化效果可能有限
使用配置:
- 模型: phi_2.7b
- 方法: qlora
- 批量大小: 4
- 梯度累积: 4
- 训练轮数: 1
sed: can't read s|TRAIN_FILE = os.path.join(PROJECT_ROOT, "data", "alpaca_train.jsonl")|TRAIN_FILE = os.path.join(PROJECT_ROOT, "data", "alpaca_train_5k.jsonl")|g: No such file or directory

🔄 开始训练 phi_2.7b 使用 qlora 方法...
执行命令: python scripts/train_instruction.py --method qlora --model_size small --batch_size 4 --gradient_accumulation_steps 4 --epochs 1
2025-03-08 02:27:59.283671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741400879.319944    3811 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741400879.330813    3811 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin 

In [24]:
!python -m bitsandbytes

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++
CUDA specs: CUDASpecs(highest_compute_capability=(7, 5), cuda_version_string='124', cuda_version_tuple=(12, 4))
PyTorch settings found: CUDA_VERSION=124, Highest Compute Capability: (7, 5).
To manually override the PyTorch CUDA version please see: https://github.com/TimDettmers/bitsandbytes/blob/main/docs/source/nonpytorchcuda.mdx
The directory listed in your path is found to be non-existent: /sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events
The directory listed in your path is found to be non-existent: //172.28.0.1
The directory listed in your path is found to be non-existent: 8013
The directory listed in your path is found to be non-existent: //colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/

In [29]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following depe

In [30]:
# # 彻底卸载当前版本
# !pip uninstall -y bitsandbytes

# # 安装一个已知稳定的版本
# !pip install bitsandbytes==0.40.0

# # 验证安装
# import torch
# import bitsandbytes as bnb
# print(f"bitsandbytes版本: {bnb.__version__}")

In [31]:
# 验证QLoRA功能是否可用
import torch
import bitsandbytes as bnb

# 创建一个简单的4-bit量化线性层
try:
    layer = bnb.nn.Linear4bit(10, 10)
    test_input = torch.randn(1, 10).cuda()
    output = layer(test_input)
    print("✅ QLoRA测试成功! 4-bit量化正常工作")
except Exception as e:
    print(f"❌ QLoRA测试失败: {str(e)}")


The following directories listed in your path were found to be non-existent: {PosixPath('/sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events')}
The following directories listed in your path were found to be non-existent: {PosixPath('http'), PosixPath('8013'), PosixPath('//172.28.0.1')}
The following directories listed in your path were found to be non-existent: {PosixPath('//colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/gpu-t4-s-eoak20sgw5ru --tunnel_background_save_delay=10s --tunnel_periodic_background_save_frequency=30m0s --enable_output_coalescing=true --output_coalescing_required=true --enable_kernel_event_logging=true '), PosixPath('--logtostderr --listen_host=172.28.0.12 --target_host=172.28.0.12 --tunnel_background_save_url=https')}
The following directories listed in your path were found to be non-existent: {PosixPath('/datalab/web/pyright/typeshed-fallback/stdlib,/usr/local/lib/python3.10/dist-packages')}
The following dir

RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [17]:
# 全面验证bitsandbytes是否正确支持CUDA
import torch
import bitsandbytes as bnb
import subprocess
import os
from IPython.display import display, HTML

print(f"bitsandbytes版本: {bnb.__version__}")
print(f"CUDA版本: {torch.version.cuda}")
print(f"PyTorch版本: {torch.__version__}")

# 检查CUDA设备
if torch.cuda.is_available():
    print(f"检测到CUDA设备: {torch.cuda.get_device_name(0)}")
    print(f"CUDA可用内存: {torch.cuda.mem_get_info()[0]/1e9:.2f} GB")
else:
    print("❌ 未检测到CUDA设备")

False

The following directories listed in your path were found to be non-existent: {PosixPath('/sys/fs/cgroup/memory.events /var/colab/cgroup/jupyter-children/memory.events')}
The following directories listed in your path were found to be non-existent: {PosixPath('http'), PosixPath('8013'), PosixPath('//172.28.0.1')}
The following directories listed in your path were found to be non-existent: {PosixPath('//colab.research.google.com/tun/m/cc48301118ce562b961b3c22d803539adc1e0c19/gpu-t4-s-eoak20sgw5ru --tunnel_background_save_delay=10s --tunnel_periodic_background_save_frequency=30m0s --enable_output_coalescing=true --output_coalescing_required=true --enable_kernel_event_logging=true '), PosixPath('--logtostderr --listen_host=172.28.0.12 --target_host=172.28.0.12 --tunnel_background_save_url=https')}
The following directories listed in your path were found to be non-existent: {PosixPath('/datalab/web/pyright/typeshed-fallback/stdlib,/usr/local/lib/python3.10/dist-packages')}
The followi


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [16]:
!pip install bitsandbytes==0.41.1

Collecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl.metadata (9.8 kB)
Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.1


In [18]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [20]:
# 重新导入并测试
import importlib
import bitsandbytes as bnb
importlib.reload(bnb)
print(f"bitsandbytes CUDA支持: {bnb.cuda_setup.get_compute_capability() is not None}")

ImportError: cannot import name 'pack_dict_to_tensor' from 'bitsandbytes.utils' (/usr/local/lib/python3.11/dist-packages/bitsandbytes/utils.py)

## 训练结束后的模型评估（可选）

如果您在脚本执行过程中选择了跳过评估，您可以在这里手动执行评估。

In [None]:
# 运行评估脚本
!./scripts/run_evaluation.sh small qlora false

## 模型合并（可选）

如果您想将LoRA权重合并到基础模型中，可以运行以下命令：

In [None]:
# 合并模型
!python scripts/save_merged_model.py --model_size small --method qlora

In [None]:
# 下载训练好的模型（可选）
from google.colab import files

# 将模型文件夹压缩
!tar -czvf phi_2.7b-instruction-qlora.tar.gz models/phi_2.7b-instruction-qlora/

# 下载压缩后的模型文件
files.download('phi_2.7b-instruction-qlora.tar.gz')