In [1]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    DataCollatorForSeq2Seq,
    TextStreamer,
)
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import infer_auto_device_map
import random
import numpy as np
import torch

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/Qwen2.5-0.5B"

# max_memory={0: "1GiB", 1: "38GiB"}

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # load_in_8bit=True,
    # torch_dtype=torch.float16,
    device_map = "auto", 
    # max_memory=max_memory,
)
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    device_map = "auto", 
    # max_memory=max_memory,
)

In [4]:
# 1. Configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True 
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

instruction = "Create a function to compute the sum of a list of integers."
input = "[1, 2, 3, 4, 5]"

In [5]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
print(EOS_TOKEN)
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

<|endoftext|>


In [6]:
dataset = load_dataset("json", data_files="/workspace/LLM-finetune/codeLLM/syntheticTraining/synthetic/Qwen2.5-Coder-1.5B-Instruct.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [7]:
dataset[0]

{'instruction': '5 seconds after the execution of the code, displays a message "Hello World!" in the console.\nHere is how you can achieve this using JavaScript:\n\n```javascript\nsetTimeout(function() {\n    console.log("Hello World!");\n}, 500);',
 'input': '',
 'output': '```\n\nThis will wait for 5 seconds before executing the function inside `setTimeout`. Inside that function, it logs the string `"Hello World!"` to the console. \n\nPlease note that this solution uses JavaScript. If you are looking for a solution in Python, you would need to use the `time.sleep()` function from the `time` module. Here\'s an example:\n\n```',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n5 seconds after the execution of the code, displays a message "Hello World!" in the console.\nHere is how you can achieve this using JavaScript:\n\n```javascript\nsetTimeout(

In [8]:
model.train() # put model back into training mode
# model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [9]:
model.print_trainable_parameters()
# 获取当前模型占用的 GPU显存（差值为预留给 PyTorch 的显存）
memory_footprint_bytes = model.get_memory_footprint()
memory_footprint_mib = memory_footprint_bytes / (1024 ** 3)  # 转换为 GB

print(f"{memory_footprint_mib:.2f}GB")

trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359
1.85GB


In [10]:
model_dir = "/workspace/LLM-finetune/codeLLM/outputs"
model_id  = "Qwen-0.5B"
training_args = SFTConfig(
        output_dir=f"{model_dir}/{model_id}-synthetic-Qwen2.5-Coder-1.5B-Instruct",  # 指定模型输出和保存的目录
        max_steps=500,
        per_device_train_batch_size=2,  # 每个设备上的训练批量大小
        # num_train_epochs=1,  # 训练的总轮数
        logging_steps=20,  # 指定日志记录的步长，用于跟踪训练进度
        learning_rate=2e-4,  # 学习率
        max_seq_length=1024,
        # fp16=True,  # 启用混合精度训练，可以提高训练速度，同时减少内存使用
        save_strategy="steps",
        warmup_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        # report_to="wandb",
        save_safetensors=True,
        seed=3407,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args=training_args,
)

nvidia-smi: line 14: /usr/bin/data-set: No such file or directory

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


In [11]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()



GPU = NVIDIA A100-PCIE-40GB. Max memory = 39.394 GB.
1.959 GB of memory reserved.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m446146024[0m ([33m446146024-sun-yat-sen-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,1.0256
40,0.5153
60,0.5149
80,0.5631
100,0.4904
120,0.4727
140,0.4407
160,0.4842
180,0.4354
200,0.4854


In [12]:
lora_model = trainer.model

In [13]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction, # instruction
        input, # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

lora_model.eval()
with torch.no_grad():
    print(tokenizer.decode(lora_model.generate(**inputs, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Create a function to compute the sum of a list of integers.

### Input:
[1, 2, 3, 4, 5]

### Response:
```python
def sum_list(numbers):
    total = 0
    for num in numbers:
        total += num
    return total

# Test the function
print(sum_list([1, 2, 3, 4, 5]))
```


In [14]:
new_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/synthetic/Qwen2.5-0.5B-synthetic_Qwen15B-Coder-Instruct"
trainer.save_model(new_model)