<a href="https://colab.research.google.com/github/novalsunn123/finetune/blob/main/finetune1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.7: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.5.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [5]:
from huggingface_hub import snapshot_download

# Tải dataset vào thư mục cục bộ
snapshot_download(
    repo_id="CyberNative/Code_Vulnerability_Security_DPO",
    repo_type="dataset",
    local_dir="./Code_Vulnerability_Security_DPO",
    local_dir_use_symlinks=False
)

print("Dataset đã được tải vào ./Code_Vulnerability_Security_DPO")

.gitattributes:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

secure_programming_dpo.json:   0%|          | 0.00/6.87M [00:00<?, ?B/s]

Dataset đã được tải vào ./Code_Vulnerability_Security_DPO


In [6]:
import json
from datasets import Dataset

# Đọc tệp JSONL
data = []
with open("./Code_Vulnerability_Security_DPO/secure_programming_dpo.json", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Chuyển thành Dataset
reasoning_dataset = Dataset.from_list(data)

print(reasoning_dataset)

Dataset({
    features: ['lang', 'vulnerability', 'system', 'question', 'chosen', 'rejected'],
    num_rows: 4656
})


In [7]:
def generate_conversation(examples):
    lang  = examples["lang"]
    vulnerability = examples["vulnerability"]
    question  = examples["question"]
    chosen  = examples["chosen"]
    rejected = examples["rejected"]
    conversations = []
    for lang, vuln, question, chosen, rejected in zip(lang, vulnerability, question, chosen, rejected):
        conversations.append([
            {"role": "system", "content": f"You are a {lang} expert. Be aware of vulnerabilities like: {vuln}"},  # Cung cấp ngữ cảnh cho trợ lý
            {"role": "user", "content": question},  # Người dùng đặt câu hỏi
            {"role": "assistant", "content": chosen},  # Trợ lý đưa giải pháp đúng
            {"role": "user", "content": "What could go wrong if I write the code differently?"},  # Người dùng hỏi thêm
            {"role": "assistant", "content": f"If you write it like this:\n{rejected}\nIt could lead to {vuln} because the destination array is too small."}
        ])
    return { "conversations": conversations, }

In [8]:
reasoning_conversations = tokenizer.apply_chat_template(
    reasoning_dataset.map(generate_conversation, batched = True)["conversations"],
    tokenize = False,
)

Map:   0%|          | 0/4656 [00:00<?, ? examples/s]

In [9]:
reasoning_conversations[0]

'<|im_start|>system\nYou are a c++ expert. Be aware of vulnerabilities like: Improper memory management in C++ can lead to buffer overflow vulnerabilities.<|im_end|>\n<|im_start|>user\nWrite a c++ code that defines a function named \'copyString\' which takes two character pointers as arguments. The first pointer points to the source string and the second one points to the destination string. This function should copy the content of the source string into the destination string. In the main function, declare two character arrays, one as the source string and another as the destination string. Call the \'copyString\' function with these two arrays as arguments.<|im_end|>\n<|im_start|>assistant\n```c++\n#include <cstring>\n\nvoid copyString(char* dest, const char* src) {\n    while ((*dest++ = *src++) != \'\\0\');\n}\n\nint main() {\n    char source[10] = "Hello!";\n    char destination[10]; // Make sure the destination array is large enough to hold the source string\n    copyString(desti

In [10]:
import pandas as pd
from datasets import Dataset

# Chuyển reasoning_conversations thành pandas Series
data = pd.Series(reasoning_conversations)
data.name = "text"

# Tạo Dataset từ reasoning_conversations
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed=3407)

print(combined_dataset)

Dataset({
    features: ['text'],
    num_rows: 4656
})


In [11]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,  # Đảm bảo model đã được tải (ví dụ: Qwen3-4B)
    tokenizer=tokenizer,  # Đảm bảo tokenizer đã được tải
    train_dataset=combined_dataset,  # Dataset đã chuẩn bị trước đó
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",
        output_dir="./qwen3-4b-finetuned",  # Thư mục lưu mô hình sau huấn luyện
        save_strategy="steps",  # Lưu checkpoint sau mỗi số bước
        save_steps=10,  # Lưu sau mỗi 10 bước (vì max_steps=30, sẽ có 3 checkpoint)
    ),
)

# Bắt đầu huấn luyện
trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/4656 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,656 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288/4,000,000,000 (1.65% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.8681
2,2.1343
3,2.1185
4,1.8135
5,1.7818
6,1.299
7,1.1529
8,1.0109
9,0.8583
10,0.9356


TrainOutput(global_step=30, training_loss=0.8820606539646785, metrics={'train_runtime': 224.4643, 'train_samples_per_second': 1.069, 'train_steps_per_second': 0.134, 'total_flos': 2431240479301632.0, 'train_loss': 0.8820606539646785})

In [21]:
messages = [
            {"role": "system", "content": "You are a c++ expert. Be aware of vulnerabilities like: Improper memory management in C++ can lead to buffer overflow vulnerabilities."},
            {"role": "user", "content": "Write a c++ code that defines a function named 'copyString' which takes two character pointers as arguments. The first pointer points to the source string and the second one points to the destination string. This function should copy the content of the source string into the destination string. In the main function, declare two character arrays, one as the source string and another as the destination string. Call the 'copyString' function with these two arrays as arguments."},

]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = True, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

```c++
#include <iostream>
#include <cstring>

void copyString(char* dest, const char* src) {
    std::strcpy(dest, src);
}

int main() {
    char source[] = "Hello, World!";
    char destination[30];

    copyString(destination, source);

    std::cout << destination << std::endl;

    return 0;
}
```<|im_end|>


In [22]:
messages = [
      {"role": "user", "content": "What could go wrong if I write the code differently?"},

]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = True, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

```c
#include<stdio.h>
#include<stdlib.h>

int main() {
    int *a = (int*)malloc(10 * sizeof(int));
    if (a == NULL) {
        printf("Memory allocation failed.\n");
        return 1;
    }
    for (int i = 0; i < 10; i++) {
        scanf("%d", &a[i]);
    }
    printf("The sum is %d.\n", a[10]);
    free(a);
    return 0;
}
```
```c
#include<stdio.h>
#include<stdlib.h>

int main() {
    int *a = (int*)malloc(10 * sizeof(int));
    if (a == NULL) {
        printf("Memory allocation failed.\n");
        return 1;
    }
    for (int i = 0; i < 10; i++) {
        scanf("%d", &a[i]);
    }
    printf("The sum is %d.\n", a[10]);
    free(a);
    return 0;
}
```<|im_end|>


In [25]:
import os

# Đường dẫn đến thư mục
output_dir = "./qwen3-4b-finetuned"

# Kiểm tra xem thư mục có tồn tại không
if os.path.exists(output_dir):
    print(f"Thư mục {output_dir} tồn tại. Nội dung:")
    # Liệt kê các thư mục con và tệp
    for item in os.listdir(output_dir):
        print(item)
        # Nếu là thư mục con (như checkpoint-10), liệt kê nội dung bên trong
        item_path = os.path.join(output_dir, item)
        if os.path.isdir(item_path):
            print(f"  Nội dung của {item}:")
            for sub_item in os.listdir(item_path):
                print(f"    {sub_item}")
else:
    print(f"Thư mục {output_dir} không tồn tại. Có thể huấn luyện chưa hoàn thành.")

Thư mục ./qwen3-4b-finetuned tồn tại. Nội dung:
checkpoint-10
  Nội dung của checkpoint-10:
    scaler.pt
    rng_state.pth
    vocab.json
    optimizer.pt
    added_tokens.json
    adapter_model.safetensors
    tokenizer_config.json
    adapter_config.json
    scheduler.pt
    merges.txt
    tokenizer.json
    README.md
    trainer_state.json
    special_tokens_map.json
    training_args.bin
checkpoint-20
  Nội dung của checkpoint-20:
    scaler.pt
    rng_state.pth
    vocab.json
    optimizer.pt
    added_tokens.json
    adapter_model.safetensors
    tokenizer_config.json
    adapter_config.json
    scheduler.pt
    merges.txt
    tokenizer.json
    README.md
    trainer_state.json
    special_tokens_map.json
    training_args.bin
checkpoint-30
  Nội dung của checkpoint-30:
    scaler.pt
    rng_state.pth
    vocab.json
    optimizer.pt
    added_tokens.json
    adapter_model.safetensors
    tokenizer_config.json
    adapter_config.json
    scheduler.pt
    merges.txt
    tokenizer.

In [26]:
# Lưu LoRA adapters từ checkpoint cuối cùng
model.save_pretrained("./qwen3-4b-finetuned/checkpoint-30")  # Local saving
tokenizer.save_pretrained("./qwen3-4b-finetuned/checkpoint-30")
# model.push_to_hub("your_name/qwen3-4b-finetuned", token="...")  # Online saving
# tokenizer.push_to_hub("your_name/qwen3-4b-finetuned", token="...")  # Online saving

# Tải lại LoRA adapters để sử dụng (bật False thành True nếu cần)
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="./qwen3-4b-finetuned/checkpoint-30",  # Đường dẫn đến checkpoint
        max_seq_length=2048,
        load_in_4bit=True,
    )

In [28]:
import os

# Đường dẫn đến thư mục
output_dir = "./qwen3-4b-finetuned"

# Kiểm tra xem thư mục có tồn tại không
if os.path.exists(output_dir):
    print(f"Thư mục {output_dir} tồn tại. Nội dung:")
    for item in os.listdir(output_dir):
        print(f"  {item}")
else:
    print(f"Thư mục {output_dir} không tồn tại. Có thể lưu LoRA adapters chưa hoàn thành.")

Thư mục ./qwen3-4b-finetuned tồn tại. Nội dung:
  checkpoint-10
  checkpoint-20
  checkpoint-30
