In [None]:
!pip install -U trl transformers datasets bitsandbytes peft wandb accelerate

In [None]:
import gc
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer, DPOConfig, SFTConfig, SFTTrainer
import bitsandbytes as bnb

from tqdm import tqdm
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [None]:
model_name = f"meta-llama/Llama-3.2-1B-Instruct"
cache_dir = "./cache"

In [None]:
# Model to fine-tune

# model_name = "thainq107/Llama-3.2-1B-Instruct-sft" f"meta-llama/Llama-3.2-1B-Instruct"
# cache_dir = "./cache"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map={"" : torch.cuda.current_device()},
    token="###",
    cache_dir=cache_dir,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
base_model.config.use_cache = False
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
       

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, cache_dir=cache_dir, trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
dataset = load_dataset("thainq107/Vi-Alpaca-Preference")
dataset, dataset["train"][0]

(DatasetDict({
     train: Dataset({
         features: ['id', 'question', 'chosen', 'rejected'],
         num_rows: 65017
     })
     test: Dataset({
         features: ['id', 'question', 'chosen', 'rejected'],
         num_rows: 2000
     })
 }),
 {'id': 'alpaca-7294',
  'question': 'Xác định và sửa lỗi ngữ pháp.\n\nTôi đã đi đến cửa hàng.',
  'chosen': 'Không có lỗi ngữ pháp. Câu này đã chính xác.',
  'rejected': 'Câu này không có lỗi ngữ pháp.'})

In [None]:
# QLoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ]
)

## **SFT**

In [None]:
def formatting_prompt_with_chat_template(example):
    conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": example["question"]},
        {"role": "assistant", "content": example["chosen"]},
    ]
    prompt = tokenizer.apply_chat_template(
        conversation, tokenize=False, add_generation_prompt=False
    )
    return prompt

In [None]:
model_name

'meta-llama/Llama-3.2-1B-Instruct'

In [None]:
hyperparameters = {
    "per_device_train_batch_size": 16,
    "gradient_accumulation_steps": 8,
    "gradient_checkpointing": True,
    "learning_rate": 3e-5,
    "logging_steps": 200,
    "num_train_epochs": 2,
    "save_strategy": "no",
    "overwrite_output_dir": True,
    "optim": "paged_adamw_8bit",
    "warmup_steps": 200,
    "bf16": True,
}
MAX_LENGTH = 512

In [None]:
# Use wandb
import wandb
wandb.init(
    project="vi-alpaca-preference",
    name="llama-3.2-1b-4bit-sft"
)


In [None]:
SFT_OUTPUT_DIR = f"Llama-3.2-1B-Instruct-sft"

sft_config = SFTConfig(
    **{ **hyperparameters, "output_dir":
       SFT_OUTPUT_DIR , "max_seq_length": MAX_LENGTH}
)
sft_trainer = SFTTrainer(
    model=base_model,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset['train'],
    formatting_func=formatting_prompt_with_chat_template
)

sft_trainer.train()



Step,Training Loss
200,1.9533
400,1.5253
600,1.4091
800,1.384
1000,1.3804


TrainOutput(global_step=1016, training_loss=1.5280118626872385, metrics={'train_runtime': 15463.4776, 'train_samples_per_second': 8.409, 'train_steps_per_second': 0.066, 'total_flos': 3.6872604907994726e+17, 'train_loss': 1.5280118626872385})

## **DPO**

In [None]:
# Model to fine-tune

model_name = "thainq107/Llama-3.2-1B-Instruct-sft"
cache_dir = "./cache"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map={"" : torch.cuda.current_device()},
    token="###",
    cache_dir=cache_dir,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
base_model.config.use_cache = False
base_model

In [None]:
def convert_to_conversational_preference_format(example):
    return {
        "id": example["id"],
        "prompt": [{"role": "system",
                    "content": "You are a helpful assistant."},
                   {"role": "user",
                    "content": example["question"]}],
        "chosen": [{"role": "assistant",
                    "content": example["chosen"]}],
        "rejected": [{"role": "assistant",
                      "content": example["rejected"]}],
    }

dpo_dataset = dataset.map(convert_to_conversational_preference_format)


In [None]:
dpo_dataset['train'][0]

{'id': 'alpaca-7294',
 'question': 'Xác định và sửa lỗi ngữ pháp.\n\nTôi đã đi đến cửa hàng.',
 'chosen': [{'content': 'Không có lỗi ngữ pháp. Câu này đã chính xác.',
   'role': 'assistant'}],
 'rejected': [{'content': 'Câu này không có lỗi ngữ pháp.',
   'role': 'assistant'}],
 'prompt': [{'content': 'You are a helpful assistant.', 'role': 'system'},
  {'content': 'Xác định và sửa lỗi ngữ pháp.\n\nTôi đã đi đến cửa hàng.',
   'role': 'user'}]}

In [None]:
hyperparameters = {
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 8,
    "gradient_checkpointing": True,
    "learning_rate": 3e-5,
    "logging_steps": 200,
    "num_train_epochs": 2,
    "save_strategy": "no",
    "overwrite_output_dir": True,
    "optim": "paged_adamw_8bit",
    "warmup_steps": 200,
    "bf16": True,
}
MAX_LENGTH = 512

In [None]:
# Use wandb
import wandb
wandb.init(
    project="vi-alpaca-preference",
    name="llama-3.2-1b-4bit-dpo"
)


In [None]:
DPO_OUTPUT_DIR = f"Llama-3.2-1B-Instruct-dpo"
dpo_args = DPOConfig(
    **{ **hyperparameters, "output_dir":
       DPO_OUTPUT_DIR, "max_length": MAX_LENGTH }
)

dpo_trainer = DPOTrainer(
    base_model,
    args=dpo_args,
    train_dataset=dpo_dataset['train'],
    processing_class=tokenizer,
    peft_config=peft_config,
)
dpo_trainer.train()




Step,Training Loss
200,0.4967
400,0.303
600,0.2951
800,0.2833
1000,0.2728
1200,0.27
1400,0.2502
1600,0.2539
1800,0.2586
2000,0.2466


TrainOutput(global_step=2032, training_loss=0.2921085744861543, metrics={'train_runtime': 37665.8776, 'train_samples_per_second': 3.452, 'train_steps_per_second': 0.054, 'total_flos': 0.0, 'train_loss': 0.2921085744861543, 'epoch': 2.0})