In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes torch evaluate scikit-learn
!pip install -q transformers datasets accelerate peft bitsandbytes evaluate


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model
import evaluate





In [None]:
from datasets import load_dataset

dataset = load_dataset("tweet_eval", "sentiment")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [None]:
tokenizer_cls = AutoTokenizer.from_pretrained("distilbert-base-uncased")


Tokenizer (ใช้ร่วม Full + LoRA)

In [None]:
def tokenize_cls(example):
    return tokenizer_cls(
        example["text"],
        truncation=True,
        padding=False,
        max_length=128
    )

tokenized_cls_ds = dataset.map(tokenize_cls, batched=True)
tokenized_cls_ds = tokenized_cls_ds.rename_column("label", "labels")
tokenized_cls_ds.set_format("torch")


Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_steps=100,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
)


MODEL 1: Full Fine-tuning

In [None]:
model_full = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_full = Trainer(
    model=model_full,
    args=training_args,
    train_dataset=tokenized_cls_ds["train"].shuffle(seed=42).select(range(500)),
    eval_dataset=tokenized_cls_ds["validation"].shuffle(seed=42).select(range(500)),
    tokenizer=tokenizer_cls,
    data_collator=DataCollatorWithPadding(tokenizer_cls),
)

trainer_full.train()


  trainer_full = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mphoomwichachai[0m ([33mphoomwichachai-bangkok-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


TrainOutput(global_step=63, training_loss=1.0099868774414062, metrics={'train_runtime': 231.7587, 'train_samples_per_second': 2.157, 'train_steps_per_second': 0.272, 'total_flos': 5709653684136.0, 'train_loss': 1.0099868774414062, 'epoch': 1.0})

MODEL 2: LoRA (PEFT)

In [None]:
model_lora = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config_cls = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    task_type="SEQ_CLS"
)

model_lora = get_peft_model(model_lora, lora_config_cls)
model_lora.print_trainable_parameters()


trainable params: 740,355 || all params: 67,696,134 || trainable%: 1.0936


In [None]:
trainer_lora = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_cls_ds["train"].shuffle(seed=42).select(range(500)),
    eval_dataset=tokenized_cls_ds["validation"].shuffle(seed=42).select(range(500)),
    tokenizer=tokenizer_cls,
    data_collator=DataCollatorWithPadding(tokenizer_cls),
)

trainer_lora.train()


  trainer_lora = Trainer(


Step,Training Loss


TrainOutput(global_step=63, training_loss=1.0403854127914187, metrics={'train_runtime': 182.9968, 'train_samples_per_second': 2.732, 'train_steps_per_second': 0.344, 'total_flos': 5807682608976.0, 'train_loss': 1.0403854127914187, 'epoch': 1.0})

MODEL 3: QLoRA (IMPORTANT PART)

In [None]:
# from transformers import AutoTokenizer

# tokenizer_qlora = AutoTokenizer.from_pretrained("gpt2")

# # กำหนด pad_token เป็น eos_token
# tokenizer_qlora.pad_token = tokenizer_qlora.eos_token

# def tokenize_fn(example):
#     return tokenizer_qlora(
#         example["text"],
#         padding="max_length",
#         truncation=True,
#         max_length=64
#     )

# tokenized_qlora_ds = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

In [None]:
# tokenizer_qlora.pad_token = tokenizer_qlora.eos_token

# ไม่ต้อง add_special_tokens
# tokenizer_qlora.add_special_tokens({'pad_token': '[PAD]'}) <-- ห้ามทำแบบนี้ถ้าไม่เพิ่ม embedding


In [None]:
# def format_prompt(example):
#     label_map = {0: "negative", 1: "neutral", 2: "positive"}
#     prompt = f"Text: {example['text']}\nSentiment: "
#     answer = label_map[example['label']]
#     full_text = prompt + answer

#     tokens = tokenizer_qlora(
#         full_text,
#         truncation=True,
#         padding="max_length",
#         max_length=128
#     )
#     # labels = input_ids (สำหรับ causal LM)
#     tokens["labels"] = tokens["input_ids"].copy()
#     return tokens



In [None]:
# tokenized_qlora_ds = tokenized_qlora_ds.map(format_prompt, batched=False)



Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# from transformers import DataCollatorWithPadding

# data_collator = DataCollatorWithPadding(tokenizer_qlora, padding=True)


In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )

# model_qlora = AutoModelForCausalLM.from_pretrained(
#     "gpt2",
#     quantization_config=bnb_config,
#     device_map="auto"
# )


In [None]:
# lora_config_qlora = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     target_modules=["c_attn"],
#     lora_dropout=0.1,
#     task_type="CAUSAL_LM"
# )

# model_qlora = get_peft_model(model_qlora, lora_config_qlora)
# model_qlora.print_trainable_parameters()


In [None]:
# trainer_qlora = Trainer(
#     model=model_qlora,
#     args=training_args,
#     train_dataset=tokenized_qlora_ds["train"],
#     tokenizer=tokenizer_qlora,
#     data_collator=DataCollatorWithPadding(tokenizer_qlora),
# )

# trainer_qlora.train()


In [None]:
# ====== 1️⃣ Import libraries ======
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig

# ====== 2️⃣ Load dataset ======
# ตัวอย่าง: ใช้ SST2 (Sentiment) จาก HuggingFace
dataset = load_dataset("glue", "sst2")

# ====== 3️⃣ Load tokenizer ======
tokenizer_qlora = AutoTokenizer.from_pretrained("gpt2")
tokenizer_qlora.pad_token = tokenizer_qlora.eos_token  # ตั้ง pad_token

# ====== 4️⃣ Tokenize dataset ======
def tokenize_fn(example):
    return tokenizer_qlora(
        example["sentence"],  # ใช้ field "sentence" ของ SST2
        truncation=True,
        padding="max_length",
        max_length=64
    )

tokenized_ds = dataset.map(tokenize_fn, batched=True)
# ลบตัวอย่างที่ label = -1
tokenized_ds = tokenized_ds.filter(lambda x: x["label"] != -1)

# ====== 5️⃣ Map labels to text & prepare for causal LM ======
label_map = {0: "negative", 1: "positive"}

def format_prompt(example):
    prompt = f"Text: {example['sentence']}\nSentiment: "
    answer = label_map[example['label']]
    full_text = prompt + answer

    tokens = tokenizer_qlora(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=64
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_ds = tokenized_ds.map(format_prompt, batched=False)

# ====== 6️⃣ Prepare model with 4-bit quantization ======
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_qlora = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    quantization_config=bnb_config,
    device_map="auto"
)

# ====== 7️⃣ Setup LoRA ======
lora_config_qlora = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # GPT2 attention projection
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)

model_qlora = get_peft_model(model_qlora, lora_config_qlora)
model_qlora.print_trainable_parameters()

# ====== 8️⃣ Data collator ======
data_collator = DataCollatorWithPadding(tokenizer_qlora, padding=True)

# ====== 9️⃣ Training arguments ======
training_args = TrainingArguments(
    output_dir="./qlora_gpt2",
    per_device_train_batch_size=512,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    learning_rate=2e-4,
    fp16=True,
    report_to="none",
    save_total_limit=2
)

# ====== 🔟 Trainer ======
trainer_qlora = Trainer(
    model=model_qlora,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    tokenizer=tokenizer_qlora,
    data_collator=data_collator,
)

# ====== 1️⃣1️⃣ Start training ======
trainer_qlora.train()


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Filter:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

  trainer_qlora = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364
