<a href="https://colab.research.google.com/github/phunghxhcmute/GenerativeAI/blob/main/Apply_Lightweight_Fine_Tuning_to_a_Foundation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers \
accelerate \
evaluate \
tqdm \
datasets



In [None]:
!pip install peft



# Loading and Evaluating a Foundation Model

In [None]:
import argparse
import os

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from transformers import Trainer, TrainingArguments
from tqdm import tqdm
import numpy as np

In [None]:
batch_size = 8
model_name_or_path = "gpt2"
task = "mrpc"
peft_type = PeftType.LORA
device = "cuda"
num_epochs = 2

## Load dataset, metric

In [None]:
from datasets import load_metric

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
    }

  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
padding_side = "left"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=512,padding="max_length")
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")


def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


tokenized_datasets = tokenized_datasets.map(collate_fn, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Load pretrain model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=batch_size,
    do_eval=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets["validation"], compute_metrics=compute_metrics
)

## Evaluate pretrain model

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.6439543962478638, 'eval_accuracy': 0.6151960784313726, 'eval_f1': 0.530845570732779, 'eval_precision': 0.5349556416343871, 'eval_recall': 0.5310911061098608, 'eval_runtime': 17.7376, 'eval_samples_per_second': 23.002, 'eval_steps_per_second': 2.875}


# Performing Parameter-Efficient Fine-Tuning

In [None]:
from peft import LoftQConfig, LoraConfig, get_peft_model

## LoRa config

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name_or_path)

# LoRA parameters
config.lora = True
config.lora_r = 8  # Rank of LoRA matrices
config.lora_alpha = 16  # Scale for LoRA
config.apply_lora = True
config.apply_lora_to_layers = [0, 1, 2, 3]  # Layers to which LoRA is applied
config.apply_lora_to_tasks = ["attention", "mlp"]  # Components to apply LoRA

# Re-initialize the model with the new configuration
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=config)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

## Train model with PEFT technique

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics  # Optional: Define a compute_metrics function for evaluation
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.622849,0.708333,0.516487,0.717263,0.551263


TrainOutput(global_step=459, training_loss=0.6987177697142225, metrics={'train_runtime': 415.232, 'train_samples_per_second': 8.834, 'train_steps_per_second': 1.105, 'total_flos': 958436481171456.0, 'train_loss': 0.6987177697142225, 'epoch': 1.0})

## Save model

In [None]:
model_save_path = "./trained_model"
model.save_pretrained(model_save_path)

## Load saved model

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

## Evalute new model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,  # Reuse the training arguments from the training phase
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics  # Optional: If you have a metric computation function
)


In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.62284916639328, 'eval_accuracy': 0.7083333333333334, 'eval_f1': 0.516486580690136, 'eval_precision': 0.717263306641545, 'eval_recall': 0.551262815703926, 'eval_runtime': 15.2402, 'eval_samples_per_second': 26.771, 'eval_steps_per_second': 3.346}
