# Parameter Efficient Fine Tuning - LoRA

The purpose of this notebook is to fine tune "distilbert" using PEFT - LoRA technique.

## Install Dependencies

In [1]:
!pip install peft
!pip install evaluate



## Import Libraries

In [2]:
import torch
import evaluate
import numpy as np

from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

## Create Dataset

In [3]:
# load imdb data
imdb_dataset = load_dataset("imdb")

# define subsample size
N = 1000

# generate indexes for random subsample
rand_idx = np.random.randint(24999, size=N)

# extract train and test data
x_train = imdb_dataset['train'][rand_idx]['text']
y_train = imdb_dataset['train'][rand_idx]['label']

x_test = imdb_dataset['test'][rand_idx]['text']
y_test = imdb_dataset['test'][rand_idx]['label']

# create new dataset
dataset = DatasetDict({
                        'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                        'validation':Dataset.from_dict({'label':y_test,'text':x_test})
                      })

In [4]:
# load dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.503

## Load Model

In [6]:
model_checkpoint = 'distilbert-base-uncased'  #  'roberta-base'

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Preprocess Data

In [8]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(                      # automatically loads the correct tokenizer associated with your pre-trained model
                                          model_checkpoint,     # model identifier
                                          add_prefix_space=True # This argument is model-specific. Some models (especially those based on subword tokenization like RoBERTa) benefit from having a space added before words that start sentences
                                        )

# add pad token if none exists
if tokenizer.pad_token is None:                         # checks whether the loaded tokenizer has a dedicated padding token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})# allows you to introduce custom special tokens to the tokenizer's vocabulary
    model.resize_token_embeddings(len(tokenizer))       # resizes the model's embedding layer to accommodate the newly added token



In [9]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"  # truncate (shorten) sequences that are longer than the maximum allowed length (max_length) by removing tokens from the left side (beginning) of the sequence.
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",            # return the tokenized outputs as NumPy arrays, which is often the preferred format for working with PyTorch models
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [10]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [11]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Model Evaluation

In [12]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [13]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

### Apply untrained model to text

In [14]:
# define list of examples
text_list = ["It was good.",
             "Not a fan, don't recommed.",
             "Better than the first one.",
             "This is not worth watching even once.",
             "This one is a pass."
            ]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")  # converts the text into a sequence of token IDs that the model can understand.
    # compute logits
    logits = model(inputs).logits                         # runs the forward pass of the model on the input data and extracts the raw output scores (logits) from the model's output
    # convert logits to label
    predictions = torch.argmax(logits)                    # converts the logits into a final prediction

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


## Apply PEFT - LoRA

In [15]:
peft_config = LoraConfig(                           # This is a configuration class for the LoRA method, which is part of the PEFT library.
                        task_type="SEQ_CLS",        # This specifies that the task is sequence classification. SEQ_CLS stands for Sequence Classification, indicating that the model will be used to classify entire sequences (like sentences or documents).
                        r=4,                        # This is the rank of the LoRA update matrices. It determines the number of trainable parameters and the capacity of the adaptation. A lower rank means fewer parameters but potentially less expressive power.
                        lora_alpha=32,              # This is the scaling factor for the LoRA update. It affects how much influence the LoRA adaptation has compared to the original weights.
                        lora_dropout=0.01,          # This sets the dropout probability for LoRA layers. Dropout helps prevent overfitting during training.
                        target_modules = ['q_lin']) # This specifies which modules in the model to apply LoRA to. In this case, it's targeting only the query linear layer ('q_lin') of the attention mechanism.

In [16]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [18]:
# hyperparameters
lr = 1e-3       # learning rate
batch_size = 4  # number of samples in each batch
num_epochs = 10 # number of times to iterate over the dataset

In [19]:
# define training arguments
training_args = TrainingArguments(          # provides a convenient way to set hyperparameters and other training configurations
    output_dir=model_checkpoint + "-lora-text-classification", # output directory
    learning_rate=lr,                       # learning rate
    per_device_train_batch_size=batch_size, # Sets the batch size per device for training
    per_device_eval_batch_size=batch_size,  # Sets the batch size per device for evaluation
    num_train_epochs=num_epochs,            # number of epochs for training
    weight_decay=0.01,                      # Applies weight decay (L2 regularization) to the model's parameters
    evaluation_strategy="epoch",            # Evaluates the model's performance on the evaluation set at the end of each epoch.
    save_strategy="epoch",                  # Saves a model checkpoint at the end of each epoch.
    load_best_model_at_end=True,            # After training, loads the best-performing model (based on evaluation metrics) from the saved checkpoints
)



In [20]:
 # creater trainer object
trainer = Trainer(
    model=model,                                  # The pre-trained model you're fine-tuning.
    args=training_args,                           # The TrainingArguments object you defined earlier.
    train_dataset=tokenized_dataset["train"],     # Your tokenized training dataset.
    eval_dataset=tokenized_dataset["validation"], # Your tokenized validation dataset.
    tokenizer=tokenizer,                          # The tokenizer associated with your model.
    data_collator=data_collator,                  # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,              # A function you've defined to calculate evaluation metrics during training.
)

# train model
trainer.train()                                   # starts the training process

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.492492,0.85
2,0.388000,0.445441,0.878
3,0.388000,0.668202,0.875
4,0.149300,0.6206,0.878
5,0.149300,0.788578,0.883
6,0.041600,0.941812,0.877
7,0.041600,0.918641,0.888
8,0.016100,0.981108,0.882
9,0.016100,0.972565,0.884
10,0.007400,1.005077,0.883


TrainOutput(global_step=2500, training_loss=0.12047329349517823, metrics={'train_runtime': 467.8911, 'train_samples_per_second': 21.372, 'train_steps_per_second': 5.343, 'total_flos': 1099854434924064.0, 'train_loss': 0.12047329349517823, 'epoch': 10.0})

## Generate prediction

In [23]:
model.to('cuda') # moving to 'mps' for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda") # moving to 'mps' for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


## Optional: Push Model to HuggingFace Hub

In [None]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [None]:
hf_name = 'nikhilkomakula' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [None]:
model.push_to_hub(model_id) # save model

In [None]:
trainer.push_to_hub(model_id) # save trainer

## Optional: Load PEFT Model from HuggingFace Hub

In [None]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)