<a href="https://colab.research.google.com/github/meetp297/LLM_fine_tuning/blob/main/BERT_LORA_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install transformers
# !pip install datasets
# !pip install evaluate
# !pip install accelerate
# !pip install transformers[torch]
# !pip install peft

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer
from peft import LoraConfig, PeftModel, get_peft_model
import evaluate
import numpy as np

In [None]:
#Load the dataset, have a look at the dataset overview at https://huggingface.co/datasets/glue/viewer/mrpc/train

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[1]

{'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
 'label': 0,
 'idx': 1}

In [None]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
#Define token from pretrained BERT architecture using bert-base-uncased (checkpoint) as weights matrix
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Below code is just for visulazation
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# we are combining 2 sentenses together while doing tokenization, It will add input_ids, token_type_ids and attention_mask into the dataset
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [None]:
#Apply tokenize_function into our dataset with batched as True
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [None]:
# DataCollatorWithPadding is used for dynamic padding, It will automatically take maximum batch size sequence size insted of taking
# maximum model sequence size to reduce memory as well as computation.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Here you can check that using DataCollatorWithPadding, It will take sequence size of maximum batch sequence while doing padding
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [None]:

peft_config=LoraConfig(task_type="SEQ_CLS", #Sequence classification
                       r=4, #Intrinsic rank of trainable weight matrix
                       lora_alpha=32, #Similar to learning rate
                       lora_dropout=0.01, #Probability of dropout nodes
                       target_modules=['query'])

In [None]:
# Define training argument with learning_rate, batch_size, evaluation_strategy and store this in directory 'test-training'.

training_args=TrainingArguments(checkpoint+'lora_text_classification',
    learning_rate=2e-5,
    per_device_train_batch_size= 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,)

In [None]:
# Here we are defining our model from AutoModelForSequenceClassification with labels as 2.
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)
model = get_peft_model(model,peft_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.print_trainable_parameters()

trainable params: 75,266 || all params: 109,559,044 || trainable%: 0.06869902953881196


In [None]:
# Same can be done while training model and compute accuracy and f1 score for each epoch.
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# We are passing everything into the trainer with tokenized dataset and data_collator for dynamic padding
# pass the same compute_metrics function in trainer
trainer=Trainer(model, #our PEFT model
    training_args, #Hyperparameters
    train_dataset=tokenized_datasets["train"], #Training data
    eval_dataset=tokenized_datasets["validation"], #Validation data
    data_collator=data_collator, #Dynamic sequence padding
    tokenizer=tokenizer, #Tokenizer
    compute_metrics=compute_metrics, # model performance evaluation metric
                )

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6339,0.612733,0.681373,0.810496
2,0.6039,0.598395,0.683824,0.806015
3,0.5887,0.593021,0.688725,0.802488
4,0.5798,0.582981,0.693627,0.804382
5,0.5438,0.584305,0.710784,0.809677
6,0.5549,0.57683,0.710784,0.80719
7,0.5455,0.576103,0.713235,0.807882
8,0.5531,0.582202,0.720588,0.814935
9,0.5466,0.583022,0.718137,0.813008
10,0.5466,0.580323,0.720588,0.814332


TrainOutput(global_step=9170, training_loss=0.5668378707236786, metrics={'train_runtime': 445.748, 'train_samples_per_second': 82.289, 'train_steps_per_second': 20.572, 'total_flos': 1260275879143776.0, 'train_loss': 0.5668378707236786, 'epoch': 10.0})

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
# Output of every transformer is logit. Here redictions.predictions returns logit for labels 0,1 respectively. Actual output is maxmimu of these 2.
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [None]:
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# using eveluate, load dataset and on that dataset apply metric.compute to compute accuracy and f1 score
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.7132352941176471, 'f1': 0.8078817733990148}