# Full-fine tuning BERT with glue-mrpc data

In [1]:
# Install the required version of datasets in case you have an older version
# You will need to choose "Kernel > Restart Kernel" from the menu after executing this cell
# ! pip install -q "datasets==2.15.0"

In [9]:
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
    DataCollatorWithPadding, 
    TrainingArguments, 
    Trainer)

import torch

from torch.utils.data import DataLoader

model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Let's look at the first example!

In [10]:
from datasets import load_dataset

dataset_name = "glue"
task = "mrpc"
dataset = load_dataset(dataset_name, task)

In [11]:
feature_names = list(dataset['train'].features)
feature_names[0], feature_names[1]

('sentence1', 'sentence2')

## Pre-process datasets

Now we are going to process our datasets by converting all the text into tokens for our models.

In [12]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
from peft import LoraConfig
# Initialize LoRA configuration
config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.01,
    bias="none",
)

from peft import get_peft_model
lora_model = get_peft_model(model, config)
lora_model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, o

In [14]:
def tokenize_function(examples):
    return tokenizer(examples[feature_names[0]], examples[feature_names[1]], 
                     padding="max_length", truncation=True, return_tensors="pt").to(device)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [15]:
for idx, example in enumerate(tokenized_dataset["validation"]):
    if idx == 0:      
        inputs = tokenizer(example[feature_names[0]], example[feature_names[1]], 
                           padding=True, truncation=True, return_tensors="pt").to(device)
        print(inputs)

{'input_ids': tensor([[  101,  2002,  2056,  1996,  9440,  2121,  7903,  2063, 11345,  2449,
          2987,  1005,  1056,  4906,  1996,  2194,  1005,  1055,  2146,  1011,
          2744,  3930,  5656,  1012,   102,  1000,  1996,  9440,  2121,  7903,
          2063, 11345,  2449,  2515,  2025,  4906,  2256,  2146,  1011,  2744,
          3930,  5656,  1012,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [16]:
from sklearn.metrics import accuracy_score
import numpy as np
import torch

def evaluate(model, dataset, tokenizer):
    lora_model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for example in dataset:
            inputs = tokenizer(example[feature_names[0]], example[feature_names[1]], 
                           padding=True, truncation=True, return_tensors="pt").to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            predictions.append(np.argmax(logits.cpu().numpy(), axis=1).flatten()[0])
            references.append(example["label"])
    accuracy = accuracy_score(references, predictions)
    print(f"Accuracy: {accuracy}")

# Correct usage assuming dataset is processed to handle individual examples
evaluate(lora_model, tokenized_dataset["validation"], tokenizer) # .select(range(100))

Accuracy: 0.678921568627451


## Load and set up the model

In this case we are doing a full fine tuning, so we will want to unfreeze all parameters.

In [17]:
lora_model.print_trainable_parameters()
#lora_model.save_pretrained("bert-lora")
#from peft import AutoPeftModelForCausalLM
#lora_model = AutoPeftModelForCausalLM.from_pretrained("bert-lora")

trainable params: 294,912 || all params: 109,778,690 || trainable%: 0.2686423020715587


In [18]:
print(lora_model)

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, o

## Let's train it!

Now it's time to train our model. We'll use the `Trainer` class.

First we'll define a function to compute our accuracy metreic then we make the `Trainer`.

In this instance, we will fill in some of the training arguments


In [None]:
# Assuming `tokenized_dataset` is already prepared and is a Hugging Face dataset
# train_dataset = tokenized_dataset["train"] #.select(range(1000))  # Using a subset for demonstration
# eval_dataset = tokenized_dataset["validation"] #.select(range(100))
# 
# # Convert to PyTorch tensors and create DataLoader (if not using Trainer directly)
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# 
# train_loader = DataLoader(train_dataset, batch_size=8)
# eval_loader = DataLoader(eval_dataset, batch_size=8)

In [None]:
# from transformers import AdamW, get_scheduler
# optimizers=AdamW(model.parameters(), lr=1e-5, betas=(0.5, 0.999))
# for param in model.parameters():
#     param.requires_grad = True

In [19]:
lora_model

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, o

In [23]:
# for name, param in lora_model.named_parameters():
#     if param.requires_grad:
#         print(f"Parameter {name} does not require gradients.")

In [26]:
# Note: This step assumes that the concept of PEFT involves selective fine-tuning or using adapters.
# Actual implementation may vary based on available libraries or updates to Hugging Face Transformers.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

training_args = TrainingArguments(
    output_dir="./results",          # Output directory for saving the model
    num_train_epochs=5,             # Total number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir="./logs",            # Directory for storing logs
    evaluation_strategy="epoch",     # Evaluate each `logging_steps`
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# For actual PEFT, you might need to adjust model parameters or use an adapter here.
trainer = Trainer(
    model=model, #lora_model
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Using a subset for demonstration: .select(range(1000))
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.444684,0.82598
2,No log,0.439338,0.823529
3,No log,0.432878,0.82598
4,No log,0.427932,0.823529
5,0.435300,0.424647,0.828431




TrainOutput(global_step=575, training_loss=0.43229012199070144, metrics={'train_runtime': 216.0617, 'train_samples_per_second': 84.883, 'train_steps_per_second': 2.661, 'total_flos': 4842072238940160.0, 'train_loss': 0.43229012199070144, 'epoch': 5.0})

In [24]:
# For actual PEFT, you might need to adjust model parameters or use an adapter here.
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Using a subset for demonstration: .select(range(1000))
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

## Evaluate the model

Evaluating the model is as simple as calling the evaluate method on the trainer object. This will run the model on the test set and compute the metrics we specified in the compute_metrics function.

In [27]:
# Show the performance of the model on the test set
# What do you think the evaluation accuracy will be?
trainer.evaluate()



{'eval_loss': 0.4246467053890228,
 'eval_accuracy': 0.8284313725490197,
 'eval_runtime': 2.1693,
 'eval_samples_per_second': 188.075,
 'eval_steps_per_second': 5.993,
 'epoch': 5.0}

### View the results

Let's look at a few examples

In [28]:
# Make a dataframe with the predictions and the text and the labels
import pandas as pd

items_for_manual_review = tokenized_dataset["test"].select(
    [0, 1, 10, 22, 31, 43, 100, 292, 350, 448, 487, 550]
)

results = trainer.predict(items_for_manual_review)

In [29]:
items_for_manual_review

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12
})

In [30]:
results.predictions.argmax(axis=1)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [31]:
df = pd.DataFrame(
    {
        "sentence1": [item["sentence1"] for item in items_for_manual_review],
        "sentence2": [item["sentence2"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)
# Show all the cell
pd.set_option("display.max_colwidth", None)
df

Unnamed: 0,sentence1,sentence2,predictions,labels
0,"PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .",Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .,1,1
1,The world 's two largest automakers said their U.S. sales declined more than predicted last month as a late summer sales frenzy caused more of an industry backlash than expected .,Domestic sales at both GM and No. 2 Ford Motor Co. declined more than predicted as a late summer sales frenzy prompted a larger-than-expected industry backlash .,1,1
2,Consumers would still have to get a descrambling security card from their cable operator to plug into the set .,"To watch pay television , consumers would insert into the set a security card provided by their cable service .",1,1
3,""" Senator Clinton should be ashamed of herself for playing politics with the important issue of homeland security funding , "" he said .",""" She should be ashamed of herself for playing politics with this important issue , "" said state budget division spokesman Andrew Rush .",1,1
4,The daily Hurriyet said the raid aimed to foil a Turkish plot to kill an unnamed senior Iraqi official in Kirkuk .,"The daily Hurriyet said the raid aimed to foil a Turkish plot to kill an unnamed senior Iraqi Kurdish official in Kirkuk , but Gul has denied any Turkish plot .",1,0
5,"Last year , Congress passed similar , though less expensive , buyout legislation for peanut farmers , ending that Depression-era program .","Last year , Congress passed similar , though less expensive , buyout legislation for peanut farmers to end that program that also dated from the Depression years .",1,1
6,"Licensing revenue slid 21 percent , however , to $ 107.6 million .","License sales , a key measure of demand , fell 21 percent to $ 107.6 million .",1,1
7,He claimed Red Hat and the Free Software Foundation with trying to undermine U.S. copyright and patent law .,"In his letter , McBride charges the Free Software Foundation and Red Hat with trying to undermine U.S. copyright laws .",1,1
8,Squyres is principal investigator for the Athena payload - a collection of science instruments carted by each rover .,"Steve Squyres , a Cornell University scientist , is principal investigator for the missions ' science instruments .",1,1
9,The research firm earlier had forecast an increase of 4.9 percent .,The firm had predicted earlier this year a 4.9 percent increase .,1,1
