# Project: Apply Lightweight Fine-Tuning to a Foundation Model

## Prepare the Foundation Model


### Imports

In [1]:
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import datasets
import numpy as np

BASE_MODEL = "gpt2"

  from .autonotebook import tqdm as notebook_tqdm


### Dataset and compute functions

In [2]:
def get_dataset_split(name: str, name_split: str):
    dataset = datasets.load_dataset(name,  name_split, split='train').train_test_split(
        test_size=0.2, shuffle=True, seed=23

    )
    return dataset['train'], dataset['test']


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

def predict(text: str, model, tokenizer):
    input = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():
        output = model(**input)

    logits = output.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
    return predicted_class_idx


### Load and preprocess a dataset

In [3]:
train, test = get_dataset_split('financial_phrasebank', 'sentences_66agree')

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize(batch):
    return tokenizer(batch['sentence'], return_tensors="pt", truncation=True, max_length=512, padding=True)

### Load base model

In [8]:

label2id = {"neutral": 1, "positive": 2, "negative": 0}
id2label = {1: "neutral", 2: "positive", 0: "negative"}
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

base_model.config.pad_token_id = tokenizer.pad_token_id

for param in base_model.base_model.parameters():
    param.requires_grad = False




Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train base model

In [10]:
### Train Base Model
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# We need to remove the sentence column from the datasets. Because there is a bug in the current version of the library
train_base = train.map(tokenize, batched=True).remove_columns(["sentence"])
test_base = test.map(tokenize, batched=True).remove_columns(["sentence"])

# Rename the label column to labels because the trainer expects that name
train_base = train_base.rename_column("label", "labels")
test_base = test_base.rename_column("label", "labels")

training_args = TrainingArguments(
    output_dir="./data/financial_phrasebank",
    num_train_epochs=10,
    learning_rate=2e-3,

    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,

    weight_decay=0.01,

    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=["labels"],
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_base,
    eval_dataset=test_base,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)
trainer.train()
base_evaluation = trainer.evaluate()
print(base_evaluation)
base_model.save_pretrained("gpt-lightfinetuned")


  5%|▍         | 13/270 [01:31<30:05,  7.02s/it]
 10%|█         | 27/270 [00:25<03:19,  1.22it/s]
[A
[A
[A
[A
[A
[A
                                                

[A[A                                       
 10%|█         | 27/270 [00:30<03:19,  1.22it/s]
[A
[A

{'eval_loss': 0.6310212016105652, 'eval_accuracy': 0.7132701421800948, 'eval_runtime': 4.8194, 'eval_samples_per_second': 175.127, 'eval_steps_per_second': 1.452, 'epoch': 1.0}


 20%|██        | 54/270 [00:56<02:52,  1.25it/s]
[A
[A
[A
[A
[A
[A
                                                

[A[A                                       
 20%|██        | 54/270 [01:01<02:52,  1.25it/s]
[A
[A

{'eval_loss': 0.6037357449531555, 'eval_accuracy': 0.7369668246445498, 'eval_runtime': 4.7471, 'eval_samples_per_second': 177.794, 'eval_steps_per_second': 1.475, 'epoch': 2.0}


 30%|███       | 81/270 [01:27<02:32,  1.24it/s]
[A
[A
[A
[A
[A
[A
                                                

[A[A                                       
 30%|███       | 81/270 [01:32<02:32,  1.24it/s]
[A
[A

{'eval_loss': 0.60226970911026, 'eval_accuracy': 0.735781990521327, 'eval_runtime': 4.787, 'eval_samples_per_second': 176.311, 'eval_steps_per_second': 1.462, 'epoch': 3.0}


 40%|████      | 108/270 [01:59<02:10,  1.24it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 40%|████      | 108/270 [02:04<02:10,  1.24it/s]
[A
[A

{'eval_loss': 0.5561473965644836, 'eval_accuracy': 0.759478672985782, 'eval_runtime': 4.9974, 'eval_samples_per_second': 168.887, 'eval_steps_per_second': 1.401, 'epoch': 4.0}


 50%|█████     | 135/270 [02:33<01:49,  1.23it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 50%|█████     | 135/270 [02:38<01:49,  1.23it/s]
[A
[A

{'eval_loss': 0.5363507866859436, 'eval_accuracy': 0.7725118483412322, 'eval_runtime': 4.7383, 'eval_samples_per_second': 178.122, 'eval_steps_per_second': 1.477, 'epoch': 5.0}


 60%|██████    | 162/270 [03:05<01:26,  1.25it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 60%|██████    | 162/270 [03:09<01:26,  1.25it/s]
[A
[A

{'eval_loss': 0.5357518792152405, 'eval_accuracy': 0.754739336492891, 'eval_runtime': 4.7566, 'eval_samples_per_second': 177.438, 'eval_steps_per_second': 1.472, 'epoch': 6.0}


 70%|███████   | 189/270 [03:37<01:07,  1.20it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 70%|███████   | 189/270 [03:41<01:07,  1.20it/s]
[A
[A

{'eval_loss': 0.5431651473045349, 'eval_accuracy': 0.7725118483412322, 'eval_runtime': 4.8409, 'eval_samples_per_second': 174.348, 'eval_steps_per_second': 1.446, 'epoch': 7.0}


 80%|████████  | 216/270 [04:10<00:46,  1.16it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 80%|████████  | 216/270 [04:15<00:46,  1.16it/s]
[A
[A

{'eval_loss': 0.5428694486618042, 'eval_accuracy': 0.7677725118483413, 'eval_runtime': 4.8347, 'eval_samples_per_second': 174.57, 'eval_steps_per_second': 1.448, 'epoch': 8.0}


 90%|█████████ | 243/270 [04:41<00:21,  1.25it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
 90%|█████████ | 243/270 [04:46<00:21,  1.25it/s]
[A
[A

{'eval_loss': 0.5203144550323486, 'eval_accuracy': 0.7665876777251185, 'eval_runtime': 4.7641, 'eval_samples_per_second': 177.159, 'eval_steps_per_second': 1.469, 'epoch': 9.0}


100%|██████████| 270/270 [05:13<00:00,  1.24it/s]
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                       
100%|██████████| 270/270 [05:18<00:00,  1.24it/s]
[A
[A

{'eval_loss': 0.5291668772697449, 'eval_accuracy': 0.7760663507109005, 'eval_runtime': 4.81, 'eval_samples_per_second': 175.466, 'eval_steps_per_second': 1.455, 'epoch': 10.0}


                                                 
100%|██████████| 270/270 [05:19<00:00,  1.18s/it]


{'train_runtime': 319.7794, 'train_samples_per_second': 105.479, 'train_steps_per_second': 0.844, 'train_loss': 0.5793722647207754, 'epoch': 10.0}


100%|██████████| 7/7 [00:03<00:00,  1.89it/s]

{'eval_loss': 0.5203144550323486, 'eval_accuracy': 0.7665876777251185, 'eval_runtime': 4.567, 'eval_samples_per_second': 184.804, 'eval_steps_per_second': 1.533, 'epoch': 10.0}





### Evaluate the pretrained model

In [12]:

base_model = AutoModelForSequenceClassification.from_pretrained("gpt-lightfinetuned")

test_df = pd.DataFrame(test)
test_small = test_df.sample(100)
test_small['predicted'] = test_small['sentence'].apply(lambda x: predict(x, base_model, tokenizer))
test_small['correct'] = test_small['predicted'] == test_small['label']
prediction_percentage = test_small['correct'].mean()
print(f"Accuracy: {prediction_percentage:.2f}")


Accuracy: 0.75


## Perform Lightweight Fine-Tuning

### Create a PEFT model

In [13]:
from peft import LoraConfig, get_peft_model
from peft import AutoPeftModelForCausalLM

config = LoraConfig()
number_labels = len(list(id2label.keys()))
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=number_labels,
    id2label=id2label,
    label2id=label2id
)
lora_model = get_peft_model(model, config)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

lora_model.config.pad_token_id = tokenizer.pad_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the PEFT model

In [15]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# We need to remove the sentence column from the datasets. Because there is a bug in the current version of the library
train_peft = train.map(tokenize, batched=True).remove_columns(["sentence"])
test_peft = test.map(tokenize, batched=True).remove_columns(["sentence"])

# Rename the label column to labels because the trainer expects that name
train_peft = train_peft.rename_column("label", "labels")
test_peft = test_peft.rename_column("label", "labels")

training_args = TrainingArguments(
    output_dir="./data/financial_phrasebank_peft",
    num_train_epochs=10,
    learning_rate=2e-3,

    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,

    weight_decay=0.01,

    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=["labels"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_peft,
    eval_dataset=test_peft,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)
trainer.train()
peft_evaluation = trainer.evaluate()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                
  3%|▎         | 9/270 [01:26<08:39,  1.99s/it]
[A

{'eval_loss': 0.6682124733924866, 'eval_accuracy': 0.7298578199052133, 'eval_runtime': 5.9436, 'eval_samples_per_second': 142.003, 'eval_steps_per_second': 2.355, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [02:30<08:39,  1.99s/it]
[A

{'eval_loss': 0.40280744433403015, 'eval_accuracy': 0.8447867298578199, 'eval_runtime': 5.3442, 'eval_samples_per_second': 157.929, 'eval_steps_per_second': 2.62, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [03:34<08:39,  1.99s/it]
[A

{'eval_loss': 0.37071385979652405, 'eval_accuracy': 0.8518957345971564, 'eval_runtime': 5.3252, 'eval_samples_per_second': 158.493, 'eval_steps_per_second': 2.629, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [04:37<08:39,  1.99s/it]
[A

{'eval_loss': 0.3019632399082184, 'eval_accuracy': 0.8838862559241706, 'eval_runtime': 5.3393, 'eval_samples_per_second': 158.073, 'eval_steps_per_second': 2.622, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [05:40<08:39,  1.99s/it]
[A

{'eval_loss': 0.284106969833374, 'eval_accuracy': 0.9016587677725119, 'eval_runtime': 5.2995, 'eval_samples_per_second': 159.26, 'eval_steps_per_second': 2.642, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [06:44<08:39,  1.99s/it]
[A

{'eval_loss': 0.2787133753299713, 'eval_accuracy': 0.8992890995260664, 'eval_runtime': 5.6155, 'eval_samples_per_second': 150.298, 'eval_steps_per_second': 2.493, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [07:47<08:39,  1.99s/it]
[A

{'eval_loss': 0.33058205246925354, 'eval_accuracy': 0.8981042654028436, 'eval_runtime': 5.3342, 'eval_samples_per_second': 158.224, 'eval_steps_per_second': 2.625, 'epoch': 7.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [08:50<08:39,  1.99s/it]
[A

{'eval_loss': 0.3149661421775818, 'eval_accuracy': 0.8945497630331753, 'eval_runtime': 5.312, 'eval_samples_per_second': 158.887, 'eval_steps_per_second': 2.636, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [09:53<08:39,  1.99s/it]
[A

{'eval_loss': 0.3133288323879242, 'eval_accuracy': 0.9016587677725119, 'eval_runtime': 5.2807, 'eval_samples_per_second': 159.829, 'eval_steps_per_second': 2.651, 'epoch': 9.0}



  3%|▎         | 9/270 [10:19<08:39,  1.99s/it]  

{'loss': 0.3113, 'grad_norm': 0.6535989046096802, 'learning_rate': 0.00011320754716981132, 'epoch': 9.43}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  3%|▎         | 9/270 [10:57<08:39,  1.99s/it]
[A

{'eval_loss': 0.3258794844150543, 'eval_accuracy': 0.8992890995260664, 'eval_runtime': 5.6941, 'eval_samples_per_second': 148.225, 'eval_steps_per_second': 2.459, 'epoch': 10.0}



100%|██████████| 530/530 [10:35<00:00,  1.20s/it]


{'train_runtime': 635.1817, 'train_samples_per_second': 53.103, 'train_steps_per_second': 0.834, 'train_loss': 0.3001336776985312, 'epoch': 10.0}


100%|██████████| 14/14 [00:04<00:00,  3.03it/s]


### Save the PEFT model

In [16]:
lora_model.save_pretrained('gpt-lora')

## Perform Inference Using the Fine-Tuned Model


### Load the saved PEFT model

In [17]:
lora_model_finetuned = AutoPeftModelForCausalLM.from_pretrained('gpt-lora')


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate the fine-tuned model

In [18]:
test_df_finetuned = pd.DataFrame(test)
test_small_finetuned = test_df_finetuned.sample(100)
test_small_finetuned['predicted'] = test_small_finetuned['sentence'].apply(lambda x: predict(x, lora_model_finetuned, tokenizer))
test_small_finetuned['correct'] = test_small_finetuned['predicted'] == test_small_finetuned['label']
prediction_percentage_finetuned_peft = test_small_finetuned['correct'].mean()
print(f"Accuracy: {prediction_percentage_finetuned_peft:.2f}")

Accuracy: 0.72


## Comparisons

In [20]:
print(f"Base model: {base_evaluation}")
print(f"Peft model: {peft_evaluation}")

# Random 100 samples
print(f"Accuracy base model: {prediction_percentage:.2f}")
print(f"Accuracy peft model: {prediction_percentage_finetuned_peft:.2f}", f"Improvement: {prediction_percentage_finetuned_peft - prediction_percentage:.2f}")

Base model: {'eval_loss': 0.5203144550323486, 'eval_accuracy': 0.7665876777251185, 'eval_runtime': 4.567, 'eval_samples_per_second': 184.804, 'eval_steps_per_second': 1.533, 'epoch': 10.0}
Peft model: {'eval_loss': 0.2787133753299713, 'eval_accuracy': 0.8992890995260664, 'eval_runtime': 5.1844, 'eval_samples_per_second': 162.796, 'eval_steps_per_second': 2.7, 'epoch': 10.0}
Accuracy base model: 0.75
Accuracy peft model: 0.72 Improvement: -0.03
