# Loading and Evaluating a Foundation Model

In this project we'll fine-tune a foundation model to avoid building a model from the ground up and we'll improve upon it's base accuracy

In [1]:
# Useful imports
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


### Step 1. Loading the dataset

The dataset that we'll use for this project is "FinanceInc/auditor_sentiment" which is an auditor review sentiment collected by News Department

The dataset contains "train" and "test" splits (no validation in this case) and it has a total of 33.8k rows with a train/test split of 75/25

In [2]:
# Step 1. Load a dataset
splits = ["train", "test"]

datasets = {split: ds for split, ds in zip(splits,load_dataset("FinanceInc/auditor_sentiment",split=splits))}
datasets

{'train': Dataset({
     features: ['sentence', 'label'],
     num_rows: 3877
 }),
 'test': Dataset({
     features: ['sentence', 'label'],
     num_rows: 969
 })}

Sentences can be one of:

positive - (label = 2)

neutral - (label = 1)

negative - (label = 0)


Below we can see some sample sentences:

In [29]:
# Positive
positive = filter(lambda x: x["label"] == 2,datasets["train"])
# Neutral
neutral = filter(lambda x: x["label"] == 1,datasets["train"])
# Negative
negative = filter(lambda x: x["label"] == 0,datasets["train"])

print(f"Positive sentence example: {positive.__next__()}\n")
print(f"Neutral sentence example: {neutral.__next__()}\n")
print(f"Negative sentence example: {negative.__next__()}\n")

Positive sentence example: {'sentence': "Altia 's operating profit jumped to EUR 47 million from EUR 6.6 million .", 'label': 2}

Neutral sentence example: {'sentence': 'Vaisala , headquartered in Helsinki in Finland , develops and manufactures electronic measurement systems for meteorology , environmental sciences , traffic and industry .', 'label': 1}

Negative sentence example: {'sentence': 'Operating loss totalled EUR 0.9 mn , down from a profit of EUR 2.7 mn .', 'label': 0}



### Loading the model

We'll use the model "cardiffnlp/twitter-roberta-base-sentiment-latest" which is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the TweetEval benchmark.

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

#Tokenizing the dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = datasets[split].map(
        lambda x: tokenizer(x["sentence"], truncation=True), batched=True
    )
    
tokenized_dataset["train"]

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3877
})

Loading the model:

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=3,
    id2label={1:"NEUTRAL",0:"NEGATIVE",2:"POSITIVE"},
    label2id={"NEGATIVE":0,"NEUTRAL":1, "POSITIVE":2},
)

for param in model.base_model.parameters():
    param.requires_grad = False

model.classifier

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=3, bias=True)
)

### Evaluating the model

To evaluate the model we'll create a base Trainer and we'll directly evaluate the model using the "test" split.

As it can be seen we have an accuracy of ~65%

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    return {"accuracy":(predictions == labels).mean()}

args = TrainingArguments(
    output_dir="./data/sentiment_analysis",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

eval_results = trainer.evaluate(tokenized_dataset["test"])
eval_results

  trainer = Trainer(


{'eval_loss': 0.7283438444137573,
 'eval_model_preparation_time': 0.0039,
 'eval_accuracy': 0.6553147574819401,
 'eval_runtime': 7.7896,
 'eval_samples_per_second': 124.397,
 'eval_steps_per_second': 31.195}

# Performing Parameter-Efficient Fine-Tuning

Now we'll try to improve upon the 65% accuracy of the model using LoRA

### Creating a PEFT config

In [None]:
from peft import LoraConfig
config = LoraConfig()

### Creating a PEFT model

In [8]:
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 294,912 || all params: 124,942,851 || trainable%: 0.2360


### Training the model

In [9]:
args = TrainingArguments(
    output_dir="./data/sentiment_analysis_lora",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3
)

lora_trainer = Trainer(
    model = lora_model,
    args = args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

  lora_trainer = Trainer(


We can see how the training loss decreases over each epoch

In [10]:
lora_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5703,No log
2,0.387,No log
3,0.2692,No log


TrainOutput(global_step=2910, training_loss=0.3947048383889739, metrics={'train_runtime': 341.035, 'train_samples_per_second': 34.105, 'train_steps_per_second': 8.533, 'total_flos': 272640754848654.0, 'train_loss': 0.3947048383889739, 'epoch': 3.0})

### Saving the trained model

We save the model to be able to load it and perform inference with it:

In [11]:
lora_model.save_pretrained("roberta-peft")

# Performing Inference with a PEFT Model
### Loading the model

In [14]:
from peft import AutoPeftModelForSequenceClassification
lora_model_loaded = AutoPeftModelForSequenceClassification.from_pretrained("roberta-peft")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Evaluating the model

Using the fine-tuned model we can see that now we have an 85% accuracy on the test dataset, this is a much simpler process than training a model from the ground up!

In [15]:
args = TrainingArguments(
    output_dir="./data/sentiment_analysis_lora_peft",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

lora_trainer_loaded = Trainer(
    model = lora_model_loaded,
    args = args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

eval_results = lora_trainer_loaded.evaluate(tokenized_dataset["test"])
eval_results

  lora_trainer_loaded = Trainer(


{'eval_loss': 0.4507454037666321,
 'eval_model_preparation_time': 0.0077,
 'eval_accuracy': 0.8720330237358102,
 'eval_runtime': 8.8577,
 'eval_samples_per_second': 109.397,
 'eval_steps_per_second': 27.434}