# Training/Finetuning Transformer models on custom dataset:

## 1. Finetuning using the Trainer API from huggingface:

→ Trainer API is a high-level API that aids in training the transformer models easily

In [2]:
!pip install --q datasets
!pip install --q transformers
!pip install --q evaluate

**Custom dataset used: `sst2` from the `glue-benchmark`**

In [None]:
import torch
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset

#Loading the dataset:
raw_dataset = load_dataset("glue", "sst2")

#Initializing the model and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #num_label=2; since SST-2 dataset has 2 labels: positive and negative


In [None]:
raw_dataset['train'][0]

### Preprocessing the dataset:

In [None]:
from transformers import DataCollatorWithPadding

def tokenize_data(dataset):
  return tokenizer(dataset["sentence"], truncation=True)

tokenized_dataset = raw_dataset.map(tokenize_data, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

### Creating base model for baseline purpose:

In [None]:
train_args = TrainingArguments(
    "test-trainer",
    num_train_epochs=1,
    fp16=True,
    learning_rate=1e-4,
    per_device_train_batch_size=16
)

trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

**Model Performance Evaluation**: <br>
- The model training only reports `training_loss` which is not too insightful, this is because the model was not passed with an evaluation strategy(evaluate at the end of steps or after each epoch),
<br>
- To create a better model that has better evaluation metrics.

- To better understand the model performance, making predictions with the trained model on the validation split of our dataset.<br>


In [None]:
#making predictions with the trained model:
predictions = trainer.predict(tokenized_dataset["validation"])

#Shape of prediction array and labels array:
print("Shape of Prediction array: ", predictions.predictions.shape)
print("Shape of labels: ", predictions.label_ids.shape)

### Building a component to compute metrics for evaluation using specific evaluation strategies:

In [None]:
pred = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "sst2")
metric.compute(predictions=pred, references=predictions.label_ids)

- The base model has an accuracy of ``0.89``, or `90%` accurate model

**Building a model with an evaluation strategy and defining a function that computes the metrics we need**:

In [None]:
#Defining a helper function to calculate metrics:

def calculate_metrics(evaluation_preds):
  metrics = evaluate.load("glue", "sst2")
  logits, labels = evaluation_preds
  preds = np.argmax(logits, axis=-1)
  return metrics.compute(predictions=preds, references=labels)

In [None]:
##Finalizing the model by updating training arguments and the trainer:

training_args = TrainingArguments(
    "final-trainer",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    eval_strategy="epoch",
    gradient_accumulation_steps=5
)


### ---------------

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=calculate_metrics,
)


### ---------------
#Training the model
trainer.train()

``Accuracy: 89.56%``<br>
``val_loss: 37%``<br>
``training loss: 6%`` <br>
*The model may be overfitting on the dataset*

---

## 2. Training Loop from scratch:<br>
- This section of the notebook contains training a transformer and defining its training loop from scratch in PyTorch.

In [3]:
### Imports
import numpy as np
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset

2024-08-08 14:17:04.315844: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 14:17:04.315961: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 14:17:04.460980: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:

### Loading up dataset and Initializing model:

dataset = load_dataset("glue", "sst2")
model_id = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

def tokenize_data(sample):
  return tokenizer(sample["sentence"], truncation=True)

tokenized_dataset = dataset.map(tokenize_data, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)


###------------
# Removing unnecessary columns:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

#Set the datatype to torch tensors:
tokenized_dataset.set_format("torch")

#Checking the available columns in the dataset:
tokenized_dataset["train"].column_names


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

['labels', 'input_ids', 'attention_mask']

In [5]:
#Loading up the dataloaders:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

evaluate_dataloader = DataLoader(
    tokenized_dataset["validation"],
    batch_size=32,
    collate_fn=data_collator
)

#Grabbing a batch of data, to check if the preprocessing steps were executed correctly:
for batch in train_dataloader:
  break
{k : v.shape for k, v in batch.items()}


{'labels': torch.Size([32]),
 'input_ids': torch.Size([32, 32]),
 'attention_mask': torch.Size([32, 32])}

In [6]:
#Passing in a batch of data to check if the model is properly initialized:
output = model(**batch)
print("Loss: ", output.loss)
print("Shape of the Logits array: ", output.logits.shape)

Loss:  tensor(0.6794, grad_fn=<NllLossBackward0>)
Shape of the Logits array:  torch.Size([32, 2])


Most of the components are ready for the whole training loop: <br>
- Model ✅<br>
- Tokenizer ✅ <br>
- DataLoaders(training & validation) ✅ <br>
- Preprocessed dataset ✅<br>

Components that need to be added to complete the loop:<br>

- Optimizers(eg. AdamW) 🟡<br>
- Learning rate scheduler 🟡<br>
- Set default device to GPU 🟡

**Initializing the Optimizer(AdamW) and Learning Rate scheduler**

In [7]:
from transformers import AdamW
from transformers import get_scheduler

epochs = 5
num_training_steps = epochs * len(train_dataloader)
optimizer = AdamW(params=model.parameters(), lr=5e-6)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

#Checking number of training steps:
print(num_training_steps)


10525




In [8]:
## Setting default device to GPU:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#Shifting the model to GPU:
model.to(device)


print(device)

cuda


### Defining the custom training loop:

**here using mixed-precision to fasten up the training process**:

In [9]:
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

progress_bar = tqdm(range(num_training_steps))

#putting the model in training mode:
model.train()
for epoch in range(epochs):
  for batch in train_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}

    #enabling mixed-precision:
    with autocast():
      output = model(**batch)
      loss = output.loss

    #Using the GradScaler() to create the backward pass through mixed-precision
    scaler.scale(loss).backward()

    scaler.step(optimizer)
    scaler.update()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/10525 [00:00<?, ?it/s]

### Defining a custom Evaluation Loop:

In [10]:
metrics = evaluate.load("glue", "sst2")

#Putting the model in evaluate mode:
model.eval()
for batch in evaluate_dataloader:
  batch = {k:v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    output = model(**batch)

  logits = output.logits
  predictions = torch.argmax(logits, dim=-1)
  metrics.add_batch(predictions=predictions, references=batch["labels"])

metrics.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.9426605504587156}

so, our `roberta-base` model performs at around `94%` accuracy.

### Modifying the training loop with Accelerate library:

In [11]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from datasets import load_metric

accelerate = Accelerator(mixed_precision="fp16")
checkpoint = "roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(params=model.parameters(), lr=6e-6)


train_dataloader, evaluate_dataloader, model, optimizer = accelerate.prepare(
    train_dataloader,
    evaluate_dataloader,
    model,
    optimizer
)

epochs = 6
num_training_steps = epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))


#Defining the custom-loop:
model.train()
for epoch in range(epochs):
  for batch in train_dataloader:
    output = model(**batch)
    loss = output.loss
    accelerate.backward(loss)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)


#Defining custom evaluation loop:
metrics = load_metric("glue", "sst2")
model.eval()
for batch in evaluate_dataloader:
  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metrics.add_batch(
      predictions=accelerate.gather(predictions),
      references=accelerate.gather(batch["labels"])
  )


metrics.compute()



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12630 [00:00<?, ?it/s]

  metrics = load_metric("glue", "sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

The repository for glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


{'accuracy': 0.9380733944954128}