<a href="https://colab.research.google.com/github/pgurazada/advances-in-nlp/blob/main/transfer_learning_finetune_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective

Illustrate how to fine-tune a BERT model for sentiment classification using the transformers package.

Note: This notebook should be run with a GPU. If you have access to a larger GPU, you can increase the training data size.

# Setup

In [None]:
! pip install -q datasets==2.20.0 \
                 accelerate==0.33.0 \
                 evaluate==0.4.2

In [1]:
import evaluate
import torch 

import numpy as np
import pandas as pd

from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split

from datasets import Dataset

2024-08-28 10:49:53.755567: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-28 10:49:53.755643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-28 10:49:53.757507: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-28 10:49:53.767915: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Method

We use the HuggingFace Trainer library to execute finetuning.

# Data

In [2]:
data_file = 'labeled_sentiments_data.tsv'

In [3]:
data_df = pd.read_csv(data_file, sep='\t')

In [4]:
data_df.shape

(25000, 3)

In [5]:
data_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# Prepare Dataset

In [12]:
train_df, test_df = train_test_split(data_df, test_size=0.2)

In [13]:
train_df.shape, test_df.shape

((20000, 3), (5000, 3))

In [14]:
sample_train_dataset = Dataset.from_pandas(train_df, split='train')
sample_validation_dataset = Dataset.from_pandas(test_df, split='valid')

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [16]:
def preprocess_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [17]:
tokenized_train_dataset = sample_train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = sample_validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [18]:
# Ensure the label column exists in the tokenized datasets
# This is an expectation by the transformers package
def add_labels(examples):
    examples['label'] = examples['sentiment']
    return examples

In [19]:
tokenized_train_dataset = tokenized_train_dataset.map(add_labels, batched=True)
tokenized_validation_dataset = tokenized_validation_dataset.map(add_labels, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [22]:
tokenized_train_dataset, tokenized_validation_dataset

(Dataset({
     features: ['id', 'sentiment', 'review', '__index_level_0__', 'input_ids', 'attention_mask', 'label'],
     num_rows: 20000
 }),
 Dataset({
     features: ['id', 'sentiment', 'review', '__index_level_0__', 'input_ids', 'attention_mask', 'label'],
     num_rows: 5000
 }))

# Build Model

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", 
    num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Fine-Tune

In [25]:
accuracy = evaluate.load("accuracy")

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
training_args = TrainingArguments(
    output_dir="distilbert-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [30]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2619,0.195515,0.9246
2,0.1727,0.215077,0.9328
3,0.1113,0.322593,0.9274
4,0.067,0.378072,0.929


TrainOutput(global_step=5000, training_loss=0.15461543693542482, metrics={'train_runtime': 3945.7716, 'train_samples_per_second': 253.436, 'train_steps_per_second': 15.84, 'total_flos': 1.0498774053786816e+16, 'train_loss': 0.15461543693542482, 'epoch': 4.0})

# Inference

In [31]:
test_inputs = [
    "Awesome movie",
    "Great movie, great plot"
]

In [32]:
inputs = tokenizer(test_inputs, padding=True, return_tensors="pt").to('cuda')

In [33]:
with torch.no_grad():
    logits = model(**inputs).logits

In [34]:
predicted_class_ids = logits.argmax(axis=-1)

In [35]:
predicted_class_ids

tensor([1, 1], device='cuda:0')