In [1]:
!pip install transformers datasets evaluate accelerate



# Task

Text classification is a common NLP task that assigns a label or class to text. One of the most popular forms of text classification is sentiment analysis, which assigns a label like positive, negative, or neutral to a sequence of text.

This guide will show how to:
1. Finetune DistilBERT on the IMDb dataset to determine whether a movie review is positive or negative.
2. Use your finetuned model for inference.

# Libraries

In [2]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline

mps_device = torch.device("mps")

2024-01-16 21:31:34.768730: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Uncomment below to push to HF Hub
#from huggingface_hub import notebook_login
#notebook_login()

# Data Load

In [4]:
# Load IMDB dataset from the Datasets library
imdb = load_dataset("imdb")

In [5]:
# Check out an example
# There are two fields in this dataset:
# text: the movie review text.
# label: a value that is either 0 for a negative review or 1 for a positive review.
imdb["test"][12]

{'text': 'I first watched this movie back in the mid/late 80\'s, when I was a kid. We couldn\'t even get all the way through it. The dialog, the acting, everything about it was just beyond lame.<br /><br />Here are a few examples... imagine these spoken real dramatically, way over-acted: "Oreegon? You\'re going to Oreegon? Why would anyone want to go to Oreegon?"<br /><br />"Survivalists? Nobody ever told us about any survivalists!"<br /><br />This movie was SO bad, my sister and I rented it again for her 16th birthday party, just so our friends could sit around and laugh at how awful it was. I don\'t think we were able to finish it then either!',
 'label': 0}

# Preprocessing

In [6]:
# Create a preprocessing function to tokenize text and truncate sequences 
# Inputs should be no longer than DistilBERT’s maximum input length
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
# map() to tokenize entire dataset
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
# create a batch of examples (data collator) I
# NB: more efficient to dynamically pad the sentences to the longest length in a batch during collation,
# versus padding the whole dataset to the maximum length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation

In [9]:
# Set up evaluation block (to be called from training block)
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
# Create a function that passes my predictions and labels to compute to calculate the accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Training

In [11]:
# Create a mapping of ids to labels and vice-versa
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [12]:
# Load DistilBERT
# Pass in the number of expected labels, and the label mappings
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
model.to(mps_device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Define training hyperparameters in TrainingArguments
# The only required parameter is output_dir which specifies where to save model
# You can push this model to the Hub by setting push_to_hub=True 
# At the end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.
training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Pass the training arguments to Trainer along with the model, dataset, tokenizer, data collator, compute_metrics
# NB: Trainer applies dynamic padding by default when you pass tokenizer to it...
# In this case, we didn't need to specify a data collator explicitly
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Call train() to finetune the model on new dataset
trainer.train()
#trainer.push_to_hub() (uncomment to push to HF Hub)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


# Inference

In [1]:
inference_text = "I have mixed feelings about that movie. On the one hand, it had a magical feel to it, \
but on the other hand it felt like someone was pulling the wool over our eyes. Bittersweet is what I'd call it."

In [None]:
trained_model_path = "/text_classification_model"
classifier = pipeline("sentiment-analysis", model=trained_model_path)
classifier(inference_text)

# Replicating the inference pipeline

In [None]:
# Tokenize the text and return PyTorch tensors
tokenizer = AutoTokenizer.from_pretrained("/text_classification_model")
inputs = tokenizer(inference_text, return_tensors="pt")

In [None]:
# Pass the inputs to the model and return the logits
model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
# Get the highest probability class and use id2label to convert it to a text label
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]