In [1]:
!pip install transformers datasets evaluate accelerate



# Task

Text classification is a common NLP task that assigns a label or class to text. One of the most popular forms of text classification is sentiment analysis, which assigns a label like positive, negative, or neutral to a sequence of text.

This guide will show how to:
1. Finetune DistilBERT on the IMDb dataset to determine whether a movie review is positive or negative.
2. Use your finetuned model for inference.

# Libraries

In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

mps_device = torch.device("mps")
#model.to(mps_device)

2024-01-15 21:23:44.213687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Load

In [3]:
# Load IMDB dataset from the Datasets library
imdb = load_dataset("imdb")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# Check out an example
# There are two fields in this dataset:
# text: the movie review text.
# label: a value that is either 0 for a negative review or 1 for a positive review.
imdb["test"][12]

{'text': 'I first watched this movie back in the mid/late 80\'s, when I was a kid. We couldn\'t even get all the way through it. The dialog, the acting, everything about it was just beyond lame.<br /><br />Here are a few examples... imagine these spoken real dramatically, way over-acted: "Oreegon? You\'re going to Oreegon? Why would anyone want to go to Oreegon?"<br /><br />"Survivalists? Nobody ever told us about any survivalists!"<br /><br />This movie was SO bad, my sister and I rented it again for her 16th birthday party, just so our friends could sit around and laugh at how awful it was. I don\'t think we were able to finish it then either!',
 'label': 0}

# Preprocessing

In [7]:
# Create a preprocessing function to tokenize text and truncate sequences 
# Inputs should be no longer than DistilBERT’s maximum input length
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
# map() to tokenize entire dataset
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
# create a batch of examples (data collator) I
# NB: more efficient to dynamically pad the sentences to the longest length in a batch during collation,
# versus padding the whole dataset to the maximum length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)