This notebook uses the distilBERT pretrained model to perform binary text classification on an Amazon Reviews dataset. The performance is amazingly good off the shelf. I highly recommend you change the notebook runtime type to GPU.

# Prepare data

In [None]:
# Install Hugging Face  
!pip install datasets transformers[sentencepiece]

In [None]:
# Load dependencies
import numpy as np
from datasets import load_metric
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

In [None]:
# Load data
raw_datasets = load_dataset("amazon_polarity")

Downloading:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/860 [00:00<?, ?B/s]

Downloading and preparing dataset amazon_polarity/amazon_polarity (download: 656.45 MiB, generated: 1.66 GiB, post-processed: Unknown size, total: 2.30 GiB) to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/ac31acedf6cda6bc2aa50d448f48bbad69a3dd8efc607d2ff1a9e65c2476b4c1...


Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset amazon_polarity downloaded and prepared to /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/ac31acedf6cda6bc2aa50d448f48bbad69a3dd8efc607d2ff1a9e65c2476b4c1. Subsequent calls will reuse this data.


In [None]:
# Dataset is a dictionary with train and test splits
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})

# Prepare model and tokenize text data

In [None]:
# Instantiate pretrained model and tokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
  
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Tokenize 
def tokenize_function(examples):
    return tokenizer(examples['content'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/3600 [00:00<?, ?ba/s]

  0%|          | 0/400 [00:00<?, ?ba/s]

In [None]:
# Dataset is much too large to train on free Colab - cut training set down to 10000 examples
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000)) 
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000)) 

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/ac31acedf6cda6bc2aa50d448f48bbad69a3dd8efc607d2ff1a9e65c2476b4c1/cache-ba836544a759f279.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/ac31acedf6cda6bc2aa50d448f48bbad69a3dd8efc607d2ff1a9e65c2476b4c1/cache-4dc39232d3187669.arrow


In [None]:
# Look at tokenized example
print(small_train_dataset[0])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'content': "All the pretty people in this film. Even the Rudy character played by Michael Madsen. This is adapted from a Jim Thompson novel for cryin' out loud! These are supposed to be marginal characters, not fashion models. Though McQueen and McGraw were attractive (but check out McQueen's crummy prison haircut) they were believable in the role. Baldwin and Bassinger seem like movie stars trying to act like hard cases. Action wise, the robbery scene in the Pekinpah version was about 100 times more exciting and suspenseful than anything in this re-make.", 'input_ids': [101, 2035, 1996, 3492, 2111, 1999, 202

# Modeling

In [None]:
# Model trainer parameters

# Create evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Set training arguments
training_args = TrainingArguments("test_trainer", evaluation_strategy="epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Compile the model
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=small_train_dataset, 
    eval_dataset=small_train_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: content, title.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3750


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2884,0.0913,0.9736
2,0.1314,0.024242,0.9944
3,0.0374,0.009015,0.9984


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: content, title.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1500
C

TrainOutput(global_step=3750, training_loss=0.15189232584635418, metrics={'train_runtime': 769.7407, 'train_samples_per_second': 38.974, 'train_steps_per_second': 4.872, 'total_flos': 1396415431947744.0, 'train_loss': 0.15189232584635418, 'epoch': 3.0})

# Result: DistilBERT Best Validation Accuracy: 99.84%