In [6]:
"""
DistilBERT: a distilled version of BERT: smaller, faster, cheaper and lighter - https://arxiv.org/abs/1910.01108
imdb: Large Movie Review Dataset, ~50k train/test, 217.35 MB https://huggingface.co/datasets/stanfordnlp/imdb
"""
try:
    import torch, transformers, datasets
except:
    %pip install transformers datasets torch

In [4]:
# Define IMDBTrainer class, loading distilbert-base-uncased
# and imdb dataset for fine-tuning task
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizer
from transformers import TrainingArguments, Trainer

from datasets import load_dataset


class IMDBTrainer:
    """
    distilbert-base-uncased size: ~268 MB
    """

    model_id = "distilbert-base-uncased"
    output_dir = "/tmp/MAI_IMDBTrainer"

    def __init__(self):
        self.model = DistilBertForSequenceClassification.from_pretrained(
            self.model_id, num_labels=2
        )
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_id)
        self.dataset = load_dataset("imdb")

    def tokenize_function(self, examples):
        return self.tokenizer(examples["text"], padding="max_length", truncation=True)

    def prepare_data(self):
        """Tokenizing 100,000 reviews take some time (~3 minutes)."""
        self.tokenized_datasets = self.dataset.map(self.tokenize_function, batched=True)

    def train(self):
        training_args = TrainingArguments(
            per_device_train_batch_size=64,
            output_dir=self.output_dir,
            learning_rate=2e-5,
            num_train_epochs=3,
        )
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["test"],
        )
        trainer.train()

In [3]:
imdb_trainer = IMDBTrainer()
imdb_trainer.prepare_data()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
imdb_trainer.train()