In [None]:
!pip install datasets
!pip install accelerate
!pip install evaluate

# Processing data

In [None]:
from datasets import load_dataset

# imdb = load_dataset("imdb")
imdb = load_dataset("imdb", split="train").select(range(1000)).train_test_split(test_size=0.2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

'''
Load a DistilBERT tokenizer to preprocess the text field

'''
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
'''
Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input
'''
def preprocess_function(examples):
    return tokenizer(examples["text"],  max_length=512, truncation=True)


In [None]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
'''
Pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
'''
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np

'''
Create a function that passes your predictions and labels to compute to calculate the accuracy
'''
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
'''
create a map of the expected ids to their labels with id2label and label2id
'''

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Train

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
'''
Load DistilBERT with AutoModelForSequenceClassification
'''
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.077825,1.0
2,No log,0.034988,1.0


TrainOutput(global_step=50, training_loss=0.16878799438476563, metrics={'train_runtime': 82.1584, 'train_samples_per_second': 19.475, 'train_steps_per_second': 0.609, 'total_flos': 211947837849600.0, 'train_loss': 0.16878799438476563, 'epoch': 2.0})

## Inference

In [None]:
from transformers import pipeline


text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
classifier = pipeline("sentiment-analysis", model="/content/my_model/checkpoint-50")
classifier(text)

[{'label': 'NEGATIVE', 'score': 0.8981360793113708}]