<a href="https://colab.research.google.com/github/marekrei/bert_text_classification_example/blob/master/bert_text_classification_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Text classification example with BERT
# Created by Marek Rei
# Based on https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter3/section4.ipynb
# Training the model for binary sentiment detection, using the SST2 dataset.

# Some settings
# Which pre-trained model to use.
# See https://huggingface.co/models for options.
checkpoint = "bert-base-uncased"

# How much training data to use.
# 1.0 uses the whole training set but it can take a bit of time to train.
train_data_sample_ratio = 0.1

# Example sentence to use
# We print out predictions for this sentence before and after training
example_sentence = "this was by far the best movie of the year"

In [None]:
# Install the necessary libraries
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Import the libraries
import torch
import evaluate

from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm.auto import tqdm

In [None]:
# Checking whether you are running on CPU or GPU.
# If the output here says "cuda" then it's running on GPU. Otherwise it's probably CPU.
# In order to run your code in Colab on the GPU, go to Edit -> Notebook settings -> Hardware accelerator and set it to "GPU".
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [None]:
# Loading the pretrained model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = model.to(device)

In [None]:
# Load the data
raw_datasets = load_dataset("glue", "sst2")
raw_datasets.cleanup_cache_files()

# Using only a sample of the training data if needed

if train_data_sample_ratio < 1.0:
    num_training_examples = int(train_data_sample_ratio*len(raw_datasets["train"]))
    raw_datasets["train"] = load_dataset("glue", "sst2", split='train[:'+str(num_training_examples)+']')

# Perform tokenization
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Need to remove columns that the model won't know
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

# DataCollatorWithPadding constructs batches that are padded to the length of the longest sentence in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
# Printing out the shapes in one batch
example_batch = None
for batch in train_dataloader:
    example_batch = batch
    break

print({k: v.shape for k, v in example_batch.items()})


# Then printing out the loss, output shape and output values from one batch.
outputs = model(**example_batch.to(device))
print("output.loss: ", outputs.loss)
print("output.logits.shape: ", outputs.logits.shape)
print("output.logits: ", outputs.logits)

# Generating predictions for an example sentence.
# Haven't trained the model yet so these will be random.
def print_example_predictions(example_sentence, example_model):
    _e = tokenize_function({"sentence": example_sentence})
    _k = {k: torch.tensor([_e[k]]).to(device) for k in _e}
    model.eval()
    example_outputs = model(**_k)
    example_logits = example_outputs.logits.cpu().detach().numpy()
    example_probabilities = torch.nn.functional.softmax(example_outputs.logits, dim=1).cpu().detach().numpy()
    print(example_probabilities)
    print("Example sentence: ", example_sentence)
    print("Predicted logits: ", example_logits)
    print("Predicted probabilities: ", example_probabilities)
    print("Prediction: ", "negative" if example_probabilities[0][0] > example_probabilities[0][1] else "positive")

print_example_predictions(example_sentence, model)


{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 47]), 'token_type_ids': torch.Size([8, 47]), 'attention_mask': torch.Size([8, 47])}
output.loss:  tensor(0.7619, device='cuda:0', grad_fn=<NllLossBackward0>)
output.logits.shape:  torch.Size([8, 2])
output.logits:  tensor([[-0.0450,  0.3309],
        [-0.2020,  0.3849],
        [-0.3075,  0.1596],
        [-0.2679,  0.0848],
        [-0.3335,  0.0863],
        [-0.3113,  0.2367],
        [-0.3893,  0.0356],
        [-0.2937,  0.3148]], device='cuda:0', grad_fn=<AddmmBackward0>)
[[0.36277393 0.63722605]]
Example sentence:  this was by far the best movie of the year
Predicted logits:  [[-0.28468698  0.27865756]]
Predicted probabilities:  [[0.36277393 0.63722605]]
Prediction:  positive


In [None]:
# Setting up model training for fine-tuning
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Setting the model to training mode
model.train()

# Running the training
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2526 [00:00<?, ?it/s]

In [None]:
# Setting the model to evaluation mode
model.eval()

# Running evaluation
metric = evaluate.load("glue", "sst2")
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8922018348623854}


In [None]:
# Getting predictions for the example sentence again, now that we have trained the model
print_example_predictions(example_sentence, model)

[[2.9539500e-04 9.9970454e-01]]
Example sentence:  this was by far the best movie of the year
Predicted logits:  [[-3.9805658  4.1463356]]
Predicted probabilities:  [[2.9539500e-04 9.9970454e-01]]
Prediction:  positive
