In [None]:
!pip install transformers datasets

In [None]:
!pip install -U datasets huggingface_hub fsspec

In [None]:
!pip install evaluate

In [None]:
import random

import torch
import transformers
import numpy as np
import pandas as pd

## 1. Getting a dataset

Large Movie Review Dataset. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.


* `text`: a string feature.
* `label`: a classification label, with possible values including `neg` (0), `pos` (1).

In [None]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")

In [None]:
print(ds)

In [None]:
# What features are there?
ds.column_names

In [None]:
# Access the training split
ds["train"]

In [None]:
ds["train"][0]

### 1.1 Inspect random examples from the dataset

In [None]:
import random

random_indx = random.sample(range(len(ds["train"])),5)
random_samples = ds["train"][random_indx]

print(f"[INFO] Random samples from dataset:\n")
for item in zip(random_samples["text"], random_samples["label"]):
    print(f"Text: {item[0]} | Label: {item[1]}")

In [None]:
# Get unique label values
ds["train"].unique("label")

In [None]:
# Check number of each label
from collections import Counter

Counter(ds["train"]["label"])

## 2. Prepare data for text classification

### 2.1 Tokenize text data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "distilbert/distilbert-base-uncased")

tokenizer

In [None]:
# Test the tokenizer
tokenizer("I love pizza")

### 2.2 Making a preprocessing function to tokenize text

In [None]:
def tokenize_text(examples):
  """ Tokenize a given example text and return the tokenized text."""
  return tokenizer(examples["text"], padding= "max_length", truncation=True)

In [None]:
# Map our tokenize_text function to dataset
tokenized_dataset = ds.map(function = tokenize_text,
                           batched = True)

In [None]:
tokenized_dataset

In [None]:
# Get two samples from the tokenized dataset
train_tokenized_sample = tokenized_dataset["train"][0]
test_tokenized_sample = tokenized_dataset["test"][0]

for key in train_tokenized_sample.keys():
    print(f"[INFO] Key: {key}")
    print(f"Train sample: {train_tokenized_sample[key]}")
    print(f"Test sample: {test_tokenized_sample[key]}")
    print("")

### Setup evaluation metrics

In [None]:
import evaluate
import numpy as np
from typing import Tuple

accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
  """
  Computes the accuracy of a model by comparing the predictions and labels.
  """
  predictions, labels = predictions_and_labels

  # Get highest prediction probability of each prediction if predictions are probabilities
  if len(predictions.shape) >= 2:
    predictions = np.argmax(predictions, axis=1)

  return accuracy_metric.compute(predictions=predictions, references=labels)

## 3. Setting up a model for training

In [None]:
from transformers import AutoModelForSequenceClassification

# Setup model
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = "distilbert/distilbert-base-uncased",
    num_labels = 2
)

In [None]:
model

### 3.1 Create a directory for saving models

In [None]:
# Create model output directory
from pathlib import Path

# Create models directory
models_dir = Path("models")
models_dir.mkdir(exist_ok = True)

# Create model save name
model_save_name = "text_clasification-imdb-distilbert-base-uncased"

# Create model save path
model_save_dir = Path(models_dir, model_save_name)

model_save_dir

### 3.2 Setting up traning arguments with TrainingArguments

Parameters we are going to use for training our model:
 * `output_dir`: the output directory where the model predictions and checkpoints will be written.
 * `learning_rate`: value of the initial learning rate for AdamW optimizer
 * `per_device_train_batch_size`:the batch size per device for training
 * `per device_eval_batch_size`: the batch size per device for evaluation
 * `num_train_epochs`:total number of training epochs to perfomr
 * `eval_strategy`: the evaluation strategy to adopt during training. Posible values are:
  * `"no"`: no evaluation is done during training
  * `"steps"`: evlauation is done every `eval_steps`
  * `"epoch"`: evaluation is done at the ed of each epoch
 * `save_strategy`: the checkpoing save stragtegy to adopt during training
 * `save_total_limit`: if a value is passed, will limit the total amout of checkpoints.
 * `use_cpu`:
 * `seed`:random seed for repdoducibility
 * `load_best_model_at_the_end`:whether or not to load the best model found during training.
 * `logging_strategy`:
 * `report_to`:

In [None]:
from transformers import TrainingArguments
print(f"[INFO] Saving model checkpoints to: {model_save_dir}")

# Create training arguments
training_args = TrainingArguments(
    output_dir = model_save_dir,
    learning_rate = 2e-5,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    num_train_epochs = 5,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3,
    seed = 42,
    load_best_model_at_end = True,
    logging_strategy = "epoch",
    report_to = "none"
    )

### 3.3 Setting up an instance of Trainer

In [None]:
from transformers import Trainer

# Setup Trainer
trainer = Trainer(
   model = model,
   args = training_args,
   train_dataset = tokenized_dataset["train"],
   eval_dataset = tokenized_dataset["test"],
   processing_class = tokenizer,
   compute_metrics = compute_accuracy
)

### 3.4 Training our text classification model

In [None]:
# Train a text classification model
results = trainer.train()

### 3.5 Save the model for later use

In [None]:
# Save model
print(f"[INFO] Saving model to {model_save_dir}")
trainer.save_model(output_dir = model_save_dir)

### 3.6 Inspecting the model training metrics

In [None]:
trainer_history_all = trainer.state.log_history
trainer_history_metrics = trainer_history_all[:-1] # get everything except the training time metrics

trainer_history_metrics[:4]

In [None]:
import pprint # import pretty print for nice printing of lists

# Extract training and evaluation metrics
trainer_history_training_set = []
trainer_history_eval_set = []

# Loop through metrics and filter for training and eval metrics
for item in trainer_history_metrics:
    item_keys = list(item.keys())
    # Check to see if "eval" is in the keys of the item
    if any("eval" in item for item in item_keys):
        trainer_history_eval_set.append(item)
    else:
        trainer_history_training_set.append(item)

# Show the first two items in each metric set
print(f"[INFO] First two items in training set:")
pprint.pprint(trainer_history_training_set[:2])

print(f"\n[INFO] First two items in evaluation set:")
pprint.pprint(trainer_history_eval_set[:2])

In [None]:
# Create pandas DataFrames for the training and evaluation metrics
trainer_history_training_df = pd.DataFrame(trainer_history_training_set)
trainer_history_eval_df = pd.DataFrame(trainer_history_eval_set)

trainer_history_training_df

In [None]:
trainer_history_eval_df

In [None]:
# Plot training and evaluation loss
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(trainer_history_training_df["epoch"], trainer_history_training_df["loss"], label="Training loss")
plt.plot(trainer_history_eval_df["epoch"], trainer_history_eval_df["eval_loss"], label="Evaluation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Text classification with DistilBert training and evaluation loss over time")
plt.legend()
plt.show()