In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import random
import pandas as pd
import evaluate
import torch
import torch.nn.functional as F

Load the IMDB dataset.
Create a tokenizer from the pre-trained BERT uncased model.
Use the pre-trained BERT uncased model with 2 labels, 1 for positive class and 0 for negative class.

In [2]:
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def describe_specific_dataset(data, data_type):
    """
    Examine some data. Count the total number of positive and negative samples and print some random samples.
    """
    total_records = len(data)
    num_positive_labels = np.sum(data['label'])
    num_negative_labels = total_records - num_positive_labels
    print(f"Dataset Type : {data_type}, Total Records : {total_records}, Positive : {num_positive_labels}, Negative : {num_negative_labels}")

    samples = random.sample(range(0, total_records), 7)

    for i in samples:
        text = data['text'][i]
        label = data['label'][i]
        print(f"Movie Review : {text}\n  Label : {label}\n")

Describe the training dataset.

In [4]:
def describe_dataset(dataset):
    describe_specific_dataset(dataset['train'], 'train')

In [5]:
describe_dataset(dataset)

Dataset Type : train, Total Records : 25000, Positive : 12500, Negative : 12500
Movie Review : Hybrid starts as water treatment planet security guard Aaron Scates (Cory Monteith) is involved in an accident which leaves him blind. Luckily it just so happens that brilliant scientist Dr. Andrea Hewitt (Justine Bateman) who works for Olaris has developed an operation to transplant organs from one species to another, Hewitt decides Aaron would be perfect for her first human experiment. Hewitt & her team transplant the eyes of a Wolf into Aaron & he miraculously regains his sight. Brilliant, right? Well, no not really since Aaron starts to go mad as he sees random images of Wolves & starts to develop a lust for blood. Aaron escapes the Olaris building & goes on the run but he is too valuable to just let go & a full scale search is mounted to capture him...<br /><br />Directed by Yelena Lanskaya this is yet another Sci-Fi Channel offering that is quite simply put terrible in every possible wa

Define a tokenizer function for the model.

In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
print(tokenized_dataset["train"].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [7]:
"""
PyTorch expects the class lablels in a column named "labels".
set_format torch converts the required columns to pytorch tensors
"""
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Run evaluation after every epoch
    save_strategy="epoch", # Save model checkpoint after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01, # L2 regularization
    logging_dir="./logs",
    logging_steps=100, # Logs training metrics after 100 steps
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

In [9]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score = f1.compute(predictions=preds, references=labels, average='weighted')

    return {
        "accuracy": acc["accuracy"],
        "f1": f1_score["f1"]
    }


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(10000)),  # Subset for speed
    eval_dataset=tokenized_dataset["test"].select(range(2000)),  # Subset for speed
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2968,0.407714,0.885,0.938992
2,0.1486,0.292796,0.9315,0.964535


TrainOutput(global_step=2500, training_loss=0.2368379367828369, metrics={'train_runtime': 2872.9919, 'train_samples_per_second': 6.961, 'train_steps_per_second': 0.87, 'total_flos': 5262221107200000.0, 'train_loss': 0.2368379367828369, 'epoch': 2.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.29279595613479614,
 'eval_accuracy': 0.9315,
 'eval_f1': 0.9645353352316852,
 'eval_runtime': 72.2463,
 'eval_samples_per_second': 27.683,
 'eval_steps_per_second': 3.46,
 'epoch': 2.0}

In [15]:
# Retrieve the best saved model.
model_path = "./results/checkpoint-2500"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
label_map = {0: "Negative", 1: "Positive"}

def predict_sentiment(review):
    """
    Given a review, compute the class probability (confidence) and the class.
    1 means positive review and 0 means negative review.
    """
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    print(f"  Logits : {logits}, Argmax : {torch.argmax(logits)}")
    predicted_class = torch.argmax(logits).item()
    probs = F.softmax(logits, dim=1)
    confidence = probs[0][predicted_class].item()
    print(f"  Probabilities : {probs}, confidence : {confidence}")
    return label_map[predicted_class]

movie_reviews = [
    "This was a boring and a poorly directed film.",
    """I really enjoyed this movie. The story, the plot and the acting was superlative. The ending felt a little stretched, 
    could have been reduced, but I liked it overall""",
    """OMG ! What a collosal waste of time ! Avoid !!!"""
]

for movie_review in movie_reviews:
    print(movie_review)
    print(f"Sentiment : {predict_sentiment(movie_review)}\n")


This was a boring and a poorly directed film.
  Logits : tensor([[ 2.6290, -3.3180]]), Argmax : 0
  Probabilities : tensor([[0.9974, 0.0026]]), confidence : 0.9973931312561035
Sentiment : Negative

I really enjoyed this movie. The story, the plot and the acting was superlative. The ending felt a little stretched, 
    could have been reduced, but I liked it overall
  Logits : tensor([[-2.8040,  3.0311]]), Argmax : 1
  Probabilities : tensor([[0.0029, 0.9971]]), confidence : 0.9970853924751282
Sentiment : Positive

OMG ! What a collosal waste of time ! Avoid !!!
  Logits : tensor([[ 2.3844, -3.0481]]), Argmax : 0
  Probabilities : tensor([[0.9956, 0.0044]]), confidence : 0.995647132396698
Sentiment : Negative

