In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import random
import pandas as pd
import evaluate
import torch
import torch.nn.functional as F

Load the IMDB dataset.
Create a tokenizer from the pre-trained BERT uncased model.
Use the pre-trained BERT uncased model with 2 labels, 1 for positive class and 0 for negative class.

In [3]:
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def describe_specific_dataset(data, data_type):
    """
    Examine some data. Count the total number of positive and negative samples and print some random samples.
    """
    total_records = len(data)
    num_positive_labels = np.sum(data['label'])
    num_negative_labels = total_records - num_positive_labels
    print(f"Dataset Type : {data_type}, Total Records : {total_records}, Positive : {num_positive_labels}, Negative : {num_negative_labels}")

    samples = random.sample(range(0, total_records), 7)

    for i in samples:
        text = data['text'][i]
        label = data['label'][i]
        print(f"Movie Review : {text}\n  Label : {label}\n")

Describe the training dataset.

In [4]:
def describe_dataset(dataset):
    describe_specific_dataset(dataset['train'], 'train')

In [5]:
describe_dataset(dataset)

Dataset Type : train, Total Records : 25000, Positive : 12500, Negative : 12500
Movie Review : Flavia(Florinda Bolkan of "Don't Torture a Duckling" fame)is locked away in a convent of carnal desires by her father.Tired of all of the sadism she sees around her(rape of a young woman in a pigsty,sexual cravings,horse castration)Flavia decides to run from the convent with her Jewish friend from the outside,Abraham.The two don't get very far before they are captured and then brought back to be tortured and forced to repent.After punishment she joins up with a band of Muslims called the Tarantulas,who had invaded the convent prior and leads a crusade that turns into nothing short of a bloody battle behind the convent walls."Flavia the Heretic" is a well-directed and fairly notorious piece of Italian nunsploitation.The film is slightly gruesome and sleazy at times.The acting is great and the characters are well-developed.Overall,"Flavia the Heretic" is a genuinely moving and intelligent movie

Define a tokenizer function for the model.

In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
print(tokenized_dataset["train"].features)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [7]:
"""
PyTorch expects the class lablels in a column named "labels".
set_format torch converts the required columns to pytorch tensors
"""
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Run evaluation after every epoch
    save_strategy="epoch", # Save model checkpoint after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01, # L2 regularization
    logging_dir="./logs",
    logging_steps=100, # Logs training metrics after 100 steps
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

In [9]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score = f1.compute(predictions=preds, references=labels, average='weighted')

    return {
        "accuracy": acc["accuracy"],
        "f1": f1_score["f1"]
    }


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(15000)),  # Subset for speed
    eval_dataset=tokenized_dataset["test"].select(range(3000)),  # Subset for speed
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2583,0.360051,0.857667,0.923381
2,0.2022,0.274243,0.909333,0.952514


TrainOutput(global_step=1876, training_loss=0.2751789418364893, metrics={'train_runtime': 4240.4903, 'train_samples_per_second': 7.075, 'train_steps_per_second': 0.442, 'total_flos': 7893331660800000.0, 'train_loss': 0.2751789418364893, 'epoch': 2.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.274243026971817,
 'eval_accuracy': 0.9093333333333333,
 'eval_f1': 0.952513966480447,
 'eval_runtime': 109.6613,
 'eval_samples_per_second': 27.357,
 'eval_steps_per_second': 3.42,
 'epoch': 2.0}

In [6]:
# Retrieve the best saved model.
model_path = "./results/checkpoint-1876"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
label_map = {0: "Negative", 1: "Positive"}

def predict_sentiment(review):
    """
    Given a review, compute the class probability (confidence) and the class.
    1 means positive review and 0 means negative review.
    """
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    print(f"  Logits : {logits}, Argmax : {torch.argmax(logits)}")
    predicted_class = torch.argmax(logits).item()
    probs = F.softmax(logits, dim=1)
    confidence = probs[0][predicted_class].item()
    print(f"  Probabilities : {probs}, confidence : {confidence}")
    return label_map[predicted_class]

movie_reviews = [
    "This was a boring and a poorly directed film.",
    """I really enjoyed this movie. The story, the plot and the acting was superlative. The ending felt a little stretched, 
    could have been reduced, but I liked it overall""",
    """OMG ! What a collosal waste of time ! Avoid !!!""",
    """Das war ein wirklich lustiger Film, die Charaktere haben gut gespielt. Es hat Spaß gemacht, ihn anzuschauen.""",
    """यह मूवी बहुत ख़राब है।  उन्होंने कुछ भी बना दिया है।  दो घंटे और तीस मिनट बर्बाद कर दिए।  मत देखो। """,
    """Este foi um dos melhores filmes que assisti nos últimos meses. A história é ótima e a reviravolta no final me prendeu na ponta da cadeira. A cinematografia, a música ambiente e as atuações também são incríveis.""",
    """C'est l'un des meilleurs films que j'ai vus depuis des mois. L'histoire est géniale et le rebondissement final m'a tenu en haleine. La photographie, la musique de fond et le jeu des acteurs sont également exceptionnels."""
]

for movie_review in movie_reviews:
    print(movie_review)
    print(f"Sentiment : {predict_sentiment(movie_review)}\n")


This was a boring and a poorly directed film.
  Logits : tensor([[ 2.4590, -2.4597]]), Argmax : 0
  Probabilities : tensor([[0.9927, 0.0073]]), confidence : 0.9927443265914917
Sentiment : Negative

I really enjoyed this movie. The story, the plot and the acting was superlative. The ending felt a little stretched, 
    could have been reduced, but I liked it overall
  Logits : tensor([[-2.7041,  2.5022]]), Argmax : 1
  Probabilities : tensor([[0.0055, 0.9945]]), confidence : 0.9945477843284607
Sentiment : Positive

OMG ! What a collosal waste of time ! Avoid !!!
  Logits : tensor([[ 2.2393, -2.3877]]), Argmax : 0
  Probabilities : tensor([[0.9903, 0.0097]]), confidence : 0.9903112649917603
Sentiment : Negative

Das war ein wirklich lustiger Film, die Charaktere haben gut gespielt. Es hat Spaß gemacht, ihn anzuschauen.
  Logits : tensor([[-2.3759,  2.2326]]), Argmax : 1
  Probabilities : tensor([[0.0099, 0.9901]]), confidence : 0.9901312589645386
Sentiment : Positive

यह मूवी बहुत ख़राब ह