In [5]:
"""
Purpose: Run a model that's fine-tuned to predict sentiment based on IMDB movie reviews.

DistilBERT: a distilled version of BERT: smaller, faster, cheaper and lighter - https://arxiv.org/abs/1910.01108
sst2: Stanford Sentiment Treebank (215,154 phrases) - https://huggingface.co/datasets/stanfordnlp/sst2
"""
import os
try:
    import torch, transformers, datasets
except:
    %pip install -q transformers datasets torch

def use_best_device():
    # Check if GPU is available
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
        torch.set_default_device(device)
    elif torch.backends.mps.is_available():
        device = "mps"   
    return device


print(f"PyTorch version: {torch.__version__}")
print("device:", use_best_device())
# print("HF_HOME:", os.environ.get("HF_HOME"))

PyTorch version: 2.2.2
device: mps


In [7]:
import torch
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer


class MAI_SentimentAnalyzer:
    """
    Given a text, returns POSTIVE or NEGATIVE.

    Note: The probability values returned by this class may be extreme (close to 0 or 1),
    indicating a high confidence in the predicted sentiment. This is due to the nature of the
    fine-tuned DistilBERT model used, which has been trained on a specific dataset and may not
    generalize well to other datasets or text styles.
    """

    model_id = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

    def __init__(self, model_id=None):
        if model_id:
            self.model_id = model_id
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_id)
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_id)

    def get_prediction(self, review):
        """Given a review, return the predicted sentiment"""
        # Tokenize the review
        inputs = self.tokenizer(
            review,
            return_tensors="pt",
            max_length=512,
            padding="max_length",
            truncation=True,
        )

        # Predict
        with torch.no_grad():
            logits = self.model(**inputs).logits
            probs = torch.softmax(logits, dim=-1)
            predicted_class_id = logits.argmax().item()
            predicted_prob = probs[0, predicted_class_id].item()

        # ['NEGATIVE', 'POSITIVE']
        return self.model.config.id2label[predicted_class_id], predicted_prob


model_sentiment = MAI_SentimentAnalyzer()

# Check for negative
review = "This is too long for me.."
sentiment, prob = model_sentiment.get_prediction(review)
print(f"Review: {review} --> Predict Sentiment: {sentiment}")
assert sentiment == "NEGATIVE", "The prediction should be negative"

# Check for positive
review = "The movie was exciting!"
sentiment, prob = model_sentiment.get_prediction(review)
print(f"Review: {review} --> Predict Sentiment: {sentiment}")
assert sentiment == "POSITIVE", "The prediction should be positive"

# Check for positive
review = (
    "This food tastes slightly okay but would not be my favorite but not bad at all"
)
sentiment, prob = model_sentiment.get_prediction(review)
print(f"Review: {review} --> Predict Sentiment: {sentiment}")
assert sentiment == "POSITIVE", "The prediction should be slightly positive"

Review: This is too long for me.. --> Predict Sentiment: NEGATIVE 99.93
Review: The movie was exciting! --> Predict Sentiment: POSITIVE 99.99
Review: This food tastes slightly okay but would not be my favorite but not bad at all --> Predict Sentiment: POSITIVE 99.16


In [10]:
import random
from datasets import load_dataset

# Load the test split of the imdb dataset
dataset = load_dataset("imdb")["test"]

# Get the last few reviews
# reviews = dataset["text"][-10:]
# labels = dataset["label"][-10:]

# Get random reviews
random_indices = random.sample(range(len(dataset["text"])), 10)

# Select reviews and labels using the random indices
reviews = [dataset["text"][i] for i in random_indices]
labels = [dataset["label"][i] for i in random_indices]

# Predict Sentiment on above dataset
for review, label in zip(reviews, labels):
    sentiment, prob = model_sentiment.get_prediction(review)

    print(f"Review: {review[:100]} \n... {review[-100:]}")
    print(f"Predicted Sentiment: {sentiment}\n")

Review: With several name actors (Lance Henrikson, David Warner, Joe Don Baker), why was Jeffery Combs given 
... en play was completely lacking. The director should have recognized this and helped the movie along.
Predicted Sentiment: NEGATIVE

Review: I'll start off right at the beginning by saying "I like this movie." It's sweeping, it's grand, it's 
... recommend it. It makes you think despite some hammy acting. Have fun with this movie; it's worth it.
Predicted Sentiment: POSITIVE

Review: By the standards of Hollywood this movie was filmed and edited as Hollywood movies are and therefore 
... ovie, spend your $8.50 on other things, like a snow cone, which would be much more worth your while.
Predicted Sentiment: NEGATIVE

Review: It's been over 30 years now but I still remember that this movie was the worst I've ever seen. I wou 
... g "STARSHIP TROOPERS" and it came mighty close but it was still more entertaining than " POOR COW ".
Predicted Sentiment: POSITIVE

Review: During t