In [1]:
"""
DistilBERT: a distilled version of BERT: smaller, faster, cheaper and lighter - https://arxiv.org/abs/1910.01108
imdb: Large Movie Review Dataset, 50k train+test, 217.35 MB https://huggingface.co/datasets/stanfordnlp/imdb
"""
try:
    import torch, transformers, datasets, accelerate
except:
    # %pip install -q torch transformers 'datasets==2.18.0' accelerate
    %pip install -q torch transformers datasets accelerate


def use_best_device():
    # Check if GPU is available
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
        torch.set_default_device(device)
    elif torch.backends.mps.is_available():
        device = "mps"   
    return device

device = use_best_device()

print(f"PyTorch version: {torch.__version__}")
print(datasets.__version__)
print(f"device: {device}")
# print("HF_HOME:", os.environ.get("HF_HOME"))

PyTorch version: 2.2.2
2.19.0
device: mps


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer


class IMDB_Dataset:
    model_id = "distilbert-base-uncased"
    splits = ["train", "test"]
    limit_data = 1000
    dataset_id = "imdb"

    def __init__(self):
        # self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_id)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)

        # imdb has: 'train', 'test', 'unsupervised'
        self.dataset = load_dataset(self.dataset_id)
        del self.dataset["unsupervised"]
        print(self.dataset)

        # Load the train and test splits of the imdb dataset, 25k train, 25k test
        # self.dataset = {split: ds for split, ds in zip(self.splits, load_dataset(self.dataset_id, split=self.splits))}
        # {'train': Dataset({features: ['text', 'label'], num_rows: 25000}), 'test': Dataset({features: ['text', 'label'], num_rows: 25000})}

        for split in self.splits:
            self.dataset[split] = (
                self.dataset[split].shuffle(seed=42).select(range(self.limit_data))
            )

        # self.dataset = load_dataset("imdb")
        # print(self.dataset)

    def tokenize_function(self, examples):
        # return self.tokenizer(examples["text"], padding="max_length", truncation=True)
        return self.tokenizer(examples["text"], max_length=512, truncation=True)

    def prepare_data(self):
        """Tokenizing 100,000 reviews take some time (~3 minutes)."""
        # self.tokenized_datasets = self.dataset.map(self.tokenize_function, batched=True)

        self.tokenized_ds = {}
        for split in self.splits:
            self.tokenized_ds[split] = self.dataset[split].map(
                self.tokenize_function, batched=True
            )


imdb_dataset = IMDB_Dataset()
imdb_dataset.prepare_data()
print(imdb_dataset.tokenizer)
print(imdb_dataset.dataset)
print(imdb_dataset.tokenized_ds)

print(imdb_dataset.tokenized_ds["train"][0]["input_ids"])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_row

In [3]:
# Define IMDBTrainer class, loading distilbert-base-uncased
# and imdb dataset for fine-tuning task
# from transformers import DistilBertForSequenceClassification
# from transformers import DistilBertTokenizer

from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np


class IMDB_Finetune:
    """
    distilbert-base-uncased size: ~268 MB
    """

    model_id = "distilbert-base-uncased"
    output_dir = "/tmp/MAI_IMDBTrainer"
    splits = ["train", "test"]
    dataset_id = "imdb"

    def __init__(self):
        # self.model = DistilBertForSequenceClassification.from_pretrained(
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_id,
            num_labels=2,
            id2label={0: "NEGATIVE", 1: "POSITIVE"},
            label2id={"NEGATIVE": 0, "POSITIVE": 1},
        )
        print(device)
        self.model = self.model.to(device)

        # Freeze all the parameters of the base model
        for param in self.model.base_model.parameters():
            param.require_grad = False

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {"accuracy": (predictions == labels).mean()}

    def train(self, tokenizer, tokenized_ds):
        training_args = TrainingArguments(
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            output_dir=self.output_dir,
            learning_rate=0.0001,
            num_train_epochs=5,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        print('training_args.device', training_args.device)
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_ds["train"],
            eval_dataset=tokenized_ds["test"],
            tokenizer=tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
            compute_metrics=self.compute_metrics,
        )
        self.trainer.train()

    def evaluate(self):
        return self.trainer.evaluate()

    def predict(self, x):
        return self.trainer.predict(x)


imdb_trainer = IMDB_Finetune()
# print(imdb_trainer.model)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mps


In [4]:
imdb_trainer.train(imdb_dataset.tokenizer, imdb_dataset.tokenized_ds)

mps
training_args.device mps


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.40268170833587646, 'eval_accuracy': 0.828, 'eval_runtime': 16.4133, 'eval_samples_per_second': 60.926, 'eval_steps_per_second': 7.616, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.5862114429473877, 'eval_accuracy': 0.837, 'eval_runtime': 10.558, 'eval_samples_per_second': 94.715, 'eval_steps_per_second': 11.839, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.8641316890716553, 'eval_accuracy': 0.817, 'eval_runtime': 10.6572, 'eval_samples_per_second': 93.834, 'eval_steps_per_second': 11.729, 'epoch': 3.0}
{'loss': 0.2812, 'grad_norm': 0.022829927504062653, 'learning_rate': 2e-05, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.9444220066070557, 'eval_accuracy': 0.826, 'eval_runtime': 10.6516, 'eval_samples_per_second': 93.883, 'eval_steps_per_second': 11.735, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.8869861960411072, 'eval_accuracy': 0.835, 'eval_runtime': 10.6388, 'eval_samples_per_second': 93.995, 'eval_steps_per_second': 11.749, 'epoch': 5.0}
{'train_runtime': 290.0418, 'train_samples_per_second': 17.239, 'train_steps_per_second': 2.155, 'train_loss': 0.23139438858032227, 'epoch': 5.0}


In [5]:
imdb_trainer.evaluate()

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.40268170833587646,
 'eval_accuracy': 0.828,
 'eval_runtime': 10.9041,
 'eval_samples_per_second': 91.709,
 'eval_steps_per_second': 11.464,
 'epoch': 5.0}

In [11]:
import pandas as pd

# Select specific items for manual review from the test split of the tokenized dataset
test_ds = imdb_dataset.tokenized_ds["test"]
items_for_manual_review = test_ds.select([0, 1, 2, 10, 50, 100])

# Generate predictions for the selected items using the trained model
results = imdb_trainer.predict(items_for_manual_review)

# Create a DataFrame with the original text messages, predicted labels, and actual labels
df = pd.DataFrame(
    {
        "text": [
            item["text"] for item in items_for_manual_review
        ],  # Extract the text messages
        "predictions": results.predictions.argmax(
            axis=1
        ),  # Convert softmax outputs to predicted label indices
        "labels": results.label_ids,  # Actual labels
    }
)

# Configure pandas to display the full text of messages in the DataFrame output
pd.set_option("display.max_colwidth", None)

# Display the DataFrame
df

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,predictions,labels
0,"<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same effect on some others out there.<br /><br />Since I didn't understand why the cover said the film was about sisters fighting over land -they weren't fighting each other at all- I watched it a second time. Then I was able to see that if one hadn't lived a similar story, one would easily miss the overwhelming undercurrent of dread and fear and the deep bond between the sisters that runs through it all. That is exactly the reason why people in general often overlook the truth about their neighbors for instance.<br /><br />But yet another reason why this movie is so perfect!<br /><br />I don't give a rat's ass (pardon my French) about to what extend the King Lear story is followed. All I know is that I can honestly say: this movie has changed my life.<br /><br />Keep up the good work guys, you CAN and DO make a difference.<br /><br />",1,1
1,"This is the latest entry in the long series of films with the French agent, O.S.S. 117 (the French answer to James Bond). The series was launched in the early 1950's, and spawned at least eight films (none of which was ever released in the U.S.). 'O.S.S.117:Cairo,Nest Of Spies' is a breezy little comedy that should not...repeat NOT, be taken too seriously. Our protagonist finds himself in the middle of a spy chase in Egypt (with Morroco doing stand in for Egypt) to find out about a long lost friend. What follows is the standard James Bond/Inspector Cloussou kind of antics. Although our man is something of an overt xenophobe,sexist,homophobe, it's treated as pure farce (as I said, don't take it too seriously). Although there is a bit of rough language & cartoon violence, it's basically okay for older kids (ages 12 & up). As previously stated in the subject line, just sit back,pass the popcorn & just enjoy.",1,1
2,"This movie was so frustrating. Everything seemed energetic and I was totally prepared to have a good time. I at least thought I'd be able to stand it. But, I was wrong. First, the weird looping? It was like watching ""America's Funniest Home Videos"". The damn parents. I hated them so much. The stereo-typical Latino family? I need to speak with the person responsible for this. We need to have a talk. That little girl who was always hanging on someone? I just hated her and had to mention it. Now, the final scene transcends, I must say. It's so gloriously bad and full of badness that it is a movie of its own. What crappy dancing. Horrible and beautiful at once.",0,0
3,"If you're a fan of Turkish and Middle Eastern music, you're in great luck. This film is a documentary of current music in Istanbul, spanning the traditional to the modern. It's very good. You could not do better if you went to Istanbul yourself. We get interviews with Orhan Gencebay, concert clips of modern musical icons, a road show with a Romani (Gypsy) audience, Turkish Hip Hop (surprisingly very very good), and much much more. Some of the best female vocalists I've ever heard. A Kurdish woman singing in a hamam (steam bath) who will rip your heart out. Lots of social and political background. If this is your thing, you'll have a grand time. I could barely sit still in the theatre.<br /><br />CD soundtrack now available on amazon. Pricey.",1,1
4,"Fantastic Chaplin movie with many memorable moments as Charlie joins the army to fight in WW 1.<br /><br />At first he goes to boot camp, where he has to learn how to handle his rifle and how to walk in line. That's a really funny scene as the tramp is not used to keeping his feet straight!<br /><br />Next thing you know he's in France in a trench. Hilarious scenes here include a starving Charlie eating the cheese of a mousetrap and reading a letter from home over someone's shoulder.<br /><br />When Charlie goes to sleep he finds his bunker all flooded and his roommate snoring. This is such a funny part! I can't really describe it, just watch the movie. When Charlie wakes up his legs feel numb so he tries to 'wake them up'. It had me rolling on the floor when it turns out his second leg still feels numb... while Charlie actually rubs his roommate's foot!<br /><br />The movie then turns a bit grim, as Charlie shoots a couple of Germans from his trench (although it's done in a very funny way) and him personating as a tree to get close to the enemy, saving a friend of his from a death squad.<br /><br />Last part is him getting a french girl in trouble by hiding in her house. He then has to save her and while doing so he captures the german kaiser as well. To do so he impersonates a german kolonel or something. I love it when Charlie is asked something in german and he's like nein nein nein. The soldier looks at him in a funny way so Charlie changes his mind: ja ja ja! The kaiser gets captured and Charlie is the hero... but then he wakes up again in bootcamp, it was just a sweet dream!<br /><br />Charlie did one of those 'dream-sequences' before (The Bank comes to mind) but who cares, this movie was so funny it had me laughing all the way. Chaplin also has something to say with this movie (as his later work became more of a social comment to several mishaps in the world) and is explained best in the last sentence of the movie: 'Peace on earth, good will to all mankind.'<br /><br />In short: a Charlie classic, very funny, timeless. 9/10.",1,1
5,"I wish I would have read more reviews and more opinions about this movie before I rented it. A waste of money. A waste of time. Very little dialog. The dialog was hard to understand in every way. The storyline and plot were both weak. The only thing that was nice at all was the cinematography.<br /><br />The characters were interesting. At the same time you will spend so much time trying to figure things out, because of the lack of dialog, that you will be rewinding the movie a lot. <br /><br />Do not watch this movie. It was a mess and will leave you feeling like a mess.<br /><br />You will say, what the heck was that, when the movie ends?",0,0
