# Project: IMDb movie review sentiment classification

## 1. Date preparation
- datasets: IMDb Reviews (25k train/ 25k test)

In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## 2. Tokenizer

In [3]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

## 3. DataLoader

In [4]:
import torch
from torch.utils.data import DataLoader

tokenized_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "label"]
)

train_loader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_dataset["test"], batch_size=16)

## 4. Define model

In [5]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Train

In [6]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)
model.to(device)

for epoch in range(2): # 2 epochs first
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attn_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Train Loss: {total_loss/len(train_loader)}")

Using device: mps
Epoch 1, Train Loss: 0.34429277943224385
Epoch 2, Train Loss: 0.21415100438354664


## 6. Test

In [26]:
from sklearn.metrics import accuracy_score

model.eval()
preds, labels_all = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attn_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        preds.extend(predictions.cpu().numpy())
        labels_all.extend(labels.cpu().numpy())

print("Test Accuracy:", accuracy_score(labels_all, preds))

Test Accuracy: 0.87992


## 7. Input for test

In [27]:
# text = "This movie was fantastic! Highly recommend."
text = "terri"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
output = model(**inputs)

pred = torch.argmax(output.logits, dim=-1)
print("Predicted label:", "Positive" if pred.item() == 1 else "Negative")

Predicted label: Positive
