In [None]:
!pip install torch torchvision transformers datasets matplotlib pandas scikit-learn wordcloud
!pip install --upgrade --force-reinstall fsspec datasets

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
import random

plt.style.use('seaborn-v0_8-ticks')

In [None]:
def get_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Using device:", device)
    return device

In [None]:
def subsample_balanced(df, n_per_class):
    return (
        df.groupby('label', group_keys=False)
        .apply(lambda x: x.sample(n=n_per_class, random_state=42))
        .reset_index(drop=True)
    )

In [None]:
def plot_class_distribution(df, colors):
    plt.figure(figsize=(5,3))
    df['label'].value_counts().plot(kind='bar', color=colors, edgecolor='#18020c')
    plt.title('IMDB Class Balance', fontsize=13, color='#560bad')
    plt.xlabel('Sentiment (0=Neg, 1=Pos)')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:

def plot_length_histogram(df, color):
    plt.figure(figsize=(6,2.8))
    df['num_words'] = df['text'].apply(lambda t: len(t.split()))
    plt.hist(df['num_words'], bins=30, color=color, edgecolor='#22223b', alpha=0.85)
    plt.title('Review Word Count Distribution', fontsize=11, color='#fb8b24')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

In [None]:
def plot_wordcloud(df, label, cmap, title, tcolor):
    text_blob = " ".join(df[df['label'] == label]['text'])
    wc = WordCloud(width=700, height=200, background_color='white', colormap=cmap).generate(text_blob)
    plt.figure(figsize=(7,2.6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=12, color=tcolor)
    plt.tight_layout()
    plt.show()

In [None]:
def get_bert_dataset(texts, labels, tokenizer, maxlen):
    class IMDBBertDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, maxlen):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.maxlen = maxlen
        def __len__(self):
            return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding='max_length',
                max_length=self.maxlen,
                return_tensors='pt'
            )
            batch = {k: v.squeeze(0) for k, v in enc.items()}
            batch['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return batch
    return IMDBBertDataset(texts, labels, tokenizer, maxlen)


In [None]:
def train_bert(model, loader, optimizer, device, epochs):
    model.train()
    loss_history = []
    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labs = batch['labels'].to(device)
            optimizer.zero_grad()
            out = model(input_ids=ids, attention_mask=mask, labels=labs)
            loss = out.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(loader)
        loss_history.append(avg_loss)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")
    return loss_history

In [None]:
def plot_training_loss(loss_history, color):
    plt.figure(figsize=(6,3.2))
    plt.plot(range(1, len(loss_history)+1), loss_history, marker='s', color=color, linewidth=2)
    plt.title('Training Loss Curve', fontsize=13, color='#00b4d8')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.tight_layout()
    plt.show()


In [None]:
def evaluate_bert(model, loader, device):
    model.eval()
    all_preds = []
    all_true = []
    with torch.no_grad():
        for batch in loader:
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labs = batch['labels'].to(device)
            outs = model(input_ids=ids, attention_mask=mask)
            preds = torch.argmax(outs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(labs.cpu().numpy())
    return np.array(all_true), np.array(all_preds)


In [None]:
def plot_cm(true_labels, pred_labels, class_names, cmap):
    cm = confusion_matrix(true_labels, pred_labels)
    fig, axis = plt.subplots(figsize=(5.5,4.5))
    cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    cm_disp.plot(cmap=cmap, ax=axis, colorbar=True)
    plt.title('IMDB Confusion Matrix', fontsize=14, color='#3c096c')
    plt.xlabel('Predicted', fontsize=12, color='#fb8b24')
    plt.ylabel('True', fontsize=12, color='#fb8b24')
    plt.xticks(fontsize=11, color='#212529')
    plt.yticks(fontsize=11, color='#212529')
    plt.grid(False)
    plt.tight_layout()
    plt.show()


In [None]:
def print_random_preds(test_texts, test_labels, pred_labels, n=5):
    idxs = random.sample(range(len(test_texts)), n)
    print("\nRandom sample predictions:")
    for idx in idxs:
        snippet = test_texts[idx][:90].replace('\n', ' ')
        print(f"\nReview: {snippet}...")
        print(f"True: {'Positive' if test_labels[idx] == 1 else 'Negative'} | Predicted: {'Positive' if pred_labels[idx] == 1 else 'Negative'}")


In [None]:
# --- MAIN WORKFLOW ---
def main():
    # Device
    device = get_device()

    # Data
    imdb_raw = load_dataset("imdb")
    reviews_train_df = pd.DataFrame(imdb_raw['train'])
    reviews_test_df = pd.DataFrame(imdb_raw['test'])

    # Balanced, small subsample for speed
    reviews_train_df = subsample_balanced(reviews_train_df, 3000)
    reviews_test_df = subsample_balanced(reviews_test_df, 400)

    # Visualization
    plot_class_distribution(reviews_train_df, ['#5fa8d3', '#f07167'])
    plot_length_histogram(reviews_train_df, '#06d6a0')
    plot_wordcloud(reviews_train_df, 0, 'cool', 'Word Cloud: Negative Reviews', '#012a4a')
    plot_wordcloud(reviews_train_df, 1, 'autumn', 'Word Cloud: Positive Reviews', '#ae2012')

    # Tokenizer and datasets
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_len = 128

    train_dataset = get_bert_dataset(reviews_train_df['text'].tolist(), reviews_train_df['label'].tolist(), tokenizer, max_len)
    test_dataset = get_bert_dataset(reviews_test_df['text'].tolist(), reviews_test_df['label'].tolist(), tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=12)

    # Model, optimizer
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Training
    loss_history = train_bert(model, train_loader, optimizer, device, epochs=10)
    plot_training_loss(loss_history, '#f07167')

    # Evaluation
    y_true, y_pred = evaluate_bert(model, test_loader, device)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\nTest Accuracy: {acc:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Confusion Matrix
    plot_cm(y_true, y_pred, ['Negative', 'Positive'], cmap='YlGnBu')

    # Print 5 random predictions
    print_random_preds(reviews_test_df['text'].tolist(), reviews_test_df['label'].tolist(), y_pred, n=5)

main()