<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Basic_Transformer_Scratch_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing datasets lib:

In [3]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random

# Basic Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads
        self.n_heads = n_heads
        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x):
        batch_size, seq_len, d_model = x.size()
        qkv = self.qkv_proj(x)  # (B, S, 3*d_model)
        qkv = qkv.view(batch_size, seq_len, self.n_heads, 3 * self.d_k).transpose(1, 2)  # (B, H, S, 3*d_k)
        q, k, v = qkv.chunk(3, dim=-1)

        scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)  # (B, H, S, S)
        attn = scores.softmax(dim=-1)
        context = attn @ v  # (B, H, S, d_k)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return self.out_proj(context)


# Feed Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = self.norm1(x + self.attn(x))
        x = self.norm2(x + self.ffn(x))
        return x


# Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, max_len=100):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])
        self.classifier = nn.Linear(d_model, 1)  # Binary classification

    def forward(self, x):
        x = self.embed(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Global average pooling
        return torch.sigmoid(self.classifier(x)).squeeze(-1)

In [5]:
# === Save as train_sentiment_transformer.py ===
import torch
import torch.nn as nn
import math
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# Hyperparameters
MAX_LEN = 64
BATCH_SIZE = 32
D_MODEL = 64
N_LAYERS = 2
N_HEADS = 4
D_FF = 128
EPOCHS = 5
MODEL_PATH = "best_transformer.pt"
VOCAB_SIZE = 30522  # For BERT tokenizer

# Define Positional Encoding, Attention, FeedForward, Transformer Classes...
# [Same code as previous message, omitted here for brevity]
# Use the definitions of: PositionalEncoding, MultiHeadAttention, FeedForward, EncoderLayer, TransformerEncoder

# Load IMDb dataset and tokenize
dataset = load_dataset("imdb", split="train[:5000]+test[:2000]")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=MAX_LEN)
    return {"input_ids": tokens["input_ids"], "label": example["label"]}

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "label"])
train_set = dataset.select(range(5000))
val_set = dataset.select(range(5000, 7000))

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerEncoder(VOCAB_SIZE, D_MODEL, N_LAYERS, N_HEADS, D_FF, MAX_LEN).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()

best_val_loss = float("inf")
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        x = batch["input_ids"].to(device)
        y = batch["label"].float().to(device)
        optimizer.zero_grad()
        loss = loss_fn(model(x), y)
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            x = batch["input_ids"].to(device)
            y = batch["label"].float().to(device)
            val_loss += loss_fn(model(x), y).item()
    val_loss /= len(val_loader)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), MODEL_PATH)
        print(f"Epoch {epoch+1}: ✅ Saved model (Val Loss = {val_loss:.4f})")
    else:
        print(f"Epoch {epoch+1}: Val Loss = {val_loss:.5f}")


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Epoch 1: ✅ Saved model (Val Loss = 0.0008)
Epoch 2: ✅ Saved model (Val Loss = 0.0003)
Epoch 3: ✅ Saved model (Val Loss = 0.0002)
Epoch 4: ✅ Saved model (Val Loss = 0.0001)
Epoch 5: ✅ Saved model (Val Loss = 0.0001)


Inference:

In [7]:
# === Save as inference_sentiment.py ===
import torch
from transformers import AutoTokenizer
# from train_sentiment_transformer import TransformerEncoder, MAX_LEN, D_MODEL, N_LAYERS, N_HEADS, D_FF, VOCAB_SIZE

def load_model(path="best_transformer.pt"):
    model = TransformerEncoder(VOCAB_SIZE, D_MODEL, N_LAYERS, N_HEADS, D_FF, MAX_LEN)
    model.load_state_dict(torch.load(path, map_location=torch.device("cpu")))
    model.eval()
    return model

def predict_sentiment(text, model, tokenizer):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    with torch.no_grad():
        pred = model(tokens["input_ids"])
        return "Positive" if pred.item() > 0.5 else "Negative"

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = load_model("best_transformer.pt")
    example = "This movie was surprisingly enjoyable and well-acted!"
    print(f"Prediction: {predict_sentiment(example, model, tokenizer)}")


Prediction: Negative


Inference on LMDB test set:

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

# Load the IMDb test split
test_dataset = load_dataset("imdb", split="test[:1000]")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_test = test_dataset.map(preprocess_function, batched=True)



# Set format for PyTorch
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

test_dataloader = DataLoader(tokenized_test, batch_size=32)


model = TransformerEncoder(vocab_size=30522, d_model=64, n_layers=2, n_heads=4, d_ff=128, max_len=512)
model.load_state_dict(torch.load("best_transformer.pt"))
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label'].float()

        outputs = model(input_ids)
        predictions = (outputs > 0.5).float()
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Test Accuracy: 100.00%
