<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Transformer_Text_Classification_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Transformer with Sinusoidal Positional Encoding

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np

# Custom Transformer Encoder Layer with Positional Encoding: Sinusoidal Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=128):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-np.log(10000.0) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

# Transformer Model for Text Classification with Positional Encoding
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, max_len)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout=0.1) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)

# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.7570, Validation Acc:0.5000
Epoch 51, Training Loss: 0.6447, Validation Acc:0.5000
Epoch 101, Training Loss: 0.5150, Validation Acc:0.5000
Epoch 151, Training Loss: 0.4650, Validation Acc:0.6000
Epoch 201, Training Loss: 0.4335, Validation Acc:0.5000
Epoch 251, Training Loss: 0.4050, Validation Acc:0.5000
Epoch 301, Training Loss: 0.3940, Validation Acc:0.5000
Epoch 351, Training Loss: 0.3540, Validation Acc:0.5000
Epoch 401, Training Loss: 0.3221, Validation Acc:0.6000
Epoch 451, Training Loss: 0.2965, Validation Acc:0.6000
Epoch 501, Training Loss: 0.2746, Validation Acc:0.7000
Epoch 551, Training Loss: 0.2532, Validation Acc:0.7000
Epoch 601, Training Loss: 0.2351, Validation Acc:0.7000
Epoch 651, Training Loss: 0.2202, Validation Acc:0.7000
Epoch 701, Training Loss: 0.2011, Validation Acc:0.7000
Epoch 751, Training Loss: 0.1888, Validation Acc:0.7000
Epoch 801, Training Loss: 0.1769, Validation Acc:0.7000
Epoch 851, Training Loss: 0.1701, Validation Acc:0.

#Transformer with Learned Positional Encoding

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np

# 2. Learned Positional Encoding
class LearnedPositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super(LearnedPositionalEncoding, self).__init__()
        self.position_embeddings = nn.Embedding(max_len, embed_dim)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        return x + self.position_embeddings(positions)

# Transformer Model for Text Classification with Positional Encoding
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = LearnedPositionalEncoding(embed_dim, max_len)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout=0.1) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)

# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.7072, Validation Acc:0.5000
Epoch 51, Training Loss: 0.6503, Validation Acc:0.5000
Epoch 101, Training Loss: 0.4812, Validation Acc:0.5000
Epoch 151, Training Loss: 0.4222, Validation Acc:0.5000
Epoch 201, Training Loss: 0.3781, Validation Acc:0.5000
Epoch 251, Training Loss: 0.3943, Validation Acc:0.5000
Epoch 301, Training Loss: 0.3273, Validation Acc:0.5000
Epoch 351, Training Loss: 0.2745, Validation Acc:0.5000
Epoch 401, Training Loss: 0.2368, Validation Acc:0.6000
Epoch 451, Training Loss: 0.2299, Validation Acc:0.6000
Epoch 501, Training Loss: 0.2109, Validation Acc:0.5000
Epoch 551, Training Loss: 0.2049, Validation Acc:0.7000
Epoch 601, Training Loss: 0.1761, Validation Acc:0.5000
Epoch 651, Training Loss: 0.1623, Validation Acc:0.5000
Epoch 701, Training Loss: 0.1512, Validation Acc:0.6000
Epoch 751, Training Loss: 0.1455, Validation Acc:0.6000
Epoch 801, Training Loss: 0.1380, Validation Acc:0.6000
Epoch 851, Training Loss: 0.1350, Validation Acc:0.

#Transformer with RBF Kernel-Based Positional Encoding

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np

# RBF Kernel-Based Positional Encoding
class RBFPositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=128):
        super(RBFPositionalEncoding, self).__init__()
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        gamma = torch.exp(-position / max_len).squeeze()
        pe = torch.zeros(max_len, embed_dim)
        for i in range(embed_dim):
            pe[:, i] = torch.exp(-((position.squeeze() - i) ** 2) / (2 * gamma ** 2))
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

# Transformer Model for Text Classification with Positional Encoding
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = RBFPositionalEncoding(embed_dim, max_len)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout=0.1) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)

# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.6860, Validation Acc:0.5000
Epoch 51, Training Loss: 0.6058, Validation Acc:0.5000
Epoch 101, Training Loss: 0.4844, Validation Acc:0.5000
Epoch 151, Training Loss: 0.4339, Validation Acc:0.5000
Epoch 201, Training Loss: 0.3808, Validation Acc:0.5000
Epoch 251, Training Loss: 0.3544, Validation Acc:0.5000
Epoch 301, Training Loss: 0.3244, Validation Acc:0.4000
Epoch 351, Training Loss: 0.2758, Validation Acc:0.5000
Epoch 401, Training Loss: 0.2808, Validation Acc:0.6000
Epoch 451, Training Loss: 0.2351, Validation Acc:0.5000
Epoch 501, Training Loss: 0.1943, Validation Acc:0.5000
Epoch 551, Training Loss: 0.1887, Validation Acc:0.5000
Epoch 601, Training Loss: 0.1919, Validation Acc:0.5000
Epoch 651, Training Loss: 0.1623, Validation Acc:0.5000
Epoch 701, Training Loss: 0.1420, Validation Acc:0.7000
Epoch 751, Training Loss: 0.1387, Validation Acc:0.5000
Epoch 801, Training Loss: 0.1280, Validation Acc:0.5000
Epoch 851, Training Loss: 0.1174, Validation Acc:0.

#Transformer with Rotary Positional Encoding (RoPE)

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np

# 4. Rotary Positional Encoding (RoPE)
class RotaryPositionalEncoding(nn.Module):
    def __init__(self, embed_dim):
        super(RotaryPositionalEncoding, self).__init__()
        self.embed_dim = embed_dim

    def forward(self, x):
        seq_len = x.shape[1]
        theta = torch.exp(torch.arange(0, self.embed_dim, 2, dtype=torch.float, device=x.device) * (-np.log(10000.0) / self.embed_dim))
        theta = theta.unsqueeze(0).unsqueeze(0)
        m = torch.arange(seq_len, dtype=torch.float, device=x.device).unsqueeze(1)
        sin, cos = torch.sin(m * theta), torch.cos(m * theta)
        x_reshaped = x.view(x.shape[0], x.shape[1], -1, 2)
        x_rotated = torch.stack([-x_reshaped[..., 1], x_reshaped[..., 0]], dim=-1)
        x_out = x_reshaped * cos.unsqueeze(-1) + x_rotated * sin.unsqueeze(-1)
        return x_out.view(x.shape)

# Transformer Model for Text Classification with Positional Encoding
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = RotaryPositionalEncoding(embed_dim)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout=0.1) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)

# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.6756, Validation Acc:0.5000
Epoch 51, Training Loss: 0.6522, Validation Acc:0.5000
Epoch 101, Training Loss: 0.5556, Validation Acc:0.5000
Epoch 151, Training Loss: 0.5261, Validation Acc:0.6000
Epoch 201, Training Loss: 0.5151, Validation Acc:0.6000
Epoch 251, Training Loss: 0.4963, Validation Acc:0.6000
Epoch 301, Training Loss: 0.4733, Validation Acc:0.6000
Epoch 351, Training Loss: 0.4523, Validation Acc:0.7000
Epoch 401, Training Loss: 0.4391, Validation Acc:0.5000
Epoch 451, Training Loss: 0.4076, Validation Acc:0.7000
Epoch 501, Training Loss: 0.3945, Validation Acc:0.7000
Epoch 551, Training Loss: 0.3747, Validation Acc:0.7000
Epoch 601, Training Loss: 0.3542, Validation Acc:0.7000
Epoch 651, Training Loss: 0.3297, Validation Acc:0.7000
Epoch 701, Training Loss: 0.3035, Validation Acc:0.8000
Epoch 751, Training Loss: 0.2935, Validation Acc:0.7000
Epoch 801, Training Loss: 0.2826, Validation Acc:0.7000
Epoch 851, Training Loss: 0.2677, Validation Acc:0.

#Transformer with Mixture of Learned and Sinusoidal Encoding

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np


# 1. Sinusoidal Encoding
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=128):
        super(SinusoidalPositionalEncoding, self).__init__()
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-np.log(10000.0) / embed_dim))
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

# 2. Learned Positional Encoding
class LearnedPositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super(LearnedPositionalEncoding, self).__init__()
        self.position_embeddings = nn.Embedding(max_len, embed_dim)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        return x + self.position_embeddings(positions)

# 5. Mixture of Learned and Sinusoidal Encoding
class HybridPositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super(HybridPositionalEncoding, self).__init__()
        self.sinusoidal_encoding = SinusoidalPositionalEncoding(embed_dim, max_len)
        self.learned_encoding = LearnedPositionalEncoding(max_len, embed_dim)

    def forward(self, x):
        return self.sinusoidal_encoding(x) + self.learned_encoding(x)

# Transformer Model for Text Classification with Positional Encoding
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = HybridPositionalEncoding(embed_dim, max_len)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, dropout=0.1) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)

# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.7137, Validation Acc:0.5000
Epoch 51, Training Loss: 0.6291, Validation Acc:0.5000
Epoch 101, Training Loss: 0.4900, Validation Acc:0.5000
Epoch 151, Training Loss: 0.4372, Validation Acc:0.7000
Epoch 201, Training Loss: 0.3989, Validation Acc:0.5000
Epoch 251, Training Loss: 0.3634, Validation Acc:0.7000
Epoch 301, Training Loss: 0.3434, Validation Acc:0.7000
Epoch 351, Training Loss: 0.3322, Validation Acc:0.5000
Epoch 401, Training Loss: 0.3060, Validation Acc:0.7000
Epoch 451, Training Loss: 0.2752, Validation Acc:0.6000
Epoch 501, Training Loss: 0.2526, Validation Acc:0.5000
Epoch 551, Training Loss: 0.2554, Validation Acc:0.5000
Epoch 601, Training Loss: 0.2300, Validation Acc:0.6000
Epoch 651, Training Loss: 0.2315, Validation Acc:0.5000
Epoch 701, Training Loss: 0.1774, Validation Acc:0.5000
Epoch 751, Training Loss: 0.1831, Validation Acc:0.5000
Epoch 801, Training Loss: 0.1856, Validation Acc:0.5000
Epoch 851, Training Loss: 0.1544, Validation Acc:0.

#Everything from Scratch with Best Position Encoder (RoPE) in this Notebook

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import numpy as np

# 4. Rotary Positional Encoding (RoPE)
class RotaryPositionalEncoding(nn.Module):
    def __init__(self, embed_dim):
        super(RotaryPositionalEncoding, self).__init__()
        self.embed_dim = embed_dim

    def forward(self, x):
        seq_len = x.shape[1]
        theta = torch.exp(torch.arange(0, self.embed_dim, 2, dtype=torch.float, device=x.device) * (-np.log(10000.0) / self.embed_dim))
        theta = theta.unsqueeze(0).unsqueeze(0)
        m = torch.arange(seq_len, dtype=torch.float, device=x.device).unsqueeze(1)
        sin, cos = torch.sin(m * theta), torch.cos(m * theta)
        x_reshaped = x.view(x.shape[0], x.shape[1], -1, 2)
        x_rotated = torch.stack([-x_reshaped[..., 1], x_reshaped[..., 0]], dim=-1)
        x_out = x_reshaped * cos.unsqueeze(-1) + x_rotated * sin.unsqueeze(-1)
        return x_out.view(x.shape)

# Custom Multi-Head Attention
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super(CustomMultiheadAttention, self).__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value):
        batch_size, seq_length, embed_dim = query.shape

        q = self.q_proj(query).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(key).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(value).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        attn_output = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, seq_length, embed_dim)
        return self.out_proj(attn_output)

# Custom Transformer Encoder Layer
class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_dim, dropout=0.1):
        super(CustomTransformerEncoderLayer, self).__init__()
        self.self_attn = CustomMultiheadAttention(embed_dim, num_heads, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_output = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x

# Transformer Model for Text Classification with Custom Transformer Encoder Layer
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = RotaryPositionalEncoding(embed_dim)
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids) * attention_mask.unsqueeze(-1)
        embedded = self.positional_encoding(embedded)
        for layer in self.encoder_layers:
            embedded = layer(embedded)
        pooled_output = embedded.mean(dim=1)
        return self.fc(pooled_output)


# Sample dataset (Replace with real dataset)
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def training_per_epoch(model, train_dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # if epoch%100 == 0:
    #     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")
    return total_loss / len(train_dataloader)

def validating_per_epoch(model, train_dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Data Preparation
train_texts = ["This is a great movie!", "Terrible film, I hated it.", "Pretty decent, I liked it.",
               "Absolutely fantastic!", "Worst movie ever.", "I loved every second.",
               "Not my cup of tea.", "Brilliant and engaging!", "Disappointing.", "Superb storyline."]
train_labels = [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]

test_texts = ["Amazing experience!", "Could have been better.", "Loved the characters.",
              "Too predictable.", "A masterpiece!", "Boring and dull.", "Would watch again!",
              "Not worth my time.", "Excellent direction.", "Bad acting."]
test_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# Model Initialization
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2
max_len = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

# Training Loop
epochs = 1000
for epoch in range(epochs):
    training_loss = training_per_epoch(model, train_dataloader, criterion, optimizer)
    validation_acc = validating_per_epoch(model, train_dataloader, criterion)
    if epoch%50 == 0:
        print(f"Epoch {epoch + 1}, Training Loss: {training_loss:.4f}, Validation Acc:{validation_acc:.4f}")



Epoch 1, Training Loss: 0.7402, Validation Acc:0.5000
Epoch 51, Training Loss: 0.0010, Validation Acc:0.9000
Epoch 101, Training Loss: 0.0005, Validation Acc:0.9000
Epoch 151, Training Loss: 0.0003, Validation Acc:0.9000
Epoch 201, Training Loss: 0.0003, Validation Acc:0.9000
Epoch 251, Training Loss: 0.0002, Validation Acc:0.9000
Epoch 301, Training Loss: 0.0002, Validation Acc:0.9000
Epoch 351, Training Loss: 0.0002, Validation Acc:0.9000
Epoch 401, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 451, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 501, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 551, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 601, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 651, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 701, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 751, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 801, Training Loss: 0.0001, Validation Acc:0.9000
Epoch 851, Training Loss: 0.0001, Validation Acc:0.