## 🧪 Toy BERT-like Model from Scratch (PyTorch)
This notebook demonstrates a minimal BERT-style architecture implemented using PyTorch only, trained on synthetic data.

### 1. Imports and Config

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import math
import random

### 2. Toy Dataset for Token Classification

In [None]:
vocab_size = 20
seq_len = 8

class ToyTextDataset(Dataset):
    def __init__(self, size=1000):
        self.data = []
        for _ in range(size):
            tokens = torch.randint(1, vocab_size, (seq_len,))
            labels = (tokens % 2).long()  # fake task: predict parity of tokens
            self.data.append((tokens, labels))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

dataset = ToyTextDataset()
loader = DataLoader(dataset, batch_size=16, shuffle=True)

### 3. Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.0) / dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

### 4. Toy BERT Block

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, ff_hidden):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=heads, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(dim, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, dim)
        )
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x

### 5. Full Toy BERT-like Model

In [None]:
class ToyBERT(nn.Module):
    def __init__(self, vocab_size, dim=32, heads=4, ff_hidden=64, n_layers=2, max_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_enc = PositionalEncoding(dim, max_len)
        self.transformer_blocks = nn.Sequential(*[
            TransformerBlock(dim, heads, ff_hidden) for _ in range(n_layers)
        ])
        self.classifier = nn.Linear(dim, 2)  # predict parity

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.transformer_blocks(x)
        return self.classifier(x)

### 6. Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ToyBERT(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for tokens, labels in loader:
        tokens, labels = tokens.to(device), labels.to(device)
        logits = model(tokens)
        loss = criterion(logits.view(-1, 2), labels.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

### 7. Evaluation on a Sample

In [None]:
model.eval()
sample = torch.randint(1, vocab_size, (1, seq_len)).to(device)
with torch.no_grad():
    logits = model(sample)
    pred = torch.argmax(logits, dim=-1)
print("Input tokens:", sample.cpu().tolist())
print("Predicted labels:", pred.cpu().tolist())