<a href="https://colab.research.google.com/github/nisbenz/Mini-Transformer/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading, Cleaning & Tokenizer Training

- **Lowercase** both columns to normalize casing.
- **Filter** sentences with ≥ 50 words to reduce memory load and focus on simpler grammar.
- **Train** a shared BPE tokenizer on the cleaned corpus.

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import math
import os
import time
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.amp import autocast, GradScaler
from tokenizers import ByteLevelBPETokenizer

CSV_FILE = "train-00000-of-00001.csv"
VOCAB_SIZE = 5000

df = pd.read_csv(CSV_FILE)
print(f"Raw rows: {len(df)}")

df['english'] = df['english'].astype(str).str.lower()
df['darija']  = df['darija'].astype(str).str.lower()

df = df[
    (df['english'].str.split().str.len() < 50) &
    (df['darija'].str.split().str.len() < 50)
]
df = df.reset_index(drop=True)
print(f"After filtering (< 50 words): {len(df)} rows")

all_text = df['english'].tolist() + df['darija'].tolist()

with open("temp_corpus.txt", "w", encoding="utf-8") as f:
    for line in all_text:
        f.write(line + "\n")

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files=["temp_corpus.txt"],
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

os.makedirs("tokenizer", exist_ok=True)
tokenizer.save_model("tokenizer")
os.remove("temp_corpus.txt")

pad_id = tokenizer.token_to_id("<pad>")
print(f"Tokenizer trained — vocab size: {tokenizer.get_vocab_size()}, pad_id: {pad_id}")

Raw rows: 16089
After filtering (< 50 words): 16003 rows
Tokenizer trained — vocab size: 5000, pad_id: 1


# Dataset, Collation & DataLoader

In [2]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.sos_token = tokenizer.token_to_id("<s>")
        self.eos_token = tokenizer.token_to_id("</s>")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = str(self.df.iloc[idx]['english'])
        trg_text = str(self.df.iloc[idx]['darija'])

        src_ids = [self.sos_token] + self.tokenizer.encode(src_text).ids + [self.eos_token]
        trg_ids = [self.sos_token] + self.tokenizer.encode(trg_text).ids + [self.eos_token]

        return {
            "src": torch.tensor(src_ids, dtype=torch.long),
            "trg": torch.tensor(trg_ids, dtype=torch.long)
        }


def get_collate_fn(pad_token_id):
    def collate_fn(batch):
        src_batch = [item['src'] for item in batch]
        trg_batch = [item['trg'] for item in batch]
        src_padded = pad_sequence(src_batch, batch_first=True, padding_value=pad_token_id)
        trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=pad_token_id)
        return src_padded, trg_padded
    return collate_fn


BATCH_SIZE = 32

dataset = TranslationDataset(df, tokenizer)
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=get_collate_fn(pad_id),
    num_workers=2,
    pin_memory=True
)

# Quick verification
data_iter = iter(dataloader)
src_batch, trg_batch = next(data_iter)

print("--- Data Pipeline Verification ---")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Source (English) Shape: {src_batch.shape}")
print(f"Target (Darija) Shape:  {trg_batch.shape}")
print(f"\nDecoded (English): {tokenizer.decode(src_batch[0].tolist())}")
print(f"Decoded (Darija):  {tokenizer.decode(trg_batch[0].tolist())}")

--- Data Pipeline Verification ---
Batch Size: 32
Source (English) Shape: torch.Size([32, 79])
Target (Darija) Shape:  torch.Size([32, 71])

Decoded (English): like halloween
Decoded (Darija):  b7al halloween


#  "Tiny" Pre-LN Transformer (D_MODEL=256, D_FF=512, 3 Layers)

| Parameter | Value |
|-----------|-------|
| `D_MODEL` | 256 |
| `D_FF` | 512 |
| `N_HEAD` | 4 |
| `NUM_LAYERS` | 3 |
| `dropout` | 0.3 |
| LayerNorm | **Pre-LN** |

In [3]:

D_MODEL    = 256
N_HEAD     = 4
D_FF       = 512
NUM_LAYERS = 3
MAX_LEN    = 256
DROPOUT    = 0.3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")


class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=5000):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
        self.d_model = d_model

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = x + self.pe[:, :x.size(1)]
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        self.n_head = n_head
        self.d_head = d_model // n_head
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask):
        B = q.size(0)
        Q = self.w_q(q).view(B, -1, self.n_head, self.d_head).transpose(1, 2)
        K = self.w_k(k).view(B, -1, self.n_head, self.d_head).transpose(1, 2)
        V = self.w_v(v).view(B, -1, self.n_head, self.d_head).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_head)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, V)
        context = context.transpose(1, 2).contiguous().view(B, -1, self.n_head * self.d_head)
        return self.w_o(context)


class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout=0.3):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        x_norm = self.norm1(x)
        x = x + self.dropout1(self.self_attn(x_norm, x_norm, x_norm, mask))
        x_norm = self.norm2(x)
        x = x + self.dropout2(self.ffn(x_norm))
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, d_ff, num_layers, max_len):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_head, d_ff, DROPOUT) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout=0.3):
        super().__init__()
        self.self_attn  = MultiHeadAttention(d_model, n_head)
        self.norm1   = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.cross_attn = MultiHeadAttention(d_model, n_head)
        self.norm2   = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm3   = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, trg_mask):
        x_norm = self.norm1(x)
        x = x + self.dropout1(self.self_attn(x_norm, x_norm, x_norm, trg_mask))
        x_norm = self.norm2(x)
        x = x + self.dropout2(self.cross_attn(x_norm, enc_output, enc_output, src_mask))
        x_norm = self.norm3(x)
        x = x + self.dropout3(self.ffn(x_norm))
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, d_ff, num_layers, max_len):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_head, d_ff, DROPOUT) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x, enc_output, src_mask, trg_mask):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, trg_mask)
        x = self.norm(x)
        return self.fc_out(x)

def make_src_mask(src, pad_token_id):
    return (src != pad_token_id).unsqueeze(1).unsqueeze(2)

def make_trg_mask(trg, pad_token_id):
    trg_pad_mask = (trg != pad_token_id).unsqueeze(1).unsqueeze(2)
    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool()
    return trg_pad_mask & trg_sub_mask


class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def forward(self, src, trg):
        src_mask = make_src_mask(src, self.src_pad_idx)
        trg_mask = make_trg_mask(trg, self.trg_pad_idx)
        enc_output = self.encoder(src, src_mask)
        return self.decoder(trg, enc_output, src_mask, trg_mask)


V = tokenizer.get_vocab_size()

enc = TransformerEncoder(V, D_MODEL, N_HEAD, D_FF, NUM_LAYERS, MAX_LEN)
dec = TransformerDecoder(V, D_MODEL, N_HEAD, D_FF, NUM_LAYERS, MAX_LEN)

model = Transformer(enc, dec, pad_id, pad_id, device).to(device)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights)

src = src_batch.to(device)
trg = trg_batch.to(device)
out = model(src, trg)

total_params = sum(p.numel() for p in model.parameters())
print(f"\n--- Tiny Transformer Verification ---")
print(f"Layers: {NUM_LAYERS}  |  D_MODEL: {D_MODEL}  |  D_FF: {D_FF}")
print(f"Total parameters: {total_params:,}")
print(f"Source Shape: {src.shape}")
print(f"Target Shape: {trg.shape}")
print(f"Output Shape: {out.shape}")

Device: cuda

--- Tiny Transformer Verification ---
Layers: 3  |  D_MODEL: 256  |  D_FF: 512
Total parameters: 7,799,688
Source Shape: torch.Size([32, 79])
Target Shape: torch.Size([32, 71])
Output Shape: torch.Size([32, 71, 5000])


# Optimized Training Loop

- **Label smoothing** = 0.1 (prevents overconfident, loopy output)
- **OneCycleLR** with `max_lr = 0.0007` (built-in warmup)
- **Mixed precision** via `torch.amp` for T4 speed
- **20 epochs**

In [4]:
# ── Training config ──
EPOCHS        = 20
CLIP          = 1.0
LEARNING_RATE = 0.0007

optimizer = torch.optim.Adam(
    model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9
)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=LEARNING_RATE,
    steps_per_epoch=len(dataloader),
    epochs=EPOCHS,
    pct_start=0.1,
    div_factor=10,
    three_phase=False
)

# Label smoothing prevents overconfidence → less repetition
criterion = nn.CrossEntropyLoss(ignore_index=pad_id, label_smoothing=0.1)

scaler = GradScaler('cuda')
model.train()

print(f"Training on {device}  |  {EPOCHS} epochs  |  max_lr={LEARNING_RATE}")
print(f"Batches/epoch: {len(dataloader)}  |  Label smoothing: 0.1")
print("=" * 60)

for epoch in range(EPOCHS):
    start_time = time.time()
    epoch_loss = 0

    for i, (src, trg) in enumerate(dataloader):
        src = src.to(device)
        trg = trg.to(device)

        trg_input = trg[:, :-1]
        trg_label = trg[:, 1:]

        optimizer.zero_grad()

        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(src, trg_input)
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg_label = trg_label.contiguous().view(-1)
            loss = criterion(output, trg_label)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        epoch_loss += loss.item()

        if i % 50 == 0:
            lr = scheduler.get_last_lr()[0]
            print(f"  Epoch {epoch+1} | Batch {i} | Loss: {loss.item():.4f} | LR: {lr:.6f}")

    avg_loss = epoch_loss / len(dataloader)
    elapsed = time.time() - start_time
    print(f"→ Epoch {epoch+1}/{EPOCHS}  |  Avg Loss: {avg_loss:.4f}  |  Time: {elapsed:.1f}s")
    print("-" * 60)

    torch.save(model.state_dict(), f'checkpoint_epoch_{epoch+1}.pth')

print("\n Training complete!")

Training on cuda  |  20 epochs  |  max_lr=0.0007
Batches/epoch: 501  |  Label smoothing: 0.1
  Epoch 1 | Batch 0 | Loss: 8.5877 | LR: 0.000070
  Epoch 1 | Batch 50 | Loss: 7.8900 | LR: 0.000074
  Epoch 1 | Batch 100 | Loss: 7.4271 | LR: 0.000086
  Epoch 1 | Batch 150 | Loss: 7.0696 | LR: 0.000105
  Epoch 1 | Batch 200 | Loss: 6.9824 | LR: 0.000131
  Epoch 1 | Batch 250 | Loss: 6.6938 | LR: 0.000163
  Epoch 1 | Batch 300 | Loss: 6.7024 | LR: 0.000200
  Epoch 1 | Batch 350 | Loss: 6.5870 | LR: 0.000243
  Epoch 1 | Batch 400 | Loss: 6.4407 | LR: 0.000288
  Epoch 1 | Batch 450 | Loss: 6.4131 | LR: 0.000336
  Epoch 1 | Batch 500 | Loss: 6.2443 | LR: 0.000385
→ Epoch 1/20  |  Avg Loss: 6.9821  |  Time: 22.6s
------------------------------------------------------------
  Epoch 2 | Batch 0 | Loss: 6.3774 | LR: 0.000386
  Epoch 2 | Batch 50 | Loss: 6.1604 | LR: 0.000436
  Epoch 2 | Batch 100 | Loss: 6.1391 | LR: 0.000484
  Epoch 2 | Batch 150 | Loss: 6.3022 | LR: 0.000529
  Epoch 2 | Batch 200 

#  Beam Search Inference (beam_size = 3)

Beam search explores multiple candidate translations simultaneously
and picks the one with the highest overall probability.

In [5]:
def translate_beam(sentence, model, tokenizer, device, beam_size=3, max_len=50):
    """Translate an English sentence to Darija using beam search."""
    model.eval()

    sos_id = tokenizer.token_to_id("<s>")
    eos_id = tokenizer.token_to_id("</s>")

    # Encode source with <s> ... </s> wrapping (matches training format)
    tokens = [sos_id] + tokenizer.encode(sentence.lower()).ids + [eos_id]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_mask = make_src_mask(src_tensor, pad_id)

    with torch.no_grad():
        enc_output = model.encoder(src_tensor, src_mask)

    # Each beam: (token_sequence, cumulative_log_prob)
    beams = [([sos_id], 0.0)]

    for _ in range(max_len):
        new_beams = []
        for seq, score in beams:
            # If this beam already ended, carry it forward
            if seq[-1] == eos_id:
                new_beams.append((seq, score))
                continue

            trg_tensor = torch.LongTensor(seq).unsqueeze(0).to(device)
            trg_mask = make_trg_mask(trg_tensor, pad_id)

            with torch.no_grad():
                out = model.decoder(trg_tensor, enc_output, src_mask, trg_mask)

            log_probs = torch.log_softmax(out[0, -1, :], dim=-1)
            top_probs, top_idx = torch.topk(log_probs, beam_size)

            for i in range(beam_size):
                new_beams.append(
                    (seq + [top_idx[i].item()], score + top_probs[i].item())
                )

        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

        if all(b[0][-1] == eos_id for b in beams):
            break

    best_seq = beams[0][0]
    if best_seq[0] == sos_id:
        best_seq = best_seq[1:]
    if best_seq and best_seq[-1] == eos_id:
        best_seq = best_seq[:-1]
    return tokenizer.decode(best_seq)


# ── Test Translations ──
model.to(device)

test_sentences = [
    "How are you?",
    "my king",
    "hello there",
    "What is your name?",
    "Thank you very much"
]

print("\n" + "=" * 50)
print("  Beam Search Translations (beam_size=3)")
print("=" * 50)
for sent in test_sentences:
    result = translate_beam(sent, model, tokenizer, device)
    print(f"  EN: {sent}")
    print(f"  DR: {result}")
    print("-" * 50)


  Beam Search Translations (beam_size=3)
  EN: How are you?
  DR: kidayr?
--------------------------------------------------
  EN: my king
  DR: ra7da
--------------------------------------------------
  EN: hello there
  DR: ahlan m3ak
--------------------------------------------------
  EN: What is your name?
  DR: chno smittek?
--------------------------------------------------
  EN: Thank you very much
  DR: chokran bzaf
--------------------------------------------------


# Cell 6 — Save & Export Model

In [6]:
import shutil
from google.colab import files

os.makedirs("export_model", exist_ok=True)
torch.save(model.state_dict(), "export_model/model.pth")
tokenizer.save_model("export_model")

shutil.make_archive("darija_translator", 'zip', "export_model")

files.download('darija_translator.zip')
print("Download started...")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started...
