<a href="https://colab.research.google.com/github/ncminhbka/ML-Projects/blob/main/TFM_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests

url = "http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
if not os.path.exists("fra-eng.zip"):
    response = requests.get(url)
    with open("fra-eng.zip", "wb") as f:
        f.write(response.content)

In [None]:
!unzip fra-eng.zip

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [None]:
import unicodedata
def normalize(line):
    """Normalize a line of text and split into two at the tab character"""
    line = unicodedata.normalize("NFKC", line.strip().lower())
    eng, fra = line.split("\t")
    return eng.lower().strip(), fra.lower().strip()

In [None]:
text_pairs = []
with open("fra.txt", "r", encoding="utf-8") as f:
    for line in f:
        eng, fra = normalize(line)
        text_pairs.append((eng, fra))

In [None]:
type(text_pairs[0])

tuple

In [None]:
import tokenizers

if os.path.exists("en_tokenizer.json") and os.path.exists("fr_tokenizer.json"):
    en_tokenizer = tokenizers.Tokenizer.from_file("en_tokenizer.json")
    fr_tokenizer = tokenizers.Tokenizer.from_file("fr_tokenizer.json")
else:
    en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
    fr_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

    # Configure pre-tokenizer to split on whitespace and punctuation, add space at beginning of the sentence
    en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)
    fr_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

    # Configure decoder: So that word boundary symbol "Ġ" will be removed
    en_tokenizer.decoder = tokenizers.decoders.ByteLevel()
    fr_tokenizer.decoder = tokenizers.decoders.ByteLevel()

    # Train BPE for English and French using the same trainer
    VOCAB_SIZE = 8000
    trainer = tokenizers.trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=["[start]", "[end]", "[pad]"],
        show_progress=True
    )
    en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)
    fr_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

    en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id("[pad]"), pad_token="[pad]")
    fr_tokenizer.enable_padding(pad_id=fr_tokenizer.token_to_id("[pad]"), pad_token="[pad]")

    # Save the trained tokenizers
    en_tokenizer.save("en_tokenizer.json", pretty=True)
    fr_tokenizer.save("fr_tokenizer.json", pretty=True)

In [None]:
fr_sample = text_pairs[120][1]

In [None]:
encoded = fr_tokenizer.encode("[start] " + fr_sample + " [end]")
print(f"Original: {fr_sample}")
print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")
print(f"Decoded: {fr_tokenizer.decode(encoded.ids)}")

Original: fous le camp !
Tokens: ['[start]', 'Ġfous', 'Ġle', 'Ġcamp', 'Ġ!', 'Ġ', '[end]']
IDs: [0, 4169, 127, 2294, 219, 74, 1]
Decoded:  fous le camp ! 


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, text_pairs):
        self.text_pairs = text_pairs

    def __len__(self):
        return len(self.text_pairs)

    def __getitem__(self, idx):
        eng, fra = self.text_pairs[idx]
        return eng, "[start] " + fra + " [end]"




from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    en_str, fr_str = zip(*batch)
    en_enc = en_tokenizer.encode_batch(en_str, add_special_tokens=True)
    fr_enc = fr_tokenizer.encode_batch(fr_str, add_special_tokens=True)
    en_ids = [torch.tensor(enc.ids) for enc in en_enc]
    fr_ids = [torch.tensor(enc.ids) for enc in fr_enc]

    en_pad = pad_sequence(en_ids, batch_first=True, padding_value=en_tokenizer.token_to_id("[pad]"))
    fr_pad = pad_sequence(fr_ids, batch_first=True, padding_value=fr_tokenizer.token_to_id("[pad]"))
    return en_pad, fr_pad

BATCH_SIZE = 32
dataset = TranslationDataset(text_pairs)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
import torch
import torch.nn as nn

class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_embedding = nn.Embedding(max_len, embedding_dim) #maxlen 768 rat lon de khong lo lookup thieu

    def forward(self, x):
        bs, seq_len = x.size() #seqlen khac maxlen
        positions = torch.arange(0, seq_len, dtype=torch.long, device = x.device).expand(bs, seq_len)
        token_embeddings = self.token_embedding(x)
        positional_embeddings = self.positional_embedding(positions)
        return token_embeddings + positional_embeddings

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, ffdims, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, ffdims),
            nn.ReLU(),
            nn.Linear(ffdims, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim, eps = 1e-6)
        self.norm2 = nn.LayerNorm(embedding_dim, eps = 1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    def forward(self, x, mask = None):
        attn_output, _ = self.attn(x, x, x, key_padding_mask = mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

In [None]:
class Encoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, ffdims, src_vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(max_len, src_vocab_size, embedding_dim)
        self.encoders = nn.ModuleList([EncoderBlock(embedding_dim, num_heads, ffdims, dropout) for _ in range(num_layers)])
    def forward(self, x, mask = None):
        x = self.embedding(x)
        for encoder in self.encoders:
            x = encoder(x, mask)
        return x

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.masked_attn = nn.MultiheadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            batch_first=True
        )
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            batch_first=True
        )
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_3 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask):
        # 1️⃣ Masked self-attention (decoder nhìn quá khứ)
        attn_output, _ = self.masked_attn(
            x, x, x,
            attn_mask=causal_mask,                # chặn nhìn tương lai
            key_padding_mask=tgt_key_padding_mask  # chặn token PAD trong tgt
        )
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(x + attn_output)

        # 2️⃣ Cross-attention (decoder nhìn encoder output)
        attn_output, _ = self.cross_attn(
            out_1, enc_output, enc_output, #CHÚ Ý Q K V này
            key_padding_mask=src_key_padding_mask  # chặn token PAD trong src
        )
        attn_output = self.dropout_2(attn_output)
        out_2 = self.layernorm_2(out_1 + attn_output)

        # 3️⃣ Feed-forward network
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_3(ffn_output)
        out_3 = self.layernorm_3(out_2 + ffn_output)

        return out_3


In [None]:
class Decoder(nn.Module):
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, tgt_vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(max_len, tgt_vocab_size, embed_dim)
        self.decoders = nn.ModuleList([
            DecoderBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)
        ])
    def forward(self, x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask):
        x = self.embedding(x)
        for decoder in self.decoders:
            x = decoder(x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask)
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, num_layers, src_vocab_size, tgt_vocab_size, max_len, dropout=0.1, device = 'cpu'):
        super().__init__()
        self.encoder = Encoder(num_layers, embed_dim, num_heads, ff_dim, src_vocab_size, max_len, dropout)
        self.decoder = Decoder(num_layers, embed_dim, num_heads, ff_dim, tgt_vocab_size, max_len, dropout)
        self.final_layer = nn.Linear(embed_dim, tgt_vocab_size)
        self.device = device
    def generate_causal_mask(self, seq_len):
        if seq_len == 1: # Handle the case where seq_len is 1
            return torch.zeros(1, 1, device=self.device)
        return torch.triu(torch.ones(seq_len, seq_len, device=self.device) * float('-inf'), diagonal=1)
    def generate_key_padding_mask(self, src):
        return src == torch.tensor(PAD_ID, device=self.device)

    def forward(self, src, tgt):
        src_key_padding_mask = self.generate_key_padding_mask(src)
        tgt_key_padding_mask = self.generate_key_padding_mask(tgt)
        causal_mask = self.generate_causal_mask(tgt.size(1)) #tgt: (bs, tgt sqlen), src (bs, src sqlen)
        enc_out = self.encoder(src, mask = src_key_padding_mask)
        dec_out = self.decoder(tgt, enc_out, src_key_padding_mask = src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, causal_mask = causal_mask)
        return self.final_layer(dec_out)

In [None]:
en_tokenizer.token_to_id("[pad]")

2

In [None]:
fr_tokenizer.token_to_id("[pad]")

2

In [None]:
PAD_ID = en_tokenizer.token_to_id("[pad]")

In [None]:
model_config = {
    "embed_dim": 256,
    "num_heads": 8,
    "ff_dim": 1024,
    "num_layers": 4,
    "src_vocab_size": en_tokenizer.get_vocab_size(),
    "tgt_vocab_size": fr_tokenizer.get_vocab_size(),
    "max_len": 256,
    "dropout": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
device = model_config["device"]
model = Transformer(**model_config).to(device)

In [None]:
import torch.optim as optim
N_EPOCHS = 60
LR = 0.005
WARMUP_STEPS = 1000
CLIP_NORM = 5.0

loss_fn = nn.CrossEntropyLoss(ignore_index=fr_tokenizer.token_to_id("[pad]"))

optimizer = optim.Adam(model.parameters(), lr=LR)
warmup_scheduler = optim.lr_scheduler.LinearLR(
    optimizer, start_factor=0.01, end_factor=1.0, total_iters=WARMUP_STEPS)
cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=N_EPOCHS * len(dataloader) - WARMUP_STEPS, eta_min=0)
scheduler = optim.lr_scheduler.SequentialLR(
    optimizer, schedulers=[warmup_scheduler, cosine_scheduler], milestones=[WARMUP_STEPS])

In [None]:


for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    for en_ids, fr_ids in dataloader:
        # Move the "sentences" to device
        en_ids = en_ids.to(device)
        fr_ids = fr_ids.to(device)
        # zero the grad, then forward pass
        optimizer.zero_grad()
        outputs = model(en_ids, fr_ids) #(bs, batchsqlen, tgt vocabsize)
        # compute the loss: compare 3D logits to 2D targets
        loss = loss_fn(outputs[:, :-1, :].reshape(-1, outputs.shape[-1]), fr_ids[:, 1:].reshape(-1)) #output bỏ cuối, input bỏ đầu
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM, error_if_nonfinite=False)
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{N_EPOCHS}; Avg loss {epoch_loss/len(dataloader)}")




Epoch 1/60; Avg loss 5.393344523827737
Epoch 2/60; Avg loss 5.328317748480046
Epoch 3/60; Avg loss 5.294869009982391
Epoch 4/60; Avg loss 5.278003667246764
Epoch 5/60; Avg loss 5.271035464084311
Epoch 6/60; Avg loss 5.2698560476805225
Epoch 7/60; Avg loss 5.272284278325507


KeyboardInterrupt: 

In [None]:
# Test for a few samples
import random
model.eval()
N_SAMPLES = 5
MAX_LEN = 60
with torch.no_grad():
    start_token = torch.tensor([fr_tokenizer.token_to_id("[start]")]).to(device) #chỉ có start
    for en, true_fr in random.sample(dataset.text_pairs, N_SAMPLES):
        en_ids = torch.tensor(en_tokenizer.encode(en).ids).unsqueeze(0).to(device) #lấy ngẫu nhiên 1 câu, chuyển thành id, thêm bs

        # get context from encoder
        src_mask = model.generate_key_padding_mask(en_ids)
        enc_out = model.encoder(en_ids, src_mask)

        # generate output from decoder
        fr_ids = start_token.unsqueeze(0)
        for _ in range(MAX_LEN):
            causal_mask = model.generate_causal_mask(fr_ids.size(1))
            tgt_mask = model.generate_key_padding_mask(fr_ids)
            x = model.decoder(fr_ids, enc_out, src_mask, tgt_mask, causal_mask)
            outputs = model.final_layer(x)

            outputs = outputs.argmax(dim=-1)
            fr_ids = torch.cat([fr_ids, outputs[:, -1:]], axis=-1)
            if fr_ids[0, -1] == fr_tokenizer.token_to_id("[end]"):
                break

        # Decode the predicted IDs
        pred_fr = fr_tokenizer.decode(fr_ids[0].tolist())
        print(f"English: {en}")
        print(f"French: {true_fr}")
        print(f"Predicted: {pred_fr}")
        print()

English: he's a total wreck.
French: c'est une vraie loque.
Predicted:  je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je

English: it looks like no one's home.
French: on dirait que personne n'est à la maison.
Predicted:  je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je

English: everybody except me knew what was going on.
French: tout le monde sauf moi savait ce qu'il se passait.
Predicted:  je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je je

English: you can't believe anybody.
French: vous ne pouvez pas croire à n'importe qui.
Predicted:  je je je je je je je je je je je je je je je je je je je je je

In [None]:
# -*- coding: utf-8 -*-
"""TFM Machine Translation - fixed and optimized

Modifications made:
- Reduced model size for faster training
- Pre-tokenize dataset once (avoid tokenization inside collate_fn)
- Fixed tokenizer `add_special_tokens` usage
- Added dropout in embedding
- Fixed key_padding_mask dtype and shapes
- Replaced SequentialLR warmup with Transformer-style LambdaLR
- Fixed decoding step to use only last token's logits
- Reduced max_len to 128
"""

import os
import requests
import unicodedata
import random

# Download data if not present
url = "http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip"
if not os.path.exists("fra-eng.zip"):
    response = requests.get(url)
    with open("fra-eng.zip", "wb") as f:
        f.write(response.content)

# unzip (works in Colab)
try:
    get_ipython()
    !unzip -o fra-eng.zip
except Exception:
    # Not running in IPython environment — assume already unzipped
    pass


def normalize(line):
    """Normalize a line of text and split into two at the tab character"""
    line = unicodedata.normalize("NFKC", line.strip())
    eng, fra = line.split("\t")
    return eng.lower().strip(), fra.lower().strip()

text_pairs = []
with open("fra.txt", "r", encoding="utf-8") as f:
    for line in f:
        eng, fra = normalize(line)
        text_pairs.append((eng, fra))

# Tokenizer setup (train if not exists)
import tokenizers

if os.path.exists("en_tokenizer.json") and os.path.exists("fr_tokenizer.json"):
    en_tokenizer = tokenizers.Tokenizer.from_file("en_tokenizer.json")
    fr_tokenizer = tokenizers.Tokenizer.from_file("fr_tokenizer.json")
else:
    en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
    fr_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

    en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)
    fr_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

    en_tokenizer.decoder = tokenizers.decoders.ByteLevel()
    fr_tokenizer.decoder = tokenizers.decoders.ByteLevel()

    VOCAB_SIZE = 8000
    trainer = tokenizers.trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=["[start]", "[end]", "[pad]"],
        show_progress=True
    )
    en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)
    fr_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

    # Enable padding with our [pad] token
    en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id("[pad]"), pad_token="[pad]")
    fr_tokenizer.enable_padding(pad_id=fr_tokenizer.token_to_id("[pad]"), pad_token="[pad]")

    en_tokenizer.save("en_tokenizer.json", pretty=True)
    fr_tokenizer.save("fr_tokenizer.json", pretty=True)

# Quick inspect
fr_sample = text_pairs[120][1]
encoded = fr_tokenizer.encode("[start] " + fr_sample + " [end]")
print("Example: ", fr_sample)
print("Tokens (sample):", encoded.tokens[:20])

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

# Pre-tokenize all data to avoid runtime tokenization overhead
PRE_TOKENIZED = []
start_id = fr_tokenizer.token_to_id("[start]")
end_id = fr_tokenizer.token_to_id("[end]")
pad_id = fr_tokenizer.token_to_id("[pad]")

for eng, fra in text_pairs:
    en_ids = en_tokenizer.encode(eng).ids
    fr_ids = [start_id] + fr_tokenizer.encode(fra).ids + [end_id]
    PRE_TOKENIZED.append((torch.tensor(en_ids, dtype=torch.long), torch.tensor(fr_ids, dtype=torch.long)))

class EncodedTranslationDataset(Dataset):
    def __init__(self, encoded_pairs):
        self.pairs = encoded_pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

# collate only pads tensors
def collate_fn(batch):
    en_ids, fr_ids = zip(*batch)
    en_pad = pad_sequence(en_ids, batch_first=True, padding_value=en_tokenizer.token_to_id("[pad]"))
    fr_pad = pad_sequence(fr_ids, batch_first=True, padding_value=fr_tokenizer.token_to_id("[pad]"))
    return en_pad, fr_pad

# Smaller model config for faster experiments
BATCH_SIZE = 32
dataset = EncodedTranslationDataset(PRE_TOKENIZED)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Transformer implementation (slimmed and fixes)
class TokenAndPositionalEmbedding(nn.Module):
    def __init__(self, max_len, vocab_size, embedding_dim, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_embedding = nn.Embedding(max_len, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        bs, seq_len = x.size()
        positions = torch.arange(0, seq_len, dtype=torch.long, device=x.device).unsqueeze(0).expand(bs, seq_len)
        token_embeddings = self.token_embedding(x)
        positional_embeddings = self.positional_embedding(positions)
        return self.dropout(token_embeddings + positional_embeddings)

class EncoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, ffdims, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embedding_dim, num_heads, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, ffdims),
            nn.ReLU(),
            nn.Linear(ffdims, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim, eps=1e-6)
        self.norm2 = nn.LayerNorm(embedding_dim, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, key_padding_mask=None):
        # key_padding_mask: (batch, seq) bool where True means position is PAD and should be ignored
        attn_output, _ = self.attn(x, x, x, key_padding_mask=key_padding_mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

class Encoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, ffdims, src_vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(max_len, src_vocab_size, embedding_dim, dropout)
        self.encoders = nn.ModuleList([EncoderBlock(embedding_dim, num_heads, ffdims, dropout) for _ in range(num_layers)])
    def forward(self, x, key_padding_mask=None):
        x = self.embedding(x)
        for encoder in self.encoders:
            x = encoder(x, key_padding_mask=key_padding_mask)
        return x

class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.masked_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.cross_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True)
        )
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_3 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask):
        # Masked self-attention (prevent looking at future positions)
        attn_output, _ = self.masked_attn(
            x, x, x,
            attn_mask=causal_mask,                # (tgt_len, tgt_len) additive mask
            key_padding_mask=tgt_key_padding_mask  # (batch, tgt_len) bool
        )
        attn_output = self.dropout_1(attn_output)
        out_1 = self.layernorm_1(x + attn_output)

        # Cross-attention
        attn_output, _ = self.cross_attn(
            out_1, enc_output, enc_output,
            key_padding_mask=src_key_padding_mask
        )
        attn_output = self.dropout_2(attn_output)
        out_2 = self.layernorm_2(out_1 + attn_output)

        # Feed-forward
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_3(ffn_output)
        out_3 = self.layernorm_3(out_2 + ffn_output)

        return out_3

class Decoder(nn.Module):
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, tgt_vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.embedding = TokenAndPositionalEmbedding(max_len, tgt_vocab_size, embed_dim, dropout)
        self.decoders = nn.ModuleList([DecoderBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
    def forward(self, x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask):
        x = self.embedding(x)
        for decoder in self.decoders:
            x = decoder(x, enc_output, src_key_padding_mask, tgt_key_padding_mask, causal_mask)
        return x

class Transformer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, num_layers, src_vocab_size, tgt_vocab_size, max_len, dropout=0.1, device='cpu'):
        super().__init__()
        self.encoder = Encoder(num_layers, embed_dim, num_heads, ff_dim, src_vocab_size, max_len, dropout)
        self.decoder = Decoder(num_layers, embed_dim, num_heads, ff_dim, tgt_vocab_size, max_len, dropout)
        self.final_layer = nn.Linear(embed_dim, tgt_vocab_size)
        self.device = device

    def generate_causal_mask(self, seq_len):
        # Returns an additive mask for MultiheadAttention: shape (tgt_len, tgt_len)
        if seq_len == 1:
            return torch.zeros((1, 1), device=self.device)
        mask = torch.triu(torch.full((seq_len, seq_len), float('-inf'), device=self.device), diagonal=1)
        return mask

    def generate_key_padding_mask(self, src):
        # src: (batch, seq)
        return (src == pad_id).to(torch.bool)

    def forward(self, src, tgt):
        src_key_padding_mask = self.generate_key_padding_mask(src)  # (batch, src_len)
        tgt_key_padding_mask = self.generate_key_padding_mask(tgt)  # (batch, tgt_len)
        causal_mask = self.generate_causal_mask(tgt.size(1))       # (tgt_len, tgt_len)
        enc_out = self.encoder(src, key_padding_mask=src_key_padding_mask)
        dec_out = self.decoder(tgt, enc_out, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, causal_mask=causal_mask)
        return self.final_layer(dec_out)

# Model config (reduced for faster training)
model_config = {
    "embed_dim": 256,
    "num_heads": 4,
    "ff_dim": 1024,
    "num_layers": 3,
    "src_vocab_size": en_tokenizer.get_vocab_size(),
    "tgt_vocab_size": fr_tokenizer.get_vocab_size(),
    "max_len": 128,
    "dropout": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

device = model_config["device"]
model = Transformer(**model_config).to(device)

# Training setup
import torch.optim as optim
N_EPOCHS = 20
WARMUP_STEPS = 4000
CLIP_NORM = 1.0

loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)

# Use Adam with lr=1.0 and scale using LambdaLR (Transformer warmup schedule)
optimizer = optim.Adam(model.parameters(), lr=1.0)

def lr_lambda(step):
    # step is 0-indexed in LambdaLR; convert to 1-indexed for formula
    d_model = model_config['embed_dim']
    step = max(step + 1, 1)
    return (d_model ** -0.5) * min(step ** -0.5, step * (WARMUP_STEPS ** -1.5))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# Training loop
for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0.0
    for en_ids, fr_ids in dataloader:
        en_ids = en_ids.to(device)
        fr_ids = fr_ids.to(device)

        # decoder input is fr_ids[:, :-1], target is fr_ids[:, 1:]
        decoder_input = fr_ids[:, :-1]
        decoder_target = fr_ids[:, 1:]

        optimizer.zero_grad()
        outputs = model(en_ids, decoder_input)  # (bs, tgt_len-1, vocab)

        loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), decoder_target.reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{N_EPOCHS}; Avg loss {avg_loss:.4f}")

# Simple greedy decoding for evaluation on a few samples
model.eval()
N_SAMPLES = 5
MAX_LEN = 60
with torch.no_grad():
    start_token = torch.tensor([start_id]).to(device)
    for en_ids, true_fr_ids in random.sample(PRE_TOKENIZED, N_SAMPLES):
        en_ids = en_ids.unsqueeze(0).to(device)
        src_mask = model.generate_key_padding_mask(en_ids)
        enc_out = model.encoder(en_ids, src_mask)

        fr_ids = start_token.unsqueeze(0)  # shape (1,1)
        for _ in range(MAX_LEN):
            causal_mask = model.generate_causal_mask(fr_ids.size(1))
            tgt_mask = model.generate_key_padding_mask(fr_ids)
            x = model.decoder(fr_ids, enc_out, src_mask, tgt_mask, causal_mask)
            logits = model.final_layer(x)  # (1, seq_len, vocab)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)  # (1,1)
            fr_ids = torch.cat([fr_ids, next_token], dim=-1)
            if fr_ids[0, -1].item() == end_id:
                break

        pred_ids = fr_ids[0].tolist()
        pred_text = fr_tokenizer.decode(pred_ids)
        true_text = fr_tokenizer.decode(true_fr_ids.tolist())
        eng_text = en_tokenizer.decode(en_ids[0].tolist())

        print("English:", eng_text)
        print("True French:", true_text)
        print("Predicted:", pred_text)
        print()


Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 
Example:  fous le camp !
Tokens (sample): ['[start]', 'Ġfous', 'Ġle', 'Ġcamp', 'Ġ!', 'Ġ', '[end]']




Epoch 1/20; Avg loss 2.9733
Epoch 2/20; Avg loss 1.5412
Epoch 3/20; Avg loss 1.1615
Epoch 4/20; Avg loss 0.9641
Epoch 5/20; Avg loss 0.8381
Epoch 6/20; Avg loss 0.7467
Epoch 7/20; Avg loss 0.6764
Epoch 8/20; Avg loss 0.6201
Epoch 9/20; Avg loss 0.5743
Epoch 10/20; Avg loss 0.5360
Epoch 11/20; Avg loss 0.5018
Epoch 12/20; Avg loss 0.4736
Epoch 13/20; Avg loss 0.4489
Epoch 14/20; Avg loss 0.4257
Epoch 15/20; Avg loss 0.4063
Epoch 16/20; Avg loss 0.3889
Epoch 17/20; Avg loss 0.3726
Epoch 18/20; Avg loss 0.3589
Epoch 19/20; Avg loss 0.3466
Epoch 20/20; Avg loss 0.3340
English:  she's drop-dead gorgeous.
True French:  elle est bonne à tomber raide.
Predicted:  elle est bonne à tomber.

English:  you can't judge a book by its cover.
True French:  on ne doit pas juger un livre sur sa reliure.
Predicted:  on ne peut pas juger un livre sur sa couverture.

English:  he is probably still alive.
True French:  il est probablement toujours vivant.
Predicted:  il est probablement encore en vie.

Engl

In [None]:
# Simple greedy decoding for evaluation on a few samples
model.eval()
N_SAMPLES = 5
MAX_LEN = 60
with torch.no_grad():
    start_token = torch.tensor([start_id]).to(device)
    for en_ids, true_fr_ids in random.sample(PRE_TOKENIZED, N_SAMPLES):
        en_ids = en_ids.unsqueeze(0).to(device)
        src_mask = model.generate_key_padding_mask(en_ids)
        enc_out = model.encoder(en_ids, src_mask)

        fr_ids = start_token.unsqueeze(0)  # shape (1,1)
        for _ in range(MAX_LEN):
            causal_mask = model.generate_causal_mask(fr_ids.size(1))
            tgt_mask = model.generate_key_padding_mask(fr_ids)
            x = model.decoder(fr_ids, enc_out, src_mask, tgt_mask, causal_mask)
            logits = model.final_layer(x)  # (1, seq_len, vocab)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)  # (1,1)
            fr_ids = torch.cat([fr_ids, next_token], dim=-1)
            if fr_ids[0, -1].item() == end_id:
                break

        pred_ids = fr_ids[0].tolist()
        pred_text = fr_tokenizer.decode(pred_ids)
        true_text = fr_tokenizer.decode(true_fr_ids.tolist())
        eng_text = en_tokenizer.decode(en_ids[0].tolist())

        print("English:", eng_text)
        print("True French:", true_text)
        print("Predicted:", pred_text)
        print()




English:  i want this building locked.
True French:  je veux que cet immeuble soit verrouillé.
Predicted:  je veux que cet immeuble soit verrouillé.

English:  i think it's time for me to abandon that plan.
True French:  je pense qu'il est temps pour moi de laisser tomber ce projet.
Predicted:  je pense qu'il est temps pour moi d'abandonner ce projet.

English:  i need to talk to you about an urgent matter.
True French:  il me faut vous entretenir d'une affaire pressante.
Predicted:  il me faut vous entretenir d'une affaire pressante.

English:  haven't you caused enough trouble already?
True French:  t'as pas causé assez de problèmes comme ça ?
Predicted:  n'as-tu pas assez causé de problèmes ?

English:  don't you want to see the world?
True French:  ne veux-tu pas voir le monde ?
Predicted:  ne voulez-vous pas voir le monde ?



In [None]:


def translate_sentence(model, en_tokenizer, fr_tokenizer, sentence, max_len=60, device="cpu"):
    model.eval()
    with torch.no_grad():
        start_id = fr_tokenizer.token_to_id("[start]")
        end_id = fr_tokenizer.token_to_id("[end]")

        # Encode English input
        en_ids = torch.tensor(en_tokenizer.encode(sentence).ids, dtype=torch.long).unsqueeze(0).to(device)

        # Encoder forward pass
        src_mask = model.generate_key_padding_mask(en_ids)
        enc_out = model.encoder(en_ids, src_mask)

        # Start decoding
        fr_ids = torch.tensor([[start_id]], dtype=torch.long, device=device)
        for _ in range(max_len):
            causal_mask = model.generate_causal_mask(fr_ids.size(1))
            tgt_mask = model.generate_key_padding_mask(fr_ids)
            x = model.decoder(fr_ids, enc_out, src_mask, tgt_mask, causal_mask)
            logits = model.final_layer(x)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
            fr_ids = torch.cat([fr_ids, next_token], dim=-1)
            if fr_ids[0, -1].item() == end_id:
                break

        pred_text = fr_tokenizer.decode(fr_ids[0].tolist())
        return pred_text


device = "cuda" if torch.cuda.is_available() else "cpu"
input = "Excuse me, where is the nearest coffee shop?"
result = translate_sentence(model, en_tokenizer, fr_tokenizer, input, device=device)

print(result)

 it-moi quand est le café le plus proche ?
