In [30]:
import re
import time
import math
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F

from transformer import Transformer

In [38]:
EXCEL_PATH = "/content/english_to_urdu_dataset.xlsx"
TOKENIZER_EN_PREFIX = "/content/tokenizer_en"   # prefix used when saved
TOKENIZER_UR_PREFIX = "/content/tokenizer_ur"
BUILD_TOKENIZERS_IF_MISSING = True
TARGET_VOCAB_SIZE = 2 ** 13

BATCH_SIZE = 32
MAX_SEQ_LEN = 40           # must match transformer's max_sequence_length used at init
D_MODEL = 256
FFN_HIDDEN = 512
NUM_HEADS = 8
NUM_LAYERS = 4
DROP_PROB = 0.1
LR = 1e-4
EPOCHS = 6
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NEG_INF = -1e9

In [33]:
df = pd.read_excel(EXCEL_PATH, engine='openpyxl')  # engine explicit for .xlsx

# detect likely columns
possible_cols = [c.lower() for c in df.columns]
if 'english' in possible_cols:
    en_col = df.columns[possible_cols.index('english')]
elif 'eng' in possible_cols:
    en_col = df.columns[possible_cols.index('eng')]
elif 'English' in df.columns:
    en_col = 'English'
else:
    # If your excel has other names, update here
    en_col = df.columns[0]

if 'urdu' in possible_cols:
    ur_col = df.columns[possible_cols.index('urdu')]
elif 'Urdu' in df.columns:
    ur_col = 'Urdu'
else:
    ur_col = df.columns[1]

df = df[[en_col, ur_col]].rename(columns={en_col: 'eng', ur_col: 'urdu'})

In [34]:
# cleaning functions
def clean_english(text):
    if isinstance(text, str):
        return re.sub(r'[^a-z0-9\s]', '', text.lower()).strip()
    return ''

def clean_urdu(text):
    if isinstance(text, str):
        # remove ASCII letters/digits/punctuation; keep unicode (Urdu)
        return re.sub(r'[A-Za-z0-9!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~]', '', text).strip()
    return ''

df['eng'] = df['eng'].apply(clean_english)
df['urdu'] = df['urdu'].apply(clean_urdu)
df = df[(df['eng'] != '') & (df['urdu'] != '')].reset_index(drop=True)

print("Total cleaned pairs:", len(df))

Total cleaned pairs: 9101


In [39]:
import tensorflow_datasets as tfds

try:
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file(TOKENIZER_EN_PREFIX)
    tokenizer_ur = tfds.deprecated.text.SubwordTextEncoder.load_from_file(TOKENIZER_UR_PREFIX)
    print("Loaded tokenizers from disk.")
except Exception as e:
    print("Couldn't load tokenizers from disk:", e)
    if not BUILD_TOKENIZERS_IF_MISSING:
        raise RuntimeError("Set BUILD_TOKENIZERS_IF_MISSING=True to build tokenizers from the dataset.")
    # Build tokenizers from training corpus (slow)
    english_corpus = df['eng'].tolist()
    urdu_corpus = df['urdu'].tolist()
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(english_corpus, target_vocab_size=TARGET_VOCAB_SIZE)
    tokenizer_ur = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(urdu_corpus, target_vocab_size=TARGET_VOCAB_SIZE)
    tokenizer_en.save_to_file(TOKENIZER_EN_PREFIX)
    tokenizer_ur.save_to_file(TOKENIZER_UR_PREFIX)
    print("Built & saved tokenizers.")

# Helper: map tokenizer ids -> subword text piece (string)
# SubwordTextEncoder uses .subwords list (length = vocab_size)
subwords_en = list(tokenizer_en.subwords)
subwords_ur = list(tokenizer_ur.subwords)

# Add special tokens as textual tokens for your SentenceEmbedding mapping
START_TOKEN = "<S>"
END_TOKEN = "</S>"
PADDING_TOKEN = "<PAD>"

# English token strings and mapping to indices (for SentenceEmbedding)
english_token_list = subwords_en + [START_TOKEN, END_TOKEN, PADDING_TOKEN]
english_to_index = {tok: idx for idx, tok in enumerate(english_token_list)}
index_to_english = {idx: tok for tok, idx in english_to_index.items()}

# Urdu token strings and mapping to indices
urdu_token_list = subwords_ur + [START_TOKEN, END_TOKEN, PADDING_TOKEN]
urdu_to_index = {tok: idx for idx, tok in enumerate(urdu_token_list)}
index_to_urdu = {idx: tok for tok, idx in urdu_to_index.items()}

print("English tokens:", len(english_token_list), " Urdu tokens:", len(urdu_token_list))

# PAD index used by loss
PAD_IDX_URDU = urdu_to_index[PADDING_TOKEN]

Couldn't load tokenizers from disk: /content/tokenizer_en.subwords; No such file or directory
Built & saved tokenizers.
English tokens: 7347  Urdu tokens: 8717


In [47]:
class EngUrDataset(Dataset):
    def __init__(self, eng_texts, ur_texts):
        assert len(eng_texts) == len(ur_texts)
        self.eng = eng_texts
        self.ur = ur_texts

    def __len__(self):
        return len(self.eng)

    def __getitem__(self, idx):
        en_sentence = self.eng[idx]
        ur_sentence = self.ur[idx]
        # Encode to subword ids (integers)
        en_ids = tokenizer_en.encode(en_sentence)
        ur_ids = tokenizer_ur.encode(ur_sentence)
        # Convert ids to subword strings (these are the tokens our SentenceEmbedding maps)
        en_tokens = []
        for i in en_ids:
            if i < len(subwords_en):
                en_tokens.append(subwords_en[i])
            elif i == len(subwords_en):  # START token
                en_tokens.append("<S>")
            elif i == len(subwords_en) + 1:  # END token
                en_tokens.append("</S>")

        ur_tokens = []
        for i in ur_ids:
            if i < len(subwords_ur):
                ur_tokens.append(subwords_ur[i])
            elif i == len(subwords_ur):  # START token
                ur_tokens.append("<S>")
            elif i == len(subwords_ur) + 1:  # END token
                ur_tokens.append("</S>")

        return en_tokens, ur_tokens

def collate_batch(batch):
    # returns lists of token-lists
    en_batch, ur_batch = zip(*batch)
    return list(en_batch), list(ur_batch)

# Train/val split
from sklearn.model_selection import train_test_split
train_texts, val_texts = train_test_split(df, test_size=0.1, random_state=42)
train_ds = EngUrDataset(train_texts['eng'].tolist(), train_texts['urdu'].tolist())
val_ds = EngUrDataset(val_texts['eng'].tolist(), val_texts['urdu'].tolist())

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [41]:
def create_masks(eng_batch, ur_batch, max_sequence_length):
    """
    eng_batch, ur_batch: lists of token-lists (e.g. [ ['this','is','a'], ... ])
    returns encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask
    Each mask will be a float tensor with NEG_INF for masked positions and 0 for allowed.
    Shapes: (batch, max_seq, max_seq)
    """
    batch_size = len(eng_batch)
    look_ahead_mask = torch.triu(torch.ones((max_sequence_length, max_sequence_length), dtype=torch.bool), diagonal=1)
    # same shape as earlier
    encoder_padding_mask = torch.zeros((batch_size, max_sequence_length, max_sequence_length), dtype=torch.bool)
    decoder_padding_mask_self = torch.zeros((batch_size, max_sequence_length, max_sequence_length), dtype=torch.bool)
    decoder_padding_mask_cross = torch.zeros((batch_size, max_sequence_length, max_sequence_length), dtype=torch.bool)

    for idx in range(batch_size):
        eng_len = len(eng_batch[idx]) + 1 + 1  # start + tokens + end -> SentenceEmbedding will add start & end
        ur_len = len(ur_batch[idx]) + 1 + 1

        # indices that will be padding (positions after actual length)
        eng_pad_positions = np.arange(eng_len, max_sequence_length)
        ur_pad_positions = np.arange(ur_len, max_sequence_length)

        # For encoder padding mask: mask rows/cols corresponding to padding tokens.
        encoder_padding_mask[idx, :, eng_pad_positions] = True
        encoder_padding_mask[idx, eng_pad_positions, :] = True

        # Decoder self-attention padding mask (prevent padding tokens from attending and being attended to)
        decoder_padding_mask_self[idx, :, ur_pad_positions] = True
        decoder_padding_mask_self[idx, ur_pad_positions, :] = True

        # Cross-attention: prevent decoder positions (including padded) attending to encoder padding
        decoder_padding_mask_cross[idx, :, eng_pad_positions] = True
        decoder_padding_mask_cross[idx, ur_pad_positions, :] = True

    # convert to float masks with NEG_INF for masked positions (matching your scaling usage)
    encoder_self_attention_mask = torch.where(encoder_padding_mask, torch.tensor(NEG_INF), torch.tensor(0.0))
    # combine lookahead + decoder padding for decoder self-attention
    # look_ahead_mask has shape (max_seq, max_seq) boolean; expand to batch
    look_ahead_mask_bool = look_ahead_mask.unsqueeze(0).expand(batch_size, -1, -1)
    decoder_self_attention_mask = torch.where(look_ahead_mask_bool | decoder_padding_mask_self,
                                             torch.tensor(NEG_INF), torch.tensor(0.0))
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross, torch.tensor(NEG_INF), torch.tensor(0.0))

    return encoder_self_attention_mask.to(DEVICE), decoder_self_attention_mask.to(DEVICE), decoder_cross_attention_mask.to(DEVICE)

In [42]:
src_vocab_size_for_model = len(english_token_list)   # equals embedding vocab inside SentenceEmbedding
tgt_vocab_size_for_model = len(urdu_token_list)

model = Transformer(
    d_model=D_MODEL,
    ffn_hidden=FFN_HIDDEN,
    num_heads=NUM_HEADS,
    drop_prob=DROP_PROB,
    num_layers=NUM_LAYERS,
    max_sequence_length=MAX_SEQ_LEN,
    kn_vocab_size=tgt_vocab_size_for_model,
    english_to_index=english_to_index,
    urdu_to_index=urdu_to_index,
    START_TOKEN=START_TOKEN,
    END_TOKEN=END_TOKEN,
    PADDING_TOKEN=PADDING_TOKEN
).to(DEVICE)

# loss & optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX_URDU, reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [43]:
def train_one_epoch(model, loader, epoch):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    start = time.time()

    for batch_idx, (eng_batch, ur_batch) in enumerate(loader):
        # eng_batch, ur_batch are lists of token-lists (strings), length = batch_size
        batch_size = len(eng_batch)

        # Create masks: these assume SentenceEmbedding will add start/end tokens
        enc_mask, dec_self_mask, dec_cross_mask = create_masks(eng_batch, ur_batch, MAX_SEQ_LEN)

        optimizer.zero_grad()

        preds = model(
            x=eng_batch,
            y=ur_batch,
            encoder_self_attention_mask=enc_mask,
            decoder_self_attention_mask=dec_self_mask,
            decoder_cross_attention_mask=dec_cross_mask,
            enc_start_token=True,
            enc_end_token=True,
            dec_start_token=True,
            dec_end_token=True
        )  # preds shape: (batch, seq_len, vocab_size) as floats (logits)

        # Build target labels: we need indices (ints) of the target tokens after SentenceEmbedding tokenization
        # We'll reuse the SentenceEmbedding logic: sequence length will equal MAX_SEQ_LEN inside model outputs.
        # But to compute loss we must get the integer labels tensor:
        # We'll convert ur_batch token-lists -> tensor of indices using urdu_to_index, letting SentenceEmbedding's padding match.
        def batch_to_indices(batch_tokens, mapping):
            # batch_tokens: list of token-lists (strings) for each example (no start/end)
            # mapping maps token-string -> int index in embedding
            res = []
            for tokens in batch_tokens:
                idxs = [mapping[t] for t in tokens]               # tokens only (SentenceEmbedding will add start/end)
                # SentenceEmbedding.batch_tokenize will insert start & end if flags true, then pad to max_sequence_length
                # But here we must mimic exactly what SentenceEmbedding produces:
                seq = []
                seq.append(mapping[START_TOKEN])
                seq.extend(idxs)
                seq.append(mapping[END_TOKEN])
                # pad
                while len(seq) < MAX_SEQ_LEN:
                    seq.append(mapping[PADDING_TOKEN])
                # truncate if > MAX_SEQ_LEN
                seq = seq[:MAX_SEQ_LEN]
                res.append(seq)
            return torch.tensor(res, dtype=torch.long, device=DEVICE)

        target_indices = batch_to_indices(ur_batch, urdu_to_index)  # shape (batch, MAX_SEQ_LEN)

        # preds: (batch, seq_len, vocab)
        # We will compute loss on all token positions, but exclude PAD positions
        preds_flat = preds.view(-1, preds.size(-1))
        labels_flat = target_indices.view(-1)

        # compute per-token loss with reduction='sum' then divide by actual tokens
        loss_sum = criterion(preds_flat, labels_flat)
        non_pad = (labels_flat != PAD_IDX_URDU).sum().item()
        loss = loss_sum / max(1, non_pad)
        loss.backward()
        optimizer.step()

        total_loss += loss_sum.item()
        total_tokens += non_pad

        if batch_idx % 50 == 0:
            print(f"Epoch {epoch} Batch {batch_idx} Loss per token: {loss.item():.4f}")

    epoch_loss = total_loss / max(1, total_tokens)
    print(f"Epoch {epoch} finished in {time.time()-start:.1f}s, avg loss per token: {epoch_loss:.4f}")
    return epoch_loss

In [44]:
def translate_sentence(model, eng_sentence, max_length=MAX_SEQ_LEN):
    model.eval()
    with torch.no_grad():
        # prepare english as token-list (no start/end)
        en_ids = tokenizer_en.encode(eng_sentence)
        en_tokens = [subwords_en[i] for i in en_ids]
        # initial ur sentence is empty string (model will add start token if dec_start_token=True)
        ur_sentence = ("",)   # as tuple of one string like your earlier code used
        for step in range(max_length):
            enc_mask, dec_mask, cross_mask = create_masks([en_tokens], [[]], max_length)
            preds = model(
                x=[en_tokens],
                y=ur_sentence,   # tuple/list; SentenceEmbedding expects sequences; pass as tuple of 1 element
                encoder_self_attention_mask=enc_mask,
                decoder_self_attention_mask=dec_mask,
                decoder_cross_attention_mask=cross_mask,
                enc_start_token=True,
                enc_end_token=True,
                dec_start_token=True,
                dec_end_token=False
            )  # outputs shape (1, max_len, vocab)
            # take next token at current step
            logits_at_step = preds[0, step]   # (vocab,)
            next_idx = torch.argmax(logits_at_step).item()
            next_token_str = index_to_urdu[next_idx]
            # stop if END_TOKEN
            if next_token_str == END_TOKEN:
                break
            # append token (SentenceEmbedding expects token-lists; we can append string to the single string tuple)
            ur_sentence = (ur_sentence[0] + next_token_str, )  # keep format consistent with encoder code
        # ur_sentence is a tuple with concatenated token-strings — convert token-strings back to string using tokenizer_ur.decode
        # However we constructed tokens as raw subword string tokens so it's tricky to decode; instead, build list of subword tokens:
        # We will extract tokens from the built ur_sentence[0] by splitting on token boundaries isn't trivial.
        # Simpler: we can re-run generation collecting token strings in a list instead:
    return None  # We will implement improved inference below

In [45]:
# Better greedy inference that accumulates tokens in list form:
def translate_greedy(model, eng_sentence, max_length=MAX_SEQ_LEN):
    model.eval()
    with torch.no_grad():
        en_ids = tokenizer_en.encode(eng_sentence)
        en_tokens = [subwords_en[i] for i in en_ids]
        ur_tokens = []  # list of generated subword tokens (strings), not joined text
        for step in range(max_length):
            # pass current ur_tokens as batch of one example (note: pass as list-of-strings where each example is list tokens OR empty string)
            enc_mask, dec_mask, cross_mask = create_masks([en_tokens], [ur_tokens], max_length)
            # model expects batch of token-lists. For target we pass a tuple/list with one element that is a list of token strings.
            preds = model(
                x=[en_tokens],
                y=[ur_tokens],   # batch of one, each is a list of token strings
                encoder_self_attention_mask=enc_mask,
                decoder_self_attention_mask=dec_mask,
                decoder_cross_attention_mask=cross_mask,
                enc_start_token=True,
                enc_end_token=True,
                dec_start_token=True,
                dec_end_token=False
            )  # shape (1, max_len, vocab)
            logits = preds[0, len(ur_tokens)]   # logits for next position
            next_idx = torch.argmax(logits).item()
            next_token = index_to_urdu[next_idx]
            if next_token == END_TOKEN:
                break
            if next_token == PADDING_TOKEN:
                break
            ur_tokens.append(next_token)
        # Now ur_tokens is a list of subword token-strings; we need to convert to text
        # Find id of each subword token in tokenizer_ur.subwords and then decode
        ur_ids = []
        for t in ur_tokens:
            if t in urdu_to_index and urdu_to_index[t] < tokenizer_ur.vocab_size:
                ur_ids.append(urdu_to_index[t])
            else:
                # token might be special; skip
                pass
        # decode to string with tokenizer
        ur_text = tokenizer_ur.decode(ur_ids)
        return ur_text

In [48]:
for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader, epoch)
    # (optional) quick val: translate a few samples
    sample_src = val_texts['eng'].iloc[:3].tolist()
    for s in sample_src:
        translation = translate_greedy(model, s, max_length=MAX_SEQ_LEN)
        print("SRC:", s)
        print("PRED:", translation)
    # you can also save checkpoint
    torch.save(model.state_dict(), f"/content/transformer_eng_ur_epoch{epoch}.pt")

print("Training finished.")

RuntimeError: stack expects each tensor to be equal size, but got [40] at entry 0 and [43] at entry 22

In [51]:
# Full corrected training cell for English -> Urdu (PyTorch)
import re, time, math
import torch, numpy as np, pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# ----- Hyperparams & paths -----
EXCEL_PATH = "/content/english_to_urdu_dataset.xlsx"
TOKENIZER_EN_PREFIX = "/content/tokenizer_en"
TOKENIZER_UR_PREFIX = "/content/tokenizer_ur"
BUILD_TOKENIZERS_IF_MISSING = True
TARGET_VOCAB_SIZE = 2**13

BATCH_SIZE = 32
MAX_SEQ_LEN = 40          # must match transformer's max_sequence_length
D_MODEL = 256
FFN_HIDDEN = 512
NUM_HEADS = 8
NUM_LAYERS = 4
DROP_PROB = 0.1
LR = 1e-4
EPOCHS = 6
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NEG_INF = -1e9

# ----- Load transformer class -----
from transformer import Transformer

# ----- Load & clean dataset -----
df = pd.read_excel(EXCEL_PATH, engine='openpyxl')
# If your columns are already 'eng' and 'urdu' this still works
if 'eng' not in df.columns or 'urdu' not in df.columns:
    df = df.rename(columns={'English': 'eng', 'Urdu': 'urdu'})

def clean_english(text):
    if isinstance(text, str):
        return re.sub(r'[^a-z0-9\s]', '', text.lower()).strip()
    return ''

def clean_urdu(text):
    if isinstance(text, str):
        return re.sub(r'[A-Za-z0-9!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~]', '', text).strip()
    return ''

df['eng'] = df['eng'].apply(clean_english)
df['urdu'] = df['urdu'].apply(clean_urdu)
df = df[(df['eng'] != '') & (df['urdu'] != '')].reset_index(drop=True)
print("✅ Total cleaned pairs:", len(df))

# ----- Train / val split (reset indices) -----
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
print("Training pairs:", len(train_df), "Validation pairs:", len(val_df))

# ----- Build / Load tokenizers -----
!pip install -q tensorflow_datasets
import tensorflow_datasets as tfds

try:
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file(TOKENIZER_EN_PREFIX)
    tokenizer_ur = tfds.deprecated.text.SubwordTextEncoder.load_from_file(TOKENIZER_UR_PREFIX)
    print("✅ Loaded tokenizers from disk.")
except Exception as e:
    print("⚠ Couldn't load tokenizers:", e)
    if not BUILD_TOKENIZERS_IF_MISSING:
        raise RuntimeError("Set BUILD_TOKENIZERS_IF_MISSING=True to build tokenizers.")
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        train_df['eng'].tolist(), target_vocab_size=TARGET_VOCAB_SIZE)
    tokenizer_ur = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
        train_df['urdu'].tolist(), target_vocab_size=TARGET_VOCAB_SIZE)
    tokenizer_en.save_to_file(TOKENIZER_EN_PREFIX)
    tokenizer_ur.save_to_file(TOKENIZER_UR_PREFIX)
    print("✅ Built & saved tokenizers.")

subwords_en = list(tokenizer_en.subwords)   # length == tokenizer_en.vocab_size
subwords_ur = list(tokenizer_ur.subwords)

# special tokens used in SentenceEmbedding mapping (strings)
START_TOKEN = "<S>"
END_TOKEN = "</S>"
PAD_TOKEN = "<PAD>"

# build token->index maps that match your transformer's SentenceEmbedding
english_token_list = subwords_en + [START_TOKEN, END_TOKEN, PAD_TOKEN]
urdu_token_list = subwords_ur + [START_TOKEN, END_TOKEN, PAD_TOKEN]
english_to_index = {t: i for i, t in enumerate(english_token_list)}
urdu_to_index  = {t: i for i, t in enumerate(urdu_token_list)}
index_to_urdu  = {i: t for t, i in urdu_to_index.items()}

PAD_IDX_URDU = urdu_to_index[PAD_TOKEN]

print("✅ English tokens:", len(english_token_list), "Urdu tokens:", len(urdu_token_list))

# ----- Dataset (returns lists of subword strings) -----
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer_en, tokenizer_ur, subwords_en, subwords_ur, max_len=MAX_SEQ_LEN):
        self.df = df.reset_index(drop=True)  # safe .iloc usage
        self.tokenizer_en = tokenizer_en
        self.tokenizer_ur = tokenizer_ur
        self.subwords_en = subwords_en
        self.subwords_ur = subwords_ur
        # reserve 2 slots for start/end tokens when passing into model's SentenceEmbedding
        self.max_tokens = max_len - 2

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        en_text = str(row['eng'])
        ur_text = str(row['urdu'])

        en_ids = self.tokenizer_en.encode(en_text)
        ur_ids = self.tokenizer_ur.encode(ur_text)

        # truncate IDs so after adding start/end they fit into MAX_SEQ_LEN
        en_ids = en_ids[:self.max_tokens]
        ur_ids = ur_ids[:self.max_tokens]

        # map ids -> subword strings, ignore any reserved indices (>= vocab_size)
        en_tokens = []
        for i in en_ids:
            if i < len(self.subwords_en):
                en_tokens.append(self.subwords_en[i])
            # else ignore (we won't include start/end here)
        ur_tokens = []
        for i in ur_ids:
            if i < len(self.subwords_ur):
                ur_tokens.append(self.subwords_ur[i])

        return en_tokens, ur_tokens

def collate_fn(batch):
    # batch is list of (en_tokens, ur_tokens)
    en_batch, ur_batch = zip(*batch)
    return list(en_batch), list(ur_batch)

# ----- DataLoaders -----
train_dataset = TranslationDataset(train_df, tokenizer_en, tokenizer_ur, subwords_en, subwords_ur, max_len=MAX_SEQ_LEN)
val_dataset   = TranslationDataset(val_df,   tokenizer_en, tokenizer_ur, subwords_en, subwords_ur, max_len=MAX_SEQ_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# ----- Mask creation (returns float masks with NEG_INF where masked) -----
def create_masks(eng_batch, ur_batch, max_len):
    batch_size = len(eng_batch)
    look_ahead = torch.triu(torch.ones((max_len, max_len), dtype=torch.bool), diagonal=1)
    enc_pad = torch.zeros((batch_size, max_len, max_len), dtype=torch.bool)
    dec_pad_self = torch.zeros_like(enc_pad)
    dec_pad_cross = torch.zeros_like(enc_pad)

    for i in range(batch_size):
        e_len = len(eng_batch[i]) + 2   # +start +end
        u_len = len(ur_batch[i]) + 2
        e_pad = np.arange(e_len, max_len)
        u_pad = np.arange(u_len, max_len)
        if len(e_pad) > 0:
            enc_pad[i, :, e_pad] = enc_pad[i, e_pad, :] = True
        if len(u_pad) > 0:
            dec_pad_self[i, :, u_pad] = dec_pad_self[i, u_pad, :] = True
            dec_pad_cross[i, u_pad, :] = True
            dec_pad_cross[i, :, e_pad] = enc_pad[i, :, e_pad]  # keep cross consistent

    enc_mask = torch.where(enc_pad, torch.tensor(NEG_INF), torch.tensor(0.0))
    look_ahead = look_ahead.unsqueeze(0).expand(batch_size, -1, -1)
    dec_self = torch.where(look_ahead | dec_pad_self, torch.tensor(NEG_INF), torch.tensor(0.0))
    dec_cross = torch.where(dec_pad_cross, torch.tensor(NEG_INF), torch.tensor(0.0))
    return enc_mask.to(DEVICE), dec_self.to(DEVICE), dec_cross.to(DEVICE)

# ----- Instantiate model -----
model = Transformer(
    d_model=D_MODEL,
    ffn_hidden=FFN_HIDDEN,
    num_heads=NUM_HEADS,
    drop_prob=DROP_PROB,
    num_layers=NUM_LAYERS,
    max_sequence_length=MAX_SEQ_LEN,
    kn_vocab_size=len(urdu_token_list),
    english_to_index=english_to_index,
    urdu_to_index=urdu_to_index,
    START_TOKEN=START_TOKEN,
    END_TOKEN=END_TOKEN,
    PADDING_TOKEN=PAD_TOKEN
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX_URDU, reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# ----- helper to convert batch token-strings -> indices according to urdu_to_index (used for loss) -----
def batch_to_indices(batch_tokens, mapping, max_len=MAX_SEQ_LEN):
    res = []
    for tokens in batch_tokens:
        seq = [mapping[START_TOKEN]] + [mapping.get(t, mapping[PAD_TOKEN]) for t in tokens] + [mapping[END_TOKEN]]
        seq = seq[:max_len] + [mapping[PAD_TOKEN]] * max(0, max_len - len(seq))
        res.append(seq)
    return torch.tensor(res, dtype=torch.long, device=DEVICE)

# ----- Training step -----
def train_one_epoch(model, loader, epoch):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    t0 = time.time()
    for batch_idx, (eng_batch, ur_batch) in enumerate(loader):
        enc_mask, dec_self_mask, dec_cross_mask = create_masks(eng_batch, ur_batch, MAX_SEQ_LEN)
        optimizer.zero_grad()
        preds = model(
            x=eng_batch, y=ur_batch,
            encoder_self_attention_mask=enc_mask,
            decoder_self_attention_mask=dec_self_mask,
            decoder_cross_attention_mask=dec_cross_mask,
            enc_start_token=True, enc_end_token=True,
            dec_start_token=True, dec_end_token=True
        )  # (batch, seq_len, vocab)

        target_indices = batch_to_indices(ur_batch, urdu_to_index, max_len=MAX_SEQ_LEN)
        preds_flat = preds.view(-1, preds.size(-1))
        labels_flat = target_indices.view(-1)
        loss_sum = criterion(preds_flat, labels_flat)
        non_pad = (labels_flat != PAD_IDX_URDU).sum().item()
        loss = loss_sum / max(1, non_pad)
        loss.backward()
        optimizer.step()

        total_loss += loss_sum.item()
        total_tokens += non_pad

        if batch_idx % 50 == 0:
            print(f"Epoch {epoch} Batch {batch_idx} Loss/token {loss.item():.4f}")

    avg_loss = total_loss / max(1, total_tokens)
    print(f"Epoch {epoch} finished in {time.time()-t0:.1f}s | Avg loss/token: {avg_loss:.4f}")
    return avg_loss

# ----- Greedy translate -----
def translate_greedy(model, eng_sentence, max_length=MAX_SEQ_LEN):
    model.eval()
    with torch.no_grad():
        en_ids = tokenizer_en.encode(eng_sentence)[:(max_length-2)]
        en_tokens = [subwords_en[i] for i in en_ids if i < len(subwords_en)]
        ur_tokens = []
        for step in range(max_length):
            enc_mask, dec_mask, cross_mask = create_masks([en_tokens], [ur_tokens], max_length)
            preds = model(
                x=[en_tokens], y=[ur_tokens],
                encoder_self_attention_mask=enc_mask,
                decoder_self_attention_mask=dec_mask,
                decoder_cross_attention_mask=cross_mask,
                enc_start_token=True, enc_end_token=True,
                dec_start_token=True, dec_end_token=False
            )
            logits = preds[0, len(ur_tokens)]
            next_idx = torch.argmax(logits).item()
            next_token = index_to_urdu.get(next_idx, PAD_TOKEN)
            if next_token in {END_TOKEN, PAD_TOKEN}:
                break
            ur_tokens.append(next_token)
        # convert ur_tokens (subword strings) back to ids for tokenizer decode
        ur_ids = [urdu_to_index[t] for t in ur_tokens if t in urdu_to_index and urdu_to_index[t] < tokenizer_ur.vocab_size]
        return tokenizer_ur.decode(ur_ids)

# ----- Train loop -----
for epoch in range(1, EPOCHS+1):
    train_loss = train_one_epoch(model, train_loader, epoch)
    print("\nSample translations from validation set:")
    for s in val_df['eng'].iloc[:3].tolist():
        print("EN:", s)
        print("PRED UR:", translate_greedy(model, s))
    torch.save(model.state_dict(), f"/content/transformer_eng_ur_epoch{epoch}.pt")

print("🎉 Training finished.")

✅ Total cleaned pairs: 9101
Training pairs: 8190 Validation pairs: 911
✅ Loaded tokenizers from disk.
✅ English tokens: 7048 Urdu tokens: 8354
Epoch 1 Batch 0 Loss/token 9.0867
Epoch 1 Batch 50 Loss/token 5.7523
Epoch 1 Batch 100 Loss/token 4.0772
Epoch 1 Batch 150 Loss/token 3.3466
Epoch 1 Batch 200 Loss/token 2.7486
Epoch 1 Batch 250 Loss/token 2.6196
Epoch 1 finished in 15.5s | Avg loss/token: 4.2577

Sample translations from validation set:
EN: for this cause many are weak and sickly among you  and many sleep
PRED UR:  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ، 
EN: toms mad
PRED UR:  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ، 
EN: honour widows that are widows indeed
PRED UR:  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ،  ، 
Epoch 2 Batch 0 Loss/token 2.3217
Epo