In [None]:
!git clone https://github.com/amir9ume/urdu_ghazals_rekhta.git

Cloning into 'urdu_ghazals_rekhta'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 112 (delta 7), reused 6 (delta 6), pack-reused 103 (from 1)[K
Receiving objects: 100% (112/112), 2.03 MiB | 20.22 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [None]:
!unzip -q /content/urdu_ghazals_rekhta/dataset/dataset.zip -d /content/urdu_ghazals_rekhta/dataset/

In [None]:
import os
import re

# path to dataset
base_path = "/content/urdu_ghazals_rekhta/dataset/dataset"

def normalize_urdu(text):
    # Normalize common Urdu characters
    text = re.sub("[يى]", "ی", text)   # different forms of 'yeh'
    text = re.sub("[ۀہۂھ]", "ہ", text)  # standardize 'heh'
    text = re.sub("[ك]", "ک", text)     # Arabic kaaf -> Urdu kaaf
    text = re.sub("[ۃ]", "ہ", text)

    # Remove diacritics
    text = re.sub(r"[\u064B-\u0652]", "", text)

    # Remove punctuation (optional: keep , ? ! if useful for modeling)
    text = re.sub(r"[^\w\sءاآأإا-ی]", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Test
sample = "یِہ  كہاں  هے؟"   # intentionally messy
print("Before:", sample)
print("After :", normalize_urdu(sample))


Before: یِہ  كہاں  هے؟
After : یہ کہاں هے


In [None]:
urdu_sentences = []
roman_sentences = []

for poet in os.listdir(base_path):
    poet_path = os.path.join(base_path, poet)
    urdu_path = os.path.join(poet_path, "ur")
    eng_path  = os.path.join(poet_path, "en")

    if not os.path.exists(urdu_path) or not os.path.exists(eng_path):
        continue

    # match files by name in urdu and eng
    for fname in os.listdir(urdu_path):
        urdu_file = os.path.join(urdu_path, fname)
        eng_file  = os.path.join(eng_path, fname)

        if not os.path.exists(eng_file):
            continue

        with open(urdu_file, "r", encoding="utf-8") as f1, open(eng_file, "r", encoding="utf-8") as f2:
            urdu_lines = f1.readlines()
            eng_lines  = f2.readlines()

            for u, e in zip(urdu_lines, eng_lines):
                u = normalize_urdu(u.strip())
                e = e.strip().lower()   # roman target → lowercase

                if u and e:
                    urdu_sentences.append(u)
                    roman_sentences.append(e)

print("Total pairs:", len(urdu_sentences))
print("Example Urdu  :", urdu_sentences[0])
print("Example Roman :", roman_sentences[0])


Total pairs: 21003
Example Urdu  : خمار موسم خوشبو حد چمن میں کہلا
Example Roman : ḳhumār-e-mausam-e-ḳhushbū had-e-chaman meñ khulā


In [None]:
import pandas as pd

df = pd.DataFrame({"urdu": urdu_sentences, "roman": roman_sentences})
df.to_csv("/content/urdu_roman_dataset.csv", index=False, encoding="utf-8")

print(df.head())


                                    urdu  \
0        خمار موسم خوشبو حد چمن میں کہلا   
1      مری غزل کا خزانہ ترے بدن میں کہلا   
2  تم اس کا حسن کبہی اس کی بزم میں دیکہو   
3     کہ ماہتاب سدا شب کے پیرہن میں کہلا   
4      عجب نشہ تہا مگر اس کی بخشش لب میں   

                                              roman  
0  ḳhumār-e-mausam-e-ḳhushbū had-e-chaman meñ khulā  
1       mirī ġhazal kā ḳhazāna tire badan meñ khulā  
2         tum us kā husn kabhī us kī bazm meñ dekho  
3         ki māhtāb sadā shab ke pairahan meñ khulā  
4    ajab nasha thā magar us kī baḳhshish-e-lab meñ  


In [None]:
import sentencepiece as spm

# Write Urdu text to file for training tokenizer
with open("urdu.txt", "w", encoding="utf-8") as f:
    for line in urdu_sentences:
        f.write(line + "\n")

# Write Roman Urdu text
with open("roman.txt", "w", encoding="utf-8") as f:
    for line in roman_sentences:
        f.write(line + "\n")

# Train BPE model for Urdu (vocab size ~8000)
spm.SentencePieceTrainer.train(
    input="urdu.txt",
    model_prefix="urdu_bpe",
    vocab_size=8000,
    character_coverage=0.995,  # covers almost all Urdu chars
    model_type="bpe"
)

# Train BPE model for Roman Urdu (vocab size ~8000)
spm.SentencePieceTrainer.train(
    input="roman.txt",
    model_prefix="roman_bpe",
    vocab_size=8000,
    character_coverage=1.0,
    model_type="bpe"
)

# Load trained tokenizers
sp_urdu = spm.SentencePieceProcessor(model_file="urdu_bpe.model")
sp_roman = spm.SentencePieceProcessor(model_file="roman_bpe.model")

# Example encoding & decoding
sample_urdu = urdu_sentences[0]
sample_roman = roman_sentences[0]

print("Urdu tokens :", sp_urdu.encode(sample_urdu, out_type=str))
print("Roman tokens:", sp_roman.encode(sample_roman, out_type=str))


Urdu tokens : ['▁خمار', '▁موسم', '▁خوشبو', '▁حد', '▁چمن', '▁میں', '▁کہلا']
Roman tokens: ['▁ḳhumār', '-', 'e', '-', 'mausam', '-', 'e', '-', 'ḳhushbū', '▁had', '-', 'e', '-', 'chaman', '▁meñ', '▁khulā']


In [None]:
# Colab prerequisites
# !pip install sentencepiece torch torchvision --quiet

import os
import random
import math
import time
import sentencepiece as spm
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# ========== Config (change as needed) ==========
CSV_PATH = "/content/urdu_roman_dataset.csv"   # your CSV
URDU_MODEL = "urdu_bpe.model"   # or train and point to these
ROMAN_MODEL = "roman_bpe.model"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)

# ========== Utility: load or train SentencePiece ==========
# If you already have urdu_bpe.model and roman_bpe.model, skip training.
def train_sp_if_missing(sentences, model_prefix, vocab_size=8000, model_type="bpe", character_coverage=1.0):
    if os.path.exists(model_prefix + ".model"):
        print(f"{model_prefix}.model found, loading existing.")
        sp = spm.SentencePieceProcessor(model_file=model_prefix + ".model")
        return sp
    # create tmp file for sentencepiece training
    tmp_txt = model_prefix + ".txt"
    with open(tmp_txt, "w", encoding="utf-8") as f:
        for s in sentences:
            f.write(s + "\n")
    spm.SentencePieceTrainer.train(
        input=tmp_txt,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        character_coverage=character_coverage,
        model_type=model_type,
        pad_id=0, unk_id=1, bos_id=2, eos_id=3  # set reserved ids
    )
    sp = spm.SentencePieceProcessor(model_file=model_prefix + ".model")
    os.remove(tmp_txt)
    return sp

# ========== Dataset + tokenizer wrapper ==========
class ParallelDataset(Dataset):
    def __init__(self, urdu_texts, roman_texts, sp_urdu, sp_roman, max_len=200):
        assert len(urdu_texts) == len(roman_texts)
        self.urdu = urdu_texts
        self.roman = roman_texts
        self.sp_urdu = sp_urdu
        self.sp_roman = sp_roman
        self.max_len = max_len

        # use SentencePiece's built-in IDs for special tokens (as we set in trainer)
        # pad_id = 0, unk_id = 1, bos_id = 2, eos_id = 3 (if trained with these)
        self.pad_id_urdu = 0
        self.pad_id_roman = 0
        self.bos_id_roman = 2
        self.eos_id_roman = 3
        self.bos_id_urdu = 2
        self.eos_id_urdu = 3

    def __len__(self):
        return len(self.urdu)

    def __getitem__(self, idx):
        src = self.urdu[idx]
        tgt = self.roman[idx]

        # encode as IDs (SentencePiece returns list of ids)
        src_ids = self.sp_urdu.encode(src, out_type=int)
        tgt_ids = self.sp_roman.encode(tgt, out_type=int)

        # add bos/eos to target (decoder requires sos/bos)
        tgt_input = [self.bos_id_roman] + tgt_ids
        tgt_output = tgt_ids + [self.eos_id_roman]

        # clip long sequences
        if len(src_ids) > self.max_len:
            src_ids = src_ids[:self.max_len]
        if len(tgt_input) > self.max_len:
            tgt_input = tgt_input[:self.max_len]
            tgt_output = tgt_output[:self.max_len]

        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_input, dtype=torch.long), torch.tensor(tgt_output, dtype=torch.long)

def collate_fn(batch):
    # batch items: (src_ids, tgt_input, tgt_output)
    srcs, t_inps, t_outs = zip(*batch)
    src_lens = [s.size(0) for s in srcs]
    # pad sequences
    srcs_padded = pad_sequence(srcs, batch_first=True, padding_value=0)  # pad_id assumed 0
    t_inps_padded = pad_sequence(t_inps, batch_first=True, padding_value=0)
    t_outs_padded = pad_sequence(t_outs, batch_first=True, padding_value=0)
    return srcs_padded, torch.tensor(src_lens), t_inps_padded, t_outs_padded

# ========== Model Components ==========
class EncoderBiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden, num_layers=2, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(input_size=emb_dim,
                           hidden_size=enc_hidden,
                           num_layers=num_layers,
                           batch_first=True,
                           bidirectional=True,
                           dropout=dropout if num_layers>1 else 0.0)
        self.num_layers = num_layers
        self.enc_hidden = enc_hidden
        self.bidirectional = True
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_lengths):
        # src: (B, T)
        embedded = self.dropout(self.embedding(src))  # (B, T, E)
        # pack
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h_n, c_n) = self.rnn(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # (B, T, hidden*2)
        # h_n shape: (num_layers*2, B, enc_hidden)
        return out, (h_n, c_n)

class DecoderLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, dec_hidden, num_layers=4, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(input_size=emb_dim,
                           hidden_size=dec_hidden,
                           num_layers=num_layers,
                           batch_first=True,
                           dropout=dropout if num_layers>1 else 0.0)
        self.out = nn.Linear(dec_hidden, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.dec_hidden = dec_hidden

    def forward(self, tgt_input, hidden):
        # tgt_input: (B, T)  -- token ids including BOS at start
        emb = self.dropout(self.embedding(tgt_input))  # (B, T, E)
        outputs, (h_n, c_n) = self.rnn(emb, hidden)  # outputs: (B, T, dec_hidden)
        logits = self.out(outputs)  # (B, T, V)
        return logits, (h_n, c_n)

# Helper to map encoder final hidden -> decoder initial hidden
class EncToDecInit(nn.Module):
    def __init__(self, enc_layers, enc_hidden, dec_layers, dec_hidden, bidirectional=True):
        super().__init__()
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.bidirectional = bidirectional
        self.enc_hidden = enc_hidden
        self.dec_hidden = dec_hidden

        # encoder hidden has shape (num_layers*directions, B, enc_hidden)
        enc_total_layers = enc_layers * (2 if bidirectional else 1)
        # We'll flatten enc hidden (for each layer) and project per-decoder-layer
        # Create linear to transform concatenated forward+back states into decoder hidden size
        self.h_proj = nn.Linear(enc_hidden * (2 if bidirectional else 1), dec_hidden)
        self.c_proj = nn.Linear(enc_hidden * (2 if bidirectional else 1), dec_hidden)

    def forward(self, h_enc, c_enc):
        # h_enc: (enc_total_layers, B, enc_hidden)
        enc_total_layers = h_enc.size(0)
        B = h_enc.size(1)
        # We'll create decoder initial hidden by taking the top `enc_layers` pairs and projecting them
        # Simple strategy: for each encoder-layer index i (0..enc_layers-1) combine forward+back (if bidir)
        # and produce a vector for each decoder layer (repeat/cycle if decoder has more layers)
        # First fold forward/back into single vector per encoder layer
        if self.bidirectional:
            # h_enc shape (enc_layers*2, B, H). Pair them (0,1), (2,3), ...
            h_pairs = []
            c_pairs = []
            for i in range(0, enc_total_layers, 2):
                h_f = h_enc[i]   # (B, H)
                h_b = h_enc[i+1]
                h_cat = torch.cat([h_f, h_b], dim=-1)  # (B, 2H)
                h_pairs.append(h_cat)

                c_f = c_enc[i]
                c_b = c_enc[i+1]
                c_cat = torch.cat([c_f, c_b], dim=-1)
                c_pairs.append(c_cat)
            # h_pairs is list len enc_layers, each (B, 2H)
            h_pairs = torch.stack(h_pairs, dim=0)
            c_pairs = torch.stack(c_pairs, dim=0)
        else:
            h_pairs = h_enc
            c_pairs = c_enc

        # Now project each encoder-layer pair to decoder hidden size
        # We'll generate dec_layers outputs by repeating cycle if needed
        h_dec_list = []
        c_dec_list = []
        for i in range(self.dec_layers):
            src_layer = i % h_pairs.size(0)  # cycle through encoder layers
            h_src = h_pairs[src_layer]  # (B, 2H) or (B, H)
            c_src = c_pairs[src_layer]
            h_proj = torch.tanh(self.h_proj(h_src))  # (B, dec_hidden)
            c_proj = torch.tanh(self.c_proj(c_src))
            h_dec_list.append(h_proj.unsqueeze(0))
            c_dec_list.append(c_proj.unsqueeze(0))
        h_dec = torch.cat(h_dec_list, dim=0)  # (dec_layers, B, dec_hidden)
        c_dec = torch.cat(c_dec_list, dim=0)
        return (h_dec.contiguous(), c_dec.contiguous())

# ========== Training / Eval Routines ==========
def train_epoch(encoder, dec_init, decoder, dataloader, optimizer, criterion, teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    dec_init.train()
    total_loss = 0.0
    for srcs, src_lens, t_inps, t_outs in tqdm(dataloader, leave=False):
        srcs = srcs.to(DEVICE)
        src_lens = src_lens.to(DEVICE)
        t_inps = t_inps.to(DEVICE)
        t_outs = t_outs.to(DEVICE)

        optimizer.zero_grad()
        enc_outputs, (h_n, c_n) = encoder(srcs, src_lens)  # we don't use attention, just initial state

        # map to decoder initial hidden
        h0, c0 = dec_init(h_n, c_n)

        # forward decoder in teacher-forcing manner
        logits, _ = decoder(t_inps, (h0, c0))  # returns (B, T, V)
        B, T, V = logits.size()
        logits_flat = logits.view(-1, V)
        t_outs_flat = t_outs.view(-1)

        loss = criterion(logits_flat, t_outs_flat)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(encoder.parameters()) + list(decoder.parameters()) + list(dec_init.parameters()), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(encoder, dec_init, decoder, dataloader, criterion):
    encoder.eval()
    decoder.eval()
    dec_init.eval()
    total_loss = 0.0
    with torch.no_grad():
        for srcs, src_lens, t_inps, t_outs in tqdm(dataloader, leave=False):
            srcs = srcs.to(DEVICE)
            src_lens = src_lens.to(DEVICE)
            t_inps = t_inps.to(DEVICE)
            t_outs = t_outs.to(DEVICE)

            enc_outputs, (h_n, c_n) = encoder(srcs, src_lens)
            h0, c0 = dec_init(h_n, c_n)
            logits, _ = decoder(t_inps, (h0, c0))
            B, T, V = logits.size()
            loss = criterion(logits.view(-1, V), t_outs.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Early stopping helper
class EarlyStopping:
    def __init__(self, patience=5, delta=0.0):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta

    def step(self, metric):  # metric: validation loss (lower is better)
        score = -metric
        if self.best_score is None:
            self.best_score = score
            return False
        if score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                return True
            return False
        else:
            self.best_score = score
            self.counter = 0
            return False

# ========== Full pipeline: load CSV, split, create dataloaders ==========
def load_and_prepare(csv_path, sp_urdu, sp_roman, split_seed=SEED):
    df = pd.read_csv(csv_path, encoding="utf-8")
    # expected columns: 'urdu' and 'roman'
    urdu_texts = df['urdu'].astype(str).tolist()
    roman_texts = df['roman'].astype(str).tolist()
    # Shuffle and split 50/25/25
    combined = list(zip(urdu_texts, roman_texts))
    random.Random(split_seed).shuffle(combined)
    n = len(combined)
    n_train = int(0.50 * n)
    n_val = int(0.25 * n)
    train_set = combined[:n_train]
    val_set = combined[n_train:n_train+n_val]
    test_set = combined[n_train+n_val:]
    print(f"Total: {n}, train: {len(train_set)}, val: {len(val_set)}, test: {len(test_set)}")

    train_urdu, train_roman = zip(*train_set)
    val_urdu, val_roman = zip(*val_set)
    test_urdu, test_roman = zip(*test_set)

    return (list(train_urdu), list(train_roman)), (list(val_urdu), list(val_roman)), (list(test_urdu), list(test_roman))

# ========== Experiment / Runner ==========
def run_experiment(config):
    print("\n=== Experiment:", config, "===")
    # Load SP models (or train if not present)
    # Provide sentences only if models missing; here we train if missing
    df = pd.read_csv(CSV_PATH, encoding="utf-8")
    urdu_texts = df['urdu'].astype(str).tolist()
    roman_texts = df['roman'].astype(str).tolist()

    sp_urdu = train_sp_if_missing(urdu_texts, "urdu_bpe", vocab_size=config['vocab_size'], character_coverage=0.995)
    sp_roman = train_sp_if_missing(roman_texts, "roman_bpe", vocab_size=config['vocab_size'], character_coverage=1.0)

    # prepare datasets
    (train_urdu, train_roman), (val_urdu, val_roman), (test_urdu, test_roman) = load_and_prepare(CSV_PATH, sp_urdu, sp_roman)
    train_ds = ParallelDataset(train_urdu, train_roman, sp_urdu, sp_roman, max_len=config['max_len'])
    val_ds   = ParallelDataset(val_urdu, val_roman, sp_urdu, sp_roman, max_len=config['max_len'])
    test_ds  = ParallelDataset(test_urdu, test_roman, sp_urdu, sp_roman, max_len=config['max_len'])

    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)
    test_loader  = DataLoader(test_ds, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)

    # vocab sizes from sentencepiece processor
    src_vocab = sp_urdu.get_piece_size()
    tgt_vocab = sp_roman.get_piece_size()
    print("vocab sizes (src,tgt):", src_vocab, tgt_vocab)

    # build models
    encoder = EncoderBiLSTM(vocab_size=src_vocab, emb_dim=config['emb_dim'], enc_hidden=config['enc_hidden'],
                            num_layers=config['enc_layers'], dropout=config['dropout']).to(DEVICE)
    decoder = DecoderLSTM(vocab_size=tgt_vocab, emb_dim=config['emb_dim'], dec_hidden=config['dec_hidden'],
                          num_layers=config['dec_layers'], dropout=config['dropout']).to(DEVICE)

    dec_init = EncToDecInit(enc_layers=config['enc_layers'], enc_hidden=config['enc_hidden'],
                            dec_layers=config['dec_layers'], dec_hidden=config['dec_hidden']).to(DEVICE)

    # criterion & optimizer
    pad_idx = 0
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()) + list(dec_init.parameters()), lr=config['lr'])

    # training loop
    best_val_loss = float("inf")
    early_stopper = EarlyStopping(patience=config['patience'], delta=0.0001)
    for epoch in range(1, config['epochs']+1):
        t0 = time.time()
        train_loss = train_epoch(encoder, dec_init, decoder, train_loader, optimizer, criterion, teacher_forcing_ratio=config['tf'])
        val_loss = evaluate(encoder, dec_init, decoder, val_loader, criterion)
        t1 = time.time()
        print(f"Epoch {epoch} | train_loss {train_loss:.4f} | val_loss {val_loss:.4f} | time {t1-t0:.1f}s")

        # checkpoint best
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'encoder': encoder.state_dict(),
                'decoder': decoder.state_dict(),
                'dec_init': dec_init.state_dict(),
                'config': config
            }, f"best_model_exp_{config['name']}.pt")
            print("  Saved best model.")

        # early stopping
        if early_stopper.step(val_loss):
            print("Early stopping triggered. Stopping training.")
            break

    # final evaluation on test
    test_loss = evaluate(encoder, dec_init, decoder, test_loader, criterion)
    print("Final test loss:", test_loss)
    return {'train_loss': train_loss, 'val_loss': val_loss, 'test_loss': test_loss}

# ========== Example experiments (modify per your assignment) ==========
if __name__ == "__main__":
    # pick three experiments (you must report & compare)
    experiments = [
        {   # Exp A: moderate embedding, medium hidden
            'name': 'A_emb128_h256',
            'vocab_size': 8000,
            'emb_dim': 128,
            'enc_hidden': 256,
            'dec_hidden': 256,
            'enc_layers': 2,    # as required in assignment
            'dec_layers': 4,    # as required
            'dropout': 0.3,
            'lr': 1e-3,
            'batch_size': 64,
            'epochs': 30,
            'patience': 5,
            'tf': 0.5,  # teacher forcing ratio
            'max_len': 120
        },
        {   # Exp B: larger embedding, larger hidden, smaller lr
            'name': 'B_emb256_h512',
            'vocab_size': 8000,
            'emb_dim': 256,
            'enc_hidden': 512,
            'dec_hidden': 512,
            'enc_layers': 2,
            'dec_layers': 4,
            'dropout': 0.3,
            'lr': 5e-4,
            'batch_size': 64,
            'epochs': 30,
            'patience': 6,
            'tf': 0.5,
            'max_len': 120
        },
        {   # Exp C: smaller lr, higher dropout
            'name': 'C_emb256_h512_drop50',
            'vocab_size': 8000,
            'emb_dim': 256,
            'enc_hidden': 512,
            'dec_hidden': 512,
            'enc_layers': 2,
            'dec_layers': 4,
            'dropout': 0.5,
            'lr': 1e-4,
            'batch_size': 32,
            'epochs': 40,
            'patience': 8,
            'tf': 0.5,
            'max_len': 120
        }
    ]

    results = []
    for cfg in experiments:
        res = run_experiment(cfg)
        results.append((cfg['name'], res))
    print("All experiment results:", results)



=== Experiment: {'name': 'A_emb128_h256', 'vocab_size': 8000, 'emb_dim': 128, 'enc_hidden': 256, 'dec_hidden': 256, 'enc_layers': 2, 'dec_layers': 4, 'dropout': 0.3, 'lr': 0.001, 'batch_size': 64, 'epochs': 30, 'patience': 5, 'tf': 0.5, 'max_len': 120} ===
urdu_bpe.model found, loading existing.
roman_bpe.model found, loading existing.
Total: 21003, train: 10501, val: 5250, test: 5252
vocab sizes (src,tgt): 8000 8000




Epoch 1 | train_loss 6.3151 | val_loss 6.0435 | time 7.4s
  Saved best model.




Epoch 2 | train_loss 5.9728 | val_loss 6.0192 | time 5.5s
  Saved best model.




Epoch 3 | train_loss 5.8010 | val_loss 5.6085 | time 6.0s
  Saved best model.




Epoch 4 | train_loss 5.4038 | val_loss 5.3301 | time 5.5s
  Saved best model.




Epoch 5 | train_loss 5.1379 | val_loss 5.1567 | time 6.0s
  Saved best model.




Epoch 6 | train_loss 4.9380 | val_loss 5.0205 | time 6.2s
  Saved best model.




Epoch 7 | train_loss 4.7460 | val_loss 4.8623 | time 6.0s
  Saved best model.




Epoch 8 | train_loss 4.5602 | val_loss 4.7408 | time 5.5s
  Saved best model.




Epoch 9 | train_loss 4.3919 | val_loss 4.6085 | time 6.1s
  Saved best model.




Epoch 10 | train_loss 4.2182 | val_loss 4.5134 | time 5.5s
  Saved best model.




Epoch 11 | train_loss 4.0480 | val_loss 4.4004 | time 6.0s
  Saved best model.




Epoch 12 | train_loss 3.8805 | val_loss 4.3200 | time 5.6s
  Saved best model.




Epoch 13 | train_loss 3.7347 | val_loss 4.2801 | time 5.8s
  Saved best model.




Epoch 14 | train_loss 3.5966 | val_loss 4.1709 | time 5.8s
  Saved best model.




Epoch 15 | train_loss 3.4704 | val_loss 4.1290 | time 5.6s
  Saved best model.




Epoch 16 | train_loss 3.3422 | val_loss 4.0632 | time 5.9s
  Saved best model.




Epoch 17 | train_loss 3.2303 | val_loss 4.0409 | time 5.5s
  Saved best model.




Epoch 18 | train_loss 3.1229 | val_loss 3.9994 | time 6.2s
  Saved best model.




Epoch 19 | train_loss 3.0093 | val_loss 3.9951 | time 5.6s
  Saved best model.




Epoch 20 | train_loss 2.9236 | val_loss 3.9743 | time 6.1s
  Saved best model.




Epoch 21 | train_loss 2.8199 | val_loss 3.9598 | time 5.6s
  Saved best model.




Epoch 22 | train_loss 2.7212 | val_loss 3.9557 | time 6.2s
  Saved best model.




Epoch 23 | train_loss 2.6378 | val_loss 3.9539 | time 5.6s
  Saved best model.




Epoch 24 | train_loss 2.5466 | val_loss 3.9368 | time 6.2s
  Saved best model.




Epoch 25 | train_loss 2.4621 | val_loss 3.9367 | time 5.6s
  Saved best model.




Epoch 26 | train_loss 2.3843 | val_loss 3.9408 | time 6.1s




Epoch 27 | train_loss 2.3050 | val_loss 3.9315 | time 5.6s
  Saved best model.




Epoch 28 | train_loss 2.2291 | val_loss 3.9463 | time 6.2s




Epoch 29 | train_loss 2.1562 | val_loss 3.9509 | time 5.6s




Epoch 30 | train_loss 2.0862 | val_loss 3.9588 | time 6.1s




Final test loss: 3.887688068022211

=== Experiment: {'name': 'B_emb256_h512', 'vocab_size': 8000, 'emb_dim': 256, 'enc_hidden': 512, 'dec_hidden': 512, 'enc_layers': 2, 'dec_layers': 4, 'dropout': 0.3, 'lr': 0.0005, 'batch_size': 64, 'epochs': 30, 'patience': 6, 'tf': 0.5, 'max_len': 120} ===
urdu_bpe.model found, loading existing.
roman_bpe.model found, loading existing.
Total: 21003, train: 10501, val: 5250, test: 5252
vocab sizes (src,tgt): 8000 8000




Epoch 1 | train_loss 6.3002 | val_loss 6.0367 | time 11.4s
  Saved best model.




Epoch 2 | train_loss 5.9536 | val_loss 6.0005 | time 11.6s
  Saved best model.




Epoch 3 | train_loss 5.8680 | val_loss 5.8845 | time 11.7s
  Saved best model.




Epoch 4 | train_loss 5.5140 | val_loss 5.3994 | time 11.6s
  Saved best model.




Epoch 5 | train_loss 5.0623 | val_loss 5.0294 | time 11.7s
  Saved best model.




Epoch 6 | train_loss 4.6965 | val_loss 4.7319 | time 11.6s
  Saved best model.




Epoch 7 | train_loss 4.3803 | val_loss 4.5236 | time 11.6s
  Saved best model.




Epoch 8 | train_loss 4.1107 | val_loss 4.3619 | time 11.5s
  Saved best model.




Epoch 9 | train_loss 3.8636 | val_loss 4.2137 | time 11.6s
  Saved best model.




Epoch 10 | train_loss 3.6454 | val_loss 4.0877 | time 11.5s
  Saved best model.




Epoch 11 | train_loss 3.4389 | val_loss 4.0346 | time 11.6s
  Saved best model.




Epoch 12 | train_loss 3.2521 | val_loss 3.9068 | time 11.6s
  Saved best model.




Epoch 13 | train_loss 3.0678 | val_loss 3.8688 | time 11.6s
  Saved best model.




Epoch 14 | train_loss 2.9006 | val_loss 3.8062 | time 11.6s
  Saved best model.




Epoch 15 | train_loss 2.7409 | val_loss 3.7637 | time 11.6s
  Saved best model.




Epoch 16 | train_loss 2.5908 | val_loss 3.7584 | time 11.6s
  Saved best model.




Epoch 17 | train_loss 2.4433 | val_loss 3.7511 | time 11.6s
  Saved best model.




Epoch 18 | train_loss 2.3055 | val_loss 3.7022 | time 11.6s
  Saved best model.




Epoch 19 | train_loss 2.1682 | val_loss 3.7066 | time 11.6s




Epoch 20 | train_loss 2.0358 | val_loss 3.6888 | time 11.6s
  Saved best model.




Epoch 21 | train_loss 1.9153 | val_loss 3.7123 | time 11.6s




Epoch 22 | train_loss 1.7973 | val_loss 3.6841 | time 11.6s
  Saved best model.




Epoch 23 | train_loss 1.6844 | val_loss 3.7020 | time 11.7s




Epoch 24 | train_loss 1.5739 | val_loss 3.7117 | time 11.8s




Epoch 25 | train_loss 1.4759 | val_loss 3.7552 | time 11.8s




Epoch 26 | train_loss 1.3776 | val_loss 3.7170 | time 11.8s




Epoch 27 | train_loss 1.2837 | val_loss 3.7401 | time 11.7s




Epoch 28 | train_loss 1.1953 | val_loss 3.7658 | time 11.6s
Early stopping triggered. Stopping training.




Final test loss: 3.683548941669694

=== Experiment: {'name': 'C_emb256_h512_drop50', 'vocab_size': 8000, 'emb_dim': 256, 'enc_hidden': 512, 'dec_hidden': 512, 'enc_layers': 2, 'dec_layers': 4, 'dropout': 0.5, 'lr': 0.0001, 'batch_size': 32, 'epochs': 40, 'patience': 8, 'tf': 0.5, 'max_len': 120} ===
urdu_bpe.model found, loading existing.
roman_bpe.model found, loading existing.
Total: 21003, train: 10501, val: 5250, test: 5252
vocab sizes (src,tgt): 8000 8000




Epoch 1 | train_loss 6.4366 | val_loss 6.0297 | time 14.8s
  Saved best model.




Epoch 2 | train_loss 5.8621 | val_loss 5.7346 | time 14.9s
  Saved best model.




Epoch 3 | train_loss 5.5375 | val_loss 5.4437 | time 14.9s
  Saved best model.




Epoch 4 | train_loss 5.2900 | val_loss 5.2589 | time 15.0s
  Saved best model.




Epoch 5 | train_loss 5.0928 | val_loss 5.0656 | time 15.0s
  Saved best model.




Epoch 6 | train_loss 4.9011 | val_loss 4.9034 | time 14.9s
  Saved best model.




Epoch 7 | train_loss 4.7326 | val_loss 4.7581 | time 14.9s
  Saved best model.




Epoch 8 | train_loss 4.5840 | val_loss 4.6461 | time 14.8s
  Saved best model.




Epoch 9 | train_loss 4.4507 | val_loss 4.5513 | time 15.2s
  Saved best model.




Epoch 10 | train_loss 4.3338 | val_loss 4.4574 | time 14.9s
  Saved best model.




Epoch 11 | train_loss 4.2232 | val_loss 4.3874 | time 14.8s
  Saved best model.




Epoch 12 | train_loss 4.1180 | val_loss 4.3234 | time 14.8s
  Saved best model.




Epoch 13 | train_loss 4.0215 | val_loss 4.2600 | time 15.1s
  Saved best model.




Epoch 14 | train_loss 3.9262 | val_loss 4.2055 | time 14.9s
  Saved best model.




Epoch 15 | train_loss 3.8414 | val_loss 4.1494 | time 14.9s
  Saved best model.




Epoch 16 | train_loss 3.7562 | val_loss 4.1092 | time 14.8s
  Saved best model.




Epoch 17 | train_loss 3.6715 | val_loss 4.0717 | time 14.9s
  Saved best model.




Epoch 18 | train_loss 3.5931 | val_loss 4.0287 | time 15.2s
  Saved best model.




Epoch 19 | train_loss 3.5128 | val_loss 3.9784 | time 14.8s
  Saved best model.




Epoch 20 | train_loss 3.4421 | val_loss 3.9384 | time 14.9s
  Saved best model.




Epoch 21 | train_loss 3.3671 | val_loss 3.9213 | time 14.9s
  Saved best model.




Epoch 22 | train_loss 3.2942 | val_loss 3.8758 | time 15.2s
  Saved best model.




Epoch 23 | train_loss 3.2244 | val_loss 3.8427 | time 14.9s
  Saved best model.




Epoch 24 | train_loss 3.1571 | val_loss 3.8228 | time 14.8s
  Saved best model.




Epoch 25 | train_loss 3.0897 | val_loss 3.8050 | time 14.8s
  Saved best model.




Epoch 26 | train_loss 3.0202 | val_loss 3.7719 | time 14.9s
  Saved best model.




Epoch 27 | train_loss 2.9603 | val_loss 3.7469 | time 15.2s
  Saved best model.




Epoch 28 | train_loss 2.8958 | val_loss 3.7169 | time 14.9s
  Saved best model.




Epoch 29 | train_loss 2.8361 | val_loss 3.7077 | time 14.9s
  Saved best model.




Epoch 30 | train_loss 2.7793 | val_loss 3.6893 | time 14.9s
  Saved best model.




Epoch 31 | train_loss 2.7169 | val_loss 3.6734 | time 15.3s
  Saved best model.




Epoch 32 | train_loss 2.6617 | val_loss 3.6558 | time 14.9s
  Saved best model.




Epoch 33 | train_loss 2.6066 | val_loss 3.6463 | time 14.9s
  Saved best model.




Epoch 34 | train_loss 2.5463 | val_loss 3.6244 | time 14.9s
  Saved best model.




Epoch 35 | train_loss 2.4921 | val_loss 3.6271 | time 15.1s




Epoch 36 | train_loss 2.4437 | val_loss 3.6007 | time 15.1s
  Saved best model.




Epoch 37 | train_loss 2.3881 | val_loss 3.5962 | time 15.0s
  Saved best model.




Epoch 38 | train_loss 2.3359 | val_loss 3.5983 | time 14.9s




Epoch 39 | train_loss 2.2843 | val_loss 3.5956 | time 14.9s
  Saved best model.




Epoch 40 | train_loss 2.2327 | val_loss 3.5849 | time 15.2s
  Saved best model.


                                                 

Final test loss: 3.51837947585366
All experiment results: [('A_emb128_h256', {'train_loss': 2.0861891428629558, 'val_loss': 3.9588143595729965, 'test_loss': 3.887688068022211}), ('B_emb256_h512', {'train_loss': 1.1952744223854759, 'val_loss': 3.7657759160880584, 'test_loss': 3.683548941669694}), ('C_emb256_h512_drop50', {'train_loss': 2.232714864258346, 'val_loss': 3.5848672606728296, 'test_loss': 3.51837947585366})]




In [None]:
import torch

# 1. Load trained models
checkpoint = torch.load("best_model_exp_A_emb128_h256.pt", map_location=DEVICE)
config = checkpoint["config"]

src_vocab = checkpoint["config"]["vocab_size"]
tgt_vocab = checkpoint["config"]["vocab_size"]

encoder = EncoderBiLSTM(vocab_size=sp_urdu.get_piece_size(),
                        emb_dim=config['emb_dim'],
                        enc_hidden=config['enc_hidden'],
                        num_layers=config['enc_layers'],
                        dropout=config['dropout']).to(DEVICE)

decoder = DecoderLSTM(vocab_size=sp_roman.get_piece_size(),
                      emb_dim=config['emb_dim'],
                      dec_hidden=config['dec_hidden'],
                      num_layers=config['dec_layers'],
                      dropout=config['dropout']).to(DEVICE)

dec_init = EncToDecInit(enc_layers=config['enc_layers'],
                        enc_hidden=config['enc_hidden'],
                        dec_layers=config['dec_layers'],
                        dec_hidden=config['dec_hidden']).to(DEVICE)

encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])
dec_init.load_state_dict(checkpoint['dec_init'])

encoder.eval()
decoder.eval()
dec_init.eval()

# 2. Predict function
def translate_sentence(sentence, sp_urdu, sp_roman, encoder, dec_init, decoder, max_len=120):
    # encode input
    src_ids = sp_urdu.encode(sentence, out_type=int)
    src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(DEVICE)
    src_len = torch.tensor([len(src_ids)]).to(DEVICE)

    with torch.no_grad():
        _, (h_n, c_n) = encoder(src_tensor, src_len)
        h0, c0 = dec_init(h_n, c_n)

    # start with <bos>
    bos_id = sp_roman.bos_id()
    eos_id = sp_roman.eos_id()
    tgt_ids = [bos_id]
    hidden = (h0, c0)

    for _ in range(max_len):
        inp = torch.tensor([tgt_ids], dtype=torch.long).to(DEVICE)
        logits, hidden = decoder(inp, hidden)
        next_token = logits[:, -1, :].argmax(dim=-1).item()
        if next_token == eos_id:
            break
        tgt_ids.append(next_token)

    # decode ids back to text
    return sp_roman.decode(tgt_ids[1:])  # remove BOS

# 3. Example
urdu_sentence = "ہزاروں خواہشیں ایسی کہ ہر خواہش پہ دم نکلے"
print("Predicted Roman Urdu:", translate_sentence(urdu_sentence, sp_urdu, sp_roman, encoder, dec_init, decoder))


Predicted Roman Urdu: hazāroñ sī ik k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k k


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Word-level tokenizer (use num_words to limit vocab size)
tokenizer_urdu = Tokenizer(num_words=10000, oov_token="<unk>")
tokenizer_roman = Tokenizer(num_words=10000, oov_token="<unk>")

tokenizer_urdu.fit_on_texts(urdu_sentences)
tokenizer_roman.fit_on_texts(roman_sentences)

# Convert to sequences
input_sequences = tokenizer_urdu.texts_to_sequences(urdu_sentences)
target_sequences = tokenizer_roman.texts_to_sequences(roman_sentences)

# Pad
max_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding="post")
target_sequences = pad_sequences(target_sequences, maxlen=max_len, padding="post")


