Setup and Dependencies

In [1]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Special tokens
PAD, BOS, EOS, UNK = 0, 1, 2, 3

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
en_ur_pairs = [
 ('I am going to school', 'میں سکول جا رہا ہوں'),
 ('He is a doctor', 'وہ ایک ڈاکٹر ہے'),
 ('It is a beautiful day', 'یہ ایک خوبصورت دن ہے'),
 ('The book is on the table', 'کتاب میز پر ہے'),
 ('Are you okay?', 'کیا آپ ٹھیک ہیں؟'),
 ('I need to drink water', 'مجھے پانی پینا ہے'),
 ('We will go to the market tomorrow', 'ہم کل بازار جائیں گے'),
 ('Where do you live?', 'تم کہاں رہتے ہو؟'),
 ('My name is Mahnoor', 'میرا نام ماہ نور ہے'),
 ('Thank you', 'آپ کا شکریہ'),
 ('Close the door', 'دروازہ بند کر دو'),
 ('I need help', 'مجھے مدد کی ضرورت ہے'),
 ('Did you eat food?', 'کیا تم نے کھانا کھایا؟'),
 ('I watched a movie', 'میں نے فلم دیکھی'),
 ('It is raining outside', 'باہر بارش ہو رہی ہے'),
 ('I am tired', 'میں تھک گیا ہوں'),
 ('She is very intelligent', 'وہ بہت ذہین ہے'),
 ('We are all friends', 'ہم سب دوست ہیں'),
 ('Your shawl is very beautiful', 'تمہاری شال بہت خوبصورت ہے'),
 ('I have a dog', 'میرے پاس ایک کتا ہے'),
 ('This book is very interesting', 'یہ کتاب بہت دلچسپ ہے'),
 ('I like technology', 'مجھے ٹیکنالوجی پسند ہے'),
 ('Where are you going?', 'تم کہاں جا رہے ہو؟'),
 ('My parents are happy', 'میرے والدین خوش ہیں'),
 ('What do you want?', 'آپ کو کیا چاہیے؟'),
 ('I am watching you', 'میں تمہیں دیکھ رہا ہوں'),
 ('We are all learning', 'ہم سب کچھ سیکھ رہے ہیں'),
 ('This game was very fun', 'یہ کھیل بہت مزے کا تھا'),
 ("My friend's name is Ali", 'میرے دوست کا نام علی ہے'),
 ('What are you studying?', 'تم کیا پڑھ رہے ہو؟'),
 ('I have started learning a new language',
  'میں نے نئی زبان سیکھنی شروع کی ہے'),
 ('Come with me', 'میرے ساتھ چلوں'),
 ('We will meet soon', 'ہماری ملاقات جلد ہوگی'),
 ('She cooks very well', 'وہ بہت اچھا کھانا پکاتی ہے'),
 ('This is my favorite color', 'یہ میرا پسندیدہ رنگ ہے'),
 ('How old are you?', 'تم کتنے سال کے ہو؟'),
 ('I am very happy', 'میں بہت خوش ہوں'),
 ('Do you like tea?', 'کیا آپ کو چائے پسند ہے؟'),
 ('I have finished the book', 'میں نے کتاب مکمل کر لی'),
 ('This is all very difficult', 'یہ سب کچھ بہت مشکل ہے'),
 ('I have enough time', 'میرے پاس کافی وقت ہے'),
 ('How much do you need to travel?', 'تمہیں کتنا سفر کرنا ہے؟'),
 ('We have started a new project', 'ہم نے ایک نیا منصوبہ شروع کیا ہے'),
 ('How many people are attending the event?',
  'کتنے لوگ اس تقریب میں شامل ہیں؟'),
 ('Did you complete your work?', 'کیا آپ نے اپنا کام مکمل کیا؟'),
 ('I bought new clothes', 'میں نے نئے کپڑے خریدے ہیں'),
 ('He is very hardworking', 'وہ بہت محنتی ہے'),
 ('My friend helped me', 'میرے دوست نے میری مدد کی'),
 ('Where do you work?', 'آپ کہاں کام کرتے ہیں؟'),
 ('My father is an engineer', 'میرے والد ایک انجینئر ہیں'),
 ('She took care of me', 'اس نے میرا خیال رکھا'),
 ('Do you have time?', 'کیا آپ کے پاس وقت ہے؟'),
 ('We are all very happy', 'ہم سب بہت خوش ہیں'),
 ('I will come in a little while', 'میں ابھی تھوڑی دیر میں آ رہا ہوں'),
 ('This game is very interesting to me', 'یہ کھیل میرے لیے بہت دلچسپ ہے'),
 ('How was your day?', 'آپ کا دن کیسا رہا؟'),
 ('I made a beautiful picture', 'میں نے ایک خوبصورت تصویر بنائی'),
 ('This place is very beautiful', 'یہ جگہ بہت خوبصورت ہے'),
 ('What do you need?', 'تمہیں کیا چاہیے؟'),
 ('We have started a new plan', 'ہم نے ایک نیا منصوبہ شروع کیا ہے'),
 ('How long have you been here?', 'تم کتنی دیر سے یہاں ہو؟'),
 ('I want to talk to you', 'میں تم سے بات کرنا چاہتا ہوں'),
 ('Come with me', 'میرے ساتھ چلیں'),
 ('This work is very important', 'یہ کام بہت ضروری ہے'),
 ('He studies very well', 'وہ بہت اچھا پڑھتا ہے'),
 ('I have bought a new phone', 'میں نے نیا موبائل خریدا ہے'),
 ('Do you read books?', 'کیا آپ کتابیں پڑھتے ہیں؟'),
 ('We will go to the park tomorrow', 'ہم کل پارک جائیں گے'),
 ('I need your help', 'مجھے تمہاری مدد کی ضرورت ہے'),
 ('Where are you going?', 'آپ کہاں جا رہے ہیں؟'),
 ('He always helps me', 'وہ ہمیشہ میری مدد کرتا ہے'),
 ('My parents are very loving', 'میرے والدین بہت محبت کرنے والے ہیں'),
 ('We have moved to a new city', 'ہم ایک نئے شہر میں منتقل ہو گئے ہیں'),
 ('We should always tell the truth', 'ہمیں ہمیشہ سچ بولنا چاہیے'),
 ('What do you have?', 'تمہارے پاس کیا ہے؟'),
 ('I did a lot of work today', 'میں نے آج بہت کام کیا'),
 ('I have a new computer', 'میرے پاس ایک نئے کمپیوٹر ہے'),
 ('How long has it been?', 'تمہیں کتنی دیر ہو گئی؟'),
 ('He is very cheerful', 'وہ بہت خوش مزاج ہے'),
 ('Our team worked very hard', 'ہماری ٹیم نے بہت محنت کی'),
 ('He keeps smiling always', 'وہ ہمیشہ مسکراتا رہتا ہے'),
 ('We had a great time', 'ہم نے بہت اچھا وقت گزارا'),
 ('My friends are very helpful', 'میرے دوست بہت مددگار ہیں'),
 ('My parents are very kind', 'میرے والدین بہت مہربان ہیں'),
 ('We made a beautiful garden', 'ہم نے ایک خوبصورت باغ بنایا'),
 ('It is important to solve this issue', 'یہ مسئلہ حل کرنا ضروری ہے'),
 ('Where do you need to go?', 'تم نے کہاں جانا ہے؟'),
 ('I am happy to meet you', 'میں تم سے مل کر خوش ہوں'),
 ('This question is very interesting', 'یہ سوال بہت دلچسپ ہے'),
 ('I am coming right now', 'میں ابھی آ رہا ہوں'),
 ('I told you', 'میں نے تمہیں کہا تھا'),
 ('How are you?', 'تم کیسے ہو؟'),
 ('Did you buy a new phone?', 'کیا تم نے نیا موبائل خریدا؟'),
 ('This is a very difficult question', 'یہ بہت مشکل سوال ہے'),
 ('My friends are very intelligent', 'میرے دوست بہت ذہین ہیں'),
 ('What do you think about all this?', 'تمہارے خیال میں یہ سب کچھ کیسا ہے؟'),
 ('We help each other', 'ہم سب ایک دوسرے کی مدد کرتے ہیں'),
 ('I love you very much', 'میں تم سے بہت محبت کرتا ہوں'),
 ('Have a pleasant day', 'آپ کا دن خوش گوار گزرے'),
 ('You are very hardworking', 'تم بہت محنتی ہو'),
 ('This is very important for us', 'یہ ہمارے لئے بہت اہم ہے'),
 ('I am coming right now', 'میں ابھی آ رہا ہوں')
 ]

 # English-to-Urdu is just reversed
ur_en_pairs = [(ur, en) for en, ur in en_ur_pairs]

In [2]:
# Dummy dataset: Urdu-English pairs
en_ur_pairs = [
    ("hello", "ہیلو"),
    ("how are you", "آپ کیسے ہیں"),
    ("thank you", "شکریہ"),
    ("good morning", "صبح بخیر"),
]
# English-to-Urdu is just reversed
ur_en_pairs = [(ur, en) for en, ur in en_ur_pairs]


Transformer Seq2Seq Code (Single Notebook Script)

1. Data Preparation:
- We define a small Urdu-English dataset.
- Each sentence pair is a translation of the other.
2. Vocabulary Building:
- We manually assign indices to words.
- <pad>: padding, <sos>: start of sentence, <eos>: end of sentence, <unk>: unknown token.

In [3]:
def build_vocab(sentences):
    tokens = set()
    for sent in sentences:
        tokens.update(sent.split())
    vocab = {tok: i+4 for i, tok in enumerate(sorted(tokens))}
    vocab['<pad>'], vocab['<bos>'], vocab['<eos>'], vocab['<unk>'] = PAD, BOS, EOS, UNK
    return vocab

src_vocab_en = build_vocab([e for e, _ in en_ur_pairs])
tgt_vocab_ur = build_vocab([u for _, u in en_ur_pairs])
src_vocab_ur = build_vocab([u for u, _ in ur_en_pairs])
tgt_vocab_en = build_vocab([e for _, e in ur_en_pairs])

inv_tgt_vocab_ur = {i: tok for tok, i in tgt_vocab_ur.items()}
inv_tgt_vocab_en = {i: tok for tok, i in tgt_vocab_en.items()}


3. Encoding Sentences to Tensors
- This function converts a sentence into a fixed-length vector of word indices.
- All sentences are padded to max_len.
4. Positional Encoding
- Since transformers don’t use recurrence (like RNNs), we inject position information using sine/cosine patterns.
-Added to word embeddings.
5. Transformer Seq2Seq Model
- We create a standard encoder-decoder transformer using PyTorch’s built-in module.
- src and tgt inputs are passed with positional encodings.
- The output is passed through a linear layer to predict the next token.

Attention Mechanism
The transformer:

- Computes self-attention in encoder and decoder to find word relationships in each sentence.
- Computes cross-attention in the decoder to relate target words with source words.

In [4]:
# Dataset class
def tokenize_and_encode(sentence, vocab):
    return [vocab.get(tok, UNK) for tok in sentence.split()]

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, max_len=10):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self): return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_ids = [BOS] + tokenize_and_encode(src, self.src_vocab)[:self.max_len-2] + [EOS]
        tgt_ids = [BOS] + tokenize_and_encode(tgt, self.tgt_vocab)[:self.max_len-2] + [EOS]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

In [5]:
# Collate fn for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = [len(x) for x in src_batch]
    tgt_lens = [len(x) for x in tgt_batch]
    max_src, max_tgt = max(src_lens), max(tgt_lens)
    src_padded = torch.stack([torch.cat([x, x.new_full((max_src-len(x),), PAD)]) for x in src_batch])
    tgt_padded = torch.stack([torch.cat([x, x.new_full((max_tgt-len(x),), PAD)]) for x in tgt_batch])
    return src_padded, tgt_padded

In [6]:
# Custom Multi-Head Attention with gating
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.gate = nn.Parameter(torch.rand(1))

    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        attn_output, _ = self.mha(query, key, value,
                                  attn_mask=attn_mask,
                                  key_padding_mask=key_padding_mask)
        # gated residual
        return self.gate*attn_output + (1 - self.gate)*query

# Encoder and Decoder layers using custom attention
class CustomEncoderLayer(nn.Module):
    def __init__(self, emb_size, nhead, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.self_attn = CustomMultiheadAttention(emb_size, nhead, dropout)
        self.linear1 = nn.Linear(emb_size, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, emb_size)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        # src: seq_len, batch, emb_size
        src2 = self.self_attn(src, src, src,
                              attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear2(self.dropout(torch.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        return self.norm2(src)


In [7]:
class CustomDecoderLayer(nn.Module):
    def __init__(self, emb_size, nhead, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.self_attn = CustomMultiheadAttention(emb_size, nhead, dropout)
        self.multihead_attn = CustomMultiheadAttention(emb_size, nhead, dropout)
        self.linear1 = nn.Linear(emb_size, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, emb_size)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)
        self.norm3 = nn.LayerNorm(emb_size)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_key_padding_mask=None,
                tgt_key_padding_mask=None):
        tgt2 = self.self_attn(tgt, tgt, tgt,
                              attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory,
                                   key_padding_mask=memory_key_padding_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(torch.relu(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        return self.norm3(tgt)


6. Training Loop
- For each pair:
-- Input: Urdu sentence
-- Output: English sentence (shifted for decoder input vs target)

- We use CrossEntropyLoss (ignoring <pad>) to train the network to predict the next token.
- Optimizer: Adam
7. Translation Function
-Start with <sos>, and use the model to generate one token at a time.

-Each output is fed back into the model (greedy decoding).

In [8]:
# Full Seq2Seq with custom layers
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers,
                 emb_size, nhead, src_vocab_size, tgt_vocab_size,
                 dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.emb_size = emb_size
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size)
        self.encoder_layers = nn.ModuleList([
            CustomEncoderLayer(emb_size, nhead, dim_feedforward, dropout)
            for _ in range(num_encoder_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            CustomDecoderLayer(emb_size, nhead, dim_feedforward, dropout)
            for _ in range(num_decoder_layers)
        ])
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def encode(self, src, src_mask, src_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src) * math.sqrt(self.emb_size))
        memory = src_emb.transpose(0, 1)
        for layer in self.encoder_layers:
            memory = layer(memory, src_mask, src_key_padding_mask)
        return memory

    def decode(self, tgt, memory, tgt_mask,
               tgt_key_padding_mask, memory_key_padding_mask):
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size))
        output = tgt_emb.transpose(0, 1)
        for layer in self.decoder_layers:
            output = layer(output, memory, tgt_mask,
                           memory_key_padding_mask,
                           tgt_key_padding_mask)
        return output

    def forward(self, src, tgt, src_mask, tgt_mask,
                src_key_padding_mask, tgt_key_padding_mask,
                memory_key_padding_mask):
        memory = self.encode(src, src_mask, src_key_padding_mask)
        decoder_output = self.decode(tgt, memory,
                                     tgt_mask,
                                     tgt_key_padding_mask,
                                     memory_key_padding_mask)
        out = self.generator(decoder_output.transpose(0, 1))
        return out


8. Evaluation Metrics
- BLEU Score: Measures n-gram precision against reference.
- METEOR Score: Also considers synonyms and stem matches (more suitable for low-resource languages).
- Both are calculated for generated vs reference sentences.



In [9]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, emb_size)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() *
                             (-math.log(10000.0) / emb_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0).to(device)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

In [10]:
# Mask generation

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

# Greedy decode for inference
def greedy_decode(model, src_sentence, src_vocab, tgt_vocab,
                  inv_tgt_vocab, max_len=10):
    model.eval()
    tokens = [BOS] + [src_vocab.get(tok, UNK) for tok in src_sentence.split()] + [EOS]
    src = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    src_mask = torch.zeros(len(tokens), len(tokens)).type(torch.bool).to(device)
    src_key_padding_mask = (src == PAD)
    memory = model.encode(src, src_mask, src_key_padding_mask)
    ys = torch.tensor([[BOS]], dtype=torch.long).to(device)
    for i in range(max_len):
        tgt_mask = generate_square_subsequent_mask(ys.size(1)).to(device)
        out = model.decode(ys, memory, tgt_mask,
                           tgt_key_padding_mask=(ys==PAD),
                           memory_key_padding_mask=src_key_padding_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        next_word = prob.argmax(1).item()
        ys = torch.cat([ys, torch.tensor([[next_word]]).to(device)], dim=1)
        if next_word == EOS:
            break
    translated = [inv_tgt_vocab.get(idx, '<unk>') for idx in ys.squeeze().tolist()]
    return ' '.join(translated[1:-1])

9. Custom Input Testing
- Lets you input your own Urdu or English sentence, get the model's prediction, and evaluate it with BLEU and METEOR.
- Results are stored in a CSV file.

In [11]:
# Training and evaluation

def train_model(model, dataloader, optimizer, loss_fn, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input, tgt_out = tgt[:, :-1], tgt[:, 1:]
            src_mask = torch.zeros((src.size(1), src.size(1))).type(torch.bool).to(device)
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            src_pad_mask = (src == PAD)
            tgt_pad_mask = (tgt_input == PAD)
            logits = model(src, tgt_input, src_mask, tgt_mask,
                           src_pad_mask, tgt_pad_mask, src_pad_mask)
            optimizer.zero_grad()
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


def evaluate(model, pairs, src_vocab, tgt_vocab, inv_tgt_vocab):
    bleu_scores, meteor_scores = [], []
    for src_sent, tgt_sent in pairs:
        pred = greedy_decode(model, src_sent, src_vocab, tgt_vocab, inv_tgt_vocab)
        ref = [tgt_sent.split()]
        bleu_scores.append(sentence_bleu(ref, pred.split()))
        meteor_scores.append(meteor_score(ref, pred.split()))
    return sum(bleu_scores)/len(bleu_scores), sum(meteor_scores)/len(meteor_scores)


In [12]:
def main():
    # Urdu->English
    dataset_ur_en = TranslationDataset(ur_en_pairs, src_vocab_ur, tgt_vocab_en)
    loader_ur_en = DataLoader(dataset_ur_en, batch_size=2, collate_fn=collate_fn)
    model_ur_en = Seq2SeqTransformer(2, 2, emb_size=64, nhead=4,
                                     src_vocab_size=len(src_vocab_ur),
                                     tgt_vocab_size=len(tgt_vocab_en)).to(device)
    optimizer_ur_en = optim.Adam(model_ur_en.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
    print("Training Urdu->English...")
    train_model(model_ur_en, loader_ur_en, optimizer_ur_en, loss_fn, num_epochs=50)
    bleu_ur_en, meteor_ur_en = evaluate(model_ur_en, ur_en_pairs,
                                        src_vocab_ur, tgt_vocab_en, inv_tgt_vocab_en)
    print(f"Urdu->English BLEU: {bleu_ur_en:.4f}, METEOR: {meteor_ur_en:.4f}")

    # English->Urdu
    dataset_en_ur = TranslationDataset(en_ur_pairs, src_vocab_en, tgt_vocab_ur)
    loader_en_ur = DataLoader(dataset_en_ur, batch_size=2, collate_fn=collate_fn)
    model_en_ur = Seq2SeqTransformer(2, 2, emb_size=64, nhead=4,
                                     src_vocab_size=len(src_vocab_en),
                                     tgt_vocab_size=len(tgt_vocab_ur)).to(device)
    optimizer_en_ur = optim.Adam(model_en_ur.parameters(), lr=0.001)
    print("Training English->Urdu...")
    train_model(model_en_ur, loader_en_ur, optimizer_en_ur, loss_fn, num_epochs=50)
    bleu_en_ur, meteor_en_ur = evaluate(model_en_ur, en_ur_pairs,
                                        src_vocab_en, tgt_vocab_ur, inv_tgt_vocab_ur)
    print(f"English->Urdu BLEU: {bleu_en_ur:.4f}, METEOR: {meteor_en_ur:.4f}")

    # Test demo sentences
    test_sentences_ur = ["آپ کیسے ہیں", "صبح بخیر"]
    for sent in test_sentences_ur:
        print(f"Urdu->English '{sent}' -> {greedy_decode(model_ur_en, sent, src_vocab_ur, tgt_vocab_en, inv_tgt_vocab_en)}")
    test_sentences_en = ["good morning", "thank you"]
    for sent in test_sentences_en:
        print(f"English->Urdu '{sent}' -> {greedy_decode(model_en_ur, sent, src_vocab_en, tgt_vocab_ur, inv_tgt_vocab_ur)}")

if __name__ == "__main__":
    main()


Training Urdu->English...




Epoch 1, Loss: 4.5494
Epoch 2, Loss: 3.6744
Epoch 3, Loss: 3.1150
Epoch 4, Loss: 2.6542
Epoch 5, Loss: 2.2812
Epoch 6, Loss: 1.9221
Epoch 7, Loss: 1.6436
Epoch 8, Loss: 1.3533
Epoch 9, Loss: 1.1666
Epoch 10, Loss: 0.9620
Epoch 11, Loss: 0.7592
Epoch 12, Loss: 0.6458
Epoch 13, Loss: 0.5509
Epoch 14, Loss: 0.4834
Epoch 15, Loss: 0.3901
Epoch 16, Loss: 0.3227
Epoch 17, Loss: 0.2678
Epoch 18, Loss: 0.2153
Epoch 19, Loss: 0.1928
Epoch 20, Loss: 0.1476
Epoch 21, Loss: 0.1166
Epoch 22, Loss: 0.1047
Epoch 23, Loss: 0.1205
Epoch 24, Loss: 0.0896
Epoch 25, Loss: 0.0685
Epoch 26, Loss: 0.0620
Epoch 27, Loss: 0.0532
Epoch 28, Loss: 0.0549
Epoch 29, Loss: 0.0370
Epoch 30, Loss: 0.0334
Epoch 31, Loss: 0.0402
Epoch 32, Loss: 0.0329
Epoch 33, Loss: 0.0260
Epoch 34, Loss: 0.0302
Epoch 35, Loss: 0.0246
Epoch 36, Loss: 0.0289
Epoch 37, Loss: 0.0614
Epoch 38, Loss: 0.0604
Epoch 39, Loss: 0.0448
Epoch 40, Loss: 0.0819
Epoch 41, Loss: 0.1503
Epoch 42, Loss: 0.2260
Epoch 43, Loss: 0.2546
Epoch 44, Loss: 0.17

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Urdu->English BLEU: 0.8898, METEOR: 0.9897
Training English->Urdu...
Epoch 1, Loss: 4.6388
Epoch 2, Loss: 3.8041
Epoch 3, Loss: 3.0956
Epoch 4, Loss: 2.5871
Epoch 5, Loss: 2.1756
Epoch 6, Loss: 1.7753
Epoch 7, Loss: 1.4849
Epoch 8, Loss: 1.2640
Epoch 9, Loss: 1.0521
Epoch 10, Loss: 0.8263
Epoch 11, Loss: 0.6659
Epoch 12, Loss: 0.5491
Epoch 13, Loss: 0.4457
Epoch 14, Loss: 0.3620
Epoch 15, Loss: 0.2994
Epoch 16, Loss: 0.2314
Epoch 17, Loss: 0.2027
Epoch 18, Loss: 0.1479
Epoch 19, Loss: 0.1443
Epoch 20, Loss: 0.1364
Epoch 21, Loss: 0.1221
Epoch 22, Loss: 0.1641
Epoch 23, Loss: 0.1044
Epoch 24, Loss: 0.1035
Epoch 25, Loss: 0.1094
Epoch 26, Loss: 0.0898
Epoch 27, Loss: 0.0897
Epoch 28, Loss: 0.0610
Epoch 29, Loss: 0.0781
Epoch 30, Loss: 0.0669
Epoch 31, Loss: 0.1108
Epoch 32, Loss: 0.0847
Epoch 33, Loss: 0.0524
Epoch 34, Loss: 0.0621
Epoch 35, Loss: 0.0394
Epoch 36, Loss: 0.0343
Epoch 37, Loss: 0.0554
Epoch 38, Loss: 0.0536
Epoch 39, Loss: 0.0444
Epoch 40, Loss: 0.0281
Epoch 41, Loss: 0.02