In [None]:
!pip install sacrebleu --quiet

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device, torch.cuda.get_device_name(0))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda Tesla T4


In [None]:
from google.colab import files

# Upload your JSON files
uploaded = files.upload()


Saving test_bpe.json to test_bpe.json
Saving train_bpe.json to train_bpe.json
Saving val_bpe.json to val_bpe.json


In [None]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2  # adjustable

class BpeDataset(Dataset):
    def __init__(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        src = torch.tensor(item['urdu_ids'], dtype=torch.long)
        trg = torch.tensor(item['roman_ids'], dtype=torch.long)
        return src, trg

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    src_lens = torch.tensor([len(s) for s in srcs])
    trg_lens = torch.tensor([len(t) for t in trgs])
    src_pad = pad_sequence(srcs, batch_first=True, padding_value=PAD_IDX)
    trg_pad = pad_sequence(trgs, batch_first=True, padding_value=PAD_IDX)
    return src_pad, src_lens, trg_pad, trg_lens

# Paths for Colab
train_ds = BpeDataset("/content/train_bpe.json")
val_ds   = BpeDataset("/content/val_bpe.json")
test_ds  = BpeDataset("/content/test_bpe.json")

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Loss function (ignores PAD tokens during training)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [None]:
#for checking only
src, src_lens, trg, trg_lens = next(iter(train_loader))
print("SRC batch:", src.shape)
print("TRG batch:", trg.shape)
print("SRC lens:", src_lens[:10])
print("TRG lens:", trg_lens[:10])


SRC batch: torch.Size([16, 11])
TRG batch: torch.Size([16, 16])
SRC lens: tensor([10,  8,  7, 10, 11,  8, 10,  8, 11, 11])
TRG lens: tensor([12, 10, 12, 13, 14, 16, 13, 14, 13, 16])


In [None]:
# Find actual vocab sizes from dataset
max_src_id = max(max(item['urdu_ids']) for item in train_ds.data)
max_trg_id = max(max(item['roman_ids']) for item in train_ds.data)

SRC_VOCAB_SIZE = max_src_id + 1
TRG_VOCAB_SIZE = max_trg_id + 1

print("SRC vocab size:", SRC_VOCAB_SIZE)
print("TRG vocab size:", TRG_VOCAB_SIZE)

# Sanity check batch
src_pad, src_lens, trg_pad, trg_lens = next(iter(train_loader))
print("SRC shape:", src_pad.shape)
print("TRG shape:", trg_pad.shape)

# Create fake logits with correct vocab size
logits = torch.randn(trg_pad.size(0), trg_pad.size(1), TRG_VOCAB_SIZE)

# Compute loss ignoring PAD_IDX
out_dim = logits.shape[-1]
loss = criterion(logits.reshape(-1, out_dim), trg_pad.reshape(-1))
print("Sample loss:", loss.item())



SRC vocab size: 15999
TRG vocab size: 15978
SRC shape: torch.Size([16, 11])
TRG shape: torch.Size([16, 14])
Sample loss: 10.13171100616455


In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        # encoder is bidirectional → enc_hid_dim*2
        self.enc_proj = nn.Linear(enc_hid_dim*2, dec_hid_dim)
        self.attn = nn.Linear(dec_hid_dim, dec_hid_dim)

    def forward(self, hidden, encoder_outputs, mask=None):
        # encoder_outputs: [batch, src_len, enc_hid*2]
        # hidden: [batch, dec_hid]

        # project encoder outputs → [batch, src_len, dec_hid]
        proj_enc = self.enc_proj(encoder_outputs)

        # transform hidden → [batch, dec_hid, 1]
        query = self.attn(hidden).unsqueeze(2)

        # scores: [batch, src_len]
        scores = torch.bmm(proj_enc, query).squeeze(2)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn_weights = torch.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), proj_enc).squeeze(1)  # [batch, dec_hid]

        return context, attn_weights



In [None]:
# ---------- paste this whole block into one Colab cell ----------
import json, math, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.optim import Adam
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import sacrebleu

# ---- constants ----
PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---- Dataset loader (expects /content/train_bpe.json etc) ----
class BpeDataset(Dataset):
    def __init__(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        it = self.data[idx]
        src = torch.tensor(it['urdu_ids'], dtype=torch.long)
        trg = torch.tensor(it['roman_ids'], dtype=torch.long)
        return src, trg

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    src_lens = torch.tensor([len(s) for s in srcs], dtype=torch.long)
    trg_lens = torch.tensor([len(t) for t in trgs], dtype=torch.long)
    src_pad = pad_sequence(srcs, batch_first=True, padding_value=PAD_IDX)
    trg_pad = pad_sequence(trgs, batch_first=True, padding_value=PAD_IDX)
    return src_pad, src_lens, trg_pad, trg_lens

train_ds = BpeDataset("/content/train_bpe.json")
val_ds   = BpeDataset("/content/val_bpe.json")
test_ds  = BpeDataset("/content/test_bpe.json")

BATCH = 64
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False, collate_fn=collate_fn)

# ---- utilities: build vocab sizes and id->token maps from your JSON files ----
def get_vocab_size(path, id_key):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    max_id = max(max(item[id_key]) for item in data)
    return max_id + 1

def build_id2tok(paths, id_key, token_key):
    id2tok = {}
    for path in paths:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        for item in data:
            ids = item[id_key]
            toks = item[token_key]
            for i, idx in enumerate(ids):
                k = str(idx)
                # if mapping conflict occurs, keep first seen mapping (should be consistent)
                if k not in id2tok:
                    id2tok[k] = toks[i] if i < len(toks) else "<UNK>"
    # ensure special tokens
    id2tok.setdefault(str(PAD_IDX), "<PAD>")
    id2tok.setdefault(str(SOS_IDX), "<SOS>")
    id2tok.setdefault(str(EOS_IDX), "<EOS>")
    return id2tok

INPUT_DIM  = get_vocab_size("/content/train_bpe.json", "urdu_ids")
OUTPUT_DIM = get_vocab_size("/content/train_bpe.json", "roman_ids")
tgt_id2tok = build_id2tok(["/content/train_bpe.json","/content/val_bpe.json","/content/test_bpe.json"], "roman_ids", "roman_tokens")

print("SRC vocab size:", INPUT_DIM)
print("TRG vocab size:", OUTPUT_DIM)

# ---- Model definitions (BiLSTM encoder 2 layers, LSTM decoder 4 layers) ----
class EncoderBiLSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
                            bidirectional=True, batch_first=True, dropout=dropout)
        self.fc_h = nn.Linear(hid_dim*2, hid_dim)
        self.fc_c = nn.Linear(hid_dim*2, hid_dim)
    def forward(self, src, src_len):
        emb = self.embedding(src)                            # [b, src_len, emb]
        packed = pack_padded_sequence(emb, src_len.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (h, c) = self.lstm(packed)
        enc_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        h_cat = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)     # [b, hid*2]
        c_cat = torch.cat((c[-2,:,:], c[-1,:,:]), dim=1)
        h0 = torch.tanh(self.fc_h(h_cat)).unsqueeze(0)       # [1,b,hid]
        c0 = torch.tanh(self.fc_c(c_cat)).unsqueeze(0)
        return enc_out, (h0, c0)

class DecoderWithAttention(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim + hid_dim, hid_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.n_layers = n_layers
        self.attention = Attention(hid_dim, hid_dim)

    def forward_step(self, input_token, hidden, cell, encoder_outputs, mask=None):
        emb = self.embedding(input_token).unsqueeze(1)  # [b,1,emb]
        context, attn_weights = self.attention(hidden[-1], encoder_outputs, mask)  # use last layer hidden
        context = context.unsqueeze(1)  # [b,1,hid]
        rnn_input = torch.cat([emb, context], dim=2)   # [b,1,emb+hid]
        out, (h, c) = self.lstm(rnn_input, (hidden, cell))
        pred = self.fc_out(out.squeeze(1))
        return pred, h, c, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_len, trg=None, teacher_forcing=0.5, max_len=100):
        bsz = src.size(0)
        trg_len = trg.size(1) if trg is not None else max_len
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(bsz, trg_len, vocab_size, device=self.device)

        enc_out, (h, c) = self.encoder(src, src_len)

        input_token = trg[:,0] if trg is not None else torch.full((bsz,), SOS_IDX, dtype=torch.long, device=self.device)

        for t in range(1, trg_len):
            pred, h, c, _ = self.decoder.forward_step(input_token, h, c, enc_out)
            outputs[:, t, :] = pred
            if trg is not None:
                if torch.rand(1).item() < teacher_forcing:
                    input_token = trg[:, t]
                else:
                    input_token = pred.argmax(1)
            else:
                input_token = pred.argmax(1)
        return outputs


Device: cuda
SRC vocab size: 15999
TRG vocab size: 15978


In [None]:

# ---- training helpers ----
def ids_to_string(ids, id2tok):
    toks = []
    for idx in ids:
        if idx == PAD_IDX: continue
        if idx == SOS_IDX: continue
        if idx == EOS_IDX: break
        toks.append(id2tok.get(str(idx), "<UNK>"))
    s = "".join(toks).replace("▁", " ").strip()
    return s

def levenshtein(a,b):
    n,m = len(a), len(b)
    if n==0: return m
    dp = list(range(m+1))
    for i in range(1,n+1):
        prev, dp[0] = dp[0], i
        for j in range(1,m+1):
            cur = min(dp[j] + 1, prev + (a[i-1] != b[j-1]), dp[j-1] + 1)
            prev, dp[j] = dp[j], cur
    return dp[m]

def compute_metrics(preds, refs, val_loss):
    bleu = sacrebleu.corpus_bleu(preds, [refs]).score if len(preds)>0 else 0.0
    cers = [levenshtein(p,r)/max(1,len(r)) for p,r in zip(preds, refs)] if len(preds)>0 else [1.0]
    cer = sum(cers)/len(cers)
    try:
        ppl = math.exp(val_loss)
    except OverflowError:
        ppl = float('inf')
    return bleu, cer, ppl

# ---- train / eval functions (AMP-safe) ----
def train_epoch(model, loader, optimizer, criterion, scaler, tf_ratio):
    model.train()
    total_loss = 0.0
    for src, src_len, trg, _ in tqdm(loader, desc="train", leave=False):
        src, src_len, trg = src.to(device), src_len.to(device), trg.to(device)
        optimizer.zero_grad()
        with autocast():
            outputs = model(src, src_len, trg, teacher_forcing=tf_ratio)
            out_dim = outputs.shape[-1]
            loss = criterion(outputs[:,1:,:].reshape(-1, out_dim), trg[:,1:].reshape(-1))
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion, id2tok):
    model.eval()
    total_loss = 0.0
    preds_str = []
    refs_str = []
    with torch.no_grad():
        for src, src_len, trg, _ in tqdm(loader, desc="eval", leave=False):
            src, src_len, trg = src.to(device), src_len.to(device), trg.to(device)
            outputs = model(src, src_len, trg=None, teacher_forcing=0.0, max_len=trg.size(1))
            out_dim = outputs.shape[-1]
            loss = criterion(outputs[:,:trg.size(1),:].reshape(-1,out_dim), trg.reshape(-1))
            total_loss += loss.item()
            top = outputs.argmax(-1).cpu().tolist()
            for i in range(len(top)):
                preds_str.append(ids_to_string(top[i], id2tok))
                refs_str.append(ids_to_string(trg[i].cpu().tolist(), id2tok))
    return total_loss / len(loader), preds_str, refs_str



In [None]:
import matplotlib.pyplot as plt

def run_training(INPUT_DIM, OUTPUT_DIM,
                 emb_dim=256, hid_dim=512,
                 enc_layers=4, dec_layers=4,
                 dropout=0.3, lr=5e-4,
                 epochs=12, save_dir="/content",
                 resume_from=None):

    # 1) Build model with attention
    encoder = EncoderBiLSTM(INPUT_DIM, emb_dim, hid_dim,
                            n_layers=enc_layers, dropout=dropout, pad_idx=PAD_IDX)
    decoder = DecoderWithAttention(OUTPUT_DIM, emb_dim, hid_dim,
                                   n_layers=dec_layers, dropout=dropout, pad_idx=PAD_IDX)
    model = Seq2Seq(encoder, decoder, device).to(device)

    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-6)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)

    scaler = GradScaler()   # AMP scaler

    start_epoch = 0
    best_val_loss = float("inf")
    best_epoch = -1

    # Resume from checkpoint if provided
    if resume_from is not None:
        checkpoint = torch.load(resume_from, map_location=device)
        model.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])
        scaler.load_state_dict(checkpoint["scaler_state"])
        start_epoch = checkpoint["epoch"] + 1
        best_val_loss = checkpoint.get("best_val_loss", float("inf"))
        best_epoch = checkpoint.get("best_epoch", -1)
        print(f"Resumed training from epoch {start_epoch} using {resume_from}")

    # 2) Track history + scheduler
    history = {"train_loss": [], "val_loss": [], "bleu": [], "cer": [], "ppl": []}
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", factor=0.5, patience=3
    )

    # 3) Training loop
    for epoch in range(start_epoch, start_epoch + epochs):
        tf = max(0.0, 1 - epoch / max(1, start_epoch + epochs))   # teacher forcing decay

        train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, tf)
        val_loss, preds, refs = evaluate(model, val_loader, criterion, tgt_id2tok)
        bleu, cer, ppl = compute_metrics(preds, refs, val_loss)

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["bleu"].append(bleu)
        history["cer"].append(cer)
        history["ppl"].append(ppl)

        print(f"[E{epoch+1}] train_loss={train_loss:.4f} "
              f"val_loss={val_loss:.4f} BLEU={bleu:.2f} CER={cer:.4f} PPL={ppl:.2f}")

        # save full checkpoint
        checkpoint = {
            "epoch": epoch,
            "model_state": model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scaler_state": scaler.state_dict(),
            "best_val_loss": best_val_loss,
            "best_epoch": best_epoch
        }
        torch.save(checkpoint, f"{save_dir}/checkpoint_epoch{epoch+1}.pt")

        # save best checkpoint
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch + 1
            torch.save(checkpoint, f"{save_dir}/best_checkpoint.pt")
            print(f"==> Saved best model at epoch {best_epoch} (val_loss={best_val_loss:.4f})")

        scheduler.step(val_loss)

    # 4) (Optional) Plotting can go here if you want curves

    return history, model


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=6.6281 val_loss=6.6963 BLEU=0.03 CER=0.7517 PPL=809.44
==> Saved best model at epoch 1 (val_loss=6.6963)




[E2] train_loss=6.3598 val_loss=6.6922 BLEU=0.04 CER=0.7503 PPL=806.12
==> Saved best model at epoch 2 (val_loss=6.6922)




[E3] train_loss=6.3292 val_loss=6.7029 BLEU=0.03 CER=0.7940 PPL=814.80




[E4] train_loss=6.2776 val_loss=6.7267 BLEU=0.03 CER=0.8082 PPL=834.40




[E5] train_loss=6.2363 val_loss=6.6827 BLEU=0.04 CER=0.7488 PPL=798.49
==> Saved best model at epoch 5 (val_loss=6.6827)




[E6] train_loss=6.1999 val_loss=6.7344 BLEU=0.03 CER=0.7796 PPL=840.81




[E7] train_loss=6.1894 val_loss=6.6761 BLEU=0.03 CER=0.7517 PPL=793.22
==> Saved best model at epoch 7 (val_loss=6.6761)




[E8] train_loss=6.1917 val_loss=6.6782 BLEU=0.03 CER=0.7544 PPL=794.92




[E9] train_loss=6.1799 val_loss=6.6417 BLEU=0.04 CER=0.7487 PPL=766.39
==> Saved best model at epoch 9 (val_loss=6.6417)




[E10] train_loss=6.1575 val_loss=6.6445 BLEU=0.04 CER=0.8955 PPL=768.52




[E11] train_loss=6.1652 val_loss=6.6144 BLEU=0.04 CER=0.7880 PPL=745.77
==> Saved best model at epoch 11 (val_loss=6.6144)




[E12] train_loss=6.1537 val_loss=6.5733 BLEU=0.05 CER=0.7531 PPL=715.74
==> Saved best model at epoch 12 (val_loss=6.5733)


In [None]:
def test_model(model, test_path, batch_size=64):
    # load dataset
    test_ds = BpeDataset(test_path)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # evaluate
    test_loss, preds, refs = evaluate(model, test_loader, criterion, tgt_id2tok)
    bleu, cer, ppl = compute_metrics(preds, refs, test_loss)

    print("\n=== Test Set Results ===")
    print(f"Loss: {test_loss:.4f}")
    print(f"BLEU: {bleu:.2f}")
    print(f"CER:  {cer:.4f}")
    print(f"PPL:  {ppl:.2f}")

    # Show some examples
    for i in range(5):
        print(f"REF: {refs[i]}")
        print(f"PRD: {preds[i]}")
        print("-"*40)

    return bleu, cer, ppl, preds, refs


In [None]:
def translate_sentence(model, urdu_ids, tgt_id2tok, max_len=100, ref_ids=None):
    model.eval()
    src = torch.tensor(urdu_ids, dtype=torch.long).unsqueeze(0).to(device)
    src_len = torch.tensor([len(urdu_ids)], dtype=torch.long).to(device)

    # forward (no teacher forcing)
    with torch.no_grad():
        outputs = model(src, src_len, trg=None, teacher_forcing=0.0, max_len=max_len)
        top = outputs.argmax(-1).squeeze(0).tolist()

    # decode ids -> string
    pred_str = ids_to_string(top, tgt_id2tok)

    # optional metrics if reference provided
    if ref_ids is not None:
        ref_str = ids_to_string(ref_ids, tgt_id2tok)
        bleu, cer, ppl = compute_metrics([pred_str], [ref_str], val_loss=0.0)
        print("\n=== Single Sentence Test ===")
        print("REF:", ref_str)
        print("PRD:", pred_str)
        print(f"BLEU: {bleu:.2f}, CER: {cer:.4f}, PPL: {ppl:.2f}")
    else:
        print("\n=== Single Sentence Test ===")
        print("PRD:", pred_str)

    return pred_str


In [None]:
# --- load best model ---
checkpoint = torch.load("/content/best_checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint["model_state"])
model.to(device)

# --- test on whole test_bpe.json ---
test_model(model, "/content/test_bpe.json")

# --- test on a single Urdu sentence ---
# Example: take one example from test_ds
sample = test_ds[0]
urdu_ids = sample[0].tolist()     # input ids
ref_ids  = sample[1].tolist()     # ground truth roman ids
translate_sentence(model, urdu_ids, tgt_id2tok, ref_ids=ref_ids)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=512, hid_dim=512,
                 enc_layers=4, dec_layers=4,
                 dropout=0.3, lr=1e-4, )

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=7.0891 val_loss=6.7016 BLEU=0.03 CER=0.7700 PPL=813.70
==> Saved best model at epoch 1 (val_loss=6.7016)




[E2] train_loss=6.3436 val_loss=6.7194 BLEU=0.04 CER=0.7712 PPL=828.31




[E3] train_loss=6.3019 val_loss=6.6952 BLEU=0.04 CER=0.7546 PPL=808.48
==> Saved best model at epoch 3 (val_loss=6.6952)




[E4] train_loss=6.2775 val_loss=6.6530 BLEU=0.04 CER=0.7508 PPL=775.08
==> Saved best model at epoch 4 (val_loss=6.6530)




[E5] train_loss=6.2576 val_loss=6.7048 BLEU=0.04 CER=0.7604 PPL=816.35




[E6] train_loss=6.2409 val_loss=6.6127 BLEU=0.04 CER=0.7386 PPL=744.48
==> Saved best model at epoch 6 (val_loss=6.6127)




[E7] train_loss=6.2302 val_loss=6.6249 BLEU=0.04 CER=0.7424 PPL=753.66




[E8] train_loss=6.2121 val_loss=6.6023 BLEU=0.04 CER=0.7462 PPL=736.75
==> Saved best model at epoch 8 (val_loss=6.6023)




[E9] train_loss=6.1968 val_loss=6.5817 BLEU=0.04 CER=0.7576 PPL=721.77
==> Saved best model at epoch 9 (val_loss=6.5817)




[E10] train_loss=6.1655 val_loss=6.5646 BLEU=0.04 CER=0.8667 PPL=709.53
==> Saved best model at epoch 10 (val_loss=6.5646)




[E11] train_loss=6.1319 val_loss=6.5466 BLEU=0.04 CER=0.8644 PPL=696.89
==> Saved best model at epoch 11 (val_loss=6.5466)




[E12] train_loss=6.0952 val_loss=6.5386 BLEU=0.05 CER=0.8360 PPL=691.32
==> Saved best model at epoch 12 (val_loss=6.5386)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=512, hid_dim=512,
                 enc_layers=4, dec_layers=4,
                 dropout=0.3, lr=1e-4, resume_from="/content/best_checkpoint.pt", epochs=20)

  scaler = GradScaler()   # correct usage


Resumed training from epoch 12 using /content/best_checkpoint.pt


  with autocast():


[E13] train_loss=6.0442 val_loss=6.5260 BLEU=0.06 CER=0.9293 PPL=682.66
==> Saved best model at epoch 13 (val_loss=6.5260)




[E14] train_loss=6.0021 val_loss=6.4947 BLEU=0.07 CER=0.8145 PPL=661.63
==> Saved best model at epoch 14 (val_loss=6.4947)




[E15] train_loss=5.9628 val_loss=6.5067 BLEU=0.10 CER=0.8138 PPL=669.58




[E16] train_loss=5.9139 val_loss=6.5039 BLEU=0.10 CER=0.8632 PPL=667.77




[E17] train_loss=5.8666 val_loss=6.4739 BLEU=0.11 CER=0.8886 PPL=648.03
==> Saved best model at epoch 17 (val_loss=6.4739)




[E18] train_loss=5.8193 val_loss=6.4185 BLEU=0.13 CER=0.8216 PPL=613.07
==> Saved best model at epoch 18 (val_loss=6.4185)




[E19] train_loss=5.7766 val_loss=6.4172 BLEU=0.13 CER=0.8912 PPL=612.28
==> Saved best model at epoch 19 (val_loss=6.4172)




[E20] train_loss=5.7343 val_loss=6.3859 BLEU=0.12 CER=0.8020 PPL=593.40
==> Saved best model at epoch 20 (val_loss=6.3859)




[E21] train_loss=5.6908 val_loss=6.3558 BLEU=0.14 CER=0.7928 PPL=575.81
==> Saved best model at epoch 21 (val_loss=6.3558)




[E22] train_loss=5.6411 val_loss=6.3215 BLEU=0.17 CER=0.7282 PPL=556.38
==> Saved best model at epoch 22 (val_loss=6.3215)




[E23] train_loss=5.5897 val_loss=6.3180 BLEU=0.16 CER=0.7430 PPL=554.45
==> Saved best model at epoch 23 (val_loss=6.3180)




[E24] train_loss=5.5421 val_loss=6.2746 BLEU=0.27 CER=0.7314 PPL=530.93
==> Saved best model at epoch 24 (val_loss=6.2746)




[E25] train_loss=5.4948 val_loss=6.2464 BLEU=0.36 CER=0.7026 PPL=516.14
==> Saved best model at epoch 25 (val_loss=6.2464)




[E26] train_loss=5.4445 val_loss=6.2614 BLEU=0.39 CER=0.7290 PPL=523.94




[E27] train_loss=5.3981 val_loss=6.2324 BLEU=0.53 CER=0.7028 PPL=508.96
==> Saved best model at epoch 27 (val_loss=6.2324)




[E28] train_loss=5.3562 val_loss=6.1974 BLEU=0.73 CER=0.7005 PPL=491.45
==> Saved best model at epoch 28 (val_loss=6.1974)




[E29] train_loss=5.3072 val_loss=6.2023 BLEU=0.92 CER=0.7005 PPL=493.88




[E30] train_loss=5.2621 val_loss=6.1624 BLEU=0.92 CER=0.6885 PPL=474.57
==> Saved best model at epoch 30 (val_loss=6.1624)




[E31] train_loss=5.2176 val_loss=6.1901 BLEU=1.42 CER=0.6996 PPL=487.87




[E32] train_loss=5.1723 val_loss=6.1154 BLEU=1.55 CER=0.6741 PPL=452.80
==> Saved best model at epoch 32 (val_loss=6.1154)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=512, hid_dim=256,
                 enc_layers=6, dec_layers=4,
                 dropout=0.3, lr=1e-4, epochs=8)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=7.5925 val_loss=6.7545 BLEU=0.00 CER=0.9081 PPL=857.94
==> Saved best model at epoch 1 (val_loss=6.7545)




[E2] train_loss=6.4043 val_loss=6.7107 BLEU=0.03 CER=0.7859 PPL=821.18
==> Saved best model at epoch 2 (val_loss=6.7107)




[E3] train_loss=6.3631 val_loss=6.6800 BLEU=0.04 CER=0.7485 PPL=796.29
==> Saved best model at epoch 3 (val_loss=6.6800)




[E4] train_loss=6.3319 val_loss=6.6673 BLEU=0.04 CER=0.7477 PPL=786.24
==> Saved best model at epoch 4 (val_loss=6.6673)




[E5] train_loss=6.3076 val_loss=6.6398 BLEU=0.03 CER=0.7422 PPL=764.93
==> Saved best model at epoch 5 (val_loss=6.6398)




[E6] train_loss=6.2859 val_loss=6.6231 BLEU=0.03 CER=0.7395 PPL=752.26
==> Saved best model at epoch 6 (val_loss=6.6231)




[E7] train_loss=6.2661 val_loss=6.6141 BLEU=0.03 CER=0.7414 PPL=745.54
==> Saved best model at epoch 7 (val_loss=6.6141)




[E8] train_loss=6.2495 val_loss=6.6113 BLEU=0.04 CER=0.7501 PPL=743.44
==> Saved best model at epoch 8 (val_loss=6.6113)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=256, hid_dim=256,
                 enc_layers=2, dec_layers=4,
                 dropout=0.1, lr=1e-4, epochs=12)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=7.5720 val_loss=6.7509 BLEU=0.00 CER=0.9081 PPL=854.87
==> Saved best model at epoch 1 (val_loss=6.7509)




[E2] train_loss=6.3882 val_loss=6.7160 BLEU=0.03 CER=0.7469 PPL=825.49
==> Saved best model at epoch 2 (val_loss=6.7160)




[E3] train_loss=6.3329 val_loss=6.6835 BLEU=0.04 CER=0.7285 PPL=799.11
==> Saved best model at epoch 3 (val_loss=6.6835)




[E4] train_loss=6.2944 val_loss=6.6217 BLEU=0.04 CER=0.7297 PPL=751.26
==> Saved best model at epoch 4 (val_loss=6.6217)




[E5] train_loss=6.2507 val_loss=6.6263 BLEU=0.05 CER=0.7591 PPL=754.67




[E6] train_loss=6.2119 val_loss=6.5613 BLEU=0.07 CER=0.7452 PPL=707.19
==> Saved best model at epoch 6 (val_loss=6.5613)




[E7] train_loss=6.1706 val_loss=6.5263 BLEU=0.07 CER=0.7315 PPL=682.87
==> Saved best model at epoch 7 (val_loss=6.5263)




[E8] train_loss=6.1241 val_loss=6.4928 BLEU=0.09 CER=0.7511 PPL=660.35
==> Saved best model at epoch 8 (val_loss=6.4928)




[E9] train_loss=6.0685 val_loss=6.4558 BLEU=0.11 CER=0.7367 PPL=636.38
==> Saved best model at epoch 9 (val_loss=6.4558)




[E10] train_loss=6.0180 val_loss=6.4446 BLEU=0.12 CER=0.7551 PPL=629.30
==> Saved best model at epoch 10 (val_loss=6.4446)




[E11] train_loss=5.9668 val_loss=6.3859 BLEU=0.16 CER=0.7246 PPL=593.40
==> Saved best model at epoch 11 (val_loss=6.3859)




[E12] train_loss=5.9187 val_loss=6.3680 BLEU=0.17 CER=0.7422 PPL=582.92
==> Saved best model at epoch 12 (val_loss=6.3680)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=256, hid_dim=256,
                 enc_layers=2, dec_layers=4,
                 dropout=0.1, lr=1e-4, resume_from="/content/best_checkpoint.pt", epochs=20)

  scaler = GradScaler()   # correct usage


Resumed training from epoch 12 using /content/best_checkpoint.pt


  with autocast():


[E13] train_loss=5.8661 val_loss=6.3230 BLEU=0.16 CER=0.7172 PPL=557.26
==> Saved best model at epoch 13 (val_loss=6.3230)




[E14] train_loss=5.8186 val_loss=6.3081 BLEU=0.20 CER=0.7300 PPL=549.03
==> Saved best model at epoch 14 (val_loss=6.3081)




[E15] train_loss=5.7716 val_loss=6.2629 BLEU=0.21 CER=0.7107 PPL=524.76
==> Saved best model at epoch 15 (val_loss=6.2629)




[E16] train_loss=5.7199 val_loss=6.2375 BLEU=0.23 CER=0.6992 PPL=511.58
==> Saved best model at epoch 16 (val_loss=6.2375)




[E17] train_loss=5.6712 val_loss=6.2187 BLEU=0.24 CER=0.7012 PPL=502.04
==> Saved best model at epoch 17 (val_loss=6.2187)




[E18] train_loss=5.6257 val_loss=6.2102 BLEU=0.41 CER=0.7256 PPL=497.79
==> Saved best model at epoch 18 (val_loss=6.2102)




[E19] train_loss=5.5796 val_loss=6.1698 BLEU=0.45 CER=0.7054 PPL=478.08
==> Saved best model at epoch 19 (val_loss=6.1698)




[E20] train_loss=5.5382 val_loss=6.1776 BLEU=0.47 CER=0.7064 PPL=481.85




[E21] train_loss=5.4969 val_loss=6.1251 BLEU=0.60 CER=0.6773 PPL=457.19
==> Saved best model at epoch 21 (val_loss=6.1251)




[E22] train_loss=5.4561 val_loss=6.1141 BLEU=0.65 CER=0.6855 PPL=452.20
==> Saved best model at epoch 22 (val_loss=6.1141)




[E23] train_loss=5.4179 val_loss=6.0904 BLEU=0.89 CER=0.6884 PPL=441.61
==> Saved best model at epoch 23 (val_loss=6.0904)




[E24] train_loss=5.3725 val_loss=6.0728 BLEU=1.16 CER=0.6956 PPL=433.92
==> Saved best model at epoch 24 (val_loss=6.0728)




[E25] train_loss=5.3337 val_loss=6.0675 BLEU=1.26 CER=0.6864 PPL=431.61
==> Saved best model at epoch 25 (val_loss=6.0675)




[E26] train_loss=5.2926 val_loss=6.0346 BLEU=1.50 CER=0.6708 PPL=417.64
==> Saved best model at epoch 26 (val_loss=6.0346)




[E27] train_loss=5.2525 val_loss=6.0312 BLEU=1.69 CER=0.6855 PPL=416.23
==> Saved best model at epoch 27 (val_loss=6.0312)




[E28] train_loss=5.2123 val_loss=5.9987 BLEU=2.09 CER=0.6720 PPL=402.90
==> Saved best model at epoch 28 (val_loss=5.9987)




[E29] train_loss=5.1768 val_loss=5.9857 BLEU=2.42 CER=0.6649 PPL=397.69
==> Saved best model at epoch 29 (val_loss=5.9857)




[E30] train_loss=5.1378 val_loss=5.9874 BLEU=2.40 CER=0.6842 PPL=398.38




[E31] train_loss=5.0997 val_loss=5.9663 BLEU=2.58 CER=0.6792 PPL=390.08
==> Saved best model at epoch 31 (val_loss=5.9663)




[E32] train_loss=5.0623 val_loss=5.9487 BLEU=2.70 CER=0.6482 PPL=383.27
==> Saved best model at epoch 32 (val_loss=5.9487)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=128, hid_dim=256,
                 enc_layers=1, dec_layers=2,
                 dropout=0.1, lr=1e-4, epochs=12)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=7.6501 val_loss=6.7549 BLEU=0.00 CER=0.9709 PPL=858.27
==> Saved best model at epoch 1 (val_loss=6.7549)




[E2] train_loss=6.3934 val_loss=6.7156 BLEU=0.03 CER=0.7524 PPL=825.21
==> Saved best model at epoch 2 (val_loss=6.7156)




[E3] train_loss=6.3327 val_loss=6.6718 BLEU=0.04 CER=0.7460 PPL=789.80
==> Saved best model at epoch 3 (val_loss=6.6718)




[E4] train_loss=6.2777 val_loss=6.6295 BLEU=0.05 CER=0.7889 PPL=757.11
==> Saved best model at epoch 4 (val_loss=6.6295)




[E5] train_loss=6.2256 val_loss=6.6044 BLEU=0.06 CER=0.8146 PPL=738.35
==> Saved best model at epoch 5 (val_loss=6.6044)




[E6] train_loss=6.1764 val_loss=6.5395 BLEU=0.06 CER=0.8012 PPL=691.94
==> Saved best model at epoch 6 (val_loss=6.5395)




[E7] train_loss=6.1271 val_loss=6.5003 BLEU=0.07 CER=0.7897 PPL=665.37
==> Saved best model at epoch 7 (val_loss=6.5003)




[E8] train_loss=6.0756 val_loss=6.4501 BLEU=0.08 CER=0.7636 PPL=632.77
==> Saved best model at epoch 8 (val_loss=6.4501)




[E9] train_loss=6.0271 val_loss=6.4162 BLEU=0.13 CER=0.7403 PPL=611.66
==> Saved best model at epoch 9 (val_loss=6.4162)




[E10] train_loss=5.9787 val_loss=6.3810 BLEU=0.14 CER=0.7359 PPL=590.54
==> Saved best model at epoch 10 (val_loss=6.3810)




[E11] train_loss=5.9357 val_loss=6.3547 BLEU=0.16 CER=0.7480 PPL=575.17
==> Saved best model at epoch 11 (val_loss=6.3547)




[E12] train_loss=5.8979 val_loss=6.3237 BLEU=0.17 CER=0.7264 PPL=557.64
==> Saved best model at epoch 12 (val_loss=6.3237)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=64, hid_dim=128,
                 enc_layers=1, dec_layers=2,
                 dropout=0.1, lr=1e-4)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=8.4675 val_loss=7.0117 BLEU=0.00 CER=1.0000 PPL=1109.52
==> Saved best model at epoch 1 (val_loss=7.0117)




[E2] train_loss=6.5243 val_loss=6.7773 BLEU=0.00 CER=1.0000 PPL=877.73
==> Saved best model at epoch 2 (val_loss=6.7773)




[E3] train_loss=6.4388 val_loss=6.7513 BLEU=0.00 CER=1.0000 PPL=855.13
==> Saved best model at epoch 3 (val_loss=6.7513)




[E4] train_loss=6.4127 val_loss=6.7309 BLEU=0.00 CER=0.9773 PPL=837.93
==> Saved best model at epoch 4 (val_loss=6.7309)




[E5] train_loss=6.3864 val_loss=6.7085 BLEU=0.02 CER=0.8398 PPL=819.35
==> Saved best model at epoch 5 (val_loss=6.7085)




[E6] train_loss=6.3623 val_loss=6.6878 BLEU=0.03 CER=0.7690 PPL=802.54
==> Saved best model at epoch 6 (val_loss=6.6878)




[E7] train_loss=6.3387 val_loss=6.6746 BLEU=0.03 CER=0.7494 PPL=792.05
==> Saved best model at epoch 7 (val_loss=6.6746)




[E8] train_loss=6.3146 val_loss=6.6515 BLEU=0.04 CER=0.7514 PPL=773.98
==> Saved best model at epoch 8 (val_loss=6.6515)




[E9] train_loss=6.2976 val_loss=6.6333 BLEU=0.04 CER=0.7634 PPL=759.97
==> Saved best model at epoch 9 (val_loss=6.6333)




[E10] train_loss=6.2793 val_loss=6.6191 BLEU=0.03 CER=0.7728 PPL=749.27
==> Saved best model at epoch 10 (val_loss=6.6191)




[E11] train_loss=6.2623 val_loss=6.6055 BLEU=0.04 CER=0.7752 PPL=739.17
==> Saved best model at epoch 11 (val_loss=6.6055)




[E12] train_loss=6.2466 val_loss=6.5957 BLEU=0.04 CER=0.7777 PPL=731.97
==> Saved best model at epoch 12 (val_loss=6.5957)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=64, hid_dim=128,
                 enc_layers=1, dec_layers=2,
                 dropout=0.5, lr=5e-4)

  scaler = GradScaler()   # correct usage
  with autocast():


[E1] train_loss=7.0309 val_loss=6.7329 BLEU=0.02 CER=0.8380 PPL=839.60
==> Saved best model at epoch 1 (val_loss=6.7329)




[E2] train_loss=6.3676 val_loss=6.7024 BLEU=0.03 CER=0.7464 PPL=814.34
==> Saved best model at epoch 2 (val_loss=6.7024)




[E3] train_loss=6.3126 val_loss=6.6971 BLEU=0.04 CER=0.7511 PPL=810.08
==> Saved best model at epoch 3 (val_loss=6.6971)




[E4] train_loss=6.2728 val_loss=6.6990 BLEU=0.05 CER=0.7569 PPL=811.60




[E5] train_loss=6.2292 val_loss=6.6316 BLEU=0.05 CER=0.7626 PPL=758.73
==> Saved best model at epoch 5 (val_loss=6.6316)




[E6] train_loss=6.1850 val_loss=6.5592 BLEU=0.06 CER=0.7799 PPL=705.70
==> Saved best model at epoch 6 (val_loss=6.5592)




[E7] train_loss=6.1361 val_loss=6.5183 BLEU=0.07 CER=0.8055 PPL=677.41
==> Saved best model at epoch 7 (val_loss=6.5183)




[E8] train_loss=6.0846 val_loss=6.4764 BLEU=0.08 CER=0.7702 PPL=649.63
==> Saved best model at epoch 8 (val_loss=6.4764)




[E9] train_loss=6.0288 val_loss=6.4285 BLEU=0.10 CER=0.7784 PPL=619.24
==> Saved best model at epoch 9 (val_loss=6.4285)




[E10] train_loss=5.9688 val_loss=6.3811 BLEU=0.11 CER=0.7261 PPL=590.57
==> Saved best model at epoch 10 (val_loss=6.3811)




[E11] train_loss=5.9037 val_loss=6.3345 BLEU=0.23 CER=0.7353 PPL=563.66
==> Saved best model at epoch 11 (val_loss=6.3345)




[E12] train_loss=5.8345 val_loss=6.2885 BLEU=0.20 CER=0.7219 PPL=538.37
==> Saved best model at epoch 12 (val_loss=6.2885)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=64, hid_dim=128,
                 enc_layers=1, dec_layers=2,
                 dropout=0.5, lr=1e-4, resume_from="/content/best_checkpoint.pt")

  scaler = GradScaler()   # correct usage


Resumed training from epoch 12 using /content/best_checkpoint.pt


  with autocast():


[E13] train_loss=5.7301 val_loss=6.2475 BLEU=0.24 CER=0.7187 PPL=516.74
==> Saved best model at epoch 13 (val_loss=6.2475)




[E14] train_loss=5.6489 val_loss=6.2022 BLEU=0.39 CER=0.6974 PPL=493.82
==> Saved best model at epoch 14 (val_loss=6.2022)




[E15] train_loss=5.5722 val_loss=6.1517 BLEU=0.45 CER=0.6695 PPL=469.52
==> Saved best model at epoch 15 (val_loss=6.1517)




[E16] train_loss=5.4996 val_loss=6.1099 BLEU=0.77 CER=0.6825 PPL=450.29
==> Saved best model at epoch 16 (val_loss=6.1099)




[E17] train_loss=5.4304 val_loss=6.0709 BLEU=0.81 CER=0.6900 PPL=433.09
==> Saved best model at epoch 17 (val_loss=6.0709)




[E18] train_loss=5.3583 val_loss=6.0338 BLEU=1.23 CER=0.6652 PPL=417.31
==> Saved best model at epoch 18 (val_loss=6.0338)




[E19] train_loss=5.2972 val_loss=6.0337 BLEU=1.64 CER=0.6890 PPL=417.27
==> Saved best model at epoch 19 (val_loss=6.0337)




[E20] train_loss=5.2294 val_loss=5.9770 BLEU=1.59 CER=0.6607 PPL=394.27
==> Saved best model at epoch 20 (val_loss=5.9770)




[E21] train_loss=5.1681 val_loss=5.9533 BLEU=2.15 CER=0.6640 PPL=385.01
==> Saved best model at epoch 21 (val_loss=5.9533)




[E22] train_loss=5.1092 val_loss=5.9566 BLEU=2.24 CER=0.6720 PPL=386.28




[E23] train_loss=5.0481 val_loss=5.9089 BLEU=2.64 CER=0.6434 PPL=368.30
==> Saved best model at epoch 23 (val_loss=5.9089)




[E24] train_loss=4.9896 val_loss=5.9063 BLEU=3.08 CER=0.6563 PPL=367.33
==> Saved best model at epoch 24 (val_loss=5.9063)


In [None]:
history, model = run_training(INPUT_DIM, OUTPUT_DIM, emb_dim=64, hid_dim=64,
                 enc_layers=1, dec_layers=2,
                 dropout=0.5, lr=5e-4,)

  scaler = GradScaler()   # correct usage
  with autocast():


TypeError: DecoderLSTM.forward_step() takes 4 positional arguments but 5 were given

In [None]:
history, model = run_training(
    INPUT_DIM, OUTPUT_DIM,
    emb_dim=64, hid_dim=128,
    enc_layers=1, dec_layers=2,
    dropout=0.5, lr=1e-4
)


  scaler = GradScaler()   # AMP scaler
  with autocast():


RuntimeError: Expected hidden[0] size (2, 64, 128), got [1, 64, 128]