<a href="https://colab.research.google.com/github/manasdeshpande125/da6401_assignment_3/blob/main/DLASG3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch, os, random, numpy as np, pandas as pd
print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


**Download dataset and unzip it**

In [None]:
import urllib.request, tarfile, pathlib, shutil

URL = "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
TAR = "dakshina.tar"
if not pathlib.Path(TAR).exists():
    urllib.request.urlretrieve(URL, TAR)
    print("Downloaded.")

with tarfile.open(TAR) as t:
    members = [m for m in t.getmembers() if m.name.startswith("dakshina_dataset_v1.0/mr/lexicons/")]
    t.extractall(members=members)
DATA_ROOT = pathlib.Path("dakshina_dataset_v1.0/mr/lexicons")
print("Files:", os.listdir(DATA_ROOT))

**Classes for loading and preparing the dataset**

In [None]:
class Vocabulary:
    def __init__(self, file_path, src_lang, trg_lang):
        df = pd.read_csv(file_path, sep="\t", header=None,
                         names=[src_lang, trg_lang], dtype=str).dropna()
        self.df = df
        self.src_lang, self.trg_lang = src_lang, trg_lang

        self.src_vocab = {c: i+3 for i, c in enumerate(sorted(set("".join(df[src_lang]))))}
        self.trg_vocab = {c: i+3 for i, c in enumerate(sorted(set("".join(df[trg_lang]))))}

        for v in (self.src_vocab, self.trg_vocab):
            v["<pad>"] = 1; v["<unk>"] = 2; v["<s>"] = 0

        self.s_char2idx, self.s_idx2char = self.src_vocab, {i: c for c, i in self.src_vocab.items()}
        self.t_char2idx, self.t_idx2char = self.trg_vocab, {i: c for c, i in self.trg_vocab.items()}

    def get(self):
        return (self.src_vocab, self.trg_vocab,
                self.t_char2idx, self.t_idx2char,
                self.s_char2idx, self.s_idx2char)


In [None]:
from torch.utils.data import Dataset

class TransliterationDataset(Dataset):
    def __init__(self, file_path, src_lang, trg_lang,
                 src_vocab, trg_vocab, t_char2idx):
        df = pd.read_csv(file_path, sep="\t", header=None,
                         names=[src_lang, trg_lang], dtype=str).dropna()
        self.df = df.reset_index(drop=True)
        self.src_vocab, self.trg_vocab = src_vocab, trg_vocab
        self.t_char2idx = t_char2idx
        self.max_src_len = df[src_lang].str.len().max() + 1
        self.max_trg_len = df[trg_lang].str.len().max() + 1

    def __len__(self): return len(self.df)

    def _encode(self, word, vocab, max_len):
        seq = [vocab.get(c, vocab["<unk>"]) for c in word]
        seq = [vocab["<s>"]] + seq + [vocab["<pad>"]] * (max_len-len(seq)-1)
        return torch.tensor(seq, dtype=torch.long)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # src = self._encode(row[0], self.src_vocab, self.max_src_len)
        # trg = self._encode(row[1], self.trg_vocab, self.max_trg_len)
        # return src, trg, len(row[0])+1, len(row[1])+1
        src = self._encode(row.iloc[0], self.src_vocab, self.max_src_len)
        trg = self._encode(row.iloc[1], self.trg_vocab, self.max_trg_len)
        return src, trg, len(row.iloc[0])+1, len(row.iloc[1])+1


In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class DakshinaDataModule(pl.LightningDataModule):
    def __init__(self, batch_size=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_path = DATA_ROOT/"mr.translit.sampled.train.tsv"
        self.dev_path   = DATA_ROOT/"mr.translit.sampled.dev.tsv"
        self.test_path  = DATA_ROOT/"mr.translit.sampled.test.tsv"

    def prepare_data(self): pass  # Nothing to download (done above)

    def setup(self, stage=None):
        vocab = Vocabulary(self.train_path, 'src', 'trg')
        (self.src_vocab, self.trg_vocab,
         self.t_char2idx, self.t_idx2char,
         self.s_char2idx, self.s_idx2char) = vocab.get()

        self.train_ds = TransliterationDataset(self.train_path, 'src', 'trg',
                                               self.src_vocab, self.trg_vocab,
                                               self.t_char2idx)
        self.val_ds   = TransliterationDataset(self.dev_path,  'src', 'trg',
                                               self.src_vocab, self.trg_vocab,
                                               self.t_char2idx)
        self.test_ds  = TransliterationDataset(self.test_path, 'src', 'trg',
                                               self.src_vocab, self.trg_vocab,
                                               self.t_char2idx)

    def train_dataloader(self): return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle=True)
    def val_dataloader(self):   return DataLoader(self.val_ds,   batch_size=self.batch_size)
    def test_dataloader(self):  return DataLoader(self.test_ds,  batch_size=self.batch_size)


In [None]:
# import torch.nn as nn
# import torch.nn.functional as F

# def make_rnn(cell_type, *args, **kw):
#     return {"rnn": nn.RNN,
#             "lstm": nn.LSTM,
#             "gru": nn.GRU}[cell_type.lower()](*args, **kw)

# class Encoder(nn.Module):
#     def __init__(self, input_dim, emb_dim, hid_dim, n_layers, cell, dropout):
#         super().__init__()
#         self.embedding = nn.Embedding(input_dim, emb_dim)
#         self.rnn = make_rnn(cell, emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=False)
#         self.dropout = nn.Dropout(dropout)
#     def forward(self, src):
#         emb = self.dropout(self.embedding(src))
#         return self.rnn(emb)

# class Decoder(nn.Module):
#     def __init__(self, output_dim, emb_dim, hid_dim, n_layers, cell, dropout):
#         super().__init__()
#         self.output_dim = output_dim
#         self.embedding = nn.Embedding(output_dim, emb_dim)
#         self.rnn = make_rnn(cell, emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=False)
#         self.fc = nn.Linear(hid_dim, output_dim)
#         self.dropout = nn.Dropout(dropout)
#     def forward(self, inp, hidden):
#         emb = self.dropout(self.embedding(inp))
#         out, hidden = self.rnn(emb, hidden)
#         pred = F.log_softmax(self.fc(out.squeeze(0)), dim=1)
#         return pred, hidden

# class Seq2SeqLightning(pl.LightningModule):
#     def __init__(self, src_vocab, trg_vocab,
#                  emb_dim=128, hid_dim=256, n_layers=2,
#                  cell="lstm", dropout=0.2, lr=1e-3, tf_ratio=0.5):
#         super().__init__()
#         self.save_hyperparameters()
#         self.encoder = Encoder(len(src_vocab), emb_dim, hid_dim, n_layers, cell, dropout)
#         self.decoder = Decoder(len(trg_vocab), emb_dim, hid_dim, n_layers, cell, dropout)
#         self.criterion = nn.NLLLoss(ignore_index=trg_vocab["<pad>"])
#         self.tf_ratio = tf_ratio

#     def forward(self, src, trg, teacher_forcing=0.5):
#         batch, trg_len = src.shape[1], trg.shape[0]
#         outputs = torch.zeros(trg_len, batch, len(self.decoder.embedding.weight)).to(src.device)
#         enc_out, hidden = self.encoder(src)
#         inp = trg[0].unsqueeze(0)
#         for t in range(1, trg_len):
#             pred, hidden = self.decoder(inp, hidden)
#             outputs[t] = pred
#             inp = trg[t].unsqueeze(0) if (torch.rand(1) < teacher_forcing) else pred.argmax(1).unsqueeze(0)
#         return outputs

#     def _step(self, batch, stage):
#         src, trg, _, _ = batch
#         src, trg = src.permute(1,0), trg.permute(1,0)
#         logits = self(src, trg, self.tf_ratio if stage=="train" else 0.0)
#         loss = self.criterion(logits[1:].reshape(-1, logits.shape[2]),
#                               trg[1:].reshape(-1))
#         self.log(f"{stage}_loss", loss, prog_bar=True)
#         return loss

#     def training_step(self, batch, _):   return self._step(batch, "train")
#     def validation_step(self, batch, _): self._step(batch, "val")
#     def test_step(self, batch, _):       self._step(batch, "test")

#     def configure_optimizers(self):
#         return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

# class Encoder(nn.Module):
#     def __init__(self, input_dim, emb_dim, hid_dim, n_layers, cell, dropout):
#         super().__init__()
#         self.embedding = nn.Embedding(input_dim, emb_dim)
#         self.rnn       = make_rnn(cell, emb_dim, hid_dim, n_layers,
#                                   dropout=dropout, batch_first=False)
#         self.dropout   = nn.Dropout(dropout)
#     def forward(self, src):
#         return self.rnn(self.dropout(self.embedding(src)))

# class Decoder(nn.Module):
#     def __init__(self, output_dim, emb_dim, hid_dim, n_layers, cell, dropout):
#         super().__init__()
#         self.embedding = nn.Embedding(output_dim, emb_dim)
#         self.rnn       = make_rnn(cell, emb_dim, hid_dim, n_layers,
#                                   dropout=dropout, batch_first=False)
#         self.fc        = nn.Linear(hid_dim, output_dim)
#         self.dropout   = nn.Dropout(dropout)
#     def forward(self, inp, hidden):
#         emb = self.dropout(self.embedding(inp))
#         out, hidden = self.rnn(emb, hidden)
#         pred = F.log_softmax(self.fc(out.squeeze(0)), dim=1)
#         return pred, hidden

**Encoder, Decoder and Seq2Seq Classes for Transliteration**

In [None]:
import torch.nn as nn, torch.nn.functional as F
import pytorch_lightning as pl

def make_rnn(cell_type, *args, **kw):
    return {"rnn": nn.RNN, "lstm": nn.LSTM, "gru": nn.GRU}[cell_type.lower()](*args, **kw)

class Encoder(nn.Module):
    def __init__(self, input_dim: int, embed_dim: int,
                 hid_dim: int, n_layers: int,
                 cell_type: str = "lstm",
                 bidirectional: bool = False,
                 dropout: float = 0.2):
        super().__init__()
        self.hid_dim, self.n_layers = hid_dim, n_layers
        self.bidirectional = bidirectional
        self.dir = 2 if bidirectional else 1

        self.embedding = nn.Embedding(input_dim, embed_dim)
        rnn_cls = {"rnn": nn.RNN, "gru": nn.GRU, "lstm": nn.LSTM}[cell_type.lower()]
        self.rnn = rnn_cls(embed_dim, hid_dim, n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout if n_layers > 1 else 0.0)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.lower()

    def forward(self, src):
        # src ⇒ [seq_len, batch]
        embedded = self.dropout(self.embedding(src))
        return self.rnn(embedded)       # (output, hidden[, cell])


class Decoder(nn.Module):
    def __init__(self, trg_vocab, output_dim, embed_dim, hid_dim, n_layers,
                 cell_type: str = "lstm", bidirectional: bool = False, dropout: float = 0.2):
        super().__init__()
        self.trg_vocab = trg_vocab  # store it here for loss
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embed_dim)
        rnn_cls = {"rnn": nn.RNN, "gru": nn.GRU, "lstm": nn.LSTM}[cell_type.lower()]
        self.rnn = rnn_cls(embed_dim, hid_dim, n_layers,
                           dropout=dropout if n_layers > 1 else 0.0)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type.lower()
        self.bidirectional = bidirectional

    def forward(self, inp, hidden):
        # inp ⇒ [1, batch]
        embedded = self.dropout(self.embedding(inp))
        outputs = self.rnn(embedded, hidden)
        rnn_out, hidden = outputs if self.cell_type != "lstm" else (outputs[0], outputs[1])
        logits = self.fc_out(rnn_out.squeeze(0))
        return F.log_softmax(logits, dim=1), hidden

class Seq2SeqLightning(pl.LightningModule):
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        cell_type: str = "lstm",
        bidirectional: bool = False,
        device: str = "cpu",
        learning_rate: float = 1e-3,
        optim_name: str = "adam",
        tf_ratio: float = 0.5,
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type.lower()
        self.bidirectional = bidirectional
        self.learning_rate = learning_rate
        self.optim_name = optim_name.lower()
        self.tf_ratio = tf_ratio
        self.criterion = nn.CrossEntropyLoss(ignore_index=decoder.trg_vocab["<pad>"])
        self.pad_idx = decoder.trg_vocab["<pad>"]
        # self.device = device

    def _merge_bidir(self, h):
        """Average the fwd & bwd hidden states so that
           [layers*dir, batch, hid] → [layers, batch, hid]"""
        if self.bidirectional:
            if self.cell_type == "lstm":
                # h is tuple(hidden, cell)
                hidden = (h[0].view(self.decoder.rnn.num_layers, 2, -1, h[0].size(-1)).mean(1),
                          h[1].view(self.decoder.rnn.num_layers, 2, -1, h[1].size(-1)).mean(1))
            else:
                hidden = h.view(self.decoder.rnn.num_layers, 2, -1, h.size(-1)).mean(1)
        else:
            hidden = h
        return hidden

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = src.size(1), trg.size(0)
        vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, vocab_size, device=self.device)

        enc_out, enc_hidden = self.encoder(src)
        dec_hidden = self._merge_bidir(enc_hidden)

        # first decoder input = <sos>
        dec_inp = trg[0].unsqueeze(0)

        for t in range(1, trg_len):
            dec_out, dec_hidden = self.decoder(dec_inp, dec_hidden)
            outputs[t] = dec_out
            top1 = dec_out.argmax(1)
            teacher = torch.rand(1).item() < teacher_forcing_ratio
            dec_inp = trg[t].unsqueeze(0) if teacher else top1.unsqueeze(0)

        return outputs

    def configure_optimizers(self):
        learning_rate  = self.learning_rate
        opt = self.optim_name.lower()          # "adam" | "nadam"

        if opt == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        elif opt == "nadam":
            optimizer = torch.optim.NAdam(self.parameters(), lr=learning_rate)
        else:                                     # fallback / safety
            raise ValueError(f"Unknown optimizer '{opt}'")

        return optimizer

    def _accuracy(self, logits, trg):
        # logits, trg both exclude <s>
        pred = logits.argmax(1)
        correct = (pred == trg) & (trg != self.pad_idx)
        return correct.float().sum()/ (trg != self.pad_idx).float().sum()

    def _step(self, batch, stage):
        src, trg, _, _ = batch
        src, trg = src.permute(1,0), trg.permute(1,0)

        logits = self(src, trg, self.tf_ratio if stage=="train" else 0.0)
        loss   = self.criterion(logits[1:].reshape(-1, logits.shape[2]),
                                trg[1:].reshape(-1))

        acc    = self._accuracy(logits[1:].reshape(-1, logits.shape[2]),
                                trg[1:].reshape(-1))

        self.log(f"{stage}_loss", loss, prog_bar=True)
        self.log(f"{stage}_acc",  acc,  prog_bar=True)

        # print once per epoch (on first batch)
        # if self.trainer.is_global_zero and self.current_epoch is not None and self.global_step % self.trainer.num_training_batches == 0:
        #     print(f"[epoch {self.current_epoch:02d}] {stage.upper()} "
        #           f"loss: {loss.item():.4f}  acc: {acc.item()*100:.2f}%")

        return loss

    def training_step(self, batch, _):   return self._step(batch, "train")
    def validation_step(self, batch, _): self._step(batch, "val")
    def test_step(self, batch, _):       self._step(batch, "test")



In [None]:
# from pytorch_lightning import Trainer
# from pytorch_lightning.callbacks import ModelCheckpoint

# def run_training(hparams):
#     dm = DakshinaDataModule(batch_size=hparams["batch_size"])
#     dm.prepare_data(); dm.setup()

#     model = Seq2SeqLightning(dm.src_vocab, dm.trg_vocab,
#                              emb_dim=hparams["emb_dim"],
#                              hid_dim=hparams["hid_dim"],
#                              n_layers=hparams["n_layers"],
#                              cell=hparams["cell"],
#                              dropout=hparams["dropout"],
#                              lr=hparams["lr"],
#                              tf_ratio=hparams["tf_ratio"])

#     ckpt = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")
#     trainer = Trainer(max_epochs=hparams["epochs"],
#                       callbacks=[ckpt],
#                       accelerator="auto", devices=1)
#     trainer.fit(model, dm)
#     trainer.test(model, dm)

# default_hparams = dict(batch_size=128, emb_dim=128, hid_dim=256,
#                        n_layers=2, cell="lstm", dropout=0.2,
#                        lr=1e-3, tf_ratio=0.5, epochs=10)

# run_training(default_hparams)

 #################################################################################################################

# def run_training(hparams: dict):
#     """
#     Train + evaluate one experiment using the hyper-parameters coming either
#     from the sweep or from the manual `default_hparams` below.
#     """
#     # 1) Data
#     dm = DakshinaDataModule(batch_size=hparams["batch_size"])
#     dm.prepare_data(); dm.setup()

#     # 2) Model
#     model = Seq2SeqLightning(
#         src_vocab        = dm.src_vocab,
#         trg_vocab        = dm.trg_vocab,
#         emb_dim          = hparams["embedding_size"],
#         hid_dim          = hparams["hidden_size"],
#         n_layers         = hparams["num_layers"],
#         cell             = hparams["cell_type"],     # "lstm" | "gru" | "rnn"
#         dropout          = hparams["dropout"],
#         bidirectional    = hparams["bidirectional"],
#         lr               = hparams["learning_rate"],
#         optim_name       = hparams["optim"],         # "adam" | "nadam"
#         tf_ratio         = hparams["teacher_forcing"]
#     )

#     # 3) Trainer
#     ckpt = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")
#     trainer = Trainer(
#         max_epochs = hparams["epochs"],
#         callbacks  = [ckpt],
#         accelerator= "auto",
#         devices    = 1
#     )

#     # 4) Fit + Test
#     trainer.fit(model, dm)
#     trainer.test(model, dm)            # uses best checkpoint by default
 #################################################################################################################


  # def configure_optimizers(self):
  #     return torch.optim.Adam(self.parameters(), learning_rate=self.hparams.learning_rate)

  # def __init__(self, src_vocab, trg_vocab,
  #              emb_dim=128, hid_dim=256, n_layers=2,
  #              cell="lstm", dropout=0.2, lr=1e-3, tf_ratio=0.5):
  #     super().__init__()
  #     self.save_hyperparameters()
  #     self.encoder  = Encoder(len(src_vocab), emb_dim, hid_dim, n_layers, cell, dropout)
  #     self.decoder  = Decoder(len(trg_vocab), emb_dim, hid_dim, n_layers, cell, dropout)
  #     self.criterion= nn.NLLLoss(ignore_index=trg_vocab["<pad>"])
  #     self.pad_idx  = trg_vocab["<pad>"]
  #     self.tf_ratio = tf_ratio

  # # ---------- forward ----------
  # def forward(self, src, trg, teacher_forcing):
  #     trg_len, batch = trg.shape
  #     vocab_size = len(self.decoder.embedding.weight)
  #     outputs = torch.zeros(trg_len, batch, vocab_size, device=src.device)

  #     _, hidden = self.encoder(src)
  #     inp = trg[0].unsqueeze(0)           # <s>

  #     for t in range(1, trg_len):
  #         pred, hidden = self.decoder(inp, hidden)
  #         outputs[t]   = pred
  #         inp = trg[t].unsqueeze(0) if (torch.rand(1) < teacher_forcing) \
  #               else pred.argmax(1).unsqueeze(0)
  #     return outputs

  # def __init__(self, encoder: Encoder, decoder: Decoder,
  #              cell_type: str = "lstm", bidirectional: bool = False, device="cpu"):
  #     super().__init__()
  #     self.encoder, self.decoder = encoder, decoder
  #     self.cell_type = cell_type.lower()
  #     self.bidirectional = bidirectional
  #     self.device = device

**Training Function and Sweeps**

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping


def run_training(hparams: dict):
    # 1) Data
    dm = DakshinaDataModule(batch_size=hparams["batch_size"])
    dm.prepare_data(); dm.setup()

    # 2) Build encoder & decoder separately
    SRC_VOCAB = len(dm.src_vocab)
    TRG_VOCAB = len(dm.trg_vocab)

    encoder = Encoder(
        input_dim    = SRC_VOCAB,
        embed_dim= hparams["embedding_size"],
        hid_dim   = hparams["hidden_size"],
        n_layers   = hparams["num_layers"],
        bidirectional= hparams["bidirectional"],
        cell_type    = hparams["cell_type"],
        dropout           = hparams["dropout"],
    )

    decoder = Decoder(
        trg_vocab=dm.trg_vocab,
        output_dim   = TRG_VOCAB,
        embed_dim= hparams["embedding_size"],
        hid_dim   = hparams["hidden_size"],
        n_layers   = hparams["num_layers"],
        bidirectional= False,
        cell_type    = hparams["cell_type"],
        dropout           = hparams["dropout"],
    )

    model = Seq2SeqLightning(
        encoder       = encoder,
        decoder       = decoder,
        cell_type     = hparams["cell_type"],
        bidirectional = hparams["bidirectional"],
        device        = "cuda" if torch.cuda.is_available() else "cpu",
        learning_rate   = hparams["learning_rate"],
        optim_name    = hparams["optim"],
        tf_ratio      = hparams["teacher_forcing"],
    )

    ckpt = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")
    run_name = (
    f"e_{hparams['epochs']}_lr_{hparams['learning_rate']}_"
    f"wd_{hparams.get('weight_decay', 0)}_o_{hparams['optim']}_"
    f"bs_{hparams['batch_size']}_ac_{hparams.get('activation_type', 'na')}_"
    f"los_{hparams.get('loss_type', 'ce')}"
    )

    wandb_logger = WandbLogger(
        project="DA6401_Assignment_3",
        # name=(f"e_{hparams['epochs']}_lr_{hparams['learning_rate']}_"
        #     f"wd_{hparams.get('weight_decay', 0)}_o_{hparams['optim']}_"
        #     f"bs_{hparams['batch_size']}_ac_{hparams.get('activation_type', 'na')}_"
        #     f"los_{hparams.get('loss_type', 'ce')}"),
        config=hparams,
        log_model=True
    )
    wandb.run.name = run_name
    early_stop = EarlyStopping(
    monitor="val_acc",     # you can also monitor "val_acc" if preferred
    patience=3,             # stop after 3 epochs with no improvement
    mode="max",             # minimize validation loss
    verbose=True
    )
    trainer = Trainer(
        max_epochs = hparams["epochs"],
        callbacks  = [ckpt,early_stop],
        accelerator= "auto",
        devices    = 1,
        logger     = wandb_logger
    )




    trainer.fit(model, dm)
    # trainer.test(model, dm)          # uses best checkpoint


default_hparams = dict(
    cell_type       = "lstm",
    dropout         = 0.2,
    embedding_size  = 128,
    num_layers      = 2,
    batch_size      = 128,
    hidden_size     = 256,
    bidirectional   = False,
    learning_rate   = 1e-3,
    epochs          = 10,
    optim           = "adam",
    teacher_forcing = 0.5
)

# run_training(default_hparams)



In [None]:
import wandb, yaml, json
wandb.login(key="41a2853ea088e37bd0d456e78102e82edb455afc")

# sweep_config = {
#     "method": "bayes",
#     "metric": {"name": "val_loss", "goal": "minimize"},
#     "parameters": {
#         "batch_size": {"values": [32, 64, 128]},
#         "emb_dim":    {"values": [32,64, 128, 256]},
#         "hid_dim":    {"values": [128, 256, 512]},
#         "n_layers":   {"values": [1, 2, 3]},
#         "cell":       {"values": ["lstm", "gru","rnn"]},
#         "dropout":    {"values": [0.2, 0.3]},
#         "lr":         {"values": [1e-3, 5e-4]},
#         "tf_ratio":   {"values": [0.2, 0.5]},
#         "epochs":     {"value": [10,13,15]}
#     }
# }

sweep_config = {
    "method": "bayes",           # or "random", "grid", …
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        "cell_type":      {"values": ["lstm", "gru", "rnn"]},
        "dropout":        {"values": [0.0, 0.1, 0.2, 0.5]},
        "embedding_size": {"values": [64, 128, 256, 512]},
        "num_layers":     {"values": [2, 3, 4]},
        "batch_size":     {"values": [32, 64, 128]},
        "hidden_size":    {"values": [128, 256, 512]},
        "bidirectional":  {"values": [True, False]},
        "learning_rate":  {"values": [1e-3, 2e-3, 1e-4, 2e-4]},
        "epochs":         {"values": [7, 10, 13]},
        "optim":          {"values": ["adam", "nadam"]},
        "teacher_forcing":{"values": [0.2, 0.5, 0.7]},
    },
}

def sweep_train():
    with wandb.init() as run:
        cfg = dict(run.config)
        run_training(cfg)

# Uncomment to launch
sweep_id = wandb.sweep(sweep_config, project="DA6401_Assignment_3",entity="cs24m024-iit-madras")
print(sweep_id)
wandb.agent(sweep_id, function=sweep_train, count=75)


**Attention Mechanism**

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class DotAttention(nn.Module):
    """
    Luong (general=dot) attention.
    Scores = h_tᵀ · H_enc  → softmax over src-time
    """
    def __init__(self, hid_dim):
        super().__init__()
        self.scale = 1.0 / (hid_dim ** 0.5)

    def forward(self, dec_hidden, enc_outputs, src_mask=None):
        # dec_hidden:  [1,  B, H]
        # enc_outputs: [Tsrc, B, H]
        scores = torch.einsum('lbh,tbh->lbt', dec_hidden, enc_outputs) * self.scale
        if src_mask is not None:
            scores = scores.masked_fill(src_mask.T.unsqueeze(0) == 0, -1e9)
        attn_w = F.softmax(scores, dim=2)          # over Tsrc
        ctx = torch.einsum('lbt,tbh->lbh', attn_w, enc_outputs)
        return ctx, attn_w.squeeze(0)              # ctx:[1,B,H], attn_w:[B,Tsrc]


In [None]:
class AttnDecoder(nn.Module):
    def __init__(self, trg_vocab, output_dim, embed_dim,
                 hid_dim, n_layers=1, cell_type="lstm", dropout=0.2):
        super().__init__()
        self.trg_vocab = trg_vocab
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim + hid_dim, hid_dim, n_layers)
        self.attn = DotAttention(hid_dim)
        self.fc_out = nn.Linear(hid_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inp, hidden, enc_out, src_mask=None):
        emb = self.dropout(self.embedding(inp))           # [1,B,E]
        ctx, attn_w = self.attn(hidden[0] if isinstance(hidden, tuple) else hidden,
                                enc_out, src_mask)
        rnn_in = torch.cat([emb, ctx], dim=2)
        rnn_out, hidden = self.rnn(rnn_in, hidden)
        logits = self.fc_out(torch.cat([rnn_out.squeeze(0), ctx.squeeze(0)], dim=1))
        return F.log_softmax(logits, dim=1), hidden, attn_w


In [None]:
class Seq2SeqAttn(pl.LightningModule):
    # init identical to before, but decoder is AttnDecoder

    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        cell_type: str = "lstm",
        bidirectional: bool = False,
        device: str = "cpu",
        learning_rate: float = 1e-3,
        optim_name: str = "adam",
        tf_ratio: float = 0.5,
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type.lower()
        self.bidirectional = bidirectional
        self.learning_rate = learning_rate
        self.optim_name = optim_name.lower()
        self.tf_ratio = tf_ratio
        self.criterion = nn.CrossEntropyLoss(ignore_index=decoder.trg_vocab["<pad>"])
        self.pad_idx = decoder.trg_vocab["<pad>"]
        # self.device = device

    def _merge_bidir(self, h):
        """Average the fwd & bwd hidden states so that
           [layers*dir, batch, hid] → [layers, batch, hid]"""
        if self.bidirectional:
            if self.cell_type == "lstm":
                # h is tuple(hidden, cell)
                hidden = (h[0].view(self.decoder.rnn.num_layers, 2, -1, h[0].size(-1)).mean(1),
                          h[1].view(self.decoder.rnn.num_layers, 2, -1, h[1].size(-1)).mean(1))
            else:
                hidden = h.view(self.decoder.rnn.num_layers, 2, -1, h.size(-1)).mean(1)
        else:
            hidden = h
        return hidden

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch, trg_len = src.size(1), trg.size(0)
        vocab = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len, batch, vocab, device=src.device)
        attn_maps = []

        enc_out, enc_hidden = self.encoder(src)
        hidden = self._merge_bidir(enc_hidden)
        dec_inp = trg[0].unsqueeze(0)

        for t in range(1, trg_len):
            dec_out, hidden, attn_w = self.decoder(dec_inp, hidden, enc_out)
            outputs[t] = dec_out
            attn_maps.append(attn_w.detach().cpu())
            teacher = torch.rand(1).item() < teacher_forcing_ratio
            dec_inp = trg[t].unsqueeze(0) if teacher else dec_out.argmax(1).unsqueeze(0)

        return outputs, torch.stack(attn_maps)  # attn: [T-1,B,SrcT]

    def _step(self, batch, stage):
        src, trg, _, _ = batch
        src, trg = src.permute(1,0), trg.permute(1,0)
        logits, _ = self(src, trg, self.tf_ratio if stage=="train" else 0.0)
        loss = self.criterion(logits[1:].reshape(-1, logits.shape[2]),
                              trg[1:].reshape(-1))
        acc  = self._accuracy(logits[1:].reshape(-1, logits.shape[2]),
                              trg[1:].reshape(-1))
        self.log(f"{stage}_loss", loss, prog_bar=True)
        self.log(f"{stage}_acc",  acc, prog_bar=True)
        return loss

    # ---------- lightning hooks ----------
    def training_step(self, batch, _):   return self._step(batch, "train")
    def validation_step(self, batch, _): self._step(batch, "val")
    def test_step(self, batch, _):       self._step(batch, "test")

    def configure_optimizers(self):
        learning_rate  = self.learning_rate
        opt = self.optim_name.lower()          # "adam" | "nadam"

        if opt == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        elif opt == "nadam":
            optimizer = torch.optim.NAdam(self.parameters(), lr=learning_rate)
        else:                                     # fallback / safety
            raise ValueError(f"Unknown optimizer '{opt}'")

        return optimizer

    # ---------- helpers ----------
    def _accuracy(self, logits, trg):
        # logits, trg both exclude <s>
        pred = logits.argmax(1)
        correct = (pred == trg) & (trg != self.pad_idx)
        return correct.float().sum()/ (trg != self.pad_idx).float().sum()


**Training Function and Sweeps**

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping

def run_training_attn(hp):
    dm = DakshinaDataModule(batch_size=hp["batch_size"]); dm.prepare_data(); dm.setup()
    enc = Encoder(len(dm.src_vocab), hp["embedding_size"],
                  hp["hidden_size"], 1, hp["cell_type"],
                  bidirectional=False, dropout=hp["dropout"])
    dec = AttnDecoder(dm.trg_vocab, len(dm.trg_vocab),
                      hp["embedding_size"], hp["hidden_size"],
                      1, hp["cell_type"], hp["dropout"])

    model = Seq2SeqAttn(enc, dec, cell_type=hp["cell_type"],
                        learning_rate=hp["learning_rate"],
                        optim_name=hp["optim"], tf_ratio=hp["teacher_forcing"])
    ckpt = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")
    run_name = (
    f"e_{hp['epochs']}_lr_{hp['learning_rate']}_"
    f"wd_{hp.get('weight_decay', 0)}_o_{hp['optim']}_"
    f"bs_{hp['batch_size']}_ac_{hp.get('activation_type', 'na')}_"
    f"los_{hp.get('loss_type', 'ce')}"
    )

    wandb_logger = WandbLogger(
        project="DA6401_Assignment_3",
        config=hp,
        log_model=True
    )
    early_stop = EarlyStopping(
    monitor="val_acc",     # you can also monitor "val_loss" if preferred
    patience=3,             # stop after 3 epochs with no improvement
    mode="max",             # minimize validation acc
    verbose=True
    )
    wandb.run.name = run_name
    trainer = Trainer(
        max_epochs = hp["epochs"],
        callbacks  = [ckpt,early_stop],
        accelerator= "auto",
        devices    = 1,
        logger     = wandb_logger
    )


    # trainer.fit(model, dm)
    # trainer.test(model, dm)          # uses best checkpoint

    trainer.fit(model, dm); trainer.test(model, dm)
    return model, dm


attn_hparams = default_hparams | {"num_layers":1, "hidden_size":256, "learning_rate":5e-4}
best_model, dm = run_training_attn(attn_hparams)


In [None]:
sweep_config = {
    "method": "bayes",           # or "random", "grid", …
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        "cell_type":      {"values": ["lstm", "gru"]},
        "dropout":        {"values": [0.0, 0.1, 0.2, 0.5]},
        "embedding_size": {"values": [64, 128, 256, 512]},
        "num_layers":     {"values": [2, 3, 4]},
        "batch_size":     {"values": [32, 64, 128]},
        "hidden_size":    {"values": [128, 256, 512]},
        "bidirectional":  {"values": [True, False]},
        "learning_rate":  {"values": [1e-3, 2e-3, 1e-4, 2e-4]},
        "epochs":         {"values": [7, 10, 13]},
        "optim":          {"values": ["adam", "nadam"]},
        "teacher_forcing":{"values": [0.2, 0.5, 0.7]},
    },
}

def sweep_train():
    with wandb.init() as run:
        cfg = dict(run.config)
        run_training(cfg)

# Uncomment to launch
sweep_id = wandb.sweep(sweep_config, project="DA6401_Assignment_3",entity="cs24m024-iit-madras")
print(sweep_id)
wandb.agent(sweep_id, function=sweep_train, count=35)