In [1]:
  !pip install wandb




In [2]:
import wandb


In [3]:
!wandb login 58a0b576fd5221cd0d63b154deaabbe535e853c6

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'embedding_dim': {'values': [32,64,128,256]},
        'hidden_dim': {'values': [32,64, 128,256 ]},
        'enc_layers': {'values': [1, 2,3]},
        'dec_layers': {'values': [1, 2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.2, 0.3,0.5]},
        'epochs': {'values': [10, 15]},
        'beam_size': {'values': [1, 3, 5]}
    }
}
# Sweep
# =======================
default_config = {
    'embedding_dim': 32,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1
}


# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder and Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

# =======================
# Seq2Seq Model with Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Train Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Val Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="dakshina-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_data = TransliterationDataset(train_path, inp_vocab, out_vocab)
    dev_data = TransliterationDataset(dev_path, inp_vocab, out_vocab)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
    decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch+1})

# =======================

if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="dakshina-transliteration")
    wandb.agent(sweep_id, function=main,count=30)


Create sweep with ID: v20mtq8o
Sweep URL: https://wandb.ai/manglesh_dl_ass3/dakshina-transliteration/sweeps/v20mtq8o


[34m[1mwandb[0m: Agent Starting Run: wsb4nmzt with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 2.0825, Acc: 39.08%
Val Loss: 1.3258, Acc: 58.51%
Epoch 2/15
Train Loss: 0.9195, Acc: 71.55%
Val Loss: 0.9491, Acc: 69.49%
Epoch 3/15
Train Loss: 0.6898, Acc: 78.14%
Val Loss: 0.8675, Acc: 71.97%
Epoch 4/15
Train Loss: 0.5977, Acc: 80.81%
Val Loss: 0.8566, Acc: 72.93%
Epoch 5/15
Train Loss: 0.5505, Acc: 82.18%
Val Loss: 0.8085, Acc: 73.42%
Epoch 6/15
Train Loss: 0.5183, Acc: 82.97%
Val Loss: 0.8133, Acc: 74.15%
Epoch 7/15
Train Loss: 0.4930, Acc: 83.72%
Val Loss: 0.8226, Acc: 74.18%
Epoch 8/15
Train Loss: 0.4748, Acc: 84.11%
Val Loss: 0.8079, Acc: 75.05%
Epoch 9/15
Train Loss: 0.4575, Acc: 84.62%
Val Loss: 0.8134, Acc: 74.60%
Epoch 10/15
Train Loss: 0.4436, Acc: 84.93%
Val Loss: 0.8148, Acc: 74.71%
Epoch 11/15
Train Loss: 0.4348, Acc: 85.06%
Val Loss: 0.8074, Acc: 74.70%
Epoch 12/15
Train Loss: 0.4261, Acc: 85.28%
Val Loss: 0.7970, Acc: 75.18%
Epoch 13/15
Train Loss: 0.4113, Acc: 85.72%
Val Loss: 0.7827, Acc: 75.33%
Epoch 14/15
Train Loss: 0.4088, Acc: 85.66%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇██████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▆▇▇▇██████████
val_loss,█▃▂▂▁▁▂▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_acc,85.96733
train_loss,0.39715
val_acc,75.1888
val_loss,0.79417


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2ixqhn81 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64


Epoch 1/5
Train Loss: 1.7502, Acc: 46.96%
Val Loss: 1.2688, Acc: 58.44%
Epoch 2/5
Train Loss: 1.0550, Acc: 66.24%
Val Loss: 1.0417, Acc: 65.82%
Epoch 3/5
Train Loss: 0.9009, Acc: 71.06%
Val Loss: 0.9590, Acc: 68.40%
Epoch 4/5
Train Loss: 0.8281, Acc: 73.26%
Val Loss: 0.9137, Acc: 69.59%
Epoch 5/5
Train Loss: 0.7783, Acc: 74.79%
Val Loss: 0.9114, Acc: 70.18%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▄▂▁▁

0,1
epoch,4.0
train_acc,74.78942
train_loss,0.77833
val_acc,70.18026
val_loss,0.91135


[34m[1mwandb[0m: Agent Starting Run: y7mw5wki with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32


Epoch 1/10
Train Loss: 2.4506, Acc: 28.60%
Val Loss: 2.3585, Acc: 32.04%
Epoch 2/10
Train Loss: 2.2253, Acc: 34.43%
Val Loss: 2.2833, Acc: 33.98%
Epoch 3/10
Train Loss: 2.1650, Acc: 35.85%
Val Loss: 2.2069, Acc: 35.93%
Epoch 4/10
Train Loss: 2.0924, Acc: 37.62%
Val Loss: 2.1449, Acc: 36.79%
Epoch 5/10
Train Loss: 2.0462, Acc: 38.88%
Val Loss: 2.0781, Acc: 38.40%
Epoch 6/10
Train Loss: 2.0057, Acc: 40.26%
Val Loss: 2.0491, Acc: 39.64%
Epoch 7/10
Train Loss: 1.9603, Acc: 42.28%
Val Loss: 2.0106, Acc: 41.34%
Epoch 8/10
Train Loss: 1.9316, Acc: 43.08%
Val Loss: 1.9953, Acc: 41.44%
Epoch 9/10
Train Loss: 1.9030, Acc: 43.88%
Val Loss: 1.9790, Acc: 41.78%
Epoch 10/10
Train Loss: 1.8927, Acc: 44.09%
Val Loss: 1.9687, Acc: 42.16%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▄▅▆▆▇███
train_loss,█▅▄▄▃▂▂▁▁▁
val_acc,▁▂▄▄▅▆▇▇██
val_loss,█▇▅▄▃▂▂▁▁▁

0,1
epoch,9.0
train_acc,44.08948
train_loss,1.89268
val_acc,42.16313
val_loss,1.96872


[34m[1mwandb[0m: Agent Starting Run: cpggv2aa with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64


Epoch 1/15
Train Loss: 2.4296, Acc: 29.30%
Val Loss: 2.4483, Acc: 28.91%
Epoch 2/15
Train Loss: 2.2491, Acc: 33.43%
Val Loss: 2.3167, Acc: 33.05%
Epoch 3/15
Train Loss: 2.1553, Acc: 36.07%
Val Loss: 2.2516, Acc: 34.99%
Epoch 4/15
Train Loss: 2.0934, Acc: 37.47%
Val Loss: 2.2106, Acc: 35.46%
Epoch 5/15
Train Loss: 2.0510, Acc: 38.55%
Val Loss: 2.1825, Acc: 36.19%
Epoch 6/15
Train Loss: 2.0193, Acc: 39.54%
Val Loss: 2.1490, Acc: 36.69%
Epoch 7/15
Train Loss: 1.9975, Acc: 40.19%
Val Loss: 2.1323, Acc: 37.09%
Epoch 8/15
Train Loss: 1.9707, Acc: 40.88%
Val Loss: 2.1333, Acc: 36.64%
Epoch 9/15
Train Loss: 1.9529, Acc: 41.51%
Val Loss: 2.1053, Acc: 37.82%
Epoch 10/15
Train Loss: 1.9338, Acc: 42.20%
Val Loss: 2.0841, Acc: 38.16%
Epoch 11/15
Train Loss: 1.9128, Acc: 42.76%
Val Loss: 2.0891, Acc: 38.32%
Epoch 12/15
Train Loss: 1.8951, Acc: 43.40%
Val Loss: 2.0671, Acc: 38.81%
Epoch 13/15
Train Loss: 1.8761, Acc: 43.99%
Val Loss: 2.0317, Acc: 39.49%
Epoch 14/15
Train Loss: 1.8531, Acc: 44.54%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇███
train_loss,█▆▅▄▃▃▃▂▂▂▂▂▁▁▁
val_acc,▁▄▅▅▆▆▆▆▇▇▇▇███
val_loss,█▆▅▄▄▃▃▃▂▂▂▂▁▁▁

0,1
epoch,14.0
train_acc,44.69564
train_loss,1.84749
val_acc,39.68056
val_loss,2.01893


[34m[1mwandb[0m: Agent Starting Run: s93dnnvr with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/5
Train Loss: 1.6026, Acc: 51.91%
Val Loss: 1.1162, Acc: 63.92%
Epoch 2/5
Train Loss: 0.7917, Acc: 74.80%
Val Loss: 0.9233, Acc: 69.43%
Epoch 3/5
Train Loss: 0.6500, Acc: 79.12%
Val Loss: 0.8664, Acc: 71.23%
Epoch 4/5
Train Loss: 0.5800, Acc: 81.17%
Val Loss: 0.8460, Acc: 72.54%
Epoch 5/5
Train Loss: 0.5396, Acc: 82.33%
Val Loss: 0.8280, Acc: 72.91%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▃▂▁▁

0,1
epoch,4.0
train_acc,82.33231
train_loss,0.53963
val_acc,72.91456
val_loss,0.82799


[34m[1mwandb[0m: Agent Starting Run: c7phxgx3 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 2.1928, Acc: 36.27%
Val Loss: 2.0194, Acc: 40.99%
Epoch 2/15
Train Loss: 1.8480, Acc: 45.05%
Val Loss: 1.8561, Acc: 44.26%
Epoch 3/15
Train Loss: 1.7108, Acc: 48.71%
Val Loss: 1.7596, Acc: 46.70%
Epoch 4/15
Train Loss: 1.6330, Acc: 51.01%
Val Loss: 1.6704, Acc: 49.08%
Epoch 5/15
Train Loss: 1.5705, Acc: 52.53%
Val Loss: 1.6294, Acc: 50.02%
Epoch 6/15
Train Loss: 1.5402, Acc: 53.42%
Val Loss: 1.6337, Acc: 50.10%
Epoch 7/15
Train Loss: 1.5016, Acc: 54.42%
Val Loss: 1.5614, Acc: 51.53%
Epoch 8/15
Train Loss: 1.4697, Acc: 55.29%
Val Loss: 1.5708, Acc: 51.32%
Epoch 9/15
Train Loss: 1.4440, Acc: 55.98%
Val Loss: 1.5273, Acc: 52.37%
Epoch 10/15
Train Loss: 1.4306, Acc: 56.34%
Val Loss: 1.5351, Acc: 51.99%
Epoch 11/15
Train Loss: 1.4176, Acc: 56.72%
Val Loss: 1.5036, Acc: 52.68%
Epoch 12/15
Train Loss: 1.4077, Acc: 56.95%
Val Loss: 1.4994, Acc: 53.33%
Epoch 13/15
Train Loss: 1.3979, Acc: 57.34%
Val Loss: 1.4758, Acc: 53.70%
Epoch 14/15
Train Loss: 1.3856, Acc: 57.64%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▄▅▆▆▇▇▇▇▇█████
train_loss,█▅▄▃▃▂▂▂▂▁▁▁▁▁▁
val_acc,▁▃▄▅▆▆▇▆▇▇▇████
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁

0,1
epoch,14.0
train_acc,57.90171
train_loss,1.37417
val_acc,54.14485
val_loss,1.47183


[34m[1mwandb[0m: Agent Starting Run: mdfn8jgi with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/5
Train Loss: 1.6647, Acc: 50.03%
Val Loss: 1.1321, Acc: 63.25%
Epoch 2/5
Train Loss: 0.8866, Acc: 71.67%
Val Loss: 0.9365, Acc: 69.35%
Epoch 3/5
Train Loss: 0.7338, Acc: 76.44%
Val Loss: 0.8489, Acc: 71.74%
Epoch 4/5
Train Loss: 0.6720, Acc: 78.16%
Val Loss: 0.8475, Acc: 72.48%
Epoch 5/5
Train Loss: 0.6210, Acc: 79.91%
Val Loss: 0.8597, Acc: 73.03%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▃▁▁▁

0,1
epoch,4.0
train_acc,79.91099
train_loss,0.621
val_acc,73.0274
val_loss,0.85969


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: yzdktqg1 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/10
Train Loss: 1.4949, Acc: 54.43%
Val Loss: 1.0108, Acc: 66.61%
Epoch 2/10
Train Loss: 0.7993, Acc: 74.16%
Val Loss: 0.9004, Acc: 71.05%
Epoch 3/10
Train Loss: 0.6753, Acc: 77.88%
Val Loss: 0.8264, Acc: 72.49%
Epoch 4/10
Train Loss: 0.6133, Acc: 79.82%
Val Loss: 0.8048, Acc: 73.52%
Epoch 5/10
Train Loss: 0.5765, Acc: 80.92%
Val Loss: 0.7943, Acc: 73.92%
Epoch 6/10
Train Loss: 0.5477, Acc: 81.74%
Val Loss: 0.7958, Acc: 74.46%
Epoch 7/10
Train Loss: 0.5268, Acc: 82.45%
Val Loss: 0.7900, Acc: 74.62%
Epoch 8/10
Train Loss: 0.5077, Acc: 82.99%
Val Loss: 0.8101, Acc: 74.65%
Epoch 9/10
Train Loss: 0.4991, Acc: 83.13%
Val Loss: 0.7730, Acc: 74.94%
Epoch 10/10
Train Loss: 0.4873, Acc: 83.45%
Val Loss: 0.8140, Acc: 74.83%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▅▆▇▇█████
val_loss,█▅▃▂▂▂▂▂▁▂

0,1
epoch,9.0
train_acc,83.44696
train_loss,0.48732
val_acc,74.83001
val_loss,0.81399


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6v4q86gi with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/10
Train Loss: 1.1691, Acc: 63.36%
Val Loss: 0.9227, Acc: 69.85%
Epoch 2/10
Train Loss: 0.6730, Acc: 78.32%
Val Loss: 0.8552, Acc: 72.37%
Epoch 3/10
Train Loss: 0.5871, Acc: 81.01%
Val Loss: 0.8597, Acc: 73.07%
Epoch 4/10
Train Loss: 0.5521, Acc: 81.81%
Val Loss: 0.7985, Acc: 73.95%
Epoch 5/10
Train Loss: 0.5238, Acc: 82.61%
Val Loss: 0.8191, Acc: 73.92%
Epoch 6/10
Train Loss: 0.4991, Acc: 83.41%
Val Loss: 0.8063, Acc: 74.35%
Epoch 7/10
Train Loss: 0.4846, Acc: 83.85%
Val Loss: 0.8067, Acc: 74.72%
Epoch 8/10
Train Loss: 0.4724, Acc: 84.07%
Val Loss: 0.8054, Acc: 74.65%
Epoch 9/10
Train Loss: 0.4642, Acc: 84.29%
Val Loss: 0.8036, Acc: 74.46%
Epoch 10/10
Train Loss: 0.4567, Acc: 84.49%
Val Loss: 0.8016, Acc: 74.79%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▅▆▇▇▇████
val_loss,█▄▄▁▂▁▁▁▁▁

0,1
epoch,9.0
train_acc,84.49097
train_loss,0.45675
val_acc,74.78661
val_loss,0.80163


[34m[1mwandb[0m: Agent Starting Run: 9mxghioj with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.5387, Acc: 53.66%
Val Loss: 1.0157, Acc: 66.38%
Epoch 2/5
Train Loss: 0.7359, Acc: 76.50%
Val Loss: 0.8643, Acc: 71.10%
Epoch 3/5
Train Loss: 0.6083, Acc: 80.28%
Val Loss: 0.8303, Acc: 73.38%
Epoch 4/5
Train Loss: 0.5424, Acc: 82.29%
Val Loss: 0.8440, Acc: 73.87%
Epoch 5/5
Train Loss: 0.5111, Acc: 83.03%
Val Loss: 0.8144, Acc: 73.94%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▃▂▂▁

0,1
epoch,4.0
train_acc,83.0251
train_loss,0.51105
val_acc,73.93594
val_loss,0.81439


[34m[1mwandb[0m: Agent Starting Run: 6e0rd4pd with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.2077, Acc: 62.26%
Val Loss: 0.9611, Acc: 69.07%
Epoch 2/5
Train Loss: 0.7005, Acc: 77.47%
Val Loss: 0.8762, Acc: 71.98%
Epoch 3/5
Train Loss: 0.6164, Acc: 80.05%
Val Loss: 0.8381, Acc: 72.79%
Epoch 4/5
Train Loss: 0.5751, Acc: 81.18%
Val Loss: 0.8352, Acc: 73.98%
Epoch 5/5
Train Loss: 0.5442, Acc: 82.03%
Val Loss: 0.8194, Acc: 73.64%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▆██
val_loss,█▄▂▂▁

0,1
epoch,4.0
train_acc,82.03489
train_loss,0.54424
val_acc,73.64081
val_loss,0.81942


[34m[1mwandb[0m: Agent Starting Run: h3se8kcc with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/10
Train Loss: 1.3687, Acc: 57.96%
Val Loss: 1.0196, Acc: 66.46%
Epoch 2/10
Train Loss: 0.7008, Acc: 77.50%
Val Loss: 0.9031, Acc: 70.90%
Epoch 3/10
Train Loss: 0.5930, Acc: 80.71%
Val Loss: 0.8726, Acc: 71.32%
Epoch 4/10
Train Loss: 0.5405, Acc: 82.24%
Val Loss: 0.8555, Acc: 71.81%
Epoch 5/10
Train Loss: 0.5028, Acc: 83.29%
Val Loss: 0.8577, Acc: 72.48%
Epoch 6/10
Train Loss: 0.4800, Acc: 83.86%
Val Loss: 0.8457, Acc: 72.41%
Epoch 7/10
Train Loss: 0.4595, Acc: 84.35%
Val Loss: 0.8342, Acc: 73.56%
Epoch 8/10
Train Loss: 0.4464, Acc: 84.67%
Val Loss: 0.8451, Acc: 73.28%
Epoch 9/10
Train Loss: 0.4266, Acc: 85.21%
Val Loss: 0.8540, Acc: 73.75%
Epoch 10/10
Train Loss: 0.4220, Acc: 85.17%
Val Loss: 0.8459, Acc: 73.88%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▅▆▆▇▇█▇██
val_loss,█▄▂▂▂▁▁▁▂▁

0,1
epoch,9.0
train_acc,85.17494
train_loss,0.42201
val_acc,73.88386
val_loss,0.84592


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 93x32whp with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/10
Train Loss: 1.5989, Acc: 50.70%
Val Loss: 1.1110, Acc: 63.73%
Epoch 2/10
Train Loss: 0.9738, Acc: 68.48%
Val Loss: 0.9532, Acc: 68.01%
Epoch 3/10
Train Loss: 0.8394, Acc: 72.86%
Val Loss: 0.8966, Acc: 69.93%
Epoch 4/10
Train Loss: 0.7765, Acc: 74.71%
Val Loss: 0.8683, Acc: 70.85%
Epoch 5/10
Train Loss: 0.7329, Acc: 76.11%
Val Loss: 0.8600, Acc: 72.05%
Epoch 6/10
Train Loss: 0.7061, Acc: 76.94%
Val Loss: 0.8368, Acc: 72.22%
Epoch 7/10
Train Loss: 0.6828, Acc: 77.69%
Val Loss: 0.8354, Acc: 72.63%
Epoch 8/10
Train Loss: 0.6633, Acc: 78.28%
Val Loss: 0.8226, Acc: 72.73%
Epoch 9/10
Train Loss: 0.6497, Acc: 78.68%
Val Loss: 0.8495, Acc: 72.72%
Epoch 10/10
Train Loss: 0.6350, Acc: 79.14%
Val Loss: 0.8033, Acc: 73.62%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▆▇▇▇████
train_loss,█▃▂▂▂▂▁▁▁▁
val_acc,▁▄▅▆▇▇▇▇▇█
val_loss,█▄▃▂▂▂▂▁▂▁

0,1
epoch,9.0
train_acc,79.13571
train_loss,0.63497
val_acc,73.62055
val_loss,0.80327


[34m[1mwandb[0m: Agent Starting Run: na8lqf5i with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2831, Acc: 60.11%
Val Loss: 0.9653, Acc: 68.32%
Epoch 2/15
Train Loss: 0.7339, Acc: 76.29%
Val Loss: 0.8807, Acc: 71.14%
Epoch 3/15
Train Loss: 0.6398, Acc: 79.11%
Val Loss: 0.8348, Acc: 72.79%
Epoch 4/15
Train Loss: 0.5940, Acc: 80.61%
Val Loss: 0.8643, Acc: 73.01%
Epoch 5/15
Train Loss: 0.5623, Acc: 81.59%
Val Loss: 0.8042, Acc: 73.76%
Epoch 6/15
Train Loss: 0.5445, Acc: 82.06%
Val Loss: 0.8092, Acc: 74.13%
Epoch 7/15
Train Loss: 0.5270, Acc: 82.52%
Val Loss: 0.8173, Acc: 74.10%
Epoch 8/15
Train Loss: 0.5221, Acc: 82.58%
Val Loss: 0.7860, Acc: 74.44%
Epoch 9/15
Train Loss: 0.5035, Acc: 83.12%
Val Loss: 0.8010, Acc: 74.79%
Epoch 10/15
Train Loss: 0.4973, Acc: 83.28%
Val Loss: 0.8133, Acc: 74.45%
Epoch 11/15
Train Loss: 0.4931, Acc: 83.37%
Val Loss: 0.8074, Acc: 74.34%
Epoch 12/15
Train Loss: 0.4797, Acc: 83.82%
Val Loss: 0.8168, Acc: 74.14%
Epoch 13/15
Train Loss: 0.4825, Acc: 83.66%
Val Loss: 0.8013, Acc: 74.77%
Epoch 14/15
Train Loss: 0.4755, Acc: 83.85%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇▇█▇▇▇███
val_loss,█▅▃▄▂▂▂▁▂▂▂▂▂▂▃

0,1
epoch,14.0
train_acc,83.91517
train_loss,0.47345
val_acc,74.70849
val_loss,0.82885


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2qgxgy1j with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.1821, Acc: 63.06%
Val Loss: 1.0186, Acc: 67.94%
Epoch 2/5
Train Loss: 0.6779, Acc: 78.20%
Val Loss: 0.8875, Acc: 72.26%
Epoch 3/5
Train Loss: 0.5966, Acc: 80.54%
Val Loss: 0.8498, Acc: 73.30%
Epoch 4/5
Train Loss: 0.5494, Acc: 82.00%
Val Loss: 0.8490, Acc: 72.88%
Epoch 5/5
Train Loss: 0.5250, Acc: 82.60%
Val Loss: 0.8435, Acc: 73.81%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▆▇▇█
val_loss,█▃▁▁▁

0,1
epoch,4.0
train_acc,82.60049
train_loss,0.52496
val_acc,73.80573
val_loss,0.84354


[34m[1mwandb[0m: Agent Starting Run: qm5ip602 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.4594, Acc: 55.06%
Val Loss: 1.0502, Acc: 64.84%
Epoch 2/15
Train Loss: 0.7703, Acc: 75.33%
Val Loss: 0.9589, Acc: 69.77%
Epoch 3/15
Train Loss: 0.6567, Acc: 78.75%
Val Loss: 0.8875, Acc: 71.57%
Epoch 4/15
Train Loss: 0.6050, Acc: 80.27%
Val Loss: 0.8487, Acc: 72.85%
Epoch 5/15
Train Loss: 0.5662, Acc: 81.50%
Val Loss: 0.8450, Acc: 72.94%
Epoch 6/15
Train Loss: 0.5447, Acc: 82.06%
Val Loss: 0.8121, Acc: 73.33%
Epoch 7/15
Train Loss: 0.5264, Acc: 82.51%
Val Loss: 0.8142, Acc: 73.58%
Epoch 8/15
Train Loss: 0.5101, Acc: 83.00%
Val Loss: 0.8516, Acc: 73.57%
Epoch 9/15
Train Loss: 0.5007, Acc: 83.20%
Val Loss: 0.8229, Acc: 74.10%
Epoch 10/15
Train Loss: 0.4876, Acc: 83.54%
Val Loss: 0.8210, Acc: 74.26%
Epoch 11/15
Train Loss: 0.4812, Acc: 83.74%
Val Loss: 0.8676, Acc: 73.62%
Epoch 12/15
Train Loss: 0.4680, Acc: 84.09%
Val Loss: 0.8342, Acc: 74.58%
Epoch 13/15
Train Loss: 0.4708, Acc: 83.90%
Val Loss: 0.8413, Acc: 74.39%
Epoch 14/15
Train Loss: 0.4591, Acc: 84.37%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇▇██▇████
val_loss,█▅▃▂▂▁▁▂▁▁▃▂▂▁▁

0,1
epoch,14.0
train_acc,84.44435
train_loss,0.45432
val_acc,74.19924
val_loss,0.81168


[34m[1mwandb[0m: Agent Starting Run: 8mdn4sr1 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.5655, Acc: 52.81%
Val Loss: 1.0226, Acc: 66.54%
Epoch 2/5
Train Loss: 0.7731, Acc: 75.34%
Val Loss: 0.8930, Acc: 70.87%
Epoch 3/5
Train Loss: 0.6386, Acc: 79.31%
Val Loss: 0.8331, Acc: 72.57%
Epoch 4/5
Train Loss: 0.5738, Acc: 81.22%
Val Loss: 0.8097, Acc: 73.56%
Epoch 5/5
Train Loss: 0.5285, Acc: 82.64%
Val Loss: 0.7897, Acc: 74.13%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇▇█
val_loss,█▄▂▂▁

0,1
epoch,4.0
train_acc,82.64215
train_loss,0.52851
val_acc,74.1298
val_loss,0.78973


[34m[1mwandb[0m: Agent Starting Run: 4j7pujrh with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.2566, Acc: 61.17%
Val Loss: 1.0171, Acc: 68.59%
Epoch 2/5
Train Loss: 0.7013, Acc: 77.49%
Val Loss: 0.8720, Acc: 72.20%
Epoch 3/5
Train Loss: 0.6108, Acc: 80.15%
Val Loss: 0.8416, Acc: 72.77%
Epoch 4/5
Train Loss: 0.5633, Acc: 81.53%
Val Loss: 0.8619, Acc: 73.03%
Epoch 5/5
Train Loss: 0.5358, Acc: 82.24%
Val Loss: 0.8205, Acc: 73.57%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▆▇▇█
val_loss,█▃▂▂▁

0,1
epoch,4.0
train_acc,82.23906
train_loss,0.53577
val_acc,73.56847
val_loss,0.82051


[34m[1mwandb[0m: Agent Starting Run: wpyri5h3 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.1927, Acc: 63.24%
Val Loss: 0.9575, Acc: 68.31%
Epoch 2/5
Train Loss: 0.6462, Acc: 79.32%
Val Loss: 0.9375, Acc: 69.12%
Epoch 3/5
Train Loss: 0.5630, Acc: 81.62%
Val Loss: 0.8434, Acc: 72.33%
Epoch 4/5
Train Loss: 0.5140, Acc: 83.05%
Val Loss: 0.8380, Acc: 72.32%
Epoch 5/5
Train Loss: 0.4862, Acc: 83.72%
Val Loss: 0.8393, Acc: 72.78%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▂▇▇█
val_loss,█▇▁▁▁

0,1
epoch,4.0
train_acc,83.71596
train_loss,0.48623
val_acc,72.77857
val_loss,0.83926


[34m[1mwandb[0m: Agent Starting Run: bp55cpdg with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.3234, Acc: 59.34%
Val Loss: 1.0504, Acc: 65.20%
Epoch 2/5
Train Loss: 0.7303, Acc: 76.64%
Val Loss: 0.9293, Acc: 69.90%
Epoch 3/5
Train Loss: 0.6242, Acc: 79.91%
Val Loss: 0.8975, Acc: 70.57%
Epoch 4/5
Train Loss: 0.5681, Acc: 81.56%
Val Loss: 0.8656, Acc: 72.00%
Epoch 5/5
Train Loss: 0.5338, Acc: 82.47%
Val Loss: 0.8549, Acc: 72.14%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▆▆██
val_loss,█▄▃▁▁

0,1
epoch,4.0
train_acc,82.46778
train_loss,0.53382
val_acc,72.14201
val_loss,0.8549


[34m[1mwandb[0m: Agent Starting Run: kwjb79ue with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64


Epoch 1/5
Train Loss: 1.8932, Acc: 43.34%
Val Loss: 1.3541, Acc: 56.55%
Epoch 2/5
Train Loss: 1.1608, Acc: 63.00%
Val Loss: 1.0905, Acc: 64.28%
Epoch 3/5
Train Loss: 0.9705, Acc: 68.85%
Val Loss: 0.9865, Acc: 67.77%
Epoch 4/5
Train Loss: 0.8885, Acc: 71.28%
Val Loss: 0.9239, Acc: 69.40%
Epoch 5/5
Train Loss: 0.8336, Acc: 73.06%
Val Loss: 0.9080, Acc: 70.14%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▄▂▁▁

0,1
epoch,4.0
train_acc,73.05896
train_loss,0.83359
val_acc,70.13686
val_loss,0.90804


[34m[1mwandb[0m: Agent Starting Run: viqo96vq with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/5
Train Loss: 1.3696, Acc: 58.08%
Val Loss: 0.9991, Acc: 67.54%
Epoch 2/5
Train Loss: 0.7283, Acc: 76.79%
Val Loss: 0.9239, Acc: 70.00%
Epoch 3/5
Train Loss: 0.6158, Acc: 80.20%
Val Loss: 0.8600, Acc: 71.77%
Epoch 4/5
Train Loss: 0.5671, Acc: 81.61%
Val Loss: 0.8462, Acc: 72.39%
Epoch 5/5
Train Loss: 0.5302, Acc: 82.70%
Val Loss: 0.8400, Acc: 72.97%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▄▆▇█
val_loss,█▅▂▁▁

0,1
epoch,4.0
train_acc,82.69844
train_loss,0.53022
val_acc,72.97243
val_loss,0.84002


[34m[1mwandb[0m: Agent Starting Run: df4qh91u with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/10
Train Loss: 1.1223, Acc: 64.98%
Val Loss: 0.9470, Acc: 69.40%
Epoch 2/10
Train Loss: 0.6253, Acc: 79.88%
Val Loss: 0.8581, Acc: 71.72%
Epoch 3/10
Train Loss: 0.5516, Acc: 81.98%
Val Loss: 0.8287, Acc: 72.58%
Epoch 4/10
Train Loss: 0.5030, Acc: 83.40%
Val Loss: 0.8474, Acc: 73.64%
Epoch 5/10
Train Loss: 0.4709, Acc: 84.33%
Val Loss: 0.8352, Acc: 73.30%
Epoch 6/10
Train Loss: 0.4559, Acc: 84.57%
Val Loss: 0.8252, Acc: 73.88%
Epoch 7/10
Train Loss: 0.4406, Acc: 84.80%
Val Loss: 0.8419, Acc: 73.95%
Epoch 8/10
Train Loss: 0.4227, Acc: 85.32%
Val Loss: 0.8040, Acc: 74.21%
Epoch 9/10
Train Loss: 0.4141, Acc: 85.38%
Val Loss: 0.8220, Acc: 74.67%
Epoch 10/10
Train Loss: 0.4078, Acc: 85.57%
Val Loss: 0.8206, Acc: 74.45%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▄▅▇▆▇▇▇██
val_loss,█▄▂▃▃▂▃▁▂▂

0,1
epoch,9.0
train_acc,85.5692
train_loss,0.40781
val_acc,74.44808
val_loss,0.82057


[34m[1mwandb[0m: Agent Starting Run: si2mjai3 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.5202, Acc: 54.11%
Val Loss: 1.0293, Acc: 66.53%
Epoch 2/5
Train Loss: 0.7638, Acc: 75.54%
Val Loss: 0.8660, Acc: 71.10%
Epoch 3/5
Train Loss: 0.6291, Acc: 79.59%
Val Loss: 0.8252, Acc: 72.54%
Epoch 4/5
Train Loss: 0.5680, Acc: 81.46%
Val Loss: 0.8253, Acc: 73.66%
Epoch 5/5
Train Loss: 0.5300, Acc: 82.54%
Val Loss: 0.7759, Acc: 74.44%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▆▇█
val_loss,█▃▂▂▁

0,1
epoch,4.0
train_acc,82.53952
train_loss,0.53002
val_acc,74.4394
val_loss,0.77588


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lbf58de6 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.1976, Acc: 61.10%
Val Loss: 0.9506, Acc: 68.34%
Epoch 2/5
Train Loss: 0.7888, Acc: 74.12%
Val Loss: 0.8974, Acc: 70.63%
Epoch 3/5
Train Loss: 0.7119, Acc: 76.58%
Val Loss: 0.8968, Acc: 71.20%
Epoch 4/5
Train Loss: 0.6738, Acc: 77.77%
Val Loss: 0.8276, Acc: 72.28%
Epoch 5/5
Train Loss: 0.6493, Acc: 78.61%
Val Loss: 0.8282, Acc: 72.84%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▅▇█
val_loss,█▅▅▁▁

0,1
epoch,4.0
train_acc,78.61232
train_loss,0.64935
val_acc,72.83933
val_loss,0.82817


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: d3cfz6ym with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 64


Epoch 1/5
Train Loss: 1.9298, Acc: 43.00%
Val Loss: 1.4465, Acc: 54.02%
Epoch 2/5
Train Loss: 1.2337, Acc: 60.49%
Val Loss: 1.1447, Acc: 62.02%
Epoch 3/5
Train Loss: 1.0446, Acc: 66.10%
Val Loss: 1.0398, Acc: 65.37%
Epoch 4/5
Train Loss: 0.9535, Acc: 68.94%
Val Loss: 1.0037, Acc: 66.66%
Epoch 5/5
Train Loss: 0.8959, Acc: 70.81%
Val Loss: 0.9793, Acc: 67.57%


0,1
epoch,▁▃▅▆█
train_acc,▁▅▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▃▂▁▁

0,1
epoch,4.0
train_acc,70.80842
train_loss,0.89588
val_acc,67.57038
val_loss,0.97933


[34m[1mwandb[0m: Agent Starting Run: ypy0s3n4 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 2.2275, Acc: 35.02%
Val Loss: 2.0111, Acc: 39.99%
Epoch 2/5
Train Loss: 1.7435, Acc: 47.72%
Val Loss: 1.7462, Acc: 46.38%
Epoch 3/5
Train Loss: 1.5380, Acc: 53.13%
Val Loss: 1.6359, Acc: 48.55%
Epoch 4/5
Train Loss: 1.4197, Acc: 56.42%
Val Loss: 1.4956, Acc: 52.99%
Epoch 5/5
Train Loss: 1.3434, Acc: 58.48%
Val Loss: 1.4179, Acc: 54.28%


0,1
epoch,▁▃▅▆█
train_acc,▁▅▆▇█
train_loss,█▄▃▂▁
val_acc,▁▄▅▇█
val_loss,█▅▄▂▁

0,1
epoch,4.0
train_acc,58.47807
train_loss,1.34336
val_acc,54.27505
val_loss,1.41791


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8lcxgxgi with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.3282, Acc: 59.21%
Val Loss: 1.0283, Acc: 65.84%
Epoch 2/5
Train Loss: 0.7387, Acc: 76.30%
Val Loss: 0.9445, Acc: 69.16%
Epoch 3/5
Train Loss: 0.6292, Acc: 79.67%
Val Loss: 0.8831, Acc: 71.29%
Epoch 4/5
Train Loss: 0.5748, Acc: 81.24%
Val Loss: 0.8733, Acc: 71.74%
Epoch 5/5
Train Loss: 0.5425, Acc: 82.18%
Val Loss: 0.8736, Acc: 72.07%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇██
val_loss,█▄▁▁▁

0,1
epoch,4.0
train_acc,82.17726
train_loss,0.54248
val_acc,72.06967
val_loss,0.87359


[34m[1mwandb[0m: Agent Starting Run: 3p59dqeb with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2460, Acc: 61.52%
Val Loss: 0.9307, Acc: 69.60%
Epoch 2/15
Train Loss: 0.7007, Acc: 77.43%
Val Loss: 0.8519, Acc: 72.06%
Epoch 3/15
Train Loss: 0.6100, Acc: 80.14%
Val Loss: 0.8310, Acc: 72.93%
Epoch 4/15
Train Loss: 0.5634, Acc: 81.56%
Val Loss: 0.8386, Acc: 72.93%
Epoch 5/15
Train Loss: 0.5345, Acc: 82.34%
Val Loss: 0.8077, Acc: 73.68%
Epoch 6/15
Train Loss: 0.5144, Acc: 82.96%
Val Loss: 0.8220, Acc: 73.77%
Epoch 7/15
Train Loss: 0.4985, Acc: 83.37%
Val Loss: 0.8059, Acc: 74.37%
Epoch 8/15
Train Loss: 0.4906, Acc: 83.47%
Val Loss: 0.8029, Acc: 74.74%
Epoch 9/15
Train Loss: 0.4769, Acc: 83.83%
Val Loss: 0.7883, Acc: 74.73%
Epoch 10/15
Train Loss: 0.4699, Acc: 84.05%
Val Loss: 0.8301, Acc: 74.33%
Epoch 11/15
Train Loss: 0.4638, Acc: 84.21%
Val Loss: 0.8094, Acc: 74.88%
Epoch 12/15
Train Loss: 0.4563, Acc: 84.35%
Val Loss: 0.7953, Acc: 74.24%
Epoch 13/15
Train Loss: 0.4531, Acc: 84.48%
Val Loss: 0.8083, Acc: 74.52%
Epoch 14/15
Train Loss: 0.4428, Acc: 84.74%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▅▆▇▇██▇█▇█▇█
val_loss,█▄▃▃▂▃▂▂▁▃▂▁▂▂▄

0,1
epoch,14.0
train_acc,84.87337
train_loss,0.43811
val_acc,74.63904
val_loss,0.84283


[34m[1mwandb[0m: Agent Starting Run: mv89x1co with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/5
Train Loss: 1.2265, Acc: 60.90%
Val Loss: 0.9564, Acc: 68.44%
Epoch 2/5
Train Loss: 0.7646, Acc: 75.01%
Val Loss: 0.8854, Acc: 71.33%
Epoch 3/5
Train Loss: 0.6826, Acc: 77.61%
Val Loss: 0.8508, Acc: 72.69%
Epoch 4/5
Train Loss: 0.6403, Acc: 79.04%
Val Loss: 0.8639, Acc: 72.75%
Epoch 5/5
Train Loss: 0.6128, Acc: 79.90%
Val Loss: 0.8203, Acc: 73.61%


0,1
epoch,▁▃▅▆█
train_acc,▁▆▇██
train_loss,█▃▂▁▁
val_acc,▁▅▇▇█
val_loss,█▄▃▃▁

0,1
epoch,4.0
train_acc,79.90023
train_loss,0.61277
val_acc,73.61477
val_loss,0.82032


In [4]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'embedding_dim': {'values': [64,128,256]},
        'hidden_dim': {'values': [64, 128,256 ]},
        'enc_layers': {'values': [1, 2,3]},
        'dec_layers': {'values': [1, 2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.2, 0.3,0.5]},
        'epochs': {'values': [20, 15]},
        'beam_size': {'values': [1, 3, 5]}
    }
}
# Sweep
# =======================
default_config = {
    'embedding_dim': 32,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1
}


# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder and Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

# =======================
# Seq2Seq Model with Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Train Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Val Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="dakshina-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_data = TransliterationDataset(train_path, inp_vocab, out_vocab)
    dev_data = TransliterationDataset(dev_path, inp_vocab, out_vocab)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
    decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch+1})

# =======================

if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="dakshina-transliteration")
    wandb.agent(sweep_id, function=main,count=10)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: q6l1non5
Sweep URL: https://wandb.ai/manglesh_dl_ass3/dakshina-transliteration/sweeps/q6l1non5


[34m[1mwandb[0m: Agent Starting Run: t64a62c3 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/20
Train Loss: 1.9554, Acc: 42.42%
Val Loss: 1.7184, Acc: 46.60%
Epoch 2/20
Train Loss: 1.5413, Acc: 52.60%
Val Loss: 1.5627, Acc: 50.22%
Epoch 3/20
Train Loss: 1.3921, Acc: 56.85%
Val Loss: 1.4492, Acc: 53.92%
Epoch 4/20
Train Loss: 1.3088, Acc: 59.20%
Val Loss: 1.3864, Acc: 55.18%
Epoch 5/20
Train Loss: 1.2589, Acc: 60.54%
Val Loss: 1.3533, Acc: 56.45%
Epoch 6/20
Train Loss: 1.2227, Acc: 61.63%
Val Loss: 1.3185, Acc: 57.52%
Epoch 7/20
Train Loss: 1.1909, Acc: 62.43%
Val Loss: 1.2978, Acc: 57.83%
Epoch 8/20
Train Loss: 1.1754, Acc: 62.80%
Val Loss: 1.3139, Acc: 57.95%
Epoch 9/20
Train Loss: 1.1584, Acc: 63.26%
Val Loss: 1.3047, Acc: 58.31%
Epoch 10/20
Train Loss: 1.1579, Acc: 63.28%
Val Loss: 1.2658, Acc: 58.91%
Epoch 11/20
Train Loss: 1.1431, Acc: 63.65%
Val Loss: 1.2666, Acc: 58.62%
Epoch 12/20
Train Loss: 1.1346, Acc: 63.93%
Val Loss: 1.2665, Acc: 59.07%
Epoch 13/20
Train Loss: 1.1267, Acc: 64.30%
Val Loss: 1.2796, Acc: 58.97%
Epoch 14/20
Train Loss: 1.1270, Acc: 64.23%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▄▆▆▇▇▇▇████████████
train_loss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▇▇▇▇█▇██▇████▇█
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
train_acc,64.44335
train_loss,1.11939
val_acc,58.94795
val_loss,1.27341


[34m[1mwandb[0m: Agent Starting Run: 43bhqd7m with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.2089, Acc: 62.40%
Val Loss: 1.0044, Acc: 66.71%
Epoch 2/20
Train Loss: 0.7046, Acc: 77.35%
Val Loss: 0.9199, Acc: 70.11%
Epoch 3/20
Train Loss: 0.6128, Acc: 80.12%
Val Loss: 0.8909, Acc: 70.57%
Epoch 4/20
Train Loss: 0.5654, Acc: 81.55%
Val Loss: 0.8812, Acc: 71.34%
Epoch 5/20
Train Loss: 0.5351, Acc: 82.43%
Val Loss: 0.8653, Acc: 72.36%
Epoch 6/20
Train Loss: 0.5084, Acc: 83.16%
Val Loss: 0.8699, Acc: 71.60%
Epoch 7/20
Train Loss: 0.4888, Acc: 83.65%
Val Loss: 0.8993, Acc: 72.44%
Epoch 8/20
Train Loss: 0.4742, Acc: 84.01%
Val Loss: 0.8733, Acc: 72.06%
Epoch 9/20
Train Loss: 0.4585, Acc: 84.46%
Val Loss: 0.8709, Acc: 72.51%
Epoch 10/20
Train Loss: 0.4489, Acc: 84.60%
Val Loss: 0.8825, Acc: 72.18%
Epoch 11/20
Train Loss: 0.4411, Acc: 84.71%
Val Loss: 0.8838, Acc: 71.96%
Epoch 12/20
Train Loss: 0.4308, Acc: 85.04%
Val Loss: 0.8976, Acc: 72.58%
Epoch 13/20
Train Loss: 0.4262, Acc: 85.14%
Val Loss: 0.8819, Acc: 72.63%
Epoch 14/20
Train Loss: 0.4173, Acc: 85.30%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▅▆▇▇▇▇▇████████████
train_loss,█▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▅▆▇▇█▇█▇▇██▇█▇▇█▇█
val_loss,█▄▂▂▁▁▃▁▁▂▂▃▂▃▁▄▃▃▄▄

0,1
epoch,20.0
train_acc,86.02775
train_loss,0.38659
val_acc,72.84801
val_loss,0.92941


[34m[1mwandb[0m: Agent Starting Run: e7g40ay4 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/20
Train Loss: 2.4500, Acc: 28.85%
Val Loss: 2.4601, Acc: 28.56%
Epoch 2/20
Train Loss: 2.3409, Acc: 31.34%
Val Loss: 2.4687, Acc: 28.86%
Epoch 3/20
Train Loss: 2.3143, Acc: 31.78%
Val Loss: 2.4092, Acc: 29.53%
Epoch 4/20
Train Loss: 2.2936, Acc: 32.03%
Val Loss: 2.4002, Acc: 29.70%
Epoch 5/20
Train Loss: 2.2704, Acc: 32.39%
Val Loss: 2.3694, Acc: 29.94%
Epoch 6/20
Train Loss: 2.2433, Acc: 33.27%
Val Loss: 2.3417, Acc: 30.40%
Epoch 7/20
Train Loss: 2.2204, Acc: 33.78%
Val Loss: 2.3271, Acc: 30.81%
Epoch 8/20
Train Loss: 2.2067, Acc: 34.07%
Val Loss: 2.3166, Acc: 31.04%
Epoch 9/20
Train Loss: 2.1921, Acc: 34.47%
Val Loss: 2.3318, Acc: 30.08%
Epoch 10/20
Train Loss: 2.1785, Acc: 34.84%
Val Loss: 2.2838, Acc: 31.51%
Epoch 11/20
Train Loss: 2.1686, Acc: 35.11%
Val Loss: 2.2786, Acc: 31.32%
Epoch 12/20
Train Loss: 2.1576, Acc: 35.43%
Val Loss: 2.2727, Acc: 31.99%
Epoch 13/20
Train Loss: 2.1538, Acc: 35.70%
Val Loss: 2.2715, Acc: 31.96%
Epoch 14/20
Train Loss: 2.1487, Acc: 35.84%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇████
train_loss,█▆▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▂▃▃▃▄▅▅▄▆▆▇▇▆▆▆▇▇█▆
val_loss,██▆▆▅▄▃▃▄▂▂▂▂▂▂▂▁▁▁▂

0,1
epoch,20.0
train_acc,36.59542
train_loss,2.11748
val_acc,31.78149
val_loss,2.26875


[34m[1mwandb[0m: Agent Starting Run: c7io9ymw with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.5128, Acc: 53.67%
Val Loss: 1.0247, Acc: 66.20%
Epoch 2/20
Train Loss: 0.7835, Acc: 74.70%
Val Loss: 0.8963, Acc: 70.52%
Epoch 3/20
Train Loss: 0.6661, Acc: 78.44%
Val Loss: 0.8549, Acc: 71.99%
Epoch 4/20
Train Loss: 0.6077, Acc: 80.13%
Val Loss: 0.8247, Acc: 73.12%
Epoch 5/20
Train Loss: 0.5660, Acc: 81.45%
Val Loss: 0.8111, Acc: 73.62%
Epoch 6/20
Train Loss: 0.5420, Acc: 82.15%
Val Loss: 0.7976, Acc: 73.75%
Epoch 7/20
Train Loss: 0.5224, Acc: 82.66%
Val Loss: 0.8217, Acc: 73.50%
Epoch 8/20
Train Loss: 0.4994, Acc: 83.36%
Val Loss: 0.8166, Acc: 74.01%
Epoch 9/20
Train Loss: 0.4875, Acc: 83.70%
Val Loss: 0.8052, Acc: 74.29%
Epoch 10/20
Train Loss: 0.4792, Acc: 83.79%
Val Loss: 0.7925, Acc: 74.62%
Epoch 11/20
Train Loss: 0.4688, Acc: 84.05%
Val Loss: 0.8034, Acc: 74.40%
Epoch 12/20
Train Loss: 0.4586, Acc: 84.36%
Val Loss: 0.8043, Acc: 74.71%
Epoch 13/20
Train Loss: 0.4470, Acc: 84.70%
Val Loss: 0.7969, Acc: 74.60%
Epoch 14/20
Train Loss: 0.4453, Acc: 84.69%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▆▆▇▇▇▇█████████████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇▇▇█▇██▇██████
val_loss,█▄▃▂▂▁▂▂▁▁▁▁▁▁▂▁▂▂▁▂

0,1
epoch,20.0
train_acc,85.3929
train_loss,0.41227
val_acc,74.77214
val_loss,0.81614


[34m[1mwandb[0m: Agent Starting Run: c85kqv7k with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.1142, Acc: 64.86%
Val Loss: 1.0020, Acc: 67.36%
Epoch 2/20
Train Loss: 0.6762, Acc: 78.09%
Val Loss: 0.8703, Acc: 71.29%
Epoch 3/20
Train Loss: 0.5970, Acc: 80.56%
Val Loss: 0.8354, Acc: 72.93%
Epoch 4/20
Train Loss: 0.5581, Acc: 81.70%
Val Loss: 0.8565, Acc: 72.84%
Epoch 5/20
Train Loss: 0.5316, Acc: 82.40%
Val Loss: 0.8047, Acc: 73.68%
Epoch 6/20
Train Loss: 0.5105, Acc: 83.05%
Val Loss: 0.8650, Acc: 73.44%
Epoch 7/20
Train Loss: 0.4946, Acc: 83.45%
Val Loss: 0.8216, Acc: 73.90%
Epoch 8/20
Train Loss: 0.4833, Acc: 83.69%
Val Loss: 0.8341, Acc: 74.03%
Epoch 9/20
Train Loss: 0.4706, Acc: 84.07%
Val Loss: 0.8223, Acc: 74.26%
Epoch 10/20
Train Loss: 0.4656, Acc: 84.13%
Val Loss: 0.8191, Acc: 74.33%
Epoch 11/20
Train Loss: 0.4578, Acc: 84.32%
Val Loss: 0.8504, Acc: 73.84%
Epoch 12/20
Train Loss: 0.4480, Acc: 84.67%
Val Loss: 0.8551, Acc: 74.53%
Epoch 13/20
Train Loss: 0.4431, Acc: 84.73%
Val Loss: 0.8584, Acc: 74.01%
Epoch 14/20
Train Loss: 0.4395, Acc: 84.85%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▆▆▇▇▇▇█████████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇███▇█▇███████
val_loss,█▃▂▃▁▃▂▂▂▂▃▃▃▂▂▃▂▂▃▂

0,1
epoch,20.0
train_acc,85.10375
train_loss,0.4244
val_acc,74.21371
val_loss,0.84367


[34m[1mwandb[0m: Agent Starting Run: uusrluco with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2784, Acc: 60.16%
Val Loss: 0.9592, Acc: 67.37%
Epoch 2/15
Train Loss: 0.7300, Acc: 76.16%
Val Loss: 0.8863, Acc: 70.76%
Epoch 3/15
Train Loss: 0.6225, Acc: 79.61%
Val Loss: 0.8220, Acc: 73.20%
Epoch 4/15
Train Loss: 0.5709, Acc: 81.22%
Val Loss: 0.7955, Acc: 74.00%
Epoch 5/15
Train Loss: 0.5343, Acc: 82.26%
Val Loss: 0.8036, Acc: 74.23%
Epoch 6/15
Train Loss: 0.5075, Acc: 83.06%
Val Loss: 0.8155, Acc: 74.17%
Epoch 7/15
Train Loss: 0.4901, Acc: 83.58%
Val Loss: 0.8072, Acc: 74.55%
Epoch 8/15
Train Loss: 0.4723, Acc: 84.02%
Val Loss: 0.7788, Acc: 75.01%
Epoch 9/15
Train Loss: 0.4612, Acc: 84.25%
Val Loss: 0.7711, Acc: 74.84%
Epoch 10/15
Train Loss: 0.4470, Acc: 84.72%
Val Loss: 0.7934, Acc: 74.72%
Epoch 11/15
Train Loss: 0.4407, Acc: 84.77%
Val Loss: 0.7924, Acc: 74.49%
Epoch 12/15
Train Loss: 0.4334, Acc: 84.91%
Val Loss: 0.7796, Acc: 74.98%
Epoch 13/15
Train Loss: 0.4254, Acc: 85.13%
Val Loss: 0.7988, Acc: 75.26%
Epoch 14/15
Train Loss: 0.4186, Acc: 85.32%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▇▇▇▇███▇████
val_loss,█▅▃▂▂▃▂▁▁▂▂▁▂▂▂

0,1
epoch,15.0
train_acc,85.44449
train_loss,0.41267
val_acc,74.97758
val_loss,0.78508


[34m[1mwandb[0m: Agent Starting Run: auhmazwj with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2531, Acc: 61.12%
Val Loss: 0.9923, Acc: 66.54%
Epoch 2/15
Train Loss: 0.7041, Acc: 77.19%
Val Loss: 0.9107, Acc: 70.33%
Epoch 3/15
Train Loss: 0.5975, Acc: 80.67%
Val Loss: 0.8480, Acc: 71.72%
Epoch 4/15
Train Loss: 0.5443, Acc: 82.15%
Val Loss: 0.8583, Acc: 72.36%
Epoch 5/15
Train Loss: 0.5079, Acc: 83.24%
Val Loss: 0.8587, Acc: 72.00%
Epoch 6/15
Train Loss: 0.4793, Acc: 84.04%
Val Loss: 0.8415, Acc: 72.81%
Epoch 7/15
Train Loss: 0.4575, Acc: 84.58%
Val Loss: 0.8272, Acc: 72.99%
Epoch 8/15
Train Loss: 0.4458, Acc: 84.80%
Val Loss: 0.8464, Acc: 72.68%
Epoch 9/15
Train Loss: 0.4274, Acc: 85.30%
Val Loss: 0.8649, Acc: 73.26%
Epoch 10/15
Train Loss: 0.4155, Acc: 85.53%
Val Loss: 0.8249, Acc: 72.98%
Epoch 11/15
Train Loss: 0.4023, Acc: 85.88%
Val Loss: 0.8622, Acc: 73.24%
Epoch 12/15
Train Loss: 0.3925, Acc: 86.01%
Val Loss: 0.8631, Acc: 73.50%
Epoch 13/15
Train Loss: 0.3876, Acc: 86.09%
Val Loss: 0.8518, Acc: 73.48%
Epoch 14/15
Train Loss: 0.3782, Acc: 86.32%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▅▆▇▆▇▇▇█▇█████
val_loss,█▅▂▂▂▂▁▂▃▁▃▃▂▁▃

0,1
epoch,15.0
train_acc,86.46588
train_loss,0.37031
val_acc,73.47299
val_loss,0.86123


[34m[1mwandb[0m: Agent Starting Run: 06wd9s9q with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2404, Acc: 61.32%
Val Loss: 1.0299, Acc: 66.60%
Epoch 2/15
Train Loss: 0.7024, Acc: 77.37%
Val Loss: 0.9366, Acc: 69.63%
Epoch 3/15
Train Loss: 0.5955, Acc: 80.70%
Val Loss: 0.9018, Acc: 71.22%
Epoch 4/15
Train Loss: 0.5368, Acc: 82.48%
Val Loss: 0.8550, Acc: 72.43%
Epoch 5/15
Train Loss: 0.5084, Acc: 83.21%
Val Loss: 0.8227, Acc: 73.14%
Epoch 6/15
Train Loss: 0.4794, Acc: 84.07%
Val Loss: 0.8357, Acc: 72.74%
Epoch 7/15
Train Loss: 0.4645, Acc: 84.33%
Val Loss: 0.8322, Acc: 73.36%
Epoch 8/15
Train Loss: 0.4445, Acc: 84.87%
Val Loss: 0.8412, Acc: 72.96%
Epoch 9/15
Train Loss: 0.4277, Acc: 85.29%
Val Loss: 0.8504, Acc: 73.55%
Epoch 10/15
Train Loss: 0.4206, Acc: 85.30%
Val Loss: 0.8272, Acc: 73.47%
Epoch 11/15
Train Loss: 0.4021, Acc: 85.83%
Val Loss: 0.8659, Acc: 72.86%
Epoch 12/15
Train Loss: 0.3951, Acc: 86.00%
Val Loss: 0.8559, Acc: 72.98%
Epoch 13/15
Train Loss: 0.3908, Acc: 86.03%
Val Loss: 0.8343, Acc: 73.26%
Epoch 14/15
Train Loss: 0.3788, Acc: 86.35%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▄▆▇█▇█▇██▇▇███
val_loss,█▅▄▂▁▁▁▂▂▁▂▂▁▂▂

0,1
epoch,15.0
train_acc,86.26999
train_loss,0.37821
val_acc,73.61477
val_loss,0.84769


[34m[1mwandb[0m: Agent Starting Run: wwp9svdf with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.3025, Acc: 58.90%
Val Loss: 0.9886, Acc: 67.72%
Epoch 2/15
Train Loss: 0.8000, Acc: 74.11%
Val Loss: 0.8877, Acc: 71.44%
Epoch 3/15
Train Loss: 0.7076, Acc: 77.02%
Val Loss: 0.8823, Acc: 71.97%
Epoch 4/15
Train Loss: 0.6651, Acc: 78.39%
Val Loss: 0.8240, Acc: 72.54%
Epoch 5/15
Train Loss: 0.6355, Acc: 79.34%
Val Loss: 0.8382, Acc: 73.07%
Epoch 6/15
Train Loss: 0.6140, Acc: 80.00%
Val Loss: 0.8411, Acc: 73.22%
Epoch 7/15
Train Loss: 0.6091, Acc: 80.12%
Val Loss: 0.8301, Acc: 73.74%
Epoch 8/15
Train Loss: 0.5941, Acc: 80.48%
Val Loss: 0.8555, Acc: 73.52%
Epoch 9/15
Train Loss: 0.5839, Acc: 80.81%
Val Loss: 0.8194, Acc: 73.89%
Epoch 10/15
Train Loss: 0.5770, Acc: 80.96%
Val Loss: 0.7889, Acc: 74.27%
Epoch 11/15
Train Loss: 0.5721, Acc: 81.22%
Val Loss: 0.8292, Acc: 74.45%
Epoch 12/15
Train Loss: 0.5699, Acc: 81.22%
Val Loss: 0.8168, Acc: 73.96%
Epoch 13/15
Train Loss: 0.5597, Acc: 81.57%
Val Loss: 0.8346, Acc: 73.96%
Epoch 14/15
Train Loss: 0.5593, Acc: 81.63%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▅▆▆▇▇▇▇██▇▇██
val_loss,█▄▄▂▃▃▂▃▂▁▂▂▃▂▂

0,1
epoch,15.0
train_acc,81.38486
train_loss,0.56288
val_acc,74.55224
val_loss,0.80338


[34m[1mwandb[0m: Agent Starting Run: 1zswwpkt with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Train Loss: 1.2082, Acc: 62.37%
Val Loss: 1.0177, Acc: 66.39%
Epoch 2/20
Train Loss: 0.7004, Acc: 77.47%
Val Loss: 0.9014, Acc: 70.19%
Epoch 3/20
Train Loss: 0.6098, Acc: 80.10%
Val Loss: 0.8823, Acc: 71.00%
Epoch 4/20


In [7]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import random
import numpy as np

# =======================
# Fixed Best Configuration
# =======================
config = {
    'embedding_dim': 128,
    'hidden_dim': 256,
    'enc_layers': 3,
    'dec_layers': 3,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 15,
    'beam_size': 5
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wandb.init(project="dakshina-transliteration", config=config)
config = wandb.config

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab=None, out_vocab=None):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if inp_vocab and out_vocab:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder / Decoder / Seq2Seq
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)

        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if random.random() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train / Evaluate
# =======================
def compute_accuracy(preds, targets):
    preds = preds.argmax(-1)
    correct = ((preds == targets) & (targets != 0)).sum().item()
    total = (targets != 0).sum().item()
    return correct / total

def train_eval(model, loader, criterion, optimizer, is_train):
    model.train() if is_train else model.eval()
    total_loss, total_acc = 0, 0
    with torch.set_grad_enabled(is_train):
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            if is_train: optimizer.zero_grad()
            output = model((src, src_lens), trg)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            acc = compute_accuracy(output[:, 1:], trg[:, 1:])
            if is_train:
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
            total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

# =======================
# Train and Save Best
# =======================
inp_vocab, out_vocab = Vocab(), Vocab()
train_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
dev_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
test_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv", inp_vocab, out_vocab)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_set, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, collate_fn=collate_fn)

encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

best_val_acc = 0.0
for epoch in range(config.epochs):
    train_loss, train_acc = train_eval(model, train_loader, criterion, optimizer, is_train=True)
    val_loss, val_acc = train_eval(model, dev_loader, criterion, optimizer, is_train=False)
    wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch})
    print(f"Epoch {epoch}: Train Loss={train_loss:.4f} Acc={train_acc:.4f}, Val Loss={val_loss:.4f} Acc={val_acc:.4f}")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")

# =======================
# Test Evaluation
# =======================
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

os.makedirs("predictions_vanilla", exist_ok=True)
with open("predictions_vanilla/preds.txt", "w", encoding="utf-8") as f:
    correct, total = 0, 0
    samples = []
    for src, trg, src_lens, _ in test_loader:
        src, trg = src.to(device), trg.to(device)
        output = model((src, src_lens), trg, teacher_forcing_ratio=0.0)
        pred_idxs = output.argmax(-1)[0].tolist()
        true_idxs = trg[0].tolist()
        pred_str = out_vocab.decode(pred_idxs)
        true_str = out_vocab.decode(true_idxs)
        input_str = inp_vocab.decode(src[0].tolist())
        f.write(f"{input_str}\t{true_str}\t{pred_str}\n")
        if pred_str == true_str:
            correct += 1
        total += 1
        samples.append((input_str, true_str, pred_str))
    test_acc = correct / total
    wandb.log({"test_accuracy": test_acc})
    print("Test Accuracy:", test_acc)

# Sample Grid (for visualization)
print("\nSample Predictions:")
print("{:<20} | {:<20} | {:<20}".format("Input", "Reference", "Prediction"))
print("=" * 65)
for s in random.sample(samples, 10):
    print("{:<20} | {:<20} | {:<20}".format(*s))


Epoch 0: Train Loss=1.2103 Acc=0.6221, Val Loss=0.7996 Acc=0.7455
Epoch 1: Train Loss=0.7029 Acc=0.7740, Val Loss=0.6784 Acc=0.7765
Epoch 2: Train Loss=0.6081 Acc=0.8035, Val Loss=0.6208 Acc=0.8025
Epoch 3: Train Loss=0.5603 Acc=0.8173, Val Loss=0.6208 Acc=0.8002
Epoch 4: Train Loss=0.5335 Acc=0.8238, Val Loss=0.6086 Acc=0.8038
Epoch 5: Train Loss=0.5105 Acc=0.8311, Val Loss=0.6006 Acc=0.8093
Epoch 6: Train Loss=0.4914 Acc=0.8355, Val Loss=0.5927 Acc=0.8080
Epoch 7: Train Loss=0.4737 Acc=0.8404, Val Loss=0.6052 Acc=0.8078
Epoch 8: Train Loss=0.4590 Acc=0.8435, Val Loss=0.6019 Acc=0.8034
Epoch 9: Train Loss=0.4495 Acc=0.8457, Val Loss=0.5979 Acc=0.8129
Epoch 10: Train Loss=0.4416 Acc=0.8482, Val Loss=0.6096 Acc=0.8070
Epoch 11: Train Loss=0.4337 Acc=0.8492, Val Loss=0.6045 Acc=0.8113
Epoch 12: Train Loss=0.4265 Acc=0.8510, Val Loss=0.5946 Acc=0.8153
Epoch 13: Train Loss=0.4222 Acc=0.8518, Val Loss=0.6018 Acc=0.8093
Epoch 14: Train Loss=0.4120 Acc=0.8546, Val Loss=0.6180 Acc=0.8116
Epoch

In [18]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math
from tqdm import tqdm

os.makedirs("predictions_vanilla", exist_ok=True)

best_config = {
    'embedding_dim': 128,
    'hidden_dim': 256,
    'enc_layers': 3,
    'dec_layers': 3,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 15,
    'beam_size': 5
}

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab, is_test=False):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if not is_test:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y), lat, dev

def collate_fn(batch):
    x_batch, y_batch, lat, dev = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens), lat, dev

# =======================
# Encoder, Decoder, Seq2Seq
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])
        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)
        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

    def predict(self, src_tensor, src_len, max_len=30):
        self.eval()
        with torch.no_grad():
            enc_hidden = self.encoder(src_tensor.unsqueeze(0), torch.tensor([src_len]))
            if self.cell_type == "LSTM":
                h, c = enc_hidden
                h = self._match_layers(h)
                c = self._match_layers(c)
                dec_hidden = (h, c)
            else:
                dec_hidden = self._match_layers(enc_hidden)
            input_token = torch.tensor([2]).to(self.device)  # <sos>
            output_seq = []
            for _ in range(max_len):
                output, dec_hidden = self.decoder(input_token, dec_hidden)
                top1 = output.argmax(1)
                if top1.item() == 2: break  # <eos>
                output_seq.append(top1.item())
                input_token = top1
        return output_seq

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _, _, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _, _, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

# =======================
# Main
# =======================
# =======================
# Main
# =======================
def main():
    wandb.init(project="dakshina-transliteration")
    config = best_config  # Use best_config directly and avoid wandb.config

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
    dev_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
    test_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv", inp_vocab, out_vocab, is_test=True)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config['embedding_dim'], config['hidden_dim'],
                      config['enc_layers'], config['cell_type'], config['dropout'])
    decoder = Decoder(out_vocab.size, config['embedding_dim'], config['hidden_dim'],
                      config['dec_layers'], config['cell_type'], config['dropout'])
    model = Seq2Seq(encoder, decoder, config['enc_layers'], config['dec_layers'], config['cell_type'], device).to(device)

    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_val_acc = 0.0
    for epoch in range(config['epochs']):
        print(f"Epoch {epoch+1}/{config['epochs']}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)

        print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")
        print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")
            print("✅ Best model saved.")

    # Evaluation on test set
    print("\n🔍 Evaluating on test set with best model:")
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    total_correct, total_count = 0, 0

    with open("predictions_vanilla/test_predictions.txt", "w", encoding="utf-8") as f:
        for src, _, src_lens, _, lat, gold in test_loader:
            src = src.to(device)
            pred_ids = model.predict(src[0], src_lens[0].item())
            pred = out_vocab.decode(pred_ids)
            f.write(f"{lat[0]}\t{gold[0]}\t{pred}\n")
            if pred == gold[0]:
                total_correct += 1
            total_count += 1

    test_acc = 100.0 * total_correct / total_count
    print(f"📊 Test Accuracy: {test_acc:.2f}%")
    wandb.log({"test_acc": test_acc})

if __name__ == "__main__":
    main()




0,1
train_acc,▁▆▇███
train_loss,█▃▂▂▁▁
val_acc,▁▆▇▇██
val_loss,█▄▂▂▁▂

0,1
train_acc,83.12333
train_loss,0.50879
val_acc,72.13333
val_loss,0.86891


Epoch 1/15
Train Loss: 1.2916, Accuracy: 60.52%
Val   Loss: 0.9746, Accuracy: 68.25%
✅ Best model saved.
Epoch 2/15
Train Loss: 0.6719, Accuracy: 78.24%
Val   Loss: 0.8701, Accuracy: 71.92%
✅ Best model saved.
Epoch 3/15
Train Loss: 0.5746, Accuracy: 81.27%
Val   Loss: 0.8041, Accuracy: 73.33%
✅ Best model saved.
Epoch 4/15
Train Loss: 0.5236, Accuracy: 82.72%
Val   Loss: 0.8114, Accuracy: 73.86%
✅ Best model saved.
Epoch 5/15
Train Loss: 0.4914, Accuracy: 83.66%
Val   Loss: 0.8050, Accuracy: 74.60%
✅ Best model saved.
Epoch 6/15
Train Loss: 0.4701, Accuracy: 84.21%
Val   Loss: 0.8052, Accuracy: 74.57%
Epoch 7/15
Train Loss: 0.4553, Accuracy: 84.49%
Val   Loss: 0.8100, Accuracy: 74.55%
Epoch 8/15
Train Loss: 0.4394, Accuracy: 84.89%
Val   Loss: 0.7896, Accuracy: 74.93%
✅ Best model saved.
Epoch 9/15
Train Loss: 0.4285, Accuracy: 85.05%
Val   Loss: 0.7989, Accuracy: 74.77%
Epoch 10/15
Train Loss: 0.4173, Accuracy: 85.34%
Val   Loss: 0.7853, Accuracy: 75.38%
✅ Best model saved.
Epoch 11/