In [None]:
  !pip install wandb


In [None]:
import wandb


In [None]:
!wandb login 58a0b576fd5221cd0d63b154deaabbe535e853c6

CHARACTER LEVEL WITHOUT ATTENTION

In [4]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'embedding_dim': {'values': [64,128,256]},
        'hidden_dim': {'values': [64, 128,256 ]},
        'enc_layers': {'values': [1, 2,3]},
        'dec_layers': {'values': [1, 2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.2, 0.3,0.5]},
        'epochs': {'values': [20, 15]},
        'beam_size': {'values': [1, 3, 5]},
        'batch_size': {'values': [64, 128, 256]},
        'learning_rate': {'values': [0.001, 0.0005, 0.0001]}
    }
}

default_config = {
    'embedding_dim': 32,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1,
    'batch_size': 64,
    'learning_rate': 0.001
}


# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder and Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

# =======================
# Seq2Seq Model with Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Train Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Val Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="dakshina-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_data = TransliterationDataset(train_path, inp_vocab, out_vocab)
    dev_data = TransliterationDataset(dev_path, inp_vocab, out_vocab)
    
    # Use config.batch_size for DataLoader
    train_loader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
    decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)
    
    # Use config.learning_rate for optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch+1})

# =======================

if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="dakshina-translit")
    wandb.agent(sweep_id, function=main,count=30)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: q6l1non5
Sweep URL: https://wandb.ai/manglesh_dl_ass3/dakshina-transliteration/sweeps/q6l1non5


[34m[1mwandb[0m: Agent Starting Run: t64a62c3 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/20
Train Loss: 1.9554, Acc: 42.42%
Val Loss: 1.7184, Acc: 46.60%
Epoch 2/20
Train Loss: 1.5413, Acc: 52.60%
Val Loss: 1.5627, Acc: 50.22%
Epoch 3/20
Train Loss: 1.3921, Acc: 56.85%
Val Loss: 1.4492, Acc: 53.92%
Epoch 4/20
Train Loss: 1.3088, Acc: 59.20%
Val Loss: 1.3864, Acc: 55.18%
Epoch 5/20
Train Loss: 1.2589, Acc: 60.54%
Val Loss: 1.3533, Acc: 56.45%
Epoch 6/20
Train Loss: 1.2227, Acc: 61.63%
Val Loss: 1.3185, Acc: 57.52%
Epoch 7/20
Train Loss: 1.1909, Acc: 62.43%
Val Loss: 1.2978, Acc: 57.83%
Epoch 8/20
Train Loss: 1.1754, Acc: 62.80%
Val Loss: 1.3139, Acc: 57.95%
Epoch 9/20
Train Loss: 1.1584, Acc: 63.26%
Val Loss: 1.3047, Acc: 58.31%
Epoch 10/20
Train Loss: 1.1579, Acc: 63.28%
Val Loss: 1.2658, Acc: 58.91%
Epoch 11/20
Train Loss: 1.1431, Acc: 63.65%
Val Loss: 1.2666, Acc: 58.62%
Epoch 12/20
Train Loss: 1.1346, Acc: 63.93%
Val Loss: 1.2665, Acc: 59.07%
Epoch 13/20
Train Loss: 1.1267, Acc: 64.30%
Val Loss: 1.2796, Acc: 58.97%
Epoch 14/20
Train Loss: 1.1270, Acc: 64.23%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▄▆▆▇▇▇▇████████████
train_loss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▆▆▇▇▇▇█▇██▇████▇█
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
train_acc,64.44335
train_loss,1.11939
val_acc,58.94795
val_loss,1.27341


[34m[1mwandb[0m: Agent Starting Run: 43bhqd7m with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.2089, Acc: 62.40%
Val Loss: 1.0044, Acc: 66.71%
Epoch 2/20
Train Loss: 0.7046, Acc: 77.35%
Val Loss: 0.9199, Acc: 70.11%
Epoch 3/20
Train Loss: 0.6128, Acc: 80.12%
Val Loss: 0.8909, Acc: 70.57%
Epoch 4/20
Train Loss: 0.5654, Acc: 81.55%
Val Loss: 0.8812, Acc: 71.34%
Epoch 5/20
Train Loss: 0.5351, Acc: 82.43%
Val Loss: 0.8653, Acc: 72.36%
Epoch 6/20
Train Loss: 0.5084, Acc: 83.16%
Val Loss: 0.8699, Acc: 71.60%
Epoch 7/20
Train Loss: 0.4888, Acc: 83.65%
Val Loss: 0.8993, Acc: 72.44%
Epoch 8/20
Train Loss: 0.4742, Acc: 84.01%
Val Loss: 0.8733, Acc: 72.06%
Epoch 9/20
Train Loss: 0.4585, Acc: 84.46%
Val Loss: 0.8709, Acc: 72.51%
Epoch 10/20
Train Loss: 0.4489, Acc: 84.60%
Val Loss: 0.8825, Acc: 72.18%
Epoch 11/20
Train Loss: 0.4411, Acc: 84.71%
Val Loss: 0.8838, Acc: 71.96%
Epoch 12/20
Train Loss: 0.4308, Acc: 85.04%
Val Loss: 0.8976, Acc: 72.58%
Epoch 13/20
Train Loss: 0.4262, Acc: 85.14%
Val Loss: 0.8819, Acc: 72.63%
Epoch 14/20
Train Loss: 0.4173, Acc: 85.30%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▅▆▇▇▇▇▇████████████
train_loss,█▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▅▆▇▇█▇█▇▇██▇█▇▇█▇█
val_loss,█▄▂▂▁▁▃▁▁▂▂▃▂▃▁▄▃▃▄▄

0,1
epoch,20.0
train_acc,86.02775
train_loss,0.38659
val_acc,72.84801
val_loss,0.92941


[34m[1mwandb[0m: Agent Starting Run: e7g40ay4 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 128


Epoch 1/20
Train Loss: 2.4500, Acc: 28.85%
Val Loss: 2.4601, Acc: 28.56%
Epoch 2/20
Train Loss: 2.3409, Acc: 31.34%
Val Loss: 2.4687, Acc: 28.86%
Epoch 3/20
Train Loss: 2.3143, Acc: 31.78%
Val Loss: 2.4092, Acc: 29.53%
Epoch 4/20
Train Loss: 2.2936, Acc: 32.03%
Val Loss: 2.4002, Acc: 29.70%
Epoch 5/20
Train Loss: 2.2704, Acc: 32.39%
Val Loss: 2.3694, Acc: 29.94%
Epoch 6/20
Train Loss: 2.2433, Acc: 33.27%
Val Loss: 2.3417, Acc: 30.40%
Epoch 7/20
Train Loss: 2.2204, Acc: 33.78%
Val Loss: 2.3271, Acc: 30.81%
Epoch 8/20
Train Loss: 2.2067, Acc: 34.07%
Val Loss: 2.3166, Acc: 31.04%
Epoch 9/20
Train Loss: 2.1921, Acc: 34.47%
Val Loss: 2.3318, Acc: 30.08%
Epoch 10/20
Train Loss: 2.1785, Acc: 34.84%
Val Loss: 2.2838, Acc: 31.51%
Epoch 11/20
Train Loss: 2.1686, Acc: 35.11%
Val Loss: 2.2786, Acc: 31.32%
Epoch 12/20
Train Loss: 2.1576, Acc: 35.43%
Val Loss: 2.2727, Acc: 31.99%
Epoch 13/20
Train Loss: 2.1538, Acc: 35.70%
Val Loss: 2.2715, Acc: 31.96%
Epoch 14/20
Train Loss: 2.1487, Acc: 35.84%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇████
train_loss,█▆▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁
val_acc,▁▂▃▃▃▄▅▅▄▆▆▇▇▆▆▆▇▇█▆
val_loss,██▆▆▅▄▃▃▄▂▂▂▂▂▂▂▁▁▁▂

0,1
epoch,20.0
train_acc,36.59542
train_loss,2.11748
val_acc,31.78149
val_loss,2.26875


[34m[1mwandb[0m: Agent Starting Run: c7io9ymw with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.5128, Acc: 53.67%
Val Loss: 1.0247, Acc: 66.20%
Epoch 2/20
Train Loss: 0.7835, Acc: 74.70%
Val Loss: 0.8963, Acc: 70.52%
Epoch 3/20
Train Loss: 0.6661, Acc: 78.44%
Val Loss: 0.8549, Acc: 71.99%
Epoch 4/20
Train Loss: 0.6077, Acc: 80.13%
Val Loss: 0.8247, Acc: 73.12%
Epoch 5/20
Train Loss: 0.5660, Acc: 81.45%
Val Loss: 0.8111, Acc: 73.62%
Epoch 6/20
Train Loss: 0.5420, Acc: 82.15%
Val Loss: 0.7976, Acc: 73.75%
Epoch 7/20
Train Loss: 0.5224, Acc: 82.66%
Val Loss: 0.8217, Acc: 73.50%
Epoch 8/20
Train Loss: 0.4994, Acc: 83.36%
Val Loss: 0.8166, Acc: 74.01%
Epoch 9/20
Train Loss: 0.4875, Acc: 83.70%
Val Loss: 0.8052, Acc: 74.29%
Epoch 10/20
Train Loss: 0.4792, Acc: 83.79%
Val Loss: 0.7925, Acc: 74.62%
Epoch 11/20
Train Loss: 0.4688, Acc: 84.05%
Val Loss: 0.8034, Acc: 74.40%
Epoch 12/20
Train Loss: 0.4586, Acc: 84.36%
Val Loss: 0.8043, Acc: 74.71%
Epoch 13/20
Train Loss: 0.4470, Acc: 84.70%
Val Loss: 0.7969, Acc: 74.60%
Epoch 14/20
Train Loss: 0.4453, Acc: 84.69%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▆▆▇▇▇▇█████████████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇▇▇█▇██▇██████
val_loss,█▄▃▂▂▁▂▂▁▁▁▁▁▁▂▁▂▂▁▂

0,1
epoch,20.0
train_acc,85.3929
train_loss,0.41227
val_acc,74.77214
val_loss,0.81614


[34m[1mwandb[0m: Agent Starting Run: c85kqv7k with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20
Train Loss: 1.1142, Acc: 64.86%
Val Loss: 1.0020, Acc: 67.36%
Epoch 2/20
Train Loss: 0.6762, Acc: 78.09%
Val Loss: 0.8703, Acc: 71.29%
Epoch 3/20
Train Loss: 0.5970, Acc: 80.56%
Val Loss: 0.8354, Acc: 72.93%
Epoch 4/20
Train Loss: 0.5581, Acc: 81.70%
Val Loss: 0.8565, Acc: 72.84%
Epoch 5/20
Train Loss: 0.5316, Acc: 82.40%
Val Loss: 0.8047, Acc: 73.68%
Epoch 6/20
Train Loss: 0.5105, Acc: 83.05%
Val Loss: 0.8650, Acc: 73.44%
Epoch 7/20
Train Loss: 0.4946, Acc: 83.45%
Val Loss: 0.8216, Acc: 73.90%
Epoch 8/20
Train Loss: 0.4833, Acc: 83.69%
Val Loss: 0.8341, Acc: 74.03%
Epoch 9/20
Train Loss: 0.4706, Acc: 84.07%
Val Loss: 0.8223, Acc: 74.26%
Epoch 10/20
Train Loss: 0.4656, Acc: 84.13%
Val Loss: 0.8191, Acc: 74.33%
Epoch 11/20
Train Loss: 0.4578, Acc: 84.32%
Val Loss: 0.8504, Acc: 73.84%
Epoch 12/20
Train Loss: 0.4480, Acc: 84.67%
Val Loss: 0.8551, Acc: 74.53%
Epoch 13/20
Train Loss: 0.4431, Acc: 84.73%
Val Loss: 0.8584, Acc: 74.01%
Epoch 14/20
Train Loss: 0.4395, Acc: 84.85%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▆▆▇▇▇▇█████████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇███▇█▇███████
val_loss,█▃▂▃▁▃▂▂▂▂▃▃▃▂▂▃▂▂▃▂

0,1
epoch,20.0
train_acc,85.10375
train_loss,0.4244
val_acc,74.21371
val_loss,0.84367


[34m[1mwandb[0m: Agent Starting Run: uusrluco with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2784, Acc: 60.16%
Val Loss: 0.9592, Acc: 67.37%
Epoch 2/15
Train Loss: 0.7300, Acc: 76.16%
Val Loss: 0.8863, Acc: 70.76%
Epoch 3/15
Train Loss: 0.6225, Acc: 79.61%
Val Loss: 0.8220, Acc: 73.20%
Epoch 4/15
Train Loss: 0.5709, Acc: 81.22%
Val Loss: 0.7955, Acc: 74.00%
Epoch 5/15
Train Loss: 0.5343, Acc: 82.26%
Val Loss: 0.8036, Acc: 74.23%
Epoch 6/15
Train Loss: 0.5075, Acc: 83.06%
Val Loss: 0.8155, Acc: 74.17%
Epoch 7/15
Train Loss: 0.4901, Acc: 83.58%
Val Loss: 0.8072, Acc: 74.55%
Epoch 8/15
Train Loss: 0.4723, Acc: 84.02%
Val Loss: 0.7788, Acc: 75.01%
Epoch 9/15
Train Loss: 0.4612, Acc: 84.25%
Val Loss: 0.7711, Acc: 74.84%
Epoch 10/15
Train Loss: 0.4470, Acc: 84.72%
Val Loss: 0.7934, Acc: 74.72%
Epoch 11/15
Train Loss: 0.4407, Acc: 84.77%
Val Loss: 0.7924, Acc: 74.49%
Epoch 12/15
Train Loss: 0.4334, Acc: 84.91%
Val Loss: 0.7796, Acc: 74.98%
Epoch 13/15
Train Loss: 0.4254, Acc: 85.13%
Val Loss: 0.7988, Acc: 75.26%
Epoch 14/15
Train Loss: 0.4186, Acc: 85.32%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▇▇▇▇███▇████
val_loss,█▅▃▂▂▃▂▁▁▂▂▁▂▂▂

0,1
epoch,15.0
train_acc,85.44449
train_loss,0.41267
val_acc,74.97758
val_loss,0.78508


[34m[1mwandb[0m: Agent Starting Run: auhmazwj with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2531, Acc: 61.12%
Val Loss: 0.9923, Acc: 66.54%
Epoch 2/15
Train Loss: 0.7041, Acc: 77.19%
Val Loss: 0.9107, Acc: 70.33%
Epoch 3/15
Train Loss: 0.5975, Acc: 80.67%
Val Loss: 0.8480, Acc: 71.72%
Epoch 4/15
Train Loss: 0.5443, Acc: 82.15%
Val Loss: 0.8583, Acc: 72.36%
Epoch 5/15
Train Loss: 0.5079, Acc: 83.24%
Val Loss: 0.8587, Acc: 72.00%
Epoch 6/15
Train Loss: 0.4793, Acc: 84.04%
Val Loss: 0.8415, Acc: 72.81%
Epoch 7/15
Train Loss: 0.4575, Acc: 84.58%
Val Loss: 0.8272, Acc: 72.99%
Epoch 8/15
Train Loss: 0.4458, Acc: 84.80%
Val Loss: 0.8464, Acc: 72.68%
Epoch 9/15
Train Loss: 0.4274, Acc: 85.30%
Val Loss: 0.8649, Acc: 73.26%
Epoch 10/15
Train Loss: 0.4155, Acc: 85.53%
Val Loss: 0.8249, Acc: 72.98%
Epoch 11/15
Train Loss: 0.4023, Acc: 85.88%
Val Loss: 0.8622, Acc: 73.24%
Epoch 12/15
Train Loss: 0.3925, Acc: 86.01%
Val Loss: 0.8631, Acc: 73.50%
Epoch 13/15
Train Loss: 0.3876, Acc: 86.09%
Val Loss: 0.8518, Acc: 73.48%
Epoch 14/15
Train Loss: 0.3782, Acc: 86.32%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▅▆▇▆▇▇▇█▇█████
val_loss,█▅▂▂▂▂▁▂▃▁▃▃▂▁▃

0,1
epoch,15.0
train_acc,86.46588
train_loss,0.37031
val_acc,73.47299
val_loss,0.86123


[34m[1mwandb[0m: Agent Starting Run: 06wd9s9q with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.2404, Acc: 61.32%
Val Loss: 1.0299, Acc: 66.60%
Epoch 2/15
Train Loss: 0.7024, Acc: 77.37%
Val Loss: 0.9366, Acc: 69.63%
Epoch 3/15
Train Loss: 0.5955, Acc: 80.70%
Val Loss: 0.9018, Acc: 71.22%
Epoch 4/15
Train Loss: 0.5368, Acc: 82.48%
Val Loss: 0.8550, Acc: 72.43%
Epoch 5/15
Train Loss: 0.5084, Acc: 83.21%
Val Loss: 0.8227, Acc: 73.14%
Epoch 6/15
Train Loss: 0.4794, Acc: 84.07%
Val Loss: 0.8357, Acc: 72.74%
Epoch 7/15
Train Loss: 0.4645, Acc: 84.33%
Val Loss: 0.8322, Acc: 73.36%
Epoch 8/15
Train Loss: 0.4445, Acc: 84.87%
Val Loss: 0.8412, Acc: 72.96%
Epoch 9/15
Train Loss: 0.4277, Acc: 85.29%
Val Loss: 0.8504, Acc: 73.55%
Epoch 10/15
Train Loss: 0.4206, Acc: 85.30%
Val Loss: 0.8272, Acc: 73.47%
Epoch 11/15
Train Loss: 0.4021, Acc: 85.83%
Val Loss: 0.8659, Acc: 72.86%
Epoch 12/15
Train Loss: 0.3951, Acc: 86.00%
Val Loss: 0.8559, Acc: 72.98%
Epoch 13/15
Train Loss: 0.3908, Acc: 86.03%
Val Loss: 0.8343, Acc: 73.26%
Epoch 14/15
Train Loss: 0.3788, Acc: 86.35%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▄▆▇█▇█▇██▇▇███
val_loss,█▅▄▂▁▁▁▂▂▁▂▂▁▂▂

0,1
epoch,15.0
train_acc,86.26999
train_loss,0.37821
val_acc,73.61477
val_loss,0.84769


[34m[1mwandb[0m: Agent Starting Run: wwp9svdf with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/15
Train Loss: 1.3025, Acc: 58.90%
Val Loss: 0.9886, Acc: 67.72%
Epoch 2/15
Train Loss: 0.8000, Acc: 74.11%
Val Loss: 0.8877, Acc: 71.44%
Epoch 3/15
Train Loss: 0.7076, Acc: 77.02%
Val Loss: 0.8823, Acc: 71.97%
Epoch 4/15
Train Loss: 0.6651, Acc: 78.39%
Val Loss: 0.8240, Acc: 72.54%
Epoch 5/15
Train Loss: 0.6355, Acc: 79.34%
Val Loss: 0.8382, Acc: 73.07%
Epoch 6/15
Train Loss: 0.6140, Acc: 80.00%
Val Loss: 0.8411, Acc: 73.22%
Epoch 7/15
Train Loss: 0.6091, Acc: 80.12%
Val Loss: 0.8301, Acc: 73.74%
Epoch 8/15
Train Loss: 0.5941, Acc: 80.48%
Val Loss: 0.8555, Acc: 73.52%
Epoch 9/15
Train Loss: 0.5839, Acc: 80.81%
Val Loss: 0.8194, Acc: 73.89%
Epoch 10/15
Train Loss: 0.5770, Acc: 80.96%
Val Loss: 0.7889, Acc: 74.27%
Epoch 11/15
Train Loss: 0.5721, Acc: 81.22%
Val Loss: 0.8292, Acc: 74.45%
Epoch 12/15
Train Loss: 0.5699, Acc: 81.22%
Val Loss: 0.8168, Acc: 73.96%
Epoch 13/15
Train Loss: 0.5597, Acc: 81.57%
Val Loss: 0.8346, Acc: 73.96%
Epoch 14/15
Train Loss: 0.5593, Acc: 81.63%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▅▆▆▇▇▇▇██▇▇██
val_loss,█▄▄▂▃▃▂▃▂▁▂▂▃▂▂

0,1
epoch,15.0
train_acc,81.38486
train_loss,0.56288
val_acc,74.55224
val_loss,0.80338


[34m[1mwandb[0m: Agent Starting Run: 1zswwpkt with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256


Epoch 1/20


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Train Loss: 1.2082, Acc: 62.37%
Val Loss: 1.0177, Acc: 66.39%
Epoch 2/20
Train Loss: 0.7004, Acc: 77.47%
Val Loss: 0.9014, Acc: 70.19%
Epoch 3/20
Train Loss: 0.6098, Acc: 80.10%
Val Loss: 0.8823, Acc: 71.00%
Epoch 4/20


**TESTING without Attention**

In [7]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import random
import numpy as np

# =======================
# Fixed Best Configuration
# =======================
best_config = {
    'embedding_dim': 128,
    'hidden_dim': 256,
    'enc_layers': 3,
    'dec_layers': 3,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 15,
    'beam_size': 5,
    'batch_size': 128,  # Added
    'learning_rate': 0.001  # Added
}


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wandb.init(project="dakshina-transliteration", config=config)
config = wandb.config

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab=None, out_vocab=None):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if inp_vocab and out_vocab:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder / Decoder / Seq2Seq
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)

        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if random.random() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train / Evaluate
# =======================
def compute_accuracy(preds, targets):
    preds = preds.argmax(-1)
    correct = ((preds == targets) & (targets != 0)).sum().item()
    total = (targets != 0).sum().item()
    return correct / total

def train_eval(model, loader, criterion, optimizer, is_train):
    model.train() if is_train else model.eval()
    total_loss, total_acc = 0, 0
    with torch.set_grad_enabled(is_train):
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            if is_train: optimizer.zero_grad()
            output = model((src, src_lens), trg)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            acc = compute_accuracy(output[:, 1:], trg[:, 1:])
            if is_train:
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
            total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

# =======================
# Train and Save Best
# =======================
inp_vocab, out_vocab = Vocab(), Vocab()
train_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
dev_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
test_set = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv", inp_vocab, out_vocab)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_set, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, collate_fn=collate_fn)

encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

best_val_acc = 0.0
for epoch in range(config.epochs):
    train_loss, train_acc = train_eval(model, train_loader, criterion, optimizer, is_train=True)
    val_loss, val_acc = train_eval(model, dev_loader, criterion, optimizer, is_train=False)
    wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch})
    print(f"Epoch {epoch}: Train Loss={train_loss:.4f} Acc={train_acc:.4f}, Val Loss={val_loss:.4f} Acc={val_acc:.4f}")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")

# =======================
# Test Evaluation
# =======================
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

os.makedirs("predictions_vanilla", exist_ok=True)
with open("predictions_vanilla/preds.txt", "w", encoding="utf-8") as f:
    correct, total = 0, 0
    samples = []
    for src, trg, src_lens, _ in test_loader:
        src, trg = src.to(device), trg.to(device)
        output = model((src, src_lens), trg, teacher_forcing_ratio=0.0)
        pred_idxs = output.argmax(-1)[0].tolist()
        true_idxs = trg[0].tolist()
        pred_str = out_vocab.decode(pred_idxs)
        true_str = out_vocab.decode(true_idxs)
        input_str = inp_vocab.decode(src[0].tolist())
        f.write(f"{input_str}\t{true_str}\t{pred_str}\n")
        if pred_str == true_str:
            correct += 1
        total += 1
        samples.append((input_str, true_str, pred_str))
    test_acc = correct / total
    wandb.log({"test_accuracy": test_acc})
    print("Test Accuracy:", test_acc)

# Sample Grid (for visualization)
print("\nSample Predictions:")
print("{:<20} | {:<20} | {:<20}".format("Input", "Reference", "Prediction"))
print("=" * 65)
for s in random.sample(samples, 10):
    print("{:<20} | {:<20} | {:<20}".format(*s))


Epoch 0: Train Loss=1.2103 Acc=0.6221, Val Loss=0.7996 Acc=0.7455
Epoch 1: Train Loss=0.7029 Acc=0.7740, Val Loss=0.6784 Acc=0.7765
Epoch 2: Train Loss=0.6081 Acc=0.8035, Val Loss=0.6208 Acc=0.8025
Epoch 3: Train Loss=0.5603 Acc=0.8173, Val Loss=0.6208 Acc=0.8002
Epoch 4: Train Loss=0.5335 Acc=0.8238, Val Loss=0.6086 Acc=0.8038
Epoch 5: Train Loss=0.5105 Acc=0.8311, Val Loss=0.6006 Acc=0.8093
Epoch 6: Train Loss=0.4914 Acc=0.8355, Val Loss=0.5927 Acc=0.8080
Epoch 7: Train Loss=0.4737 Acc=0.8404, Val Loss=0.6052 Acc=0.8078
Epoch 8: Train Loss=0.4590 Acc=0.8435, Val Loss=0.6019 Acc=0.8034
Epoch 9: Train Loss=0.4495 Acc=0.8457, Val Loss=0.5979 Acc=0.8129
Epoch 10: Train Loss=0.4416 Acc=0.8482, Val Loss=0.6096 Acc=0.8070
Epoch 11: Train Loss=0.4337 Acc=0.8492, Val Loss=0.6045 Acc=0.8113
Epoch 12: Train Loss=0.4265 Acc=0.8510, Val Loss=0.5946 Acc=0.8153
Epoch 13: Train Loss=0.4222 Acc=0.8518, Val Loss=0.6018 Acc=0.8093
Epoch 14: Train Loss=0.4120 Acc=0.8546, Val Loss=0.6180 Acc=0.8116
Epoch

In [18]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math
from tqdm import tqdm

os.makedirs("predictions_vanilla", exist_ok=True)

best_config = {
    'embedding_dim': 128,
    'hidden_dim': 256,
    'enc_layers': 3,
    'dec_layers': 3,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 15,
    'beam_size': 5
    'batch_size': 128,  # Added
    'learning_rate': 0.001  # Added
}

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab, is_test=False):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if not is_test:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y), lat, dev

def collate_fn(batch):
    x_batch, y_batch, lat, dev = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens), lat, dev

# =======================
# Encoder, Decoder, Seq2Seq
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])
        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)
        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

    def predict(self, src_tensor, src_len, max_len=30):
        self.eval()
        with torch.no_grad():
            enc_hidden = self.encoder(src_tensor.unsqueeze(0), torch.tensor([src_len]))
            if self.cell_type == "LSTM":
                h, c = enc_hidden
                h = self._match_layers(h)
                c = self._match_layers(c)
                dec_hidden = (h, c)
            else:
                dec_hidden = self._match_layers(enc_hidden)
            input_token = torch.tensor([2]).to(self.device)  # <sos>
            output_seq = []
            for _ in range(max_len):
                output, dec_hidden = self.decoder(input_token, dec_hidden)
                top1 = output.argmax(1)
                if top1.item() == 2: break  # <eos>
                output_seq.append(top1.item())
                input_token = top1
        return output_seq

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _, _, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _, _, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

# =======================
# Main
# =======================
def main():
    wandb.init(project="dakshina-transliteration")
    config = best_config

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vacab)
    dev_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
    test_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv", inp_vocab, out_vocab, is_test=True)

    # Update DataLoader batch sizes
    train_loader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)

    # Update optimizer with learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_val_acc = 0.0
    for epoch in range(config['epochs']):
        print(f"Epoch {epoch+1}/{config['epochs']}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)

        print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")
        print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")
            print("Best model saved.")

    
    # Evaluation on test set
    print("\n Evaluating on test set with best model:")
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    total_correct, total_count = 0, 0

    with open("predictions_vanilla/test_predictions.txt", "w", encoding="utf-8") as f:
        for src, _, src_lens, _, lat, gold in test_loader:
            src = src.to(device)
            pred_ids = model.predict(src[0], src_lens[0].item())
            pred = out_vocab.decode(pred_ids)
            f.write(f"{lat[0]}\t{gold[0]}\t{pred}\n")
            if pred == gold[0]:
                total_correct += 1
            total_count += 1

    test_acc = 100.0 * total_correct / total_count
    print(f"📊 Test Accuracy: {test_acc:.2f}%")
    wandb.log({"test_acc": test_acc})


     # After test evaluation, add random samples display
    print("\nRandom Test Samples Predictions:")
    import random
    random_indices = random.sample(range(len(test_data)), 20)
    
    with open("predictions_vanilla/test_predictions.txt", "a", encoding="utf-8") as f:
        f.write("\n\nRandom Sample Predictions:\n")
        for idx in random_indices:
            x, y, lat, dev = test_data[idx]
            src_tensor = x.to(device)
            pred_ids = model.predict(src_tensor, len(x))
            pred = out_vocab.decode(pred_ids)
            
            print(f"Input: {lat}")
            print(f"True: {dev}")
            print(f"Pred: {pred}\n")
            
            f.write(f"Input: {lat}\n")
            f.write(f"True: {dev}\n")
            f.write(f"Pred: {pred}\n\n")


if __name__ == "__main__":
    main()




0,1
train_acc,▁▆▇███
train_loss,█▃▂▂▁▁
val_acc,▁▆▇▇██
val_loss,█▄▂▂▁▂

0,1
train_acc,83.12333
train_loss,0.50879
val_acc,72.13333
val_loss,0.86891


Epoch 1/15
Train Loss: 1.2916, Accuracy: 60.52%
Val   Loss: 0.9746, Accuracy: 68.25%
✅ Best model saved.
Epoch 2/15
Train Loss: 0.6719, Accuracy: 78.24%
Val   Loss: 0.8701, Accuracy: 71.92%
✅ Best model saved.
Epoch 3/15
Train Loss: 0.5746, Accuracy: 81.27%
Val   Loss: 0.8041, Accuracy: 73.33%
✅ Best model saved.
Epoch 4/15
Train Loss: 0.5236, Accuracy: 82.72%
Val   Loss: 0.8114, Accuracy: 73.86%
✅ Best model saved.
Epoch 5/15
Train Loss: 0.4914, Accuracy: 83.66%
Val   Loss: 0.8050, Accuracy: 74.60%
✅ Best model saved.
Epoch 6/15
Train Loss: 0.4701, Accuracy: 84.21%
Val   Loss: 0.8052, Accuracy: 74.57%
Epoch 7/15
Train Loss: 0.4553, Accuracy: 84.49%
Val   Loss: 0.8100, Accuracy: 74.55%
Epoch 8/15
Train Loss: 0.4394, Accuracy: 84.89%
Val   Loss: 0.7896, Accuracy: 74.93%
✅ Best model saved.
Epoch 9/15
Train Loss: 0.4285, Accuracy: 85.05%
Val   Loss: 0.7989, Accuracy: 74.77%
Epoch 10/15
Train Loss: 0.4173, Accuracy: 85.34%
Val   Loss: 0.7853, Accuracy: 75.38%
✅ Best model saved.
Epoch 11/

In [None]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os

sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_loss', 'goal': 'minimize'},
    'parameters': {
        'embedding_dim': {'values': [64, 128,256]},
        'hidden_dim': {'values': [64, 128,256]},
        'enc_layers': {'values': [1, 2,3]},
        'dec_layers': {'values': [1, 2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.0,0.2, 0.3]},
        'epochs': {'values': [10,15]},
        'beam_size': {'values': [1, 3, 5]}
    }
}

default_config = {
    'embedding_dim': 64,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1
}

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.word2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2word = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for word in text.split():
                if word not in self.word2idx:
                    self.word2idx[word] = self.size
                    self.idx2word[self.size] = word
                    self.size += 1

    def encode(self, text):
        return [self.word2idx[w] for w in text.split()]

    def decode(self, idxs):
        return ' '.join([self.idx2word[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self): return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.word2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.word2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder, Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        emb = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(emb, hidden)
        return self.fc(output.squeeze(1)), hidden

# =======================
# Seq2Seq Model + Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.cell_type = cell_type
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])
        dec_hidden = self._match_layers(enc_hidden)
        input_token = trg[:, 0]

        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def beam_search(self, src, src_len, sos_idx, eos_idx, beam_size, max_len=20):
        enc_hidden = self.encoder(src, src_len)
        dec_hidden = self._match_layers(enc_hidden)

        sequences = [[sos_idx]]
        scores = [0.0]

        for _ in range(max_len):
            all_candidates = []
            for i, seq in enumerate(sequences):
                input_token = torch.tensor([seq[-1]]).to(self.device)
                output, new_hidden = self.decoder(input_token, dec_hidden)
                probs = torch.log_softmax(output, dim=-1)
                topk = torch.topk(probs, beam_size)
                for j in range(beam_size):
                    candidate = seq + [topk.indices[0][j].item()]
                    score = scores[i] + topk.values[0][j].item()
                    all_candidates.append((candidate, score))
            ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
            sequences, scores = zip(*ordered[:beam_size])
            if all(seq[-1] == eos_idx for seq in sequences):
                break
        return sequences[0]

    def _match_layers(self, hidden):
        if self.cell_type == 'LSTM':
            h, c = hidden
            return self._pad_layers(h), self._pad_layers(c)
        return self._pad_layers(hidden)

    def _pad_layers(self, h):
        if self.enc_layers == self.dec_layers:
            return h
        elif self.enc_layers > self.dec_layers:
            return h[:self.dec_layers]
        else:
            pad = h.new_zeros((self.dec_layers - self.enc_layers, *h.shape[1:]))
            return torch.cat([h, pad], dim=0)

# =======================
# Train and Evaluate
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

def evaluate(model, loader, criterion, device, out_vocab, beam_size):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            if beam_size > 1:
                for i in range(src.size(0)):
                    pred_seq = model.beam_search(src[i].unsqueeze(0), src_lens[i:i+1], 1, 2, beam_size)
                    gold_seq = trg[i].tolist()
                    total_correct += sum(p == g for p, g in zip(pred_seq[1:], gold_seq[1:]) if g != 0)
                    total_count += sum(1 for g in gold_seq[1:] if g != 0)
                total_loss += 0  # No loss in beam search mode
            else:
                output = model((src, src_lens), trg, teacher_forcing_ratio=0)
                loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
                pred = output.argmax(2)
                correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
                total_correct += correct
                total_count += (trg[:, 1:] != 0).sum().item()
                total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="word-level-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
    dev_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
    decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device, out_vocab, config.beam_size)
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f} Acc={train_acc:.4f}, Val Loss={val_loss:.4f} Acc={val_acc:.4f}")

        wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch+1})

# =======================
if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="word-level-transliteration")
    wandb.agent(sweep_id, function=main, count=5)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: v8nboqsd
Sweep URL: https://wandb.ai/manglesh_dl_ass3/word-level-transliteration/sweeps/v8nboqsd


[34m[1mwandb[0m: Agent Starting Run: 2eokon6t with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/15
Epoch 1: Train Loss=5.7940 Acc=49.5804, Val Loss=0.0000 Acc=48.1758
Epoch 2/15
Epoch 2: Train Loss=5.3633 Acc=50.0000, Val Loss=0.0000 Acc=43.5865
Epoch 3/15
Epoch 3: Train Loss=5.0938 Acc=50.0181, Val Loss=0.0000 Acc=39.4103
Epoch 4/15
Epoch 4: Train Loss=4.5842 Acc=50.3699, Val Loss=0.0000 Acc=39.0776
Epoch 5/15
Epoch 5: Train Loss=3.9932 Acc=52.3686, Val Loss=0.0000 Acc=37.5516
Epoch 6/15
Epoch 6: Train Loss=3.4184 Acc=57.0390, Val Loss=0.0000 Acc=38.8596
Epoch 7/15
Epoch 7: Train Loss=2.8836 Acc=62.4333, Val Loss=0.0000 Acc=39.7774
Epoch 8/15
Epoch 8: Train Loss=2.3975 Acc=66.3096, Val Loss=0.0000 Acc=40.8215
Epoch 9/15
Epoch 9: Train Loss=1.9742 Acc=68.3739, Val Loss=0.0000 Acc=41.7393
Epoch 10/15
Epoch 10: Train Loss=1.6258 Acc=69.0944, Val Loss=0.0000 Acc=42.0262
Epoch 11/15
Epoch 11: Train Loss=1.3579 Acc=69.3105, Val Loss=0.0000 Acc=43.0014
Epoch 12/15
Epoch 12: Train Loss=1.1646 Acc=69.4926, Val Loss=0.0000 Acc=43.0702
Epoch 13/15
Epoch 13: Train Loss=1.0331 Acc=69

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▁▁▁▂▄▆▇███████
train_loss,█▇▇▆▅▅▄▃▃▂▂▁▁▁▁
val_acc,█▅▂▂▁▂▂▃▄▄▅▅▅▅▅
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,69.48466
train_loss,0.88627
val_acc,43.6324
val_loss,0.0


[34m[1mwandb[0m: Agent Starting Run: qagkeha0 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64


Epoch 1/15
Epoch 1: Train Loss=6.1236 Acc=49.6120, Val Loss=0.0000 Acc=26.2391
Epoch 2/15
Epoch 2: Train Loss=5.5682 Acc=50.0011, Val Loss=0.0000 Acc=25.8720
Epoch 3/15
Epoch 3: Train Loss=5.4350 Acc=50.0011, Val Loss=0.0000 Acc=28.7976
Epoch 4/15
Epoch 4: Train Loss=5.3200 Acc=50.0034, Val Loss=0.0000 Acc=30.7480
Epoch 5/15


In [4]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'embedding_dim': {'values': [32,64,128,256]},
        'hidden_dim': {'values': [64, 128,256 ]},
        'enc_layers': {'values': [1, 2,3]},
        'dec_layers': {'values': [1, 2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.0,0.2, 0.3,0.5]},
        'epochs': {'values': [10,20, 15]},
        'beam_size': {'values': [1, 3, 5]},
        'batch_size': {'values': [64, 128, 256]},
        'learning_rate': {'values': [0.001, 0.0005, 0.0001]}
    }
}

default_config = {
    'embedding_dim': 32,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1,
    'batch_size': 64,
    'learning_rate': 0.001
}


# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Encoder and Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

# =======================
# Seq2Seq Model with Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])

        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Train Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Val Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="dakshina-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
    train_data = TransliterationDataset(train_path, inp_vocab, out_vocab)
    dev_data = TransliterationDataset(dev_path, inp_vocab, out_vocab)
    
    # Use config.batch_size for DataLoader
    train_loader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim, config.enc_layers, config.cell_type, config.dropout)
    decoder = Decoder(out_vocab.size, config.embedding_dim, config.hidden_dim, config.dec_layers, config.cell_type, config.dropout)
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers, config.cell_type, device).to(device)
    
    # Use config.learning_rate for optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc, "epoch": epoch+1})

# =======================

if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="dakshina-translit")
    wandb.agent(sweep_id, function=main,count=30)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 2yk21c98
Sweep URL: https://wandb.ai/manglesh_dl_ass3/dakshina-translit/sweeps/2yk21c98


[34m[1mwandb[0m: Agent Starting Run: f8lsoyx5 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/15
Train Loss: 2.8135, Acc: 23.51%
Val Loss: 2.5705, Acc: 28.12%
Epoch 2/15
Train Loss: 2.4345, Acc: 29.73%
Val Loss: 2.3086, Acc: 30.87%
Epoch 3/15
Train Loss: 2.2041, Acc: 32.47%
Val Loss: 2.1289, Acc: 32.56%
Epoch 4/15
Train Loss: 2.0241, Acc: 35.56%
Val Loss: 2.0125, Acc: 34.26%
Epoch 5/15
Train Loss: 1.8849, Acc: 38.74%
Val Loss: 1.9114, Acc: 36.35%
Epoch 6/15
Train Loss: 1.7606, Acc: 42.41%
Val Loss: 1.8224, Acc: 38.88%
Epoch 7/15
Train Loss: 1.6551, Acc: 45.66%
Val Loss: 1.7421, Acc: 41.13%
Epoch 8/15
Train Loss: 1.5627, Acc: 48.60%
Val Loss: 1.6578, Acc: 44.00%
Epoch 9/15
Train Loss: 1.4863, Acc: 50.87%
Val Loss: 1.5707, Acc: 47.06%
Epoch 10/15
Train Loss: 1.4028, Acc: 53.94%
Val Loss: 1.4997, Acc: 49.84%
Epoch 11/15
Train Loss: 1.3305, Acc: 56.48%
Val Loss: 1.4181, Acc: 52.45%
Epoch 12/15
Train Loss: 1.2608, Acc: 58.98%
Val Loss: 1.3476, Acc: 55.49%
Epoch 13/15
Train Loss: 1.2049, Acc: 60.94%
Val Loss: 1.2937, Acc: 56.91%
Epoch 14/15
Train Loss: 1.1443, Acc: 63.20%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▂▃▃▄▄▅▅▆▆▇▇▇██
train_loss,█▆▆▅▄▄▃▃▃▂▂▂▁▁▁
val_acc,▁▂▂▂▃▃▄▄▅▆▆▇▇██
val_loss,█▇▆▅▅▄▄▃▃▃▂▂▁▁▁

0,1
epoch,15.0
train_acc,64.67042
train_loss,1.09736
val_acc,60.21238
val_loss,1.19925


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dteoe9dh with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/20
Train Loss: 2.2577, Acc: 33.81%
Val Loss: 1.8775, Acc: 39.00%
Epoch 2/20
Train Loss: 1.5278, Acc: 50.62%
Val Loss: 1.5635, Acc: 48.27%
Epoch 3/20
Train Loss: 1.2740, Acc: 58.72%
Val Loss: 1.3934, Acc: 54.09%
Epoch 4/20
Train Loss: 1.1179, Acc: 64.05%
Val Loss: 1.2667, Acc: 58.63%
Epoch 5/20
Train Loss: 1.0101, Acc: 67.65%
Val Loss: 1.1977, Acc: 61.08%
Epoch 6/20
Train Loss: 0.9265, Acc: 70.48%
Val Loss: 1.1232, Acc: 63.40%
Epoch 7/20
Train Loss: 0.8674, Acc: 72.41%
Val Loss: 1.0720, Acc: 64.79%
Epoch 8/20
Train Loss: 0.8083, Acc: 74.40%
Val Loss: 1.0452, Acc: 65.60%
Epoch 9/20
Train Loss: 0.7741, Acc: 75.33%
Val Loss: 1.0145, Acc: 66.78%
Epoch 10/20
Train Loss: 0.7444, Acc: 76.19%
Val Loss: 0.9946, Acc: 67.19%
Epoch 11/20
Train Loss: 0.7136, Acc: 77.26%
Val Loss: 0.9856, Acc: 67.79%
Epoch 12/20
Train Loss: 0.6835, Acc: 78.26%
Val Loss: 0.9504, Acc: 68.51%
Epoch 13/20
Train Loss: 0.6657, Acc: 78.76%
Val Loss: 0.9413, Acc: 69.25%
Epoch 14/20
Train Loss: 0.6476, Acc: 79.26%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▅▅▆▆▇▇▇▇▇▇████████
train_loss,█▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▃▄▅▆▆▇▇▇▇▇▇████████
val_loss,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
train_acc,81.97143
train_loss,0.56146
val_acc,70.86311
val_loss,0.89417


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5fr42dqj with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/15
Train Loss: 2.8228, Acc: 22.32%
Val Loss: 2.6427, Acc: 25.90%
Epoch 2/15
Train Loss: 2.4641, Acc: 30.08%
Val Loss: 2.4051, Acc: 31.34%
Epoch 3/15
Train Loss: 2.2427, Acc: 35.23%
Val Loss: 2.2665, Acc: 33.58%
Epoch 4/15
Train Loss: 2.1051, Acc: 38.24%
Val Loss: 2.1652, Acc: 35.20%
Epoch 5/15
Train Loss: 1.9974, Acc: 40.49%
Val Loss: 2.0863, Acc: 36.22%
Epoch 6/15
Train Loss: 1.9087, Acc: 42.28%
Val Loss: 2.0062, Acc: 37.97%
Epoch 7/15
Train Loss: 1.8243, Acc: 44.30%
Val Loss: 1.9386, Acc: 39.49%
Epoch 8/15
Train Loss: 1.7495, Acc: 46.09%
Val Loss: 1.8720, Acc: 41.01%
Epoch 9/15
Train Loss: 1.6797, Acc: 47.84%
Val Loss: 1.8153, Acc: 42.46%
Epoch 10/15
Train Loss: 1.6159, Acc: 49.61%
Val Loss: 1.7517, Acc: 44.19%
Epoch 11/15
Train Loss: 1.5563, Acc: 51.27%
Val Loss: 1.7019, Acc: 45.46%
Epoch 12/15
Train Loss: 1.5039, Acc: 52.75%
Val Loss: 1.6616, Acc: 46.51%
Epoch 13/15
Train Loss: 1.4591, Acc: 54.10%
Val Loss: 1.6110, Acc: 47.95%
Epoch 14/15
Train Loss: 1.4088, Acc: 55.58%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▄▅▅▅▆▆▇▇▇▇██
train_loss,█▆▅▅▄▄▃▃▂▂▂▂▁▁▁
val_acc,▁▃▃▄▄▄▅▅▆▆▇▇▇██
val_loss,█▆▆▅▄▄▄▃▃▂▂▂▁▁▁

0,1
epoch,15.0
train_acc,56.65684
train_loss,1.37214
val_acc,50.25318
val_loss,1.53863


[34m[1mwandb[0m: Agent Starting Run: g9wdvcj7 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.4299, Acc: 55.76%
Val Loss: 1.0009, Acc: 66.51%
Epoch 2/15
Train Loss: 0.7540, Acc: 75.56%
Val Loss: 0.9032, Acc: 70.34%
Epoch 3/15
Train Loss: 0.6351, Acc: 79.36%
Val Loss: 0.8520, Acc: 71.82%
Epoch 4/15
Train Loss: 0.5792, Acc: 81.04%
Val Loss: 0.8505, Acc: 72.46%
Epoch 5/15
Train Loss: 0.5405, Acc: 82.22%
Val Loss: 0.8121, Acc: 73.14%
Epoch 6/15
Train Loss: 0.5166, Acc: 82.82%
Val Loss: 0.8129, Acc: 73.83%
Epoch 7/15
Train Loss: 0.4904, Acc: 83.70%
Val Loss: 0.8076, Acc: 73.98%
Epoch 8/15
Train Loss: 0.4758, Acc: 84.13%
Val Loss: 0.8095, Acc: 73.94%
Epoch 9/15
Train Loss: 0.4650, Acc: 84.25%
Val Loss: 0.8191, Acc: 74.25%
Epoch 10/15
Train Loss: 0.4528, Acc: 84.60%
Val Loss: 0.8058, Acc: 74.45%
Epoch 11/15
Train Loss: 0.4332, Acc: 85.16%
Val Loss: 0.8232, Acc: 74.03%
Epoch 12/15
Train Loss: 0.4322, Acc: 85.17%
Val Loss: 0.8190, Acc: 74.21%
Epoch 13/15
Train Loss: 0.4241, Acc: 85.40%
Val Loss: 0.8095, Acc: 74.25%
Epoch 14/15
Train Loss: 0.4123, Acc: 85.62%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇▇██▇████
val_loss,█▄▃▃▁▁▁▁▁▁▂▁▁▃▁

0,1
epoch,15.0
train_acc,85.47126
train_loss,0.41433
val_acc,74.68534
val_loss,0.80871


[34m[1mwandb[0m: Agent Starting Run: ak8uq9uh with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/20
Train Loss: 2.7816, Acc: 24.42%
Val Loss: 2.5395, Acc: 28.75%
Epoch 2/20
Train Loss: 2.4379, Acc: 29.77%
Val Loss: 2.3549, Acc: 30.73%
Epoch 3/20
Train Loss: 2.2707, Acc: 32.13%
Val Loss: 2.2359, Acc: 31.73%
Epoch 4/20
Train Loss: 2.1306, Acc: 34.52%
Val Loss: 2.1341, Acc: 33.17%
Epoch 5/20
Train Loss: 2.0118, Acc: 36.99%
Val Loss: 2.0551, Acc: 34.16%
Epoch 6/20
Train Loss: 1.9255, Acc: 38.79%
Val Loss: 1.9935, Acc: 34.97%
Epoch 7/20
Train Loss: 1.8439, Acc: 40.88%
Val Loss: 1.9283, Acc: 36.83%
Epoch 8/20
Train Loss: 1.7730, Acc: 42.80%
Val Loss: 1.8754, Acc: 38.09%
Epoch 9/20
Train Loss: 1.7043, Acc: 44.84%
Val Loss: 1.7996, Acc: 40.74%
Epoch 10/20
Train Loss: 1.6360, Acc: 46.99%
Val Loss: 1.7418, Acc: 42.34%
Epoch 11/20
Train Loss: 1.5757, Acc: 48.87%
Val Loss: 1.6795, Acc: 44.64%
Epoch 12/20
Train Loss: 1.5212, Acc: 50.52%
Val Loss: 1.6173, Acc: 46.73%
Epoch 13/20
Train Loss: 1.4588, Acc: 52.64%
Val Loss: 1.5578, Acc: 48.56%
Epoch 14/20
Train Loss: 1.4004, Acc: 54.71%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇▇███
train_loss,█▇▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
val_acc,▁▁▂▂▂▂▃▃▄▄▅▅▆▆▆▇▇▇██
val_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁

0,1
epoch,20.0
train_acc,62.92534
train_loss,1.14731
val_acc,59.48902
val_loss,1.23102


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: kk3m58a6 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/20
Train Loss: 2.6833, Acc: 24.82%
Val Loss: 2.4851, Acc: 28.21%
Epoch 2/20
Train Loss: 2.1716, Acc: 36.15%
Val Loss: 2.1096, Acc: 36.65%
Epoch 3/20
Train Loss: 1.8521, Acc: 43.91%
Val Loss: 1.8682, Acc: 42.03%
Epoch 4/20
Train Loss: 1.6465, Acc: 49.00%
Val Loss: 1.7460, Acc: 44.78%
Epoch 5/20
Train Loss: 1.5112, Acc: 52.33%
Val Loss: 1.6029, Acc: 47.44%
Epoch 6/20
Train Loss: 1.4144, Acc: 54.95%
Val Loss: 1.5326, Acc: 50.31%
Epoch 7/20
Train Loss: 1.3292, Acc: 57.53%
Val Loss: 1.4553, Acc: 51.48%
Epoch 8/20
Train Loss: 1.2563, Acc: 59.96%
Val Loss: 1.3879, Acc: 53.88%
Epoch 9/20
Train Loss: 1.2101, Acc: 61.21%
Val Loss: 1.3446, Acc: 55.28%
Epoch 10/20
Train Loss: 1.1497, Acc: 63.31%
Val Loss: 1.3139, Acc: 56.07%
Epoch 11/20
Train Loss: 1.1158, Acc: 64.27%
Val Loss: 1.2713, Acc: 57.67%
Epoch 12/20
Train Loss: 1.0862, Acc: 65.19%
Val Loss: 1.2793, Acc: 58.15%
Epoch 13/20
Train Loss: 1.0538, Acc: 66.20%
Val Loss: 1.2270, Acc: 59.37%
Epoch 14/20
Train Loss: 1.0205, Acc: 67.37%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇███████
train_loss,█▆▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▃▄▄▅▅▆▆▆▇▇▇▇▇██████
val_loss,█▆▅▄▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,20.0
train_acc,70.62467
train_loss,0.91335
val_acc,63.25338
val_loss,1.09876


[34m[1mwandb[0m: Agent Starting Run: 474z9jpv with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/20
Train Loss: 2.2773, Acc: 34.01%
Val Loss: 1.8953, Acc: 38.51%
Epoch 2/20
Train Loss: 1.5351, Acc: 51.10%
Val Loss: 1.5801, Acc: 48.04%
Epoch 3/20
Train Loss: 1.2716, Acc: 59.28%
Val Loss: 1.3953, Acc: 54.43%
Epoch 4/20
Train Loss: 1.1060, Acc: 64.71%
Val Loss: 1.2811, Acc: 58.22%
Epoch 5/20
Train Loss: 1.0088, Acc: 67.62%
Val Loss: 1.1764, Acc: 61.24%
Epoch 6/20
Train Loss: 0.9210, Acc: 70.53%
Val Loss: 1.1285, Acc: 62.94%
Epoch 7/20
Train Loss: 0.8559, Acc: 72.60%
Val Loss: 1.0966, Acc: 64.01%
Epoch 8/20
Train Loss: 0.8151, Acc: 73.78%
Val Loss: 1.0346, Acc: 65.69%
Epoch 9/20
Train Loss: 0.7731, Acc: 75.14%
Val Loss: 1.0130, Acc: 66.76%
Epoch 10/20
Train Loss: 0.7338, Acc: 76.51%
Val Loss: 0.9812, Acc: 67.63%
Epoch 11/20
Train Loss: 0.7109, Acc: 77.15%
Val Loss: 0.9620, Acc: 68.47%
Epoch 12/20
Train Loss: 0.6836, Acc: 78.08%
Val Loss: 0.9529, Acc: 68.59%
Epoch 13/20
Train Loss: 0.6621, Acc: 78.71%
Val Loss: 0.9454, Acc: 68.90%
Epoch 14/20
Train Loss: 0.6456, Acc: 79.25%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▅▅▆▆▇▇▇▇▇▇████████
train_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▃▄▅▆▆▇▇▇▇▇█████████
val_loss,█▆▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,20.0
train_acc,81.87763
train_loss,0.56334
val_acc,70.90073
val_loss,0.90142


[34m[1mwandb[0m: Agent Starting Run: c1urnk2t with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.2421, Acc: 60.84%
Val Loss: 0.9920, Acc: 66.09%
Epoch 2/15
Train Loss: 0.7083, Acc: 76.85%
Val Loss: 0.8844, Acc: 71.10%
Epoch 3/15
Train Loss: 0.6031, Acc: 80.23%
Val Loss: 0.8312, Acc: 71.75%
Epoch 4/15
Train Loss: 0.5552, Acc: 81.67%
Val Loss: 0.8320, Acc: 72.84%
Epoch 5/15
Train Loss: 0.5106, Acc: 83.11%
Val Loss: 0.8236, Acc: 73.51%
Epoch 6/15
Train Loss: 0.4871, Acc: 83.75%
Val Loss: 0.8094, Acc: 73.92%
Epoch 7/15
Train Loss: 0.4658, Acc: 84.25%
Val Loss: 0.8225, Acc: 73.83%
Epoch 8/15
Train Loss: 0.4492, Acc: 84.61%
Val Loss: 0.8164, Acc: 73.77%
Epoch 9/15
Train Loss: 0.4396, Acc: 84.90%
Val Loss: 0.8154, Acc: 74.14%
Epoch 10/15
Train Loss: 0.4275, Acc: 85.13%
Val Loss: 0.8244, Acc: 74.05%
Epoch 11/15
Train Loss: 0.4151, Acc: 85.49%
Val Loss: 0.8501, Acc: 74.24%
Epoch 12/15
Train Loss: 0.4024, Acc: 85.76%
Val Loss: 0.8028, Acc: 74.53%
Epoch 13/15
Train Loss: 0.3994, Acc: 85.86%
Val Loss: 0.8249, Acc: 74.01%
Epoch 14/15
Train Loss: 0.3877, Acc: 86.12%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇▇██████▇
val_loss,█▄▂▂▂▁▂▂▁▂▃▁▂▁▂

0,1
epoch,15.0
train_acc,86.10721
train_loss,0.38636
val_acc,73.77101
val_loss,0.83076


[34m[1mwandb[0m: Agent Starting Run: n0ba6hlh with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.8939, Acc: 43.98%
Val Loss: 1.1589, Acc: 61.76%
Epoch 2/15
Train Loss: 0.9003, Acc: 70.89%
Val Loss: 0.9160, Acc: 68.57%
Epoch 3/15
Train Loss: 0.7167, Acc: 76.69%
Val Loss: 0.8615, Acc: 71.31%
Epoch 4/15
Train Loss: 0.6412, Acc: 79.02%
Val Loss: 0.8244, Acc: 73.00%
Epoch 5/15
Train Loss: 0.5855, Acc: 80.77%
Val Loss: 0.8009, Acc: 73.55%
Epoch 6/15
Train Loss: 0.5574, Acc: 81.52%
Val Loss: 0.8118, Acc: 73.89%
Epoch 7/15
Train Loss: 0.5282, Acc: 82.40%
Val Loss: 0.7962, Acc: 74.36%
Epoch 8/15
Train Loss: 0.5060, Acc: 83.07%
Val Loss: 0.7950, Acc: 74.07%
Epoch 9/15
Train Loss: 0.4856, Acc: 83.69%
Val Loss: 0.7987, Acc: 74.62%
Epoch 10/15
Train Loss: 0.4757, Acc: 83.93%
Val Loss: 0.7627, Acc: 75.09%
Epoch 11/15
Train Loss: 0.4615, Acc: 84.36%
Val Loss: 0.7614, Acc: 74.96%
Epoch 12/15
Train Loss: 0.4533, Acc: 84.48%
Val Loss: 0.8043, Acc: 74.76%
Epoch 13/15
Train Loss: 0.4375, Acc: 84.95%
Val Loss: 0.7885, Acc: 74.61%
Epoch 14/15
Train Loss: 0.4305, Acc: 85.12%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇█▇███████
val_loss,█▄▃▂▂▂▂▂▂▁▁▂▁▁▁

0,1
epoch,15.0
train_acc,85.19011
train_loss,0.42496
val_acc,75.09042
val_loss,0.76437


[34m[1mwandb[0m: Agent Starting Run: u3lluckt with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.9789, Acc: 41.10%
Val Loss: 1.2306, Acc: 58.50%
Epoch 2/15
Train Loss: 0.9919, Acc: 67.66%
Val Loss: 0.9386, Acc: 68.30%
Epoch 3/15
Train Loss: 0.8052, Acc: 73.55%
Val Loss: 0.8596, Acc: 70.52%
Epoch 4/15
Train Loss: 0.7135, Acc: 76.58%
Val Loss: 0.8184, Acc: 72.35%
Epoch 5/15
Train Loss: 0.6585, Acc: 78.24%
Val Loss: 0.8232, Acc: 73.08%
Epoch 6/15
Train Loss: 0.6162, Acc: 79.76%
Val Loss: 0.7991, Acc: 73.41%
Epoch 7/15
Train Loss: 0.5856, Acc: 80.73%
Val Loss: 0.7853, Acc: 73.81%
Epoch 8/15
Train Loss: 0.5660, Acc: 81.23%
Val Loss: 0.8013, Acc: 74.04%
Epoch 9/15
Train Loss: 0.5508, Acc: 81.66%
Val Loss: 0.7773, Acc: 74.74%
Epoch 10/15
Train Loss: 0.5332, Acc: 82.19%
Val Loss: 0.7989, Acc: 74.57%
Epoch 11/15
Train Loss: 0.5218, Acc: 82.58%
Val Loss: 0.7869, Acc: 74.70%
Epoch 12/15
Train Loss: 0.5189, Acc: 82.50%
Val Loss: 0.7664, Acc: 74.47%
Epoch 13/15
Train Loss: 0.5047, Acc: 82.99%
Val Loss: 0.7696, Acc: 75.14%
Epoch 14/15
Train Loss: 0.4958, Acc: 83.29%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇█████████
train_loss,█▃▃▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇████████
val_loss,█▄▂▂▂▁▁▂▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,83.73804
train_loss,0.48231
val_acc,74.91971
val_loss,0.77202


[34m[1mwandb[0m: Agent Starting Run: yox3f4pr with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.1898, Acc: 36.27%
Val Loss: 2.0792, Acc: 41.18%
Epoch 2/15
Train Loss: 1.8874, Acc: 44.36%
Val Loss: 1.9383, Acc: 43.13%
Epoch 3/15
Train Loss: 1.7658, Acc: 47.31%
Val Loss: 1.7935, Acc: 45.61%
Epoch 4/15
Train Loss: 1.6580, Acc: 50.30%
Val Loss: 1.7302, Acc: 47.44%
Epoch 5/15
Train Loss: 1.5867, Acc: 52.02%
Val Loss: 1.6727, Acc: 48.92%
Epoch 6/15
Train Loss: 1.5410, Acc: 53.21%
Val Loss: 1.6465, Acc: 49.10%
Epoch 7/15
Train Loss: 1.5025, Acc: 54.10%
Val Loss: 1.6208, Acc: 49.74%
Epoch 8/15
Train Loss: 1.4725, Acc: 54.99%
Val Loss: 1.6094, Acc: 50.05%
Epoch 9/15
Train Loss: 1.4404, Acc: 55.83%
Val Loss: 1.5648, Acc: 51.22%
Epoch 10/15
Train Loss: 1.4140, Acc: 56.50%
Val Loss: 1.5393, Acc: 52.18%
Epoch 11/15
Train Loss: 1.3953, Acc: 57.00%
Val Loss: 1.5437, Acc: 52.46%
Epoch 12/15
Train Loss: 1.3712, Acc: 57.69%
Val Loss: 1.5086, Acc: 52.82%
Epoch 13/15
Train Loss: 1.3512, Acc: 58.26%
Val Loss: 1.5014, Acc: 52.79%
Epoch 14/15
Train Loss: 1.3345, Acc: 58.80%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▆▆▆▇▇▇▇████
train_loss,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁
val_acc,▁▂▃▄▅▅▅▆▆▇▇▇▇▇█
val_loss,█▆▅▄▃▃▃▃▂▂▂▂▂▁▁

0,1
epoch,15.0
train_acc,59.29501
train_loss,1.31797
val_acc,54.54703
val_loss,1.45694


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: cspkb8s3 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.9495, Acc: 42.36%
Val Loss: 1.2082, Acc: 60.83%
Epoch 2/15
Train Loss: 0.9175, Acc: 70.61%
Val Loss: 0.9184, Acc: 69.41%
Epoch 3/15
Train Loss: 0.7210, Acc: 76.76%
Val Loss: 0.8686, Acc: 71.34%
Epoch 4/15
Train Loss: 0.6349, Acc: 79.48%
Val Loss: 0.8205, Acc: 72.65%
Epoch 5/15
Train Loss: 0.5794, Acc: 81.16%
Val Loss: 0.8144, Acc: 73.72%
Epoch 6/15
Train Loss: 0.5507, Acc: 81.93%
Val Loss: 0.7915, Acc: 74.03%
Epoch 7/15
Train Loss: 0.5167, Acc: 83.00%
Val Loss: 0.7979, Acc: 74.31%
Epoch 8/15
Train Loss: 0.5009, Acc: 83.31%
Val Loss: 0.8038, Acc: 74.81%
Epoch 9/15
Train Loss: 0.4795, Acc: 84.01%
Val Loss: 0.8039, Acc: 74.90%
Epoch 10/15
Train Loss: 0.4692, Acc: 84.20%
Val Loss: 0.7717, Acc: 75.29%
Epoch 11/15
Train Loss: 0.4575, Acc: 84.46%
Val Loss: 0.8012, Acc: 75.02%
Epoch 12/15
Train Loss: 0.4391, Acc: 85.04%
Val Loss: 0.8074, Acc: 74.86%
Epoch 13/15
Train Loss: 0.4308, Acc: 85.19%
Val Loss: 0.8024, Acc: 75.19%
Epoch 14/15
Train Loss: 0.4281, Acc: 85.18%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇████████
val_loss,█▃▃▂▂▁▁▂▂▁▁▂▁▁▁

0,1
epoch,15.0
train_acc,85.64176
train_loss,0.41331
val_acc,75.46078
val_loss,0.7881


[34m[1mwandb[0m: Agent Starting Run: ycmh85vr with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.4875, Acc: 54.08%
Val Loss: 1.0604, Acc: 62.82%
Epoch 2/15
Train Loss: 0.8074, Acc: 73.59%
Val Loss: 0.9146, Acc: 69.20%
Epoch 3/15
Train Loss: 0.6775, Acc: 77.77%
Val Loss: 0.8877, Acc: 70.45%
Epoch 4/15
Train Loss: 0.6076, Acc: 79.98%
Val Loss: 0.8471, Acc: 71.66%
Epoch 5/15
Train Loss: 0.5615, Acc: 81.47%
Val Loss: 0.8351, Acc: 72.40%
Epoch 6/15
Train Loss: 0.5266, Acc: 82.51%
Val Loss: 0.8168, Acc: 73.41%
Epoch 7/15
Train Loss: 0.4957, Acc: 83.51%
Val Loss: 0.8146, Acc: 73.91%
Epoch 8/15
Train Loss: 0.4860, Acc: 83.69%
Val Loss: 0.8162, Acc: 73.74%
Epoch 9/15
Train Loss: 0.4733, Acc: 83.95%
Val Loss: 0.8253, Acc: 74.00%
Epoch 10/15
Train Loss: 0.4531, Acc: 84.59%
Val Loss: 0.7964, Acc: 73.95%
Epoch 11/15
Train Loss: 0.4403, Acc: 85.01%
Val Loss: 0.8265, Acc: 73.71%
Epoch 12/15
Train Loss: 0.4328, Acc: 85.12%
Val Loss: 0.8187, Acc: 74.16%
Epoch 13/15
Train Loss: 0.4206, Acc: 85.39%
Val Loss: 0.8213, Acc: 74.44%
Epoch 14/15
Train Loss: 0.4153, Acc: 85.49%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇█████████
val_loss,█▄▃▂▂▂▁▂▂▁▂▂▂▁▂

0,1
epoch,15.0
train_acc,85.84814
train_loss,0.40241
val_acc,74.40178
val_loss,0.82827


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: yeey4ogj with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 2.4779, Acc: 28.59%
Val Loss: 2.2635, Acc: 34.37%
Epoch 2/15
Train Loss: 2.0328, Acc: 39.84%
Val Loss: 1.9597, Acc: 41.49%
Epoch 3/15
Train Loss: 1.8329, Acc: 45.07%
Val Loss: 1.8043, Acc: 44.83%
Epoch 4/15
Train Loss: 1.6957, Acc: 48.52%
Val Loss: 1.6953, Acc: 47.42%
Epoch 5/15
Train Loss: 1.5883, Acc: 51.15%
Val Loss: 1.6138, Acc: 48.74%
Epoch 6/15
Train Loss: 1.4976, Acc: 53.52%
Val Loss: 1.4868, Acc: 52.18%
Epoch 7/15
Train Loss: 1.4190, Acc: 55.69%
Val Loss: 1.4376, Acc: 53.53%
Epoch 8/15
Train Loss: 1.3523, Acc: 57.70%
Val Loss: 1.3848, Acc: 55.01%
Epoch 9/15
Train Loss: 1.2998, Acc: 59.14%
Val Loss: 1.3185, Acc: 56.95%
Epoch 10/15
Train Loss: 1.2415, Acc: 61.00%
Val Loss: 1.3044, Acc: 57.49%
Epoch 11/15
Train Loss: 1.2058, Acc: 61.92%
Val Loss: 1.2462, Acc: 59.09%
Epoch 12/15
Train Loss: 1.1639, Acc: 63.09%
Val Loss: 1.2240, Acc: 59.90%
Epoch 13/15
Train Loss: 1.1168, Acc: 64.67%
Val Loss: 1.1982, Acc: 61.26%
Epoch 14/15
Train Loss: 1.0893, Acc: 65.50%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇███
train_loss,█▆▅▄▄▃▃▂▂▂▂▂▁▁▁
val_acc,▁▃▄▄▅▅▆▆▇▇▇▇███
val_loss,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁

0,1
epoch,15.0
train_acc,66.56421
train_loss,1.05484
val_acc,62.56763
val_loss,1.16361


[34m[1mwandb[0m: Agent Starting Run: dsh73upq with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.4359, Acc: 29.42%
Val Loss: 2.2767, Acc: 34.49%
Epoch 2/15
Train Loss: 2.0975, Acc: 38.72%
Val Loss: 2.0787, Acc: 39.70%
Epoch 3/15
Train Loss: 1.9221, Acc: 43.28%
Val Loss: 1.8858, Acc: 43.93%
Epoch 4/15
Train Loss: 1.7799, Acc: 46.72%
Val Loss: 1.7879, Acc: 45.67%
Epoch 5/15
Train Loss: 1.6906, Acc: 48.56%
Val Loss: 1.6876, Acc: 47.50%
Epoch 6/15
Train Loss: 1.6022, Acc: 50.70%
Val Loss: 1.6231, Acc: 49.06%
Epoch 7/15
Train Loss: 1.5171, Acc: 53.24%
Val Loss: 1.5824, Acc: 50.01%
Epoch 8/15
Train Loss: 1.4696, Acc: 54.33%
Val Loss: 1.5066, Acc: 52.16%
Epoch 9/15
Train Loss: 1.4006, Acc: 56.40%
Val Loss: 1.4793, Acc: 53.58%
Epoch 10/15
Train Loss: 1.3501, Acc: 57.94%
Val Loss: 1.4142, Acc: 55.08%
Epoch 11/15
Train Loss: 1.3097, Acc: 58.95%
Val Loss: 1.3537, Acc: 56.68%
Epoch 12/15
Train Loss: 1.2714, Acc: 60.05%
Val Loss: 1.3411, Acc: 57.67%
Epoch 13/15
Train Loss: 1.2300, Acc: 61.30%
Val Loss: 1.3058, Acc: 58.10%
Epoch 14/15
Train Loss: 1.1969, Acc: 62.28%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▅▅▆▆▇▇▇▇███
train_loss,█▆▅▄▄▃▃▃▂▂▂▂▁▁▁
val_acc,▁▂▄▄▅▅▅▆▆▇▇▇███
val_loss,█▇▅▅▄▄▃▃▃▂▂▂▁▁▁

0,1
epoch,15.0
train_acc,63.25697
train_loss,1.16489
val_acc,59.67709
val_loss,1.2508


[34m[1mwandb[0m: Agent Starting Run: t3d7y2bh with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.2682, Acc: 35.28%
Val Loss: 2.0886, Acc: 41.21%
Epoch 2/15
Train Loss: 1.8362, Acc: 46.21%
Val Loss: 1.7592, Acc: 47.63%
Epoch 3/15
Train Loss: 1.5636, Acc: 52.58%
Val Loss: 1.6207, Acc: 49.82%
Epoch 4/15
Train Loss: 1.4038, Acc: 56.76%
Val Loss: 1.4162, Acc: 55.28%
Epoch 5/15
Train Loss: 1.2600, Acc: 61.05%
Val Loss: 1.3002, Acc: 58.55%
Epoch 6/15
Train Loss: 1.1544, Acc: 64.00%
Val Loss: 1.2392, Acc: 59.92%
Epoch 7/15
Train Loss: 1.0710, Acc: 66.37%
Val Loss: 1.1272, Acc: 63.07%
Epoch 8/15
Train Loss: 1.0126, Acc: 67.90%
Val Loss: 1.0986, Acc: 64.59%
Epoch 9/15
Train Loss: 0.9639, Acc: 69.29%
Val Loss: 1.0859, Acc: 64.73%
Epoch 10/15
Train Loss: 0.9125, Acc: 70.92%
Val Loss: 1.0558, Acc: 65.45%
Epoch 11/15
Train Loss: 0.8940, Acc: 71.35%
Val Loss: 1.0086, Acc: 66.97%
Epoch 12/15
Train Loss: 0.8599, Acc: 72.42%
Val Loss: 1.0068, Acc: 66.37%
Epoch 13/15
Train Loss: 0.8387, Acc: 73.03%
Val Loss: 1.0050, Acc: 67.10%
Epoch 14/15
Train Loss: 0.8202, Acc: 73.55%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▆▆▇▇▇▇▇████
train_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁
val_acc,▁▃▃▅▆▆▇▇▇▇█████
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁

0,1
epoch,15.0
train_acc,74.51434
train_loss,0.79342
val_acc,67.67744
val_loss,0.97203


[34m[1mwandb[0m: Agent Starting Run: t7d1otzw with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.2499, Acc: 35.41%
Val Loss: 2.0772, Acc: 40.73%
Epoch 2/15
Train Loss: 1.7658, Acc: 47.55%
Val Loss: 1.7414, Acc: 47.10%
Epoch 3/15
Train Loss: 1.5230, Acc: 53.58%
Val Loss: 1.5572, Acc: 51.54%
Epoch 4/15
Train Loss: 1.3745, Acc: 57.71%
Val Loss: 1.4855, Acc: 53.90%
Epoch 5/15
Train Loss: 1.2518, Acc: 61.20%
Val Loss: 1.3554, Acc: 56.93%
Epoch 6/15
Train Loss: 1.1693, Acc: 63.33%
Val Loss: 1.2624, Acc: 59.49%
Epoch 7/15
Train Loss: 1.0902, Acc: 65.75%
Val Loss: 1.2501, Acc: 60.09%
Epoch 8/15
Train Loss: 1.0221, Acc: 67.88%
Val Loss: 1.1752, Acc: 61.73%
Epoch 9/15
Train Loss: 0.9713, Acc: 69.32%
Val Loss: 1.1364, Acc: 63.13%
Epoch 10/15
Train Loss: 0.9362, Acc: 70.18%
Val Loss: 1.1187, Acc: 63.73%
Epoch 11/15
Train Loss: 0.8912, Acc: 71.77%
Val Loss: 1.1154, Acc: 64.02%
Epoch 12/15
Train Loss: 0.8784, Acc: 71.87%
Val Loss: 1.1177, Acc: 64.27%
Epoch 13/15
Train Loss: 0.8613, Acc: 72.31%
Val Loss: 1.0788, Acc: 65.21%
Epoch 14/15
Train Loss: 0.8284, Acc: 73.43%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▄▅▆▆▇▇▇▇█████
train_loss,█▆▄▄▃▃▂▂▂▂▁▁▁▁▁
val_acc,▁▃▄▅▆▆▆▇▇▇█████
val_loss,█▆▄▄▃▂▂▂▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,73.90101
train_loss,0.80975
val_acc,65.14858
val_loss,1.06963


[34m[1mwandb[0m: Agent Starting Run: ndzqxg9t with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.5748, Acc: 52.53%
Val Loss: 0.9994, Acc: 66.27%
Epoch 2/15
Train Loss: 0.7779, Acc: 74.87%
Val Loss: 0.8744, Acc: 70.71%
Epoch 3/15
Train Loss: 0.6548, Acc: 78.67%
Val Loss: 0.8564, Acc: 72.56%
Epoch 4/15
Train Loss: 0.5948, Acc: 80.51%
Val Loss: 0.8241, Acc: 73.05%
Epoch 5/15
Train Loss: 0.5535, Acc: 81.73%
Val Loss: 0.7969, Acc: 73.72%
Epoch 6/15
Train Loss: 0.5156, Acc: 82.92%
Val Loss: 0.8008, Acc: 74.03%
Epoch 7/15
Train Loss: 0.5019, Acc: 83.15%
Val Loss: 0.7917, Acc: 74.47%
Epoch 8/15
Train Loss: 0.4846, Acc: 83.70%
Val Loss: 0.8085, Acc: 74.74%
Epoch 9/15
Train Loss: 0.4669, Acc: 84.23%
Val Loss: 0.7874, Acc: 74.38%
Epoch 10/15
Train Loss: 0.4588, Acc: 84.34%
Val Loss: 0.7936, Acc: 74.73%
Epoch 11/15
Train Loss: 0.4516, Acc: 84.43%
Val Loss: 0.7677, Acc: 75.24%
Epoch 12/15
Train Loss: 0.4460, Acc: 84.57%
Val Loss: 0.7663, Acc: 74.80%
Epoch 13/15
Train Loss: 0.4296, Acc: 85.05%
Val Loss: 0.8121, Acc: 75.10%
Epoch 14/15
Train Loss: 0.4257, Acc: 85.14%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇█▇██████
val_loss,█▄▄▃▂▂▂▂▂▂▁▁▂▁▂

0,1
epoch,15.0
train_acc,85.29854
train_loss,0.41819
val_acc,75.18012
val_loss,0.79751


[34m[1mwandb[0m: Agent Starting Run: n91f3nbw with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.0697, Acc: 38.83%
Val Loss: 1.3035, Acc: 56.78%
Epoch 2/15
Train Loss: 1.0324, Acc: 66.50%
Val Loss: 0.9715, Acc: 67.27%
Epoch 3/15
Train Loss: 0.8065, Acc: 73.86%
Val Loss: 0.8763, Acc: 70.66%
Epoch 4/15
Train Loss: 0.7194, Acc: 76.54%
Val Loss: 0.8536, Acc: 71.96%
Epoch 5/15
Train Loss: 0.6619, Acc: 78.35%
Val Loss: 0.8173, Acc: 72.66%
Epoch 6/15
Train Loss: 0.6304, Acc: 79.19%
Val Loss: 0.8055, Acc: 73.59%
Epoch 7/15
Train Loss: 0.5896, Acc: 80.63%
Val Loss: 0.7730, Acc: 73.73%
Epoch 8/15
Train Loss: 0.5714, Acc: 81.17%
Val Loss: 0.7883, Acc: 74.25%
Epoch 9/15
Train Loss: 0.5473, Acc: 81.93%
Val Loss: 0.7717, Acc: 74.20%
Epoch 10/15
Train Loss: 0.5459, Acc: 81.75%
Val Loss: 0.7695, Acc: 74.62%
Epoch 11/15
Train Loss: 0.5261, Acc: 82.39%
Val Loss: 0.7846, Acc: 74.80%
Epoch 12/15
Train Loss: 0.5136, Acc: 82.77%
Val Loss: 0.7499, Acc: 75.06%
Epoch 13/15
Train Loss: 0.5041, Acc: 83.03%
Val Loss: 0.7636, Acc: 74.85%
Epoch 14/15
Train Loss: 0.4955, Acc: 83.27%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇████████
val_loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,83.64947
train_loss,0.48463
val_acc,75.16276
val_loss,0.7586


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: wiunnrml with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.4075, Acc: 57.00%
Val Loss: 0.9825, Acc: 66.98%
Epoch 2/15
Train Loss: 0.7508, Acc: 75.43%
Val Loss: 0.8848, Acc: 71.24%
Epoch 3/15
Train Loss: 0.6288, Acc: 79.42%
Val Loss: 0.8568, Acc: 72.12%
Epoch 4/15
Train Loss: 0.5769, Acc: 80.99%
Val Loss: 0.8112, Acc: 73.55%
Epoch 5/15
Train Loss: 0.5293, Acc: 82.42%
Val Loss: 0.8164, Acc: 74.09%
Epoch 6/15
Train Loss: 0.5102, Acc: 82.94%
Val Loss: 0.8174, Acc: 74.16%
Epoch 7/15
Train Loss: 0.4872, Acc: 83.66%
Val Loss: 0.7903, Acc: 74.25%
Epoch 8/15
Train Loss: 0.4709, Acc: 84.06%
Val Loss: 0.7975, Acc: 74.18%
Epoch 9/15
Train Loss: 0.4517, Acc: 84.61%
Val Loss: 0.7870, Acc: 74.74%
Epoch 10/15
Train Loss: 0.4503, Acc: 84.45%
Val Loss: 0.8142, Acc: 74.53%
Epoch 11/15
Train Loss: 0.4344, Acc: 85.00%
Val Loss: 0.7999, Acc: 74.61%
Epoch 12/15
Train Loss: 0.4257, Acc: 85.15%
Val Loss: 0.7921, Acc: 75.08%
Epoch 13/15
Train Loss: 0.4199, Acc: 85.28%
Val Loss: 0.7777, Acc: 75.06%
Epoch 14/15
Train Loss: 0.4130, Acc: 85.40%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▅▇▇▇▇▇███████
val_loss,█▅▄▂▂▂▁▂▁▂▂▁▁▂▂

0,1
epoch,15.0
train_acc,85.78054
train_loss,0.39988
val_acc,74.99204
val_loss,0.80303


[34m[1mwandb[0m: Agent Starting Run: dq5dejol with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.2723, Acc: 33.76%
Val Loss: 1.6611, Acc: 46.52%
Epoch 2/15
Train Loss: 1.1243, Acc: 64.51%
Val Loss: 0.9851, Acc: 66.76%
Epoch 3/15
Train Loss: 0.8078, Acc: 74.15%
Val Loss: 0.8910, Acc: 69.73%
Epoch 4/15
Train Loss: 0.7063, Acc: 77.24%
Val Loss: 0.8634, Acc: 71.87%
Epoch 5/15
Train Loss: 0.6480, Acc: 78.97%
Val Loss: 0.8635, Acc: 72.42%
Epoch 6/15
Train Loss: 0.6060, Acc: 80.26%
Val Loss: 0.8092, Acc: 73.38%
Epoch 7/15
Train Loss: 0.5741, Acc: 81.26%
Val Loss: 0.8110, Acc: 73.33%
Epoch 8/15
Train Loss: 0.5546, Acc: 81.69%
Val Loss: 0.7988, Acc: 73.51%
Epoch 9/15
Train Loss: 0.5321, Acc: 82.50%
Val Loss: 0.8005, Acc: 73.80%
Epoch 10/15
Train Loss: 0.5160, Acc: 82.83%
Val Loss: 0.8004, Acc: 73.99%
Epoch 11/15
Train Loss: 0.5000, Acc: 83.30%
Val Loss: 0.7949, Acc: 74.60%
Epoch 12/15
Train Loss: 0.4797, Acc: 83.99%
Val Loss: 0.7820, Acc: 74.35%
Epoch 13/15
Train Loss: 0.4780, Acc: 83.89%
Val Loss: 0.7930, Acc: 74.81%
Epoch 14/15
Train Loss: 0.4650, Acc: 84.30%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▇▇▇▇█████████
train_loss,█▄▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▆▇▇▇██████████
val_loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,84.39192
train_loss,0.46049
val_acc,74.78082
val_loss,0.77535


[34m[1mwandb[0m: Agent Starting Run: bq87zo6n with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.8840, Acc: 42.05%
Val Loss: 1.2589, Acc: 57.22%
Epoch 2/15
Train Loss: 0.9581, Acc: 68.99%
Val Loss: 0.9719, Acc: 67.92%
Epoch 3/15
Train Loss: 0.7767, Acc: 74.65%
Val Loss: 0.8680, Acc: 70.16%
Epoch 4/15
Train Loss: 0.6934, Acc: 77.27%
Val Loss: 0.8606, Acc: 71.62%
Epoch 5/15
Train Loss: 0.6335, Acc: 79.23%
Val Loss: 0.8462, Acc: 72.15%
Epoch 6/15
Train Loss: 0.5985, Acc: 80.32%
Val Loss: 0.8459, Acc: 72.48%
Epoch 7/15
Train Loss: 0.5783, Acc: 80.78%
Val Loss: 0.8358, Acc: 73.12%
Epoch 8/15
Train Loss: 0.5341, Acc: 82.42%
Val Loss: 0.8179, Acc: 73.58%
Epoch 9/15
Train Loss: 0.5213, Acc: 82.73%
Val Loss: 0.8088, Acc: 73.93%
Epoch 10/15
Train Loss: 0.5090, Acc: 82.92%
Val Loss: 0.7994, Acc: 74.42%
Epoch 11/15
Train Loss: 0.4976, Acc: 83.31%
Val Loss: 0.8001, Acc: 74.23%
Epoch 12/15
Train Loss: 0.4858, Acc: 83.69%
Val Loss: 0.7979, Acc: 74.61%
Epoch 13/15
Train Loss: 0.4718, Acc: 84.07%
Val Loss: 0.7996, Acc: 74.22%
Epoch 14/15
Train Loss: 0.4697, Acc: 84.05%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇▇███████
val_loss,█▄▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,84.25177
train_loss,0.46381
val_acc,74.22818
val_loss,0.80138


[34m[1mwandb[0m: Agent Starting Run: 20c74d9l with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.6042, Acc: 49.49%
Val Loss: 1.1109, Acc: 63.48%
Epoch 2/15
Train Loss: 0.9266, Acc: 69.35%
Val Loss: 0.9430, Acc: 69.15%
Epoch 3/15
Train Loss: 0.7767, Acc: 74.51%
Val Loss: 0.8828, Acc: 71.03%
Epoch 4/15
Train Loss: 0.7042, Acc: 76.92%
Val Loss: 0.8755, Acc: 71.69%
Epoch 5/15
Train Loss: 0.6747, Acc: 77.82%
Val Loss: 0.8389, Acc: 72.47%
Epoch 6/15
Train Loss: 0.6330, Acc: 79.22%
Val Loss: 0.8136, Acc: 73.18%
Epoch 7/15
Train Loss: 0.6083, Acc: 79.97%
Val Loss: 0.8472, Acc: 73.56%
Epoch 8/15
Train Loss: 0.5917, Acc: 80.47%
Val Loss: 0.8117, Acc: 74.25%
Epoch 9/15
Train Loss: 0.5757, Acc: 80.97%
Val Loss: 0.8260, Acc: 74.11%
Epoch 10/15
Train Loss: 0.5689, Acc: 81.19%
Val Loss: 0.8286, Acc: 74.31%
Epoch 11/15
Train Loss: 0.5596, Acc: 81.36%
Val Loss: 0.8347, Acc: 73.96%
Epoch 12/15
Train Loss: 0.5463, Acc: 81.84%
Val Loss: 0.7965, Acc: 74.44%
Epoch 13/15
Train Loss: 0.5382, Acc: 82.09%
Val Loss: 0.8068, Acc: 74.00%
Epoch 14/15
Train Loss: 0.5279, Acc: 82.34%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇████████
val_loss,█▄▃▃▂▁▂▁▂▂▂▁▁▂▁

0,1
epoch,15.0
train_acc,82.20705
train_loss,0.5304
val_acc,74.59564
val_loss,0.81478


[34m[1mwandb[0m: Agent Starting Run: zp3gbqsj with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 2.0434, Acc: 39.54%
Val Loss: 1.3052, Acc: 56.15%
Epoch 2/15
Train Loss: 1.0172, Acc: 66.97%
Val Loss: 1.0009, Acc: 66.29%
Epoch 3/15
Train Loss: 0.8097, Acc: 73.46%
Val Loss: 0.9110, Acc: 68.83%
Epoch 4/15
Train Loss: 0.6948, Acc: 77.32%
Val Loss: 0.8604, Acc: 71.12%
Epoch 5/15
Train Loss: 0.6502, Acc: 78.61%
Val Loss: 0.8340, Acc: 72.05%
Epoch 6/15
Train Loss: 0.6044, Acc: 80.07%
Val Loss: 0.8069, Acc: 72.46%
Epoch 7/15
Train Loss: 0.5601, Acc: 81.53%
Val Loss: 0.8260, Acc: 73.23%
Epoch 8/15
Train Loss: 0.5446, Acc: 81.86%
Val Loss: 0.7847, Acc: 73.90%
Epoch 9/15
Train Loss: 0.5268, Acc: 82.45%
Val Loss: 0.7909, Acc: 73.78%
Epoch 10/15
Train Loss: 0.5058, Acc: 83.10%
Val Loss: 0.7896, Acc: 74.02%
Epoch 11/15
Train Loss: 0.4900, Acc: 83.55%
Val Loss: 0.7946, Acc: 73.81%
Epoch 12/15
Train Loss: 0.4710, Acc: 84.08%
Val Loss: 0.7975, Acc: 74.31%
Epoch 13/15
Train Loss: 0.4646, Acc: 84.18%
Val Loss: 0.8092, Acc: 73.77%
Epoch 14/15
Train Loss: 0.4536, Acc: 84.57%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇████████
val_loss,█▄▃▂▂▁▂▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,84.86813
train_loss,0.44054
val_acc,74.95443
val_loss,0.78912


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4n3kcgza with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 1.4188, Acc: 55.46%
Val Loss: 1.0102, Acc: 65.15%
Epoch 2/15
Train Loss: 0.8032, Acc: 73.79%
Val Loss: 0.8963, Acc: 70.61%
Epoch 3/15
Train Loss: 0.6785, Acc: 77.84%
Val Loss: 0.8566, Acc: 71.85%
Epoch 4/15
Train Loss: 0.6161, Acc: 79.84%
Val Loss: 0.8493, Acc: 72.79%
Epoch 5/15
Train Loss: 0.5730, Acc: 81.26%
Val Loss: 0.8139, Acc: 73.80%
Epoch 6/15
Train Loss: 0.5433, Acc: 82.16%
Val Loss: 0.8070, Acc: 74.17%
Epoch 7/15
Train Loss: 0.5213, Acc: 82.79%
Val Loss: 0.7850, Acc: 74.55%
Epoch 8/15
Train Loss: 0.5079, Acc: 83.05%
Val Loss: 0.8056, Acc: 74.43%
Epoch 9/15
Train Loss: 0.4917, Acc: 83.53%
Val Loss: 0.7877, Acc: 74.95%
Epoch 10/15
Train Loss: 0.4772, Acc: 83.91%
Val Loss: 0.7868, Acc: 74.96%
Epoch 11/15
Train Loss: 0.4645, Acc: 84.26%
Val Loss: 0.7927, Acc: 74.79%
Epoch 12/15
Train Loss: 0.4558, Acc: 84.49%
Val Loss: 0.8180, Acc: 74.81%
Epoch 13/15
Train Loss: 0.4469, Acc: 84.70%
Val Loss: 0.7861, Acc: 75.18%
Epoch 14/15
Train Loss: 0.4359, Acc: 85.00%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇▇███████
val_loss,█▄▃▃▂▂▁▂▁▁▁▂▁▁▂

0,1
epoch,15.0
train_acc,85.16693
train_loss,0.42968
val_acc,74.94285
val_loss,0.82594


[34m[1mwandb[0m: Agent Starting Run: 3z4tr0a9 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.8365, Acc: 42.66%
Val Loss: 1.2305, Acc: 59.14%
Epoch 2/15
Train Loss: 1.0175, Acc: 66.46%
Val Loss: 0.9715, Acc: 67.22%
Epoch 3/15
Train Loss: 0.8351, Acc: 72.66%
Val Loss: 0.9157, Acc: 69.43%
Epoch 4/15
Train Loss: 0.7555, Acc: 75.28%
Val Loss: 0.8797, Acc: 71.08%
Epoch 5/15
Train Loss: 0.7061, Acc: 76.91%
Val Loss: 0.8834, Acc: 71.80%
Epoch 6/15
Train Loss: 0.6676, Acc: 78.18%
Val Loss: 0.8530, Acc: 71.61%
Epoch 7/15
Train Loss: 0.6433, Acc: 78.99%
Val Loss: 0.8187, Acc: 72.71%
Epoch 8/15
Train Loss: 0.6262, Acc: 79.53%
Val Loss: 0.8503, Acc: 72.69%
Epoch 9/15
Train Loss: 0.6034, Acc: 80.33%
Val Loss: 0.8206, Acc: 73.31%
Epoch 10/15
Train Loss: 0.5906, Acc: 80.66%
Val Loss: 0.8420, Acc: 73.24%
Epoch 11/15
Train Loss: 0.5805, Acc: 81.02%
Val Loss: 0.8375, Acc: 73.66%
Epoch 12/15
Train Loss: 0.5621, Acc: 81.64%
Val Loss: 0.8375, Acc: 73.56%
Epoch 13/15
Train Loss: 0.5625, Acc: 81.58%
Val Loss: 0.8530, Acc: 73.62%
Epoch 14/15
Train Loss: 0.5560, Acc: 81.62%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇▇███████
val_loss,█▄▃▂▂▂▁▂▁▁▁▁▂▁▁

0,1
epoch,15.0
train_acc,81.92094
train_loss,0.54673
val_acc,74.05457
val_loss,0.82128


[34m[1mwandb[0m: Agent Starting Run: 16kiufuf with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.8954, Acc: 43.58%
Val Loss: 1.1600, Acc: 61.84%
Epoch 2/15
Train Loss: 0.8947, Acc: 71.12%
Val Loss: 0.9200, Acc: 68.91%
Epoch 3/15
Train Loss: 0.7209, Acc: 76.52%
Val Loss: 0.8705, Acc: 71.29%
Epoch 4/15
Train Loss: 0.6402, Acc: 79.06%
Val Loss: 0.8347, Acc: 72.30%
Epoch 5/15
Train Loss: 0.5912, Acc: 80.64%
Val Loss: 0.8118, Acc: 73.12%
Epoch 6/15
Train Loss: 0.5526, Acc: 81.83%
Val Loss: 0.8015, Acc: 73.71%
Epoch 7/15
Train Loss: 0.5269, Acc: 82.56%
Val Loss: 0.7727, Acc: 74.23%
Epoch 8/15
Train Loss: 0.5038, Acc: 83.22%
Val Loss: 0.7811, Acc: 74.63%
Epoch 9/15
Train Loss: 0.4896, Acc: 83.55%
Val Loss: 0.8056, Acc: 74.51%
Epoch 10/15
Train Loss: 0.4752, Acc: 83.93%
Val Loss: 0.7956, Acc: 74.51%
Epoch 11/15
Train Loss: 0.4635, Acc: 84.25%
Val Loss: 0.7699, Acc: 74.82%
Epoch 12/15
Train Loss: 0.4494, Acc: 84.60%
Val Loss: 0.7782, Acc: 74.72%
Epoch 13/15
Train Loss: 0.4445, Acc: 84.76%
Val Loss: 0.7772, Acc: 75.35%
Epoch 14/15
Train Loss: 0.4325, Acc: 85.08%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇████████
val_loss,█▄▃▂▂▂▁▁▂▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,85.07589
train_loss,0.42885
val_acc,75.24667
val_loss,0.7732


[34m[1mwandb[0m: Agent Starting Run: sm12c672 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/20
Train Loss: 1.6876, Acc: 47.55%
Val Loss: 1.0979, Acc: 62.50%
Epoch 2/20
Train Loss: 0.8810, Acc: 71.40%
Val Loss: 0.9687, Acc: 68.77%
Epoch 3/20
Train Loss: 0.7360, Acc: 76.11%
Val Loss: 0.9171, Acc: 70.65%
Epoch 4/20
Train Loss: 0.6704, Acc: 78.17%
Val Loss: 0.8979, Acc: 71.54%
Epoch 5/20
Train Loss: 0.6235, Acc: 79.65%
Val Loss: 0.8618, Acc: 72.51%
Epoch 6/20
Train Loss: 0.5931, Acc: 80.61%
Val Loss: 0.8318, Acc: 73.12%
Epoch 7/20
Train Loss: 0.5717, Acc: 81.16%
Val Loss: 0.8411, Acc: 73.42%
Epoch 8/20
Train Loss: 0.5481, Acc: 81.93%
Val Loss: 0.8301, Acc: 73.59%
Epoch 9/20
Train Loss: 0.5387, Acc: 82.10%
Val Loss: 0.8412, Acc: 73.18%
Epoch 10/20
Train Loss: 0.5175, Acc: 82.80%
Val Loss: 0.8556, Acc: 73.80%
Epoch 11/20
Train Loss: 0.5038, Acc: 83.23%
Val Loss: 0.8368, Acc: 74.17%
Epoch 12/20
Train Loss: 0.5102, Acc: 82.92%
Val Loss: 0.7940, Acc: 74.34%
Epoch 13/20
Train Loss: 0.4984, Acc: 83.26%
Val Loss: 0.8230, Acc: 74.26%
Epoch 14/20
Train Loss: 0.4841, Acc: 83.71%
Val

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▆▆▇▇▇▇█████████████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▆▇▇▇▇▇▇██████████
val_loss,█▅▄▃▃▂▂▂▂▂▂▁▂▂▃▁▂▂▁▁

0,1
epoch,20.0
train_acc,84.53319
train_loss,0.45333
val_acc,74.41046
val_loss,0.79422


[34m[1mwandb[0m: Agent Starting Run: a6fsw9dg with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.2661, Acc: 59.65%
Val Loss: 1.0002, Acc: 66.62%
Epoch 2/15
Train Loss: 0.7886, Acc: 74.16%
Val Loss: 0.8904, Acc: 70.81%
Epoch 3/15
Train Loss: 0.6842, Acc: 77.68%
Val Loss: 0.8809, Acc: 71.32%
Epoch 4/15
Train Loss: 0.6292, Acc: 79.39%
Val Loss: 0.8741, Acc: 72.36%
Epoch 5/15
Train Loss: 0.5966, Acc: 80.38%
Val Loss: 0.8798, Acc: 73.05%
Epoch 6/15
Train Loss: 0.5733, Acc: 81.13%
Val Loss: 0.8426, Acc: 72.72%
Epoch 7/15
Train Loss: 0.5643, Acc: 81.36%
Val Loss: 0.8669, Acc: 73.15%
Epoch 8/15
Train Loss: 0.5520, Acc: 81.72%
Val Loss: 0.8347, Acc: 73.32%
Epoch 9/15
Train Loss: 0.5412, Acc: 82.00%
Val Loss: 0.8197, Acc: 73.34%
Epoch 10/15
Train Loss: 0.5219, Acc: 82.68%
Val Loss: 0.8537, Acc: 73.57%
Epoch 11/15
Train Loss: 0.5224, Acc: 82.59%
Val Loss: 0.8548, Acc: 73.11%
Epoch 12/15
Train Loss: 0.5123, Acc: 82.92%
Val Loss: 0.8518, Acc: 73.37%
Epoch 13/15
Train Loss: 0.5099, Acc: 82.93%
Val Loss: 0.8633, Acc: 73.76%
Epoch 14/15
Train Loss: 0.4981, Acc: 83.23%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▅▆▇▇▇▇▇█▇▇███
val_loss,█▄▃▃▃▂▃▂▁▂▂▂▃▂▁

0,1
epoch,15.0
train_acc,83.12333
train_loss,0.5026
val_acc,74.08929
val_loss,0.81907


[34m[1mwandb[0m: Agent Starting Run: tg4tt48f with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.4077, Acc: 54.87%
Val Loss: 1.0090, Acc: 66.14%
Epoch 2/15
Train Loss: 0.8640, Acc: 71.54%
Val Loss: 0.9221, Acc: 69.50%
Epoch 3/15
Train Loss: 0.7513, Acc: 75.29%
Val Loss: 0.8711, Acc: 71.03%
Epoch 4/15
Train Loss: 0.6924, Acc: 77.31%
Val Loss: 0.8669, Acc: 71.88%
Epoch 5/15
Train Loss: 0.6581, Acc: 78.29%
Val Loss: 0.8526, Acc: 72.66%
Epoch 6/15
Train Loss: 0.6410, Acc: 78.81%
Val Loss: 0.8505, Acc: 73.46%
Epoch 7/15
Train Loss: 0.6177, Acc: 79.61%
Val Loss: 0.8647, Acc: 73.26%
Epoch 8/15
Train Loss: 0.6004, Acc: 80.14%
Val Loss: 0.8810, Acc: 73.43%
Epoch 9/15
Train Loss: 0.5909, Acc: 80.56%
Val Loss: 0.8552, Acc: 73.15%
Epoch 10/15
Train Loss: 0.5896, Acc: 80.41%
Val Loss: 0.8119, Acc: 73.61%
Epoch 11/15
Train Loss: 0.5795, Acc: 80.69%
Val Loss: 0.8255, Acc: 73.75%
Epoch 12/15
Train Loss: 0.5674, Acc: 81.18%
Val Loss: 0.8329, Acc: 73.74%
Epoch 13/15
Train Loss: 0.5686, Acc: 81.12%
Val Loss: 0.8025, Acc: 74.07%
Epoch 14/15
Train Loss: 0.5660, Acc: 81.03%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇▇████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▇▇▇▇▇▇▇▇███
val_loss,█▅▃▃▃▃▃▄▃▁▂▂▁▂▂

0,1
epoch,15.0
train_acc,81.72257
train_loss,0.54951
val_acc,74.37863
val_loss,0.81987


In [10]:
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math
from tqdm import tqdm

os.makedirs("predictions_vanilla", exist_ok=True)

best_config = {
    'embedding_dim': 128,
    'hidden_dim': 256,
    'enc_layers': 2,
    'dec_layers': 3,
    'cell_type': 'LSTM',
    'dropout': 0.5,
    'epochs': 15,
    'beam_size': 3,
    'batch_size': 64,  # Added
    'learning_rate': 0.0005  # Added
}

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab, is_test=False):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if not is_test:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y), lat, dev

def collate_fn(batch):
    x_batch, y_batch, lat, dev = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens), lat, dev

# =======================
# Encoder, Decoder, Seq2Seq
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token).unsqueeze(1)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        enc_hidden = self.encoder(src[0], src[1])
        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)
        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden = self.decoder(input_token, dec_hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

    def _match_layers(self, hidden):
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)

    def predict(self, src_tensor, src_len, max_len=30):
        self.eval()
        with torch.no_grad():
            enc_hidden = self.encoder(src_tensor.unsqueeze(0), torch.tensor([src_len]))
            if self.cell_type == "LSTM":
                h, c = enc_hidden
                h = self._match_layers(h)
                c = self._match_layers(c)
                dec_hidden = (h, c)
            else:
                dec_hidden = self._match_layers(enc_hidden)
            input_token = torch.tensor([2]).to(self.device)  # <sos>
            output_seq = []
            for _ in range(max_len):
                output, dec_hidden = self.decoder(input_token, dec_hidden)
                top1 = output.argmax(1)
                if top1.item() == 2: break  # <eos>
                output_seq.append(top1.item())
                input_token = top1
        return output_seq

# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _, _, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _, _, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    return total_loss / len(loader), 100.0 * total_correct / total_count

# =======================
# Main
# =======================
def main():
    wandb.init(project="dakshina-translit-test")
    config = best_config

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
    dev_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
    test_data = TransliterationDataset("/kaggle/input/dataset-01/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv", inp_vocab, out_vocab, is_test=True)

    # Update DataLoader batch sizes
    train_loader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)
     # First create model
    encoder = Encoder(inp_vocab.size, config['embedding_dim'], config['hidden_dim'],
                      config['enc_layers'], config['cell_type'], config['dropout'])
    decoder = Decoder(out_vocab.size, config['embedding_dim'], config['hidden_dim'],
                      config['dec_layers'], config['cell_type'], config['dropout'])
    model = Seq2Seq(encoder, decoder, config['enc_layers'], config['dec_layers'], config['cell_type'], device).to(device)
    # Update optimizer with learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_val_acc = 0.0
    for epoch in range(config['epochs']):
        print(f"Epoch {epoch+1}/{config['epochs']}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)

        print(f"Train Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")
        print(f"Val   Loss: {val_loss:.4f}, Accuracy: {val_acc:.2f}%")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")
            print("Best model saved.")

    
    # Evaluation on test set
    print("\n Evaluating on test set with best model:")
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    total_correct, total_count = 0, 0

    with open("predictions_vanilla/test_predictions.txt", "w", encoding="utf-8") as f:
        for src, _, src_lens, _, lat, gold in test_loader:
            src = src.to(device)
            pred_ids = model.predict(src[0], src_lens[0].item())
            pred = out_vocab.decode(pred_ids)
            f.write(f"{lat[0]}\t{gold[0]}\t{pred}\n")
            if pred == gold[0]:
                total_correct += 1
            total_count += 1

    test_acc = 100.0 * total_correct / total_count
    print(f"📊 Test Accuracy: {test_acc:.2f}%")
    wandb.log({"test_acc": test_acc})


     # After test evaluation, add random samples display
    print("\nRandom Test Samples Predictions:")
    import random
    random_indices = random.sample(range(len(test_data)), 30)
    
    with open("predictions_vanilla/test_predictions.txt", "a", encoding="utf-8") as f:
        f.write("\n\nRandom Sample Predictions:\n")
        for idx in random_indices:
            x, y, lat, dev = test_data[idx]
            src_tensor = x.to(device)
            pred_ids = model.predict(src_tensor, len(x))
            pred = out_vocab.decode(pred_ids)
            
            print(f"Input: {lat}")
            print(f"True: {dev}")
            print(f"Pred: {pred}\n")
            
            f.write(f"Input: {lat}\n")
            f.write(f"True: {dev}\n")
            f.write(f"Pred: {pred}\n\n")


if __name__ == "__main__":
    main()




Epoch 1/15
Train Loss: 1.7807, Accuracy: 47.10%
Val   Loss: 1.0987, Accuracy: 62.92%
Best model saved.
Epoch 2/15
Train Loss: 0.8313, Accuracy: 73.17%
Val   Loss: 0.8883, Accuracy: 69.75%
Best model saved.
Epoch 3/15
Train Loss: 0.6634, Accuracy: 78.55%
Val   Loss: 0.8675, Accuracy: 71.87%
Best model saved.
Epoch 4/15
Train Loss: 0.5937, Accuracy: 80.57%
Val   Loss: 0.8368, Accuracy: 72.91%
Best model saved.
Epoch 5/15
Train Loss: 0.5427, Accuracy: 82.14%
Val   Loss: 0.8239, Accuracy: 73.44%
Best model saved.
Epoch 6/15
Train Loss: 0.5105, Accuracy: 83.07%
Val   Loss: 0.8186, Accuracy: 73.39%
Epoch 7/15
Train Loss: 0.4891, Accuracy: 83.70%
Val   Loss: 0.7836, Accuracy: 74.55%
Best model saved.
Epoch 8/15
Train Loss: 0.4709, Accuracy: 84.08%
Val   Loss: 0.7878, Accuracy: 74.21%
Epoch 9/15
Train Loss: 0.4516, Accuracy: 84.61%
Val   Loss: 0.7778, Accuracy: 74.79%
Best model saved.
Epoch 10/15
Train Loss: 0.4297, Accuracy: 85.32%
Val   Loss: 0.8025, Accuracy: 74.56%
Epoch 11/15
Train Loss: