In [11]:
  !pip install wandb
import wandb
!wandb login 58a0b576fd5221cd0d63b154deaabbe535e853c6

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# =======================
# Imports and Sweep Config
# =======================
# =======================
# Imports and Sweep Config
# =======================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import wandb
import os
import math

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'embedding_dim': {'values': [32,64,128,256]},
        'hidden_dim': {'values': [32,64,128,256]},
        'enc_layers': {'values': [1,2,3]},
        'dec_layers': {'values': [1,2,3]},
        'cell_type': {'values': ['GRU', 'LSTM', 'RNN']},
        'dropout': {'values': [0.2,0.3,0.5]},
        'epochs': {'values': [10,15]},
        'beam_size': {'values': [1,3,5]},
        'attention_type': {'values': ['dot', 'general', 'concat']},
        'batch_size': {'values': [64,128,256]},
        'learning_rate': {'values': [0.001,0.0005,0.0001]}
    }
}

# =======================
# Default Config
# =======================
default_config = {
    'embedding_dim': 32,
    'hidden_dim': 64,
    'enc_layers': 1,
    'dec_layers': 1,
    'cell_type': 'LSTM',
    'dropout': 0.2,
    'epochs': 10,
    'beam_size': 1,
    'attention_type': 'general',
    'batch_size': 64,
    'learning_rate': 0.001
}


# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i > 2])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        inp_vocab.build([p[0] for p in self.pairs])
        out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y)

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    x_lens = [len(x) for x in x_batch]
    y_lens = [len(y) for y in y_batch]
    x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
    y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens)

# =======================
# Attention Mechanism
# =======================
class Attention(nn.Module):
    def __init__(self, hidden_dim, attention_type='general'):
        super().__init__()
        self.attention_type = attention_type
        
        if attention_type == 'general':
            self.attn = nn.Linear(hidden_dim, hidden_dim)
        elif attention_type == 'concat':
            self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
            self.v = nn.Linear(hidden_dim, 1, bias=False)
            
    def forward(self, hidden, encoder_outputs, mask=None):
        # hidden: [batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim]
        
        batch_size, src_len, hidden_dim = encoder_outputs.shape
        
        # For dot and general attention
        if self.attention_type == 'dot':
            # Calculate dot product between hidden and encoder_outputs
            energy = torch.bmm(encoder_outputs, hidden.unsqueeze(2)).squeeze(2)
            # energy: [batch_size, src_len]
            
        elif self.attention_type == 'general':
            # Calculate general attention
            energy = torch.bmm(encoder_outputs, self.attn(hidden).unsqueeze(2)).squeeze(2)
            # energy: [batch_size, src_len]
            
        elif self.attention_type == 'concat':
            # Repeat hidden across source length
            hidden_expanded = hidden.unsqueeze(1).repeat(1, src_len, 1)
            # Concatenate hidden and encoder_outputs
            energy = self.v(torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))).squeeze(2)
            # energy: [batch_size, src_len]
        
        # Apply mask if provided (for padding)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        # Apply softmax to get attention weights
        attention_weights = F.softmax(energy, dim=1)
        # attention_weights: [batch_size, src_len]
        
        # Apply attention weights to encoder outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        # context: [batch_size, hidden_dim]
        
        return context, attention_weights

# =======================
# Encoder and Decoder
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.cell_type = cell_type
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        
        return outputs, hidden

class AttentionDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout, attention_type):
        super().__init__()
        self.cell_type = cell_type
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        
        # Input to RNN will be embedding + context vector
        self.rnn = rnn_class(emb_dim + hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        
        self.attention = Attention(hidden_dim, attention_type)
        
        # Output layer combines hidden state and context vector
        self.out = nn.Linear(hidden_dim * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden, encoder_outputs, mask=None):
        # input_token: [batch_size]
        # hidden: tuple of [num_layers, batch_size, hidden_dim] for LSTM or [num_layers, batch_size, hidden_dim] for GRU/RNN
        # encoder_outputs: [batch_size, src_len, hidden_dim]
        
        # Get the last layer's hidden state for attention
        if self.cell_type == "LSTM":
            attn_hidden = hidden[0][-1]  # Last layer's hidden state
        else:
            attn_hidden = hidden[-1]  # Last layer's hidden state
        
        # Calculate attention
        context, attention_weights = self.attention(attn_hidden, encoder_outputs, mask)
        
        # Embed input token
        embedded = self.embedding(input_token)  # [batch_size, emb_dim]
        
        # Concatenate embedding and context vector
        rnn_input = torch.cat((embedded, context), dim=1).unsqueeze(1)  # [batch_size, 1, emb_dim + hidden_dim]
        
        # Pass through RNN
        output, hidden = self.rnn(rnn_input, hidden)
        
        # Get the output from the last layer
        if self.cell_type == "LSTM":
            output_hidden = hidden[0][-1]  # Last layer's hidden state
        else:
            output_hidden = hidden[-1]  # Last layer's hidden state
        
        # Concatenate output and context for prediction
        output = torch.cat((output_hidden, context), dim=1)
        
        # Apply dropout and predict
        output = self.dropout(output)
        prediction = self.out(output)
        
        return prediction, hidden, attention_weights

# =======================
# Seq2Seq Model with Beam Search
# =======================
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, enc_layers, dec_layers, cell_type, device, beam_size=1):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.cell_type = cell_type
        self.enc_layers = enc_layers
        self.dec_layers = dec_layers
        self.device = device
        self.beam_size = beam_size

    def create_mask(self, src, src_lens):
        # Create mask for attention (1 for valid positions, 0 for padding)
        batch_size = src.size(0)
        max_len = src.size(1)
        mask = torch.zeros(batch_size, max_len, device=self.device)
        for i, length in enumerate(src_lens):
            mask[i, :length] = 1
        return mask

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        src_data, src_lens = src
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        
        # Store outputs, attention weights
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        attentions = torch.zeros(batch_size, trg_len, src_data.size(1)).to(self.device)
        
        # Encode
        encoder_outputs, enc_hidden = self.encoder(src_data, src_lens)
        
        # Create mask for attention
        mask = self.create_mask(src_data, src_lens)
        
        # Match encoder and decoder layers
        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)
        
        # First input to the decoder is the <sos> token
        input_token = trg[:, 0]
        
        for t in range(1, trg_len):
            output, dec_hidden, attn_weights = self.decoder(
                input_token, dec_hidden, encoder_outputs, mask
            )
            
            outputs[:, t] = output
            attentions[:, t] = attn_weights
            
            # Teacher forcing
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = trg[:, t] if teacher_force else top1
            
        return outputs, attentions

    def _match_layers(self, hidden):
        # Match encoder and decoder layers
        if self.enc_layers == self.dec_layers:
            return hidden
        elif self.enc_layers > self.dec_layers:
            return hidden[:self.dec_layers]
        else:
            pad = hidden.new_zeros((self.dec_layers - self.enc_layers, *hidden.shape[1:]))
            return torch.cat([hidden, pad], dim=0)
    
    def beam_search(self, src, max_len=50, sos_idx=1, eos_idx=2):
        src_data, src_lens = src
        batch_size = src_data.size(0)
        assert batch_size == 1, "Beam search only supports batch size of 1 for now"

        # Encode
        encoder_outputs, enc_hidden = self.encoder(src_data, src_lens)
        mask = self.create_mask(src_data, src_lens)

        # Prepare initial hidden state
        if self.cell_type == "LSTM":
            h, c = enc_hidden
            h = self._match_layers(h)
            c = self._match_layers(c)
            dec_hidden = (h, c)
        else:
            dec_hidden = self._match_layers(enc_hidden)

        # Initialize beams with [score, sequence, hidden_state]
        beams = [{
            "score": 0.0,
            "seq": [sos_idx],
            "hidden": dec_hidden
        }]

        for _ in range(max_len):
            new_beams = []
            for beam in beams:
                seq = beam["seq"]
                if seq[-1] == eos_idx:
                    new_beams.append(beam)
                    continue

                input_token = torch.tensor([seq[-1]], device=self.device)
                dec_hidden = beam["hidden"]
                output, new_hidden, _ = self.decoder(input_token, dec_hidden, encoder_outputs, mask)

                log_probs = F.log_softmax(output, dim=1).squeeze(0)
                topk_log_probs, topk_indices = torch.topk(log_probs, self.beam_size)

                for log_prob, idx in zip(topk_log_probs, topk_indices):
                    new_beams.append({
                        "score": beam["score"] + log_prob.item(),
                        "seq": beam["seq"] + [idx.item()],
                        "hidden": new_hidden
                    })

            # Keep top `beam_size` beams
            beams = sorted(new_beams, key=lambda x: x["score"], reverse=True)[:self.beam_size]

            # Early stopping if all beams end with <eos>
            if all(beam["seq"][-1] == eos_idx for beam in beams):
                break

        # Choose best beam
        best_beam = max(beams, key=lambda x: x["score"])
        return best_beam["seq"]





# =======================
# Train & Eval
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, total_correct, total_count = 0, 0, 0
    for src, trg, src_lens, _ in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output, _ = model((src, src_lens), trg)
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        pred = output.argmax(2)
        correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
        total_correct += correct
        total_count += (trg[:, 1:] != 0).sum().item()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Train Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_count = 0, 0, 0
    with torch.no_grad():
        for src, trg, src_lens, _ in loader:
            src, trg = src.to(device), trg.to(device)
            output, _ = model((src, src_lens), trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            pred = output.argmax(2)
            correct = ((pred[:, 1:] == trg[:, 1:]) & (trg[:, 1:] != 0)).sum().item()
            total_correct += correct
            total_count += (trg[:, 1:] != 0).sum().item()
            total_loss += loss.item()
    acc = 100.0 * total_correct / total_count
    print(f"Val Loss: {total_loss / len(loader):.4f}, Acc: {acc:.2f}%")
    return total_loss / len(loader), acc

# =======================
# Main
# =======================
def main():
    wandb.init(config=default_config, project="dakshina-transliteration")
    config = wandb.config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inp_vocab, out_vocab = Vocab(), Vocab()
    train_path = "/kaggle/input/devnagiridata/hi.translit.sampled.train.tsv"
    dev_path = "/kaggle/input/devnagiridata/hi.translit.sampled.dev.tsv"
    
    train_data = TransliterationDataset(train_path, inp_vocab, out_vocab)
    dev_data = TransliterationDataset(dev_path, inp_vocab, out_vocab)
    
    # Use config.batch_size for DataLoader
    train_loader = DataLoader(train_data, batch_size=config.batch_size, 
                             shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=config.batch_size,
                           shuffle=False, collate_fn=collate_fn)

    encoder = Encoder(inp_vocab.size, config.embedding_dim, config.hidden_dim,
                     config.enc_layers, config.cell_type, config.dropout)
    decoder = AttentionDecoder(out_vocab.size, config.embedding_dim, config.hidden_dim,
                              config.dec_layers, config.cell_type, config.dropout,
                              config.attention_type)
    
    model = Seq2Seq(encoder, decoder, config.enc_layers, config.dec_layers,
                   config.cell_type, device, beam_size=config.beam_size).to(device)
    
    # Use config.learning_rate for optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        wandb.log({
            "train_loss": train_loss, 
            "train_acc": train_acc, 
            "val_loss": val_loss, 
            "val_acc": val_acc, 
            "epoch": epoch+1
        })

# =======================
if __name__ == '__main__':
    sweep_id = wandb.sweep(sweep_config, project="dakshina-transliteration-attention")
    wandb.agent(sweep_id, function=main, count=30)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 64q660zv
Sweep URL: https://wandb.ai/manglesh_dl_ass3/dakshina-transliteration-attention/sweeps/64q660zv


[34m[1mwandb[0m: Agent Starting Run: kk1paf4n with config:
[34m[1mwandb[0m: 	attention_type: general
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/10
Train Loss: 2.5110, Acc: 29.00%
Val Loss: 1.9818, Acc: 41.74%
Epoch 2/10
Train Loss: 1.7008, Acc: 50.11%
Val Loss: 1.3793, Acc: 56.66%
Epoch 3/10
Train Loss: 1.3229, Acc: 61.12%
Val Loss: 1.1319, Acc: 65.53%
Epoch 4/10
Train Loss: 1.1588, Acc: 66.26%
Val Loss: 1.0422, Acc: 68.36%
Epoch 5/10
Train Loss: 1.0605, Acc: 69.47%
Val Loss: 1.0014, Acc: 70.01%
Epoch 6/10
Train Loss: 1.0092, Acc: 71.01%
Val Loss: 0.9782, Acc: 70.04%
Epoch 7/10
Train Loss: 0.9667, Acc: 72.27%
Val Loss: 0.9751, Acc: 70.81%
Epoch 8/10
Train Loss: 0.9433, Acc: 72.94%
Val Loss: 0.9551, Acc: 71.02%
Epoch 9/10
Train Loss: 0.9219, Acc: 73.60%
Val Loss: 0.9505, Acc: 71.29%
Epoch 10/10
Train Loss: 0.9076, Acc: 74.02%
Val Loss: 0.9340, Acc: 71.47%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▇▇█████
train_loss,█▄▃▂▂▁▁▁▁▁
val_acc,▁▅▇▇██████
val_loss,█▄▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,74.02213
train_loss,0.90762
val_acc,71.46784
val_loss,0.93404


[34m[1mwandb[0m: Agent Starting Run: z7kry7ze with config:
[34m[1mwandb[0m: 	attention_type: dot
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10
Train Loss: 2.8594, Acc: 20.93%
Val Loss: 2.5821, Acc: 25.49%
Epoch 2/10
Train Loss: 2.4010, Acc: 28.89%
Val Loss: 2.2919, Acc: 29.30%
Epoch 3/10
Train Loss: 2.0285, Acc: 38.11%
Val Loss: 1.9301, Acc: 38.68%
Epoch 4/10
Train Loss: 1.6875, Acc: 49.23%
Val Loss: 1.6945, Acc: 47.08%
Epoch 5/10
Train Loss: 1.4732, Acc: 56.66%
Val Loss: 1.5609, Acc: 51.94%
Epoch 6/10
Train Loss: 1.3461, Acc: 60.52%
Val Loss: 1.4823, Acc: 54.67%
Epoch 7/10
Train Loss: 1.2665, Acc: 62.74%
Val Loss: 1.4252, Acc: 56.46%
Epoch 8/10
Train Loss: 1.2155, Acc: 64.04%
Val Loss: 1.3827, Acc: 57.94%
Epoch 9/10
Train Loss: 1.1562, Acc: 65.99%
Val Loss: 1.3538, Acc: 59.08%
Epoch 10/10
Train Loss: 1.1214, Acc: 67.00%
Val Loss: 1.3155, Acc: 60.74%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▅▆▇▇███
train_loss,█▆▅▃▂▂▂▁▁▁
val_acc,▁▂▄▅▆▇▇▇██
val_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,10.0
train_acc,66.99628
train_loss,1.12138
val_acc,60.74188
val_loss,1.31552


[34m[1mwandb[0m: Agent Starting Run: zd0d9rsu with config:
[34m[1mwandb[0m: 	attention_type: dot
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.5174, Acc: 25.81%
Val Loss: 2.1532, Acc: 33.30%
Epoch 2/15
Train Loss: 1.9083, Acc: 40.98%
Val Loss: 1.7132, Acc: 45.88%
Epoch 3/15
Train Loss: 1.5536, Acc: 51.78%
Val Loss: 1.4826, Acc: 53.44%
Epoch 4/15
Train Loss: 1.3771, Acc: 57.32%
Val Loss: 1.3707, Acc: 57.18%
Epoch 5/15
Train Loss: 1.2759, Acc: 60.66%
Val Loss: 1.2924, Acc: 59.76%
Epoch 6/15
Train Loss: 1.2133, Acc: 62.79%
Val Loss: 1.2253, Acc: 61.99%
Epoch 7/15
Train Loss: 1.1586, Acc: 64.62%
Val Loss: 1.1762, Acc: 64.34%
Epoch 8/15
Train Loss: 1.1129, Acc: 66.35%
Val Loss: 1.1528, Acc: 64.93%
Epoch 9/15
Train Loss: 1.0816, Acc: 67.48%
Val Loss: 1.1283, Acc: 66.03%
Epoch 10/15
Train Loss: 1.0536, Acc: 68.38%
Val Loss: 1.1321, Acc: 66.47%
Epoch 11/15
Train Loss: 1.0402, Acc: 68.86%
Val Loss: 1.1032, Acc: 66.82%
Epoch 12/15
Train Loss: 1.0175, Acc: 69.68%
Val Loss: 1.0828, Acc: 67.74%
Epoch 13/15
Train Loss: 0.9974, Acc: 70.33%
Val Loss: 1.0473, Acc: 67.78%
Epoch 14/15
Train Loss: 0.9874, Acc: 70.58%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▅▆▆▇▇▇▇██████
train_loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁
val_acc,▁▃▅▆▆▇▇▇▇██████
val_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁

0,1
epoch,15.0
train_acc,71.37816
train_loss,0.9642
val_acc,68.62359
val_loss,1.04909


[34m[1mwandb[0m: Agent Starting Run: yhqr0gl6 with config:
[34m[1mwandb[0m: 	attention_type: general
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 1.4411, Acc: 58.91%
Val Loss: 1.0458, Acc: 69.22%
Epoch 2/15
Train Loss: 0.8480, Acc: 75.06%
Val Loss: 0.9443, Acc: 71.39%
Epoch 3/15
Train Loss: 0.7820, Acc: 76.61%
Val Loss: 0.9259, Acc: 71.28%
Epoch 4/15
Train Loss: 0.7430, Acc: 77.54%
Val Loss: 0.9123, Acc: 72.36%
Epoch 5/15
Train Loss: 0.6997, Acc: 78.80%
Val Loss: 0.8711, Acc: 73.25%
Epoch 6/15
Train Loss: 0.7317, Acc: 77.84%
Val Loss: 0.8825, Acc: 73.04%
Epoch 7/15
Train Loss: 0.6677, Acc: 79.61%
Val Loss: 0.8589, Acc: 73.20%
Epoch 8/15
Train Loss: 0.6786, Acc: 79.02%
Val Loss: 0.8512, Acc: 73.68%
Epoch 9/15
Train Loss: 0.7135, Acc: 78.02%
Val Loss: 0.8835, Acc: 72.50%
Epoch 10/15
Train Loss: 0.6985, Acc: 78.56%
Val Loss: 0.8656, Acc: 73.36%
Epoch 11/15
Train Loss: 0.6477, Acc: 80.06%
Val Loss: 0.8398, Acc: 74.10%
Epoch 12/15
Train Loss: 0.6419, Acc: 80.11%
Val Loss: 0.8249, Acc: 74.11%
Epoch 13/15
Train Loss: 0.6534, Acc: 79.87%
Val Loss: 0.8633, Acc: 73.83%
Epoch 14/15
Train Loss: 0.6161, Acc: 81.04%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█▇▇▇█████
train_loss,█▃▂▂▂▂▁▂▂▂▁▁▁▁▁
val_acc,▁▄▄▅▇▆▇▇▆▇███▇▇
val_loss,█▅▄▄▂▃▂▂▃▂▁▁▂▁▁

0,1
epoch,15.0
train_acc,80.08619
train_loss,0.64332
val_acc,73.67553
val_loss,0.83605


[34m[1mwandb[0m: Agent Starting Run: w75cha9e with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.4732, Acc: 55.43%
Val Loss: 1.0160, Acc: 69.63%
Epoch 2/15
Train Loss: 0.8437, Acc: 74.68%
Val Loss: 0.9017, Acc: 71.98%
Epoch 3/15
Train Loss: 0.7513, Acc: 77.02%
Val Loss: 0.8873, Acc: 72.51%
Epoch 4/15
Train Loss: 0.6997, Acc: 78.38%
Val Loss: 0.8793, Acc: 73.36%
Epoch 5/15
Train Loss: 0.6681, Acc: 79.17%
Val Loss: 0.8600, Acc: 73.56%
Epoch 6/15
Train Loss: 0.6406, Acc: 79.86%
Val Loss: 0.8276, Acc: 74.35%
Epoch 7/15
Train Loss: 0.6221, Acc: 80.42%
Val Loss: 0.8281, Acc: 74.25%
Epoch 8/15
Train Loss: 0.6135, Acc: 80.58%
Val Loss: 0.8139, Acc: 74.38%
Epoch 9/15
Train Loss: 0.5996, Acc: 81.07%
Val Loss: 0.7871, Acc: 74.82%
Epoch 10/15
Train Loss: 0.5896, Acc: 81.21%
Val Loss: 0.8049, Acc: 75.28%
Epoch 11/15
Train Loss: 0.5782, Acc: 81.61%
Val Loss: 0.8288, Acc: 75.32%
Epoch 12/15
Train Loss: 0.5746, Acc: 81.61%
Val Loss: 0.8136, Acc: 75.42%
Epoch 13/15
Train Loss: 0.5725, Acc: 81.62%
Val Loss: 0.8254, Acc: 75.34%
Epoch 14/15
Train Loss: 0.5710, Acc: 81.58%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▄▅▆▆▆▆▇██████
val_loss,█▅▄▄▃▂▂▂▁▂▂▂▂▁▁

0,1
epoch,15.0
train_acc,82.3508
train_loss,0.55287
val_acc,75.70672
val_loss,0.78246


[34m[1mwandb[0m: Agent Starting Run: n67ny8ll with config:
[34m[1mwandb[0m: 	attention_type: general
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.5483, Acc: 54.28%
Val Loss: 1.3092, Acc: 59.25%
Epoch 2/15
Train Loss: 1.3283, Acc: 60.53%
Val Loss: 1.3366, Acc: 58.21%
Epoch 3/15
Train Loss: 1.2633, Acc: 62.46%
Val Loss: 1.1851, Acc: 62.79%
Epoch 4/15
Train Loss: 1.0925, Acc: 67.59%
Val Loss: 1.1708, Acc: 63.94%
Epoch 5/15
Train Loss: 1.2435, Acc: 63.32%
Val Loss: 1.1732, Acc: 63.95%
Epoch 6/15
Train Loss: 1.2622, Acc: 62.71%
Val Loss: 1.2109, Acc: 63.22%
Epoch 7/15
Train Loss: 1.1317, Acc: 66.46%
Val Loss: 1.1001, Acc: 66.65%
Epoch 8/15
Train Loss: 1.0772, Acc: 68.46%
Val Loss: 1.2319, Acc: 62.06%
Epoch 9/15
Train Loss: 1.1424, Acc: 66.46%
Val Loss: 1.1441, Acc: 64.76%
Epoch 10/15
Train Loss: 1.1595, Acc: 66.05%
Val Loss: 1.3471, Acc: 60.34%
Epoch 11/15
Train Loss: 1.2318, Acc: 64.25%
Val Loss: 1.1066, Acc: 66.90%
Epoch 12/15
Train Loss: 1.0795, Acc: 68.77%
Val Loss: 1.0838, Acc: 67.83%
Epoch 13/15
Train Loss: 1.1133, Acc: 67.77%
Val Loss: 1.1428, Acc: 65.92%
Epoch 14/15
Train Loss: 1.1924, Acc: 65.39%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▄▅▇▅▅▇█▇▇▆██▆█
train_loss,█▅▄▁▃▄▂▁▂▂▃▁▂▃▁
val_acc,▂▁▄▅▅▅▇▄▆▃▇█▇▅█
val_loss,▇█▄▄▄▅▂▅▃█▂▁▃▅▁

0,1
epoch,15.0
train_acc,68.01712
train_loss,1.10516
val_acc,67.62246
val_loss,1.07009


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nu7qlc0k with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10
Train Loss: 1.8805, Acc: 44.40%
Val Loss: 1.1568, Acc: 66.07%
Epoch 2/10
Train Loss: 0.8845, Acc: 73.82%
Val Loss: 0.9274, Acc: 71.81%
Epoch 3/10
Train Loss: 0.7638, Acc: 77.16%
Val Loss: 0.8985, Acc: 72.59%
Epoch 4/10
Train Loss: 0.7122, Acc: 78.52%
Val Loss: 0.8970, Acc: 73.11%
Epoch 5/10
Train Loss: 0.6927, Acc: 78.81%
Val Loss: 0.8723, Acc: 73.46%
Epoch 6/10
Train Loss: 0.6513, Acc: 80.06%
Val Loss: 0.8552, Acc: 73.83%
Epoch 7/10
Train Loss: 0.6379, Acc: 80.32%
Val Loss: 0.8488, Acc: 73.69%
Epoch 8/10
Train Loss: 0.6217, Acc: 80.69%
Val Loss: 0.8550, Acc: 73.93%
Epoch 9/10
Train Loss: 0.6201, Acc: 80.60%
Val Loss: 0.8358, Acc: 74.37%
Epoch 10/10
Train Loss: 0.6000, Acc: 81.23%
Val Loss: 0.8255, Acc: 74.43%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▇▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▆▆▇▇▇▇███
val_loss,█▃▃▃▂▂▁▂▁▁

0,1
epoch,10.0
train_acc,81.23367
train_loss,0.60001
val_acc,74.43072
val_loss,0.8255


[34m[1mwandb[0m: Agent Starting Run: vrw1qiba with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10
Train Loss: 1.8408, Acc: 42.82%
Val Loss: 1.1136, Acc: 64.21%
Epoch 2/10
Train Loss: 0.9016, Acc: 71.88%
Val Loss: 0.8618, Acc: 72.08%
Epoch 3/10
Train Loss: 0.7479, Acc: 76.40%
Val Loss: 0.8066, Acc: 73.81%
Epoch 4/10
Train Loss: 0.6717, Acc: 78.76%
Val Loss: 0.8281, Acc: 74.23%
Epoch 5/10
Train Loss: 0.6313, Acc: 80.08%
Val Loss: 0.7937, Acc: 74.88%
Epoch 6/10
Train Loss: 0.6036, Acc: 80.82%
Val Loss: 0.8305, Acc: 75.22%
Epoch 7/10
Train Loss: 0.5857, Acc: 81.32%
Val Loss: 0.7871, Acc: 75.57%
Epoch 8/10
Train Loss: 0.5755, Acc: 81.58%
Val Loss: 0.7826, Acc: 75.82%
Epoch 9/10
Train Loss: 0.5514, Acc: 82.36%
Val Loss: 0.7783, Acc: 75.75%
Epoch 10/10
Train Loss: 0.5404, Acc: 82.65%
Val Loss: 0.7910, Acc: 75.84%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_acc,▁▆▇▇▇█████
val_loss,█▃▂▂▁▂▁▁▁▁

0,1
epoch,10.0
train_acc,82.64877
train_loss,0.54038
val_acc,75.84271
val_loss,0.79096


[34m[1mwandb[0m: Agent Starting Run: wm1p0ya7 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 2.5955, Acc: 26.04%
Val Loss: 2.1989, Acc: 34.41%
Epoch 2/15
Train Loss: 1.9017, Acc: 43.22%
Val Loss: 1.5169, Acc: 53.87%
Epoch 3/15
Train Loss: 1.3732, Acc: 59.15%
Val Loss: 1.1632, Acc: 65.06%
Epoch 4/15
Train Loss: 1.1540, Acc: 66.07%
Val Loss: 1.0790, Acc: 67.20%
Epoch 5/15
Train Loss: 1.0612, Acc: 68.94%
Val Loss: 1.0271, Acc: 68.30%
Epoch 6/15
Train Loss: 1.0144, Acc: 70.23%
Val Loss: 1.0004, Acc: 69.46%
Epoch 7/15
Train Loss: 0.9723, Acc: 71.48%
Val Loss: 1.0115, Acc: 68.95%
Epoch 8/15
Train Loss: 0.9468, Acc: 72.12%
Val Loss: 0.9160, Acc: 71.81%
Epoch 9/15
Train Loss: 0.9189, Acc: 72.93%
Val Loss: 0.9196, Acc: 71.93%
Epoch 10/15
Train Loss: 0.9006, Acc: 73.51%
Val Loss: 0.9313, Acc: 71.81%
Epoch 11/15
Train Loss: 0.8851, Acc: 73.91%
Val Loss: 0.9158, Acc: 72.55%
Epoch 12/15
Train Loss: 0.8813, Acc: 73.96%
Val Loss: 0.9167, Acc: 72.11%
Epoch 13/15
Train Loss: 0.8626, Acc: 74.51%
Val Loss: 0.9207, Acc: 72.39%
Epoch 14/15
Train Loss: 0.8585, Acc: 74.54%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▃▆▇▇▇▇████████
train_loss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▇▇▇▇▇████████
val_loss,█▄▂▂▂▁▂▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,75.25569
train_loss,0.83872
val_acc,72.49212
val_loss,0.90851


[34m[1mwandb[0m: Agent Starting Run: 3ecgr25p with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/10
Train Loss: 2.5609, Acc: 26.99%
Val Loss: 2.1108, Acc: 33.60%
Epoch 2/10
Train Loss: 2.0077, Acc: 35.21%
Val Loss: 1.7490, Acc: 41.19%
Epoch 3/10
Train Loss: 1.7446, Acc: 41.12%
Val Loss: 1.5677, Acc: 45.49%
Epoch 4/10
Train Loss: 1.5799, Acc: 46.27%
Val Loss: 1.4801, Acc: 48.69%
Epoch 5/10
Train Loss: 1.4246, Acc: 52.02%
Val Loss: 1.2667, Acc: 56.98%
Epoch 6/10
Train Loss: 1.1684, Acc: 62.44%
Val Loss: 1.0664, Acc: 65.71%
Epoch 7/10
Train Loss: 1.0131, Acc: 68.74%
Val Loss: 0.9662, Acc: 69.63%
Epoch 8/10
Train Loss: 0.9299, Acc: 71.46%
Val Loss: 0.9219, Acc: 71.12%
Epoch 9/10
Train Loss: 0.8688, Acc: 73.43%
Val Loss: 0.9018, Acc: 71.51%
Epoch 10/10
Train Loss: 0.8289, Acc: 74.61%
Val Loss: 0.8714, Acc: 72.46%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▃▄▅▆▇███
train_loss,█▆▅▄▃▂▂▁▁▁
val_acc,▁▂▃▄▅▇▇███
val_loss,█▆▅▄▃▂▂▁▁▁

0,1
epoch,10.0
train_acc,74.60539
train_loss,0.8289
val_acc,72.46029
val_loss,0.87139


[34m[1mwandb[0m: Agent Starting Run: e1hiotdo with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10
Train Loss: 1.9734, Acc: 39.25%
Val Loss: 1.1697, Acc: 63.23%
Epoch 2/10
Train Loss: 1.0246, Acc: 68.52%
Val Loss: 0.9209, Acc: 71.17%
Epoch 3/10
Train Loss: 0.8391, Acc: 74.36%
Val Loss: 0.8998, Acc: 71.52%
Epoch 4/10
Train Loss: 0.7618, Acc: 76.42%
Val Loss: 0.8389, Acc: 73.46%
Epoch 5/10
Train Loss: 0.7143, Acc: 77.67%
Val Loss: 0.8374, Acc: 74.01%
Epoch 6/10
Train Loss: 0.6773, Acc: 78.86%
Val Loss: 0.8301, Acc: 74.18%
Epoch 7/10
Train Loss: 0.6496, Acc: 79.65%
Val Loss: 0.8580, Acc: 74.17%
Epoch 8/10
Train Loss: 0.6303, Acc: 80.14%
Val Loss: 0.8562, Acc: 74.25%
Epoch 9/10
Train Loss: 0.6158, Acc: 80.57%
Val Loss: 0.8267, Acc: 74.64%
Epoch 10/10
Train Loss: 0.6044, Acc: 80.92%
Val Loss: 0.7977, Acc: 75.33%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▆▆▇▇▇▇▇██
val_loss,█▃▃▂▂▂▂▂▂▁

0,1
epoch,10.0
train_acc,80.91776
train_loss,0.60442
val_acc,75.33347
val_loss,0.7977


[34m[1mwandb[0m: Agent Starting Run: gj8qzdck with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/15
Train Loss: 2.2572, Acc: 32.28%
Val Loss: 1.7513, Acc: 41.84%
Epoch 2/15
Train Loss: 1.4334, Acc: 55.80%
Val Loss: 1.0554, Acc: 68.41%
Epoch 3/15
Train Loss: 1.0429, Acc: 69.04%
Val Loss: 0.9534, Acc: 71.40%
Epoch 4/15
Train Loss: 0.9300, Acc: 72.32%
Val Loss: 0.9132, Acc: 72.43%
Epoch 5/15
Train Loss: 0.8649, Acc: 74.08%
Val Loss: 0.9180, Acc: 72.31%
Epoch 6/15
Train Loss: 0.8280, Acc: 74.99%
Val Loss: 0.8964, Acc: 73.03%
Epoch 7/15
Train Loss: 0.7859, Acc: 76.34%
Val Loss: 0.8752, Acc: 73.79%
Epoch 8/15
Train Loss: 0.7617, Acc: 77.00%
Val Loss: 0.8549, Acc: 74.27%
Epoch 9/15
Train Loss: 0.7369, Acc: 77.72%
Val Loss: 0.8398, Acc: 74.51%
Epoch 10/15
Train Loss: 0.7195, Acc: 78.18%
Val Loss: 0.8544, Acc: 74.62%
Epoch 11/15
Train Loss: 0.7056, Acc: 78.58%
Val Loss: 0.8485, Acc: 74.58%
Epoch 12/15
Train Loss: 0.6942, Acc: 78.81%
Val Loss: 0.8479, Acc: 74.88%
Epoch 13/15
Train Loss: 0.6825, Acc: 79.14%
Val Loss: 0.8410, Acc: 75.09%
Epoch 14/15
Train Loss: 0.6793, Acc: 79.06%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▄▆▇▇▇█████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▇▇▇▇██████████
val_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,79.46679
train_loss,0.66553
val_acc,75.23799
val_loss,0.82287


[34m[1mwandb[0m: Agent Starting Run: oskj9yx6 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.4046, Acc: 56.68%
Val Loss: 0.9350, Acc: 70.49%
Epoch 2/10
Train Loss: 0.7508, Acc: 76.79%
Val Loss: 0.8580, Acc: 73.27%
Epoch 3/10
Train Loss: 0.6615, Acc: 79.14%
Val Loss: 0.8463, Acc: 74.17%
Epoch 4/10
Train Loss: 0.6086, Acc: 80.74%
Val Loss: 0.8234, Acc: 75.17%
Epoch 5/10
Train Loss: 0.5759, Acc: 81.67%
Val Loss: 0.7827, Acc: 74.90%
Epoch 6/10
Train Loss: 0.5585, Acc: 82.06%
Val Loss: 0.8171, Acc: 75.12%
Epoch 7/10
Train Loss: 0.5357, Acc: 82.79%
Val Loss: 0.7760, Acc: 76.07%
Epoch 8/10
Train Loss: 0.5161, Acc: 83.33%
Val Loss: 0.8172, Acc: 75.91%
Epoch 9/10
Train Loss: 0.5048, Acc: 83.57%
Val Loss: 0.7907, Acc: 75.76%
Epoch 10/10
Train Loss: 0.4957, Acc: 83.74%
Val Loss: 0.8165, Acc: 75.94%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▄▆▇▇▇████
val_loss,█▅▄▃▁▃▁▃▂▃

0,1
epoch,10.0
train_acc,83.73969
train_loss,0.49566
val_acc,75.94398
val_loss,0.81648


[34m[1mwandb[0m: Agent Starting Run: oop6438f with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.5673, Acc: 51.57%
Val Loss: 0.9523, Acc: 70.36%
Epoch 2/15
Train Loss: 0.8201, Acc: 74.98%
Val Loss: 0.8907, Acc: 73.09%
Epoch 3/15
Train Loss: 0.7193, Acc: 77.99%
Val Loss: 0.8706, Acc: 73.72%
Epoch 4/15
Train Loss: 0.6774, Acc: 79.13%
Val Loss: 0.8422, Acc: 74.10%
Epoch 5/15
Train Loss: 0.6409, Acc: 80.29%
Val Loss: 0.8882, Acc: 73.98%
Epoch 6/15
Train Loss: 0.6311, Acc: 80.24%
Val Loss: 0.8514, Acc: 74.81%
Epoch 7/15
Train Loss: 0.6095, Acc: 80.94%
Val Loss: 0.8620, Acc: 74.52%
Epoch 8/15
Train Loss: 0.5893, Acc: 81.41%
Val Loss: 0.8133, Acc: 75.04%
Epoch 9/15
Train Loss: 0.5806, Acc: 81.66%
Val Loss: 0.8053, Acc: 75.37%
Epoch 10/15
Train Loss: 0.5764, Acc: 81.65%
Val Loss: 0.8388, Acc: 75.37%
Epoch 11/15
Train Loss: 0.5600, Acc: 82.25%
Val Loss: 0.7970, Acc: 76.08%
Epoch 12/15
Train Loss: 0.5469, Acc: 82.58%
Val Loss: 0.8427, Acc: 75.69%
Epoch 13/15
Train Loss: 0.5532, Acc: 82.23%
Val Loss: 0.8203, Acc: 76.15%
Epoch 14/15
Train Loss: 0.5329, Acc: 82.88%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▅▆▆▇▇▇█▇███
val_loss,█▅▄▃▅▃▄▂▁▃▁▃▂▃▂

0,1
epoch,15.0
train_acc,83.008
train_loss,0.53022
val_acc,76.03368
val_loss,0.8105


[34m[1mwandb[0m: Agent Starting Run: vf7xpo41 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.7330, Acc: 46.50%
Val Loss: 0.9966, Acc: 68.05%
Epoch 2/15
Train Loss: 0.9578, Acc: 70.21%
Val Loss: 0.9333, Acc: 71.18%
Epoch 3/15
Train Loss: 0.8146, Acc: 74.97%
Val Loss: 0.8729, Acc: 73.35%
Epoch 4/15
Train Loss: 0.7570, Acc: 76.58%
Val Loss: 0.8416, Acc: 73.85%
Epoch 5/15
Train Loss: 0.7062, Acc: 78.17%
Val Loss: 0.8570, Acc: 74.88%
Epoch 6/15
Train Loss: 0.6742, Acc: 79.21%
Val Loss: 0.8799, Acc: 74.75%
Epoch 7/15
Train Loss: 0.6451, Acc: 80.02%
Val Loss: 0.8132, Acc: 74.91%
Epoch 8/15
Train Loss: 0.6207, Acc: 80.71%
Val Loss: 0.8467, Acc: 75.56%
Epoch 9/15
Train Loss: 0.6167, Acc: 80.77%
Val Loss: 0.8610, Acc: 75.24%
Epoch 10/15
Train Loss: 0.6051, Acc: 80.92%
Val Loss: 0.8614, Acc: 75.21%
Epoch 11/15
Train Loss: 0.5856, Acc: 81.57%
Val Loss: 0.8465, Acc: 75.60%
Epoch 12/15
Train Loss: 0.5792, Acc: 81.66%
Val Loss: 0.8150, Acc: 75.65%
Epoch 13/15
Train Loss: 0.5691, Acc: 82.00%
Val Loss: 0.8408, Acc: 75.73%
Epoch 14/15
Train Loss: 0.5706, Acc: 81.84%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▇▇▇█▇▇█████
val_loss,█▆▃▂▃▄▁▂▃▃▂▁▂▁▁

0,1
epoch,15.0
train_acc,82.46392
train_loss,0.55308
val_acc,76.13206
val_loss,0.82


[34m[1mwandb[0m: Agent Starting Run: 682bk2iw with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 2.1580, Acc: 37.31%
Val Loss: 1.2994, Acc: 60.84%
Epoch 2/10
Train Loss: 1.3232, Acc: 61.32%
Val Loss: 1.0723, Acc: 68.42%
Epoch 3/10
Train Loss: 1.1875, Acc: 65.59%
Val Loss: 1.0218, Acc: 69.09%
Epoch 4/10
Train Loss: 1.1235, Acc: 67.59%
Val Loss: 1.0007, Acc: 70.46%
Epoch 5/10
Train Loss: 1.0876, Acc: 68.58%
Val Loss: 0.9795, Acc: 71.07%
Epoch 6/10
Train Loss: 1.0632, Acc: 69.42%
Val Loss: 0.9620, Acc: 71.02%
Epoch 7/10
Train Loss: 1.0424, Acc: 69.96%
Val Loss: 0.9723, Acc: 70.74%
Epoch 8/10
Train Loss: 1.0245, Acc: 70.54%
Val Loss: 0.9795, Acc: 71.00%
Epoch 9/10
Train Loss: 1.0126, Acc: 70.90%
Val Loss: 0.9656, Acc: 71.60%
Epoch 10/10
Train Loss: 1.0012, Acc: 71.22%
Val Loss: 0.9512, Acc: 71.67%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▆▆▇██▇███
val_loss,█▃▂▂▂▁▁▂▁▁

0,1
epoch,10.0
train_acc,71.21538
train_loss,1.00123
val_acc,71.66749
val_loss,0.95121


[34m[1mwandb[0m: Agent Starting Run: y0vvjpjv with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 2.1848, Acc: 36.30%
Val Loss: 1.4162, Acc: 56.57%
Epoch 2/10
Train Loss: 1.1347, Acc: 65.97%
Val Loss: 0.9780, Acc: 69.51%
Epoch 3/10
Train Loss: 0.9094, Acc: 72.94%
Val Loss: 0.9145, Acc: 71.97%
Epoch 4/10
Train Loss: 0.8226, Acc: 75.40%
Val Loss: 0.8949, Acc: 72.81%
Epoch 5/10
Train Loss: 0.7705, Acc: 76.84%
Val Loss: 0.8672, Acc: 73.48%
Epoch 6/10
Train Loss: 0.7365, Acc: 77.69%
Val Loss: 0.8350, Acc: 73.43%
Epoch 7/10
Train Loss: 0.7109, Acc: 78.32%
Val Loss: 0.8462, Acc: 74.03%
Epoch 8/10
Train Loss: 0.6881, Acc: 78.93%
Val Loss: 0.8172, Acc: 74.46%
Epoch 9/10
Train Loss: 0.6676, Acc: 79.38%
Val Loss: 0.8003, Acc: 74.72%
Epoch 10/10
Train Loss: 0.6536, Acc: 79.73%
Val Loss: 0.7954, Acc: 74.49%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▆▇▇██████
val_loss,█▃▂▂▂▁▂▁▁▁

0,1
epoch,10.0
train_acc,79.72945
train_loss,0.65364
val_acc,74.49437
val_loss,0.79536


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1sxvkv5x with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.8565, Acc: 44.54%
Val Loss: 1.0665, Acc: 67.06%
Epoch 2/10
Train Loss: 0.9642, Acc: 70.94%
Val Loss: 0.9299, Acc: 71.33%
Epoch 3/10
Train Loss: 0.8129, Acc: 75.43%
Val Loss: 0.9166, Acc: 72.70%
Epoch 4/10
Train Loss: 0.7393, Acc: 77.52%
Val Loss: 0.8785, Acc: 73.05%
Epoch 5/10
Train Loss: 0.7016, Acc: 78.41%
Val Loss: 0.8376, Acc: 73.92%
Epoch 6/10
Train Loss: 0.6830, Acc: 78.70%
Val Loss: 0.8279, Acc: 74.46%
Epoch 7/10
Train Loss: 0.6344, Acc: 80.40%
Val Loss: 0.8265, Acc: 75.28%
Epoch 8/10
Train Loss: 0.6350, Acc: 80.02%
Val Loss: 0.8087, Acc: 75.31%
Epoch 9/10
Train Loss: 0.6265, Acc: 80.15%
Val Loss: 0.8231, Acc: 74.95%
Epoch 10/10
Train Loss: 0.5942, Acc: 81.39%
Val Loss: 0.8046, Acc: 75.63%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇▇████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▄▆▆▇▇██▇█
val_loss,█▄▄▃▂▂▂▁▁▁

0,1
epoch,10.0
train_acc,81.39287
train_loss,0.5942
val_acc,75.63439
val_loss,0.80463


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: eqpz333v with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001


Epoch 1/15
Train Loss: 2.4409, Acc: 28.78%
Val Loss: 2.0366, Acc: 35.57%
Epoch 2/15
Train Loss: 1.7160, Acc: 47.78%
Val Loss: 1.4151, Acc: 58.57%
Epoch 3/15
Train Loss: 1.1653, Acc: 67.04%
Val Loss: 1.1424, Acc: 67.59%
Epoch 4/15
Train Loss: 0.9786, Acc: 72.42%
Val Loss: 1.0481, Acc: 69.87%
Epoch 5/15
Train Loss: 0.8922, Acc: 74.40%
Val Loss: 0.9820, Acc: 71.76%
Epoch 6/15
Train Loss: 0.8450, Acc: 75.52%
Val Loss: 0.9760, Acc: 71.70%
Epoch 7/15
Train Loss: 0.8054, Acc: 76.48%
Val Loss: 1.0276, Acc: 70.94%
Epoch 8/15
Train Loss: 0.7711, Acc: 77.39%
Val Loss: 0.9448, Acc: 72.84%
Epoch 9/15
Train Loss: 0.7504, Acc: 77.85%
Val Loss: 0.9140, Acc: 73.20%
Epoch 10/15
Train Loss: 0.7296, Acc: 78.43%
Val Loss: 0.9243, Acc: 73.13%
Epoch 11/15
Train Loss: 0.7190, Acc: 78.51%
Val Loss: 0.9042, Acc: 73.53%
Epoch 12/15
Train Loss: 0.7047, Acc: 78.81%
Val Loss: 0.9551, Acc: 72.63%
Epoch 13/15
Train Loss: 0.6898, Acc: 79.20%
Val Loss: 0.9016, Acc: 73.65%
Epoch 14/15
Train Loss: 0.6815, Acc: 79.40%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▄▆▇▇▇█████████
train_loss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▅▇▇██▇████████
val_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,15.0
train_acc,79.72835
train_loss,0.66803
val_acc,73.8665
val_loss,0.90026


[34m[1mwandb[0m: Agent Starting Run: eh9vz2pc with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.9374, Acc: 41.61%
Val Loss: 1.2123, Acc: 62.61%
Epoch 2/10
Train Loss: 0.9624, Acc: 71.29%
Val Loss: 0.9207, Acc: 71.33%
Epoch 3/10
Train Loss: 0.8048, Acc: 75.78%
Val Loss: 0.9002, Acc: 72.65%
Epoch 4/10
Train Loss: 0.7378, Acc: 77.64%
Val Loss: 0.8695, Acc: 73.26%
Epoch 5/10
Train Loss: 0.6903, Acc: 78.77%
Val Loss: 0.8785, Acc: 73.78%
Epoch 6/10
Train Loss: 0.6619, Acc: 79.56%
Val Loss: 0.8391, Acc: 74.14%
Epoch 7/10
Train Loss: 0.6459, Acc: 80.00%
Val Loss: 0.8238, Acc: 74.57%
Epoch 8/10
Train Loss: 0.6217, Acc: 80.50%
Val Loss: 0.7963, Acc: 75.13%
Epoch 9/10
Train Loss: 0.6067, Acc: 80.91%
Val Loss: 0.8051, Acc: 74.92%
Epoch 10/10
Train Loss: 0.5939, Acc: 81.28%
Val Loss: 0.8182, Acc: 75.26%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▆▇▇▇▇████
val_loss,█▃▃▂▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,81.2814
train_loss,0.5939
val_acc,75.25535
val_loss,0.81824


[34m[1mwandb[0m: Agent Starting Run: 05iwrav9 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 2.2575, Acc: 33.58%
Val Loss: 1.7159, Acc: 45.53%
Epoch 2/15
Train Loss: 1.3144, Acc: 59.91%
Val Loss: 1.0481, Acc: 68.16%
Epoch 3/15
Train Loss: 0.9773, Acc: 70.71%
Val Loss: 0.9763, Acc: 70.21%
Epoch 4/15
Train Loss: 0.8661, Acc: 74.01%
Val Loss: 0.9087, Acc: 71.59%
Epoch 5/15
Train Loss: 0.8032, Acc: 75.73%
Val Loss: 0.8968, Acc: 72.41%
Epoch 6/15
Train Loss: 0.7693, Acc: 76.42%
Val Loss: 0.8466, Acc: 73.13%
Epoch 7/15
Train Loss: 0.7305, Acc: 77.61%
Val Loss: 0.8583, Acc: 73.68%
Epoch 8/15
Train Loss: 0.7100, Acc: 78.10%
Val Loss: 0.8433, Acc: 74.05%
Epoch 9/15
Train Loss: 0.6819, Acc: 79.05%
Val Loss: 0.8332, Acc: 74.49%
Epoch 10/15
Train Loss: 0.6672, Acc: 79.38%
Val Loss: 0.8466, Acc: 74.78%
Epoch 11/15
Train Loss: 0.6487, Acc: 79.97%
Val Loss: 0.8449, Acc: 74.85%
Epoch 12/15
Train Loss: 0.6439, Acc: 79.94%
Val Loss: 0.8206, Acc: 74.83%
Epoch 13/15
Train Loss: 0.6315, Acc: 80.19%
Val Loss: 0.8718, Acc: 74.78%
Epoch 14/15
Train Loss: 0.6157, Acc: 80.87%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▅▆▇▇▇█████████
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▆▇▇▇▇█████████
val_loss,█▃▂▂▂▁▁▁▁▁▁▁▂▁▁

0,1
epoch,15.0
train_acc,80.89293
train_loss,0.61102
val_acc,74.87341
val_loss,0.83921


[34m[1mwandb[0m: Agent Starting Run: ih4bxt3q with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.0005


Epoch 1/10
Train Loss: 2.5453, Acc: 27.60%
Val Loss: 2.0865, Acc: 36.55%
Epoch 2/10
Train Loss: 2.0107, Acc: 38.52%
Val Loss: 1.6148, Acc: 48.87%
Epoch 3/10
Train Loss: 1.6613, Acc: 48.91%
Val Loss: 1.2463, Acc: 61.50%
Epoch 4/10
Train Loss: 1.4642, Acc: 55.17%
Val Loss: 1.1292, Acc: 64.97%
Epoch 5/10
Train Loss: 1.3559, Acc: 58.80%
Val Loss: 1.0711, Acc: 67.61%
Epoch 6/10
Train Loss: 1.2791, Acc: 61.39%
Val Loss: 1.0266, Acc: 68.66%
Epoch 7/10
Train Loss: 1.2338, Acc: 62.81%
Val Loss: 0.9922, Acc: 69.91%
Epoch 8/10
Train Loss: 1.1852, Acc: 64.32%
Val Loss: 1.0072, Acc: 70.10%
Epoch 9/10
Train Loss: 1.1544, Acc: 65.40%
Val Loss: 0.9742, Acc: 70.57%
Epoch 10/10
Train Loss: 1.1238, Acc: 66.45%
Val Loss: 0.9838, Acc: 70.52%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▇▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
val_acc,▁▄▆▇▇█████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,66.4453
train_loss,1.12379
val_acc,70.52458
val_loss,0.98381


[34m[1mwandb[0m: Agent Starting Run: vihpbsf2 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 128
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.5892, Acc: 51.98%
Val Loss: 1.0002, Acc: 69.75%
Epoch 2/10
Train Loss: 0.9114, Acc: 72.33%
Val Loss: 0.8719, Acc: 72.41%
Epoch 3/10
Train Loss: 0.8195, Acc: 75.04%
Val Loss: 0.8747, Acc: 73.35%
Epoch 4/10
Train Loss: 0.7654, Acc: 76.59%
Val Loss: 0.8784, Acc: 72.95%
Epoch 5/10
Train Loss: 0.7339, Acc: 77.55%
Val Loss: 0.8646, Acc: 73.50%
Epoch 6/10
Train Loss: 0.7078, Acc: 78.24%
Val Loss: 0.8785, Acc: 73.62%
Epoch 7/10
Train Loss: 0.6927, Acc: 78.52%
Val Loss: 0.8622, Acc: 74.21%
Epoch 8/10
Train Loss: 0.6837, Acc: 78.85%
Val Loss: 0.8252, Acc: 74.74%
Epoch 9/10
Train Loss: 0.6792, Acc: 78.83%
Val Loss: 0.8278, Acc: 74.50%
Epoch 10/10
Train Loss: 0.6512, Acc: 79.82%
Val Loss: 0.8382, Acc: 74.60%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▅▆▅▆▆▇███
val_loss,█▃▃▃▃▃▂▁▁▂

0,1
epoch,10.0
train_acc,79.82436
train_loss,0.65116
val_acc,74.60432
val_loss,0.83819


[34m[1mwandb[0m: Agent Starting Run: fq5wifyo with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.0451, Acc: 68.22%
Val Loss: 0.9252, Acc: 71.66%
Epoch 2/10
Train Loss: 0.6480, Acc: 79.72%
Val Loss: 0.8622, Acc: 73.75%
Epoch 3/10
Train Loss: 0.5954, Acc: 81.10%
Val Loss: 0.8244, Acc: 74.69%
Epoch 4/10
Train Loss: 0.5648, Acc: 81.87%
Val Loss: 0.7952, Acc: 74.87%
Epoch 5/10
Train Loss: 0.5379, Acc: 82.78%
Val Loss: 0.7798, Acc: 75.47%
Epoch 6/10
Train Loss: 0.5222, Acc: 83.10%
Val Loss: 0.8445, Acc: 75.85%
Epoch 7/10
Train Loss: 0.5151, Acc: 83.24%
Val Loss: 0.7786, Acc: 75.55%
Epoch 8/10
Train Loss: 0.4989, Acc: 83.65%
Val Loss: 0.8201, Acc: 75.71%
Epoch 9/10
Train Loss: 0.4874, Acc: 84.05%
Val Loss: 0.8110, Acc: 76.06%
Epoch 10/10
Train Loss: 0.4824, Acc: 84.12%
Val Loss: 0.7887, Acc: 75.52%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▄▆▆▇█▇▇█▇
val_loss,█▅▃▂▁▄▁▃▃▁

0,1
epoch,10.0
train_acc,84.12375
train_loss,0.4824
val_acc,75.51575
val_loss,0.78872


[34m[1mwandb[0m: Agent Starting Run: v6uqq6tb with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.2023, Acc: 63.48%
Val Loss: 0.9374, Acc: 70.98%
Epoch 2/15
Train Loss: 0.7656, Acc: 76.32%
Val Loss: 0.8799, Acc: 73.31%
Epoch 3/15
Train Loss: 0.6832, Acc: 78.70%
Val Loss: 0.8715, Acc: 74.01%
Epoch 4/15
Train Loss: 0.6542, Acc: 79.45%
Val Loss: 0.8128, Acc: 74.49%
Epoch 5/15
Train Loss: 0.6223, Acc: 80.43%
Val Loss: 0.8497, Acc: 74.86%
Epoch 6/15
Train Loss: 0.6021, Acc: 80.92%
Val Loss: 0.8030, Acc: 75.01%
Epoch 7/15
Train Loss: 0.5919, Acc: 81.22%
Val Loss: 0.8172, Acc: 75.40%
Epoch 8/15
Train Loss: 0.5799, Acc: 81.53%
Val Loss: 0.8246, Acc: 75.43%
Epoch 9/15
Train Loss: 0.5694, Acc: 81.84%
Val Loss: 0.8179, Acc: 75.72%
Epoch 10/15
Train Loss: 0.5642, Acc: 81.96%
Val Loss: 0.8048, Acc: 75.19%
Epoch 11/15
Train Loss: 0.5576, Acc: 82.16%
Val Loss: 0.7998, Acc: 75.42%
Epoch 12/15
Train Loss: 0.5532, Acc: 82.27%
Val Loss: 0.8040, Acc: 75.53%
Epoch 13/15
Train Loss: 0.5416, Acc: 82.71%
Val Loss: 0.7887, Acc: 75.92%
Epoch 14/15
Train Loss: 0.5461, Acc: 82.47%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▆▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▆▇▇▇▇▇▇███
val_loss,█▆▅▃▄▂▃▃▃▂▂▂▂▁▂

0,1
epoch,15.0
train_acc,82.86094
train_loss,0.53274
val_acc,75.97581
val_loss,0.79531


[34m[1mwandb[0m: Agent Starting Run: uo6pujdn with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.6826, Acc: 49.00%
Val Loss: 0.9833, Acc: 68.59%
Epoch 2/15
Train Loss: 0.8745, Acc: 72.92%
Val Loss: 0.8375, Acc: 72.50%
Epoch 3/15
Train Loss: 0.7482, Acc: 76.71%
Val Loss: 0.8200, Acc: 74.29%
Epoch 4/15
Train Loss: 0.6899, Acc: 78.46%
Val Loss: 0.8262, Acc: 74.44%
Epoch 5/15
Train Loss: 0.6451, Acc: 79.75%
Val Loss: 0.8252, Acc: 74.69%
Epoch 6/15
Train Loss: 0.6233, Acc: 80.42%
Val Loss: 0.7637, Acc: 75.67%
Epoch 7/15
Train Loss: 0.5933, Acc: 81.31%
Val Loss: 0.8107, Acc: 75.33%
Epoch 8/15
Train Loss: 0.5816, Acc: 81.61%
Val Loss: 0.8100, Acc: 75.29%
Epoch 9/15
Train Loss: 0.5680, Acc: 82.01%
Val Loss: 0.7818, Acc: 75.72%
Epoch 10/15
Train Loss: 0.5521, Acc: 82.50%
Val Loss: 0.7805, Acc: 75.80%
Epoch 11/15
Train Loss: 0.5380, Acc: 82.88%
Val Loss: 0.8553, Acc: 75.81%
Epoch 12/15
Train Loss: 0.5332, Acc: 82.96%
Val Loss: 0.8068, Acc: 75.98%
Epoch 13/15
Train Loss: 0.5382, Acc: 82.71%
Val Loss: 0.7625, Acc: 75.95%
Epoch 14/15
Train Loss: 0.5217, Acc: 83.21%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇█████████
train_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▄▆▆▆▇▇▇▇▇▇▇▇██
val_loss,█▄▃▃▃▂▃▃▂▂▄▃▂▂▁

0,1
epoch,15.0
train_acc,83.41716
train_loss,0.51344
val_acc,76.55739
val_loss,0.7403


[34m[1mwandb[0m: Agent Starting Run: 04smivhe with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 2.5109, Acc: 28.89%
Val Loss: 2.0238, Acc: 38.10%
Epoch 2/10
Train Loss: 1.5624, Acc: 50.93%
Val Loss: 1.1102, Acc: 65.64%
Epoch 3/10
Train Loss: 1.0185, Acc: 68.82%
Val Loss: 0.9203, Acc: 70.50%
Epoch 4/10
Train Loss: 0.8715, Acc: 73.00%
Val Loss: 0.8712, Acc: 72.10%
Epoch 5/10
Train Loss: 0.7973, Acc: 75.35%
Val Loss: 0.8537, Acc: 72.46%
Epoch 6/10
Train Loss: 0.7430, Acc: 76.99%
Val Loss: 0.8379, Acc: 73.87%
Epoch 7/10
Train Loss: 0.7068, Acc: 78.18%
Val Loss: 0.8035, Acc: 74.40%
Epoch 8/10
Train Loss: 0.6847, Acc: 78.60%
Val Loss: 0.8081, Acc: 74.29%
Epoch 9/10
Train Loss: 0.6562, Acc: 79.48%
Val Loss: 0.8043, Acc: 74.90%
Epoch 10/10
Train Loss: 0.6404, Acc: 80.00%
Val Loss: 0.8069, Acc: 75.12%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▆▇▇█████
train_loss,█▄▂▂▂▁▁▁▁▁
val_acc,▁▆▇▇▇█████
val_loss,█▃▂▁▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,80.0048
train_loss,0.64039
val_acc,75.11646
val_loss,0.80689


[34m[1mwandb[0m: Agent Starting Run: z4a2zz82 with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.3089, Acc: 60.64%
Val Loss: 0.9456, Acc: 71.43%
Epoch 2/15
Train Loss: 0.7817, Acc: 76.15%
Val Loss: 0.8573, Acc: 73.20%
Epoch 3/15
Train Loss: 0.7002, Acc: 78.28%
Val Loss: 0.8484, Acc: 73.97%
Epoch 4/15
Train Loss: 0.6577, Acc: 79.48%
Val Loss: 0.8318, Acc: 74.52%
Epoch 5/15
Train Loss: 0.6275, Acc: 80.33%
Val Loss: 0.8067, Acc: 74.87%
Epoch 6/15
Train Loss: 0.6082, Acc: 80.82%
Val Loss: 0.7863, Acc: 75.19%
Epoch 7/15
Train Loss: 0.5930, Acc: 81.26%
Val Loss: 0.7977, Acc: 75.16%
Epoch 8/15
Train Loss: 0.5812, Acc: 81.62%
Val Loss: 0.7925, Acc: 75.02%
Epoch 9/15
Train Loss: 0.5642, Acc: 82.12%
Val Loss: 0.8068, Acc: 75.41%
Epoch 10/15
Train Loss: 0.5582, Acc: 82.19%
Val Loss: 0.8024, Acc: 75.83%
Epoch 11/15
Train Loss: 0.5597, Acc: 82.06%
Val Loss: 0.7603, Acc: 75.66%
Epoch 12/15
Train Loss: 0.5426, Acc: 82.60%
Val Loss: 0.8041, Acc: 75.50%
Epoch 13/15
Train Loss: 0.5383, Acc: 82.76%
Val Loss: 0.8117, Acc: 75.97%
Epoch 14/15
Train Loss: 0.5326, Acc: 82.88%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▇▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▇▇▇▇██▇██▇
val_loss,█▅▄▄▃▂▂▂▃▃▁▃▃▃▃

0,1
epoch,15.0
train_acc,82.96027
train_loss,0.52844
val_acc,75.46946
val_loss,0.80888


[34m[1mwandb[0m: Agent Starting Run: 2yew837r with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 128
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/15
Train Loss: 1.1083, Acc: 67.11%
Val Loss: 0.9629, Acc: 70.57%
Epoch 2/15
Train Loss: 0.6854, Acc: 79.01%
Val Loss: 0.8795, Acc: 72.39%
Epoch 3/15
Train Loss: 0.6285, Acc: 80.40%
Val Loss: 0.8690, Acc: 73.82%
Epoch 4/15
Train Loss: 0.5906, Acc: 81.47%
Val Loss: 0.8232, Acc: 74.25%
Epoch 5/15
Train Loss: 0.5742, Acc: 81.76%
Val Loss: 0.8182, Acc: 73.68%
Epoch 6/15
Train Loss: 0.5536, Acc: 82.38%
Val Loss: 0.8186, Acc: 74.91%
Epoch 7/15
Train Loss: 0.5413, Acc: 82.65%
Val Loss: 0.8081, Acc: 74.86%
Epoch 8/15
Train Loss: 0.5237, Acc: 83.29%
Val Loss: 0.8119, Acc: 74.97%
Epoch 9/15
Train Loss: 0.5111, Acc: 83.60%
Val Loss: 0.7972, Acc: 75.36%
Epoch 10/15
Train Loss: 0.5088, Acc: 83.53%
Val Loss: 0.7974, Acc: 75.48%
Epoch 11/15
Train Loss: 0.5022, Acc: 83.76%
Val Loss: 0.8097, Acc: 75.19%
Epoch 12/15
Train Loss: 0.4964, Acc: 83.88%
Val Loss: 0.8004, Acc: 75.36%
Epoch 13/15
Train Loss: 0.4877, Acc: 84.08%
Val Loss: 0.8045, Acc: 75.42%
Epoch 14/15
Train Loss: 0.4861, Acc: 84.06%
Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_acc,▁▆▆▇▇▇▇████████
train_loss,█▃▃▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▄▆▆▅▇▇▇█████▇█
val_loss,█▅▄▂▂▂▂▂▁▁▂▁▁▂▁

0,1
epoch,15.0
train_acc,84.4082
train_loss,0.47749
val_acc,75.47814
val_loss,0.79498


[34m[1mwandb[0m: Agent Starting Run: wbrsydns with config:
[34m[1mwandb[0m: 	attention_type: concat
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001


Epoch 1/10
Train Loss: 1.5142, Acc: 54.02%
Val Loss: 1.0001, Acc: 68.74%
Epoch 2/10
Train Loss: 0.8397, Acc: 74.15%
Val Loss: 0.8635, Acc: 72.71%
Epoch 3/10
Train Loss: 0.7281, Acc: 77.54%
Val Loss: 0.8369, Acc: 73.72%
Epoch 4/10
Train Loss: 0.6878, Acc: 78.64%
Val Loss: 0.8221, Acc: 74.04%
Epoch 5/10
Train Loss: 0.6428, Acc: 80.01%
Val Loss: 0.8310, Acc: 74.59%
Epoch 6/10
Train Loss: 0.6204, Acc: 80.64%
Val Loss: 0.8199, Acc: 74.61%
Epoch 7/10
Train Loss: 0.6094, Acc: 80.95%
Val Loss: 0.8108, Acc: 74.48%
Epoch 8/10
Train Loss: 0.5889, Acc: 81.50%
Val Loss: 0.8158, Acc: 75.18%
Epoch 9/10
Train Loss: 0.5622, Acc: 82.34%
Val Loss: 0.8262, Acc: 75.21%
Epoch 10/10
Train Loss: 0.5605, Acc: 82.30%
Val Loss: 0.8074, Acc: 75.14%


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇▇█████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▁▅▆▇▇▇▇███
val_loss,█▃▂▂▂▁▁▁▂▁

0,1
epoch,10.0
train_acc,82.30252
train_loss,0.56055
val_acc,75.13961
val_loss,0.80738


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import os
import random
import wandb

# =======================
# Best Configuration
# =======================
best_config = {
    'embedding_dim': 256,
    'hidden_dim': 256,
    'enc_layers': 2,
    'dec_layers': 2,
    'cell_type': 'LSTM',
    'dropout': 0.5,
    'epochs': 15,
    'beam_size': 5,
    'attention_type': 'concat',
    'batch_size': 256,
    'learning_rate': 0.001
}

# =======================
# Vocabulary
# =======================
class Vocab:
    def __init__(self):
        self.char2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2}
        self.idx2char = {0: "<pad>", 1: "<sos>", 2: "<eos>"}
        self.size = 3

    def build(self, texts):
        for text in texts:
            for char in text:
                if char not in self.char2idx:
                    self.char2idx[char] = self.size
                    self.idx2char[self.size] = char
                    self.size += 1

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, idxs):
        return ''.join([self.idx2char[i] for i in idxs if i not in [0, 1, 2]])

# =======================
# Dataset
# =======================
class TransliterationDataset(Dataset):
    def __init__(self, filepath, inp_vocab, out_vocab, is_test=False):
        self.pairs = []
        with open(filepath, encoding='utf-8') as f:
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) < 2:
                    continue
                lat, dev = fields[0], fields[1]
                self.pairs.append((lat, dev))
        if not is_test:
            inp_vocab.build([p[0] for p in self.pairs])
            out_vocab.build([p[1] for p in self.pairs])
        self.inp_vocab = inp_vocab
        self.out_vocab = out_vocab
        self.is_test = is_test

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        lat, dev = self.pairs[idx]
        x = self.inp_vocab.encode(lat)
        if self.is_test:
            return torch.tensor(x), lat, dev
        y = [self.out_vocab.char2idx["<sos>"]] + self.out_vocab.encode(dev) + [self.out_vocab.char2idx["<eos>"]]
        return torch.tensor(x), torch.tensor(y), lat, dev

def collate_fn(batch):
    if len(batch[0]) == 3:  # Test batch
        x_batch, lat, dev = zip(*batch)
        x_lens = [len(x) for x in x_batch]
        x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
        return x_pad, lat, dev, torch.tensor(x_lens)
    else:  # Train/val batch
        x_batch, y_batch, lat, dev = zip(*batch)
        x_lens = [len(x) for x in x_batch]
        y_lens = [len(y) for y in y_batch]
        x_pad = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
        y_pad = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
        return x_pad, y_pad, torch.tensor(x_lens), torch.tensor(y_lens), lat, dev

# =======================
# Model Components
# =======================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_dim, attention_type='general'):
        super().__init__()
        self.attention_type = attention_type
        if attention_type == 'general':
            self.attn = nn.Linear(hidden_dim, hidden_dim)
        elif attention_type == 'concat':
            self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
            self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask=None):
        batch_size, src_len, hidden_dim = encoder_outputs.size()
        
        if self.attention_type == 'general':
            energy = torch.bmm(encoder_outputs, self.attn(hidden).unsqueeze(2)).squeeze(2)
        elif self.attention_type == 'concat':
            hidden_expanded = hidden.unsqueeze(1).repeat(1, src_len, 1)
            concat = torch.cat((hidden_expanded, encoder_outputs), dim=2)
            energy = self.v(torch.tanh(self.attn(concat))).squeeze(2)
        else:  # dot
            energy = torch.bmm(encoder_outputs, hidden.unsqueeze(2)).squeeze(2)
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        attention_weights = F.softmax(energy, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attention_weights

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, cell_type, dropout, attention_type):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        rnn_class = {"GRU": nn.GRU, "LSTM": nn.LSTM, "RNN": nn.RNN}[cell_type]
        self.rnn = rnn_class(emb_dim + hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.attention = Attention(hidden_dim, attention_type)
        self.out = nn.Linear(hidden_dim * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden, encoder_outputs, mask=None):
        if isinstance(hidden, tuple):  # LSTM
            attn_hidden = hidden[0][-1]
        else:  # GRU/RNN
            attn_hidden = hidden[-1]
        
        context, _ = self.attention(attn_hidden, encoder_outputs, mask)
        embedded = self.embedding(input_token)
        rnn_input = torch.cat((embedded, context), dim=1).unsqueeze(1)
        output, hidden = self.rnn(rnn_input, hidden)
        
        if isinstance(hidden, tuple):
            output_hidden = hidden[0][-1]
        else:
            output_hidden = hidden[-1]
        
        output = torch.cat((output_hidden, context), dim=1)
        output = self.dropout(output)
        prediction = self.out(output)
        return prediction, hidden, None

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def create_mask(self, src_lens, max_len):
        batch_size = len(src_lens)
        mask = torch.zeros(batch_size, max_len, device=self.device)
        for i, length in enumerate(src_lens):
            mask[i, :length] = 1
        return mask

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        src_data, src_lens = src
        encoder_outputs, enc_hidden = self.encoder(src_data, src_lens)
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        
        src_len = encoder_outputs.size(1)
        mask = self.create_mask(src_lens, src_len)

        if isinstance(enc_hidden, tuple):
            dec_hidden = enc_hidden
        else:
            dec_hidden = enc_hidden

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            output, dec_hidden, _ = self.decoder(input_token, dec_hidden, encoder_outputs, mask)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = trg[:, t] if teacher_force else top1
            
        return outputs

    def predict(self, src, src_lens, max_len=30):
        self.eval()
        with torch.no_grad():
            encoder_outputs, enc_hidden = self.encoder(src, src_lens)
            src_len = encoder_outputs.size(1)
            mask = self.create_mask(src_lens.tolist(), src_len)
            
            if isinstance(enc_hidden, tuple):
                dec_hidden = enc_hidden
            else:
                dec_hidden = enc_hidden
            
            input_token = torch.tensor([1], device=self.device)
            output_seq = []
            for _ in range(max_len):
                output, dec_hidden, _ = self.decoder(input_token, dec_hidden, encoder_outputs, mask)
                top1 = output.argmax(1)
                if top1.item() == 2:
                    break
                output_seq.append(top1.item())
                input_token = top1
        return output_seq

# =======================
# Training and Evaluation
# =======================
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    
    for batch in loader:
        src, trg, src_lens, _, _, _ = batch
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model((src, src_lens), trg)
        
        # Calculate loss
        output_dim = output.shape[-1]
        loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        pred = output.argmax(dim=2)
        mask = (trg[:, 1:] != 0)
        correct = ((pred[:, 1:] == trg[:, 1:]) & mask).sum().item()
        total_correct += correct
        total_tokens += mask.sum().item()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    accuracy = (total_correct / total_tokens) * 100 if total_tokens > 0 else 0
    return avg_loss, accuracy

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in loader:
            src, trg, src_lens, _, _, _ = batch
            src, trg = src.to(device), trg.to(device)
            output = model((src, src_lens), trg, teacher_forcing_ratio=0)
            
            # Calculate loss
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            
            # Calculate accuracy
            pred = output.argmax(dim=2)
            mask = (trg[:, 1:] != 0)
            correct = ((pred[:, 1:] == trg[:, 1:]) & mask).sum().item()
            total_correct += correct
            total_tokens += mask.sum().item()
            total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    accuracy = (total_correct / total_tokens) * 100 if total_tokens > 0 else 0
    return avg_loss, accuracy

# =======================
# Main Execution
# =======================
def main():
    wandb.init(config=best_config, project="dakshina-translit")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Initialize vocabularies
    inp_vocab = Vocab()
    out_vocab = Vocab()

    # Load datasets
    train_data = TransliterationDataset("/kaggle/input/devnagiridata/hi.translit.sampled.train.tsv", inp_vocab, out_vocab)
    dev_data = TransliterationDataset("/kaggle/input/devnagiridata/hi.translit.sampled.dev.tsv", inp_vocab, out_vocab)
    test_data = TransliterationDataset("/kaggle/input/devnagiridata/hi.translit.sampled.test.tsv", inp_vocab, out_vocab, is_test=True)

    # Create data loaders
    train_loader = DataLoader(train_data, batch_size=best_config['batch_size'], 
                            shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_data, batch_size=best_config['batch_size'],
                          shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)

    # Initialize model
    encoder = Encoder(inp_vocab.size, best_config['embedding_dim'], 
                     best_config['hidden_dim'], best_config['enc_layers'], 
                     best_config['cell_type'], best_config['dropout'])
    
    decoder = Decoder(out_vocab.size, best_config['embedding_dim'],
                     best_config['hidden_dim'], best_config['dec_layers'],
                     best_config['cell_type'], best_config['dropout'],
                     best_config['attention_type'])
    
    model = Seq2Seq(encoder, decoder, device).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=best_config['learning_rate'])
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Training loop
    best_val_loss = float('inf')
    for epoch in range(best_config['epochs']):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, dev_loader, criterion, device)
        
        print(f"\nEpoch {epoch+1}/{best_config['epochs']}")
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "val_loss": val_loss,
            "val_acc": val_acc
        })
        
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print("Best model saved!")

    # Test evaluation
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    
    total_correct = 0
    total_samples = 0
    predictions = []
    
    with torch.no_grad():
        for batch in test_loader:
            src, lat, dev, src_lens = batch
            src = src.to(device)
            pred_ids = model.predict(src, src_lens)
            pred_str = out_vocab.decode(pred_ids)
            true_str = dev[0]
            
            predictions.append({
                'input': lat[0],
                'true': true_str,
                'pred': pred_str
            })
            
            if pred_str == true_str:
                total_correct += 1
            total_samples += 1

    # Calculate accuracy
    accuracy = 100 * total_correct / total_samples
    print(f"\nTest Accuracy: {accuracy:.2f}%")
    wandb.log({"test_acc": accuracy})

     # Create and log a table of predictions
    table = wandb.Table(columns=["Input", "True", "Predicted"])
    for p in predictions[:20]:  # Log first 20 predictions
        table.add_data(p['input'], p['true'], p['pred'])
    
    wandb.log({
        "predictions": table,
        "test_accuracy": accuracy
    })

    
    # Save predictions
    with open("test_predictions.txt", "w", encoding="utf-8") as f:
        f.write(f"Test Accuracy: {accuracy:.2f}%\n\n")
        for p in predictions[:20]:
            f.write(f"Input: {p['input']}\n")
            f.write(f"True: {p['true']}\n")
            f.write(f"Pred: {p['pred']}\n\n")
    
    # Print random samples
    print("\nRandom Samples:")
    samples = random.sample(predictions, 5)
    for sample in samples:
        print(f"Input: {sample['input']}")
        print(f"True: {sample['true']}")
        print(f"Pred: {sample['pred']}\n")

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Currently logged in as: [33mmanglesh_dlass3[0m ([33mmanglesh_dl_ass3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



Epoch 1/15
Train Loss: 1.6700 | Train Acc: 49.26%
Val Loss: 0.9560 | Val Acc: 69.54%
Best model saved!

Epoch 2/15
Train Loss: 0.8696 | Train Acc: 73.11%
Val Loss: 0.9008 | Val Acc: 71.14%
Best model saved!

Epoch 3/15
Train Loss: 0.7541 | Train Acc: 76.55%
Val Loss: 0.8106 | Val Acc: 73.87%
Best model saved!

Epoch 4/15
Train Loss: 0.6842 | Train Acc: 78.64%
Val Loss: 0.8309 | Val Acc: 74.67%

Epoch 5/15
Train Loss: 0.6603 | Train Acc: 79.25%
Val Loss: 0.8221 | Val Acc: 75.44%

Epoch 6/15
Train Loss: 0.6305 | Train Acc: 80.20%
Val Loss: 0.7608 | Val Acc: 75.27%
Best model saved!

Epoch 7/15
Train Loss: 0.6054 | Train Acc: 80.90%
Val Loss: 0.7964 | Val Acc: 75.49%

Epoch 8/15
Train Loss: 0.5982 | Train Acc: 81.02%
Val Loss: 0.8013 | Val Acc: 75.42%

Epoch 9/15
Train Loss: 0.5822 | Train Acc: 81.49%
Val Loss: 0.7886 | Val Acc: 75.25%

Epoch 10/15
Train Loss: 0.5663 | Train Acc: 81.96%
Val Loss: 0.7844 | Val Acc: 75.90%

Epoch 11/15
Train Loss: 0.5470 | Train Acc: 82.60%
Val Loss: 0.771