In [1]:
# Import Lib

import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable 
import copy
import os
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import random
import heapq
import wandb
# Set device (CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [10]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf dakshina_dataset_v1.0.tar

--2025-05-19 20:30:33--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.203.207, 74.125.199.207, 172.253.117.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.203.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2025-05-19 20:30:40 (262 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [2]:
wandb.login(key='b8d44a4abbab8753e976a6e5ab717fd669ba99a2')


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs24m030[0m ([33mcs24m030-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
def load_and_convert_dataset(tsv_file_path, csv_output_path=None):
    """
    Load a TSV dataset and convert it to CSV format
    
    Args:
        tsv_file_path: Path to the TSV file
        csv_output_path: Path to save the CSV file (if None, will use same name with .csv extension)
    
    Returns:
        df: Pandas DataFrame containing the dataset
        input_len: Maximum length of input sequences
        output_len: Maximum length of output sequences
    """
    # Load TSV file
    df = pd.read_csv(tsv_file_path, sep='\t', header=None)
    
    # Assuming the first column is the input and the second column is the output
    if len(df.columns) >= 2:
        df.columns = ['input', 'output'] + [f'col_{i}' for i in range(2, len(df.columns))]
        
        # Calculate length statistics
        input_len = df['input'].str.len().max()
        output_len = df['output'].str.len().max()
        
        # Save as CSV if requested
        if csv_output_path is None:
            csv_output_path = tsv_file_path.replace('.tsv', '.csv')
            
        df.to_csv(csv_output_path, index=False)
        print(f"Converted {tsv_file_path} to {csv_output_path}")
        
        return df, input_len, output_len
    else:
        print(f"Error: The TSV file {tsv_file_path} doesn't have at least 2 columns")
        return None, None, None

# Preprocessing

In [4]:
def encode(x, max_length, char_to_idx):
    """
    Encode a string into a tensor.

    Args:
    - x (str): Input string to encode.
    - max_length (int): Maximum length for the encoded tensor.
    - char_to_idx (dict): Character to index mapping.

    Returns:
    - encoded (torch.Tensor): Encoded tensor.
    - length (int): Actual length of the encoded sequence.
    """
    encoded = np.zeros(max_length, dtype=int)
    encoder = np.array([char_to_idx[char] for char in x])
    length = min(max_length, len(encoder))
    encoded[:length] = encoder[:length]

    return torch.tensor(encoded, dtype=torch.int64), length

def get_tensor_object(df, max_input_length, max_output_length, char_to_idx_input, char_to_idx_output):
    """
    Create tensor objects from a DataFrame.

    Args:
    - df (pd.DataFrame): Input DataFrame containing input and output sequences.
    - max_input_length (int): Maximum length for input sequences.
    - max_output_length (int): Maximum length for output sequences.
    - char_to_idx_input (dict): Character to index mapping for input sequences.
    - char_to_idx_output (dict): Character to index mapping for output sequences.

    Returns:
    - tensor_inputs (torch.Tensor): Tensor containing encoded input sequences.
    - tensor_outputs (torch.Tensor): Tensor containing encoded output sequences.
    """
    
    # Encode unique inputs and outputs into tensors
    encoded_inputs = []
    encoded_outputs = []

    # Encode the input column
    for input_str in df[0]:
        encoded_input, input_length = encode(input_str, max_input_length, char_to_idx_input)
        encoded_inputs.append(encoded_input)

    # Encode the output column
    for output_str in df[1]:
        encoded_output, output_length = encode(output_str, max_output_length, char_to_idx_output)
        encoded_outputs.append(encoded_output)

    # Stack tensors column-wise
    
#     tensor_inputs = torch.stack(encoded_inputs, dim=1)
#     tensor_outputs = torch.stack(encoded_outputs, dim=1)
    tensor_inputs = torch.stack(encoded_inputs)
    tensor_outputs = torch.stack(encoded_outputs)

    return tensor_inputs, tensor_outputs

def load_dataset(path):
    """
    Load a dataset from a TSV file.
    Args:
    - path (str): Path to the TSV file.
    Returns:
    - df (pd.DataFrame): Loaded DataFrame.
    - max_input_length (int): Maximum length for input sequences.
    - max_output_length (int): Maximum length for output sequences.
    """
    df = pd.read_csv(path, header=None, encoding='utf-8', sep='\t')  # Changed separator to tab
    
    # Convert values to strings before adding special characters
    df[0] = df[0].astype(str).apply(lambda x: x + '$')
    df[1] = df[1].astype(str).apply(lambda x: '^' + x + '$')
    
    # Determine maximum length for input and output sequences
    max_input_length = max(len(x) for x in df[0].unique())
    max_output_length = max(len(x) for x in df[1].unique())
    return df, max_input_length, max_output_length

def look_up_table(vocab1, vocab2, vocab3):
    """
    Create lookup tables for vocabulary mapping.

    Args:
    - vocab1 (list): First list of vocabulary items.
    - vocab2 (list): Second list of vocabulary items.
    - vocab3 (list): Third list of vocabulary items.

    Returns:
    - vocab_to_int (dict): Mapping from vocabulary items to integers.
    - int_to_vocab (dict): Mapping from integers to vocabulary items.
    """
    
    # Combine all vocabularies into one set
    vocab = set(''.join(vocab1) + ''.join(vocab2) + ''.join(vocab3))
    vocab.discard('^')  
    vocab.discard('$')  
    vocab_to_int = {"": 0, '^':1, '$':2}
    for v_i, v in enumerate(sorted(vocab), len(vocab_to_int)):
        vocab_to_int[v] = v_i
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
    return vocab_to_int, int_to_vocab




# # Load Train, Val, Test
# df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dataset/aksharantar_sampled/hin/hin_train.csv')
# df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dataset/aksharantar_sampled/hin/hin_valid.csv')
# df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dataset/aksharantar_sampled/hin/hin_test.csv')

# input_max_len = max(train_input_len, val_input_len, test_input_len)
# output_max_len = max(train_out_len, val_out_len, test_out_len)


# # Create Look Up Table
# input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
# output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

# print("Input Lookup Table:", input_char_to_int)
# print("\n\n Output Lookup Table", output_char_to_int)

# # Data Embedding and Converting them into Tensor
# train_inputs, train_outputs = get_tensor_object(df_train, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# val_inputs, val_outputs = get_tensor_object(df_val, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
# test_inputs, test_outputs = get_tensor_object(df_test, input_max_len, input_max_len, input_char_to_int, output_char_to_int)

# # Transpose column wise
# train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
# val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
# test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


# print("\n", train_inputs[:,0],train_outputs[:,0])
# print("Training:", train_inputs.shape, train_outputs.shape)

# print("Validation", val_inputs.shape, val_inputs.shape)
# print(df_train.head())

# Create Seq2Seq Model

## encoder and decoder

In [5]:
class Encoder(nn.Module): 
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout, bidirectional, cell_type):
        super(Encoder, self).__init__()
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        
        # Define embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # Define RNN layer with specific cell type
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        else:
            raise ValueError("Invalid RNN type. Choose from 'LSTM', 'GRU', or 'RNN'.")
        
        
    def forward(self, x): # x shape: (seq_length, N) where N is batch size
        # Perform dropout on the input
        embedding = self.embedding(x)
        embedding = self.dropout(embedding) # embedding shape: (seq_length, N, embedding_size)
        
        if self.cell_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.num_layers], hidden[self.num_layers:]), dim=0)
            # Return hidden state and cell state   
            return hidden, cell
        elif self.cell_type == "GRU" or self.cell_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.num_layers], hidden[self.num_layers:]), dim=0)

            # Return hidden state and cell state
            return hidden
        else:
            print("Invalid cell_type specified for Encoder.")
            return None


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout, bidirectional, cell_type):
        super(Decoder, self).__init__()
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)  
        self.num_layers = num_layers 
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.cell_type = cell_type
        
        # Define embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # Define RNN layer with specific cell type
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        else:
            raise ValueError("Invalid RNN type. Choose from 'LSTM', 'GRU', or 'RNN'.")
            
            
        # Define fully connected layer
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)  # Adjust input size for bidirectional decoder
        # Softmax layer
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden, cell): # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        
        # Ensure x has the shape (1, N)
        x = x.unsqueeze(0)
        
        # Perform dropout on the input
        embedding = self.embedding(x)
        embedding = self.dropout(embedding)  # embedding shape: (1, N, embedding_size)
        
        if self.cell_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))  # outputs shape: (1, N, hidden_size * num_directions)

            # Pass through fully connected layer
            out = self.fc(outputs).squeeze(0)
            predictions = self.log_softmax(out)

            return predictions, hidden, cell
        elif self.cell_type == "GRU" or self.cell_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding, hidden)  # outputs shape: (1, N, hidden_size * num_directions)

            # Pass through fully connected layer
            out = self.fc(outputs).squeeze(0)
            predictions = self.log_softmax(out)

            return predictions, hidden

        else:
            print("Invalid cell_type specified for Decoder.")
            return None


## Seq2Seq Class

In [6]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, output_char_to_int, teacher_forcing, cell_type):

        super(Seq2Seq, self).__init__()  
        # Initialize encoder and decoder
        self.decoder = decoder
        self.encoder = encoder
        self.cell_type = cell_type
        self.target_vocab_size = len(output_char_to_int)
        self.teacher_force_ratio = teacher_forcing
        
    def forward(self, source, target):
        # Get batch size, target length, and target vocabulary size
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.target_vocab_size
        teacher_force_ratio = self.teacher_force_ratio
        
        # Initialize outputs tensor
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(source.device)
        # Grab the first input to the Decoder which will be <SOS> token i.e '^'
        x = target[0]
        # Get hidden state and cell state from encoder
        if self.cell_type == 'LSTM':
            hidden, cell = self.encoder(source)
        else:
            hidden = self.encoder(source)
        
        for t in range(1, target_len):
            # Use previous hidden and cell states as context from encoder at start
            if self.cell_type == 'LSTM':
                output, hidden, cell = self.decoder(x, hidden, cell)
            else:
                output, hidden = self.decoder(x, hidden, None)
                
            # Store next output prediction
            outputs[t] = output
            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)
            # Update input for next time step based on teacher forcing ratio
            x = best_guess if random.random() >= teacher_force_ratio else target[t]

        return outputs

# TRAINING

In [7]:
# BEAM SEARCH FUNCTION
def beam_search(model, input_seq, max_length, input_char_index, output_char_index, reverse_target_char_index, beam_width, length_penalty, cell_type):
    """
    Perform beam search to generate a sequence using the provided model.

    Args:
    - model (nn.Module): The Seq2Seq model.
    - input_seq (str): The input sequence.
    - max_length (int): Maximum length of the input sequence.
    - input_char_index (dict): Mapping from characters to integers for the input vocabulary.
    - output_char_index (dict): Mapping from characters to integers for the output vocabulary.
    - reverse_target_char_index (dict): Reverse mapping from integers to characters for the output vocabulary.
    - beam_width (int): Beam width for beam search.
    - length_penalty (float): Length penalty for beam search.
    - cell_type (str): Type of RNN cell used in the model ('LSTM', 'GRU', or 'RNN').

    Returns:
    - str: The generated output sequence.
    """
    if len(input_seq) > max_length:
        print("Input Length is exceeding max length!!!!")
        return ""

    # Create np array of zeros of length input
    input_data = np.zeros((max_length, 1), dtype=int)  # (N,1)

    # Encode the input
    for idx, char in enumerate(input_seq):
        input_data[idx, 0] = input_char_index[char]
    input_data[idx + 1, 0] = input_char_index["$"]  # EOS

    # Convert to tensor
    input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device)  # N,1

    with torch.no_grad():
        if cell_type == 'LSTM':
            hidden, cell = model.encoder(input_tensor)

        else:
            hidden = model.encoder(input_tensor)

    # Initialize beam
    out_t = output_char_index['^']
    out_reshape = np.array(out_t).reshape(1,)
    hidden_par = hidden.unsqueeze(0)
    initial_sequence = torch.tensor(out_reshape).to(device)
    beam = [(0.0, initial_sequence, hidden_par)]  # [(score, sequence, hidden)]

    for _ in range(len(output_char_index)):
        candidates = []
        for score, seq, hidden in beam:
            if seq[-1].item() == output_char_index['$']:
                # If the sequence ends with the end token, add it to the candidates
                candidates.append((score, seq, hidden))
                continue

            last_token = np.array(seq[-1].item()).reshape(1,)
            x = torch.tensor(last_token).to(device)

            if cell_type == 'LSTM':
                output, hidden, cell,  = model.decoder(x, hidden.squeeze(0), cell)
            else:
                output, hidden,  = model.decoder(x, hidden.squeeze(0), None)

            probabilities = F.softmax(output, dim=1)
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

            for prob, token in zip(topk_probs[0], topk_tokens[0]):
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                seq_length_norm_factor = (len(new_seq) - 1) / 5
                candidate_score = score + torch.log(prob).item() / (seq_length_norm_factor ** length_penalty)
                candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))

        # Select top-k candidates based on the accumulated scores
        beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

    best_score, best_sequence, _ = max(beam, key=lambda x: x[0])  # Select the best sequence from the beam as the output

    # Convert the best sequence indices to characters
    return ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:]])


# TRAINING FUNCTION
def train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length, wandb_log):
    """
    Train the Seq2Seq model.

    Args:
    - model (nn.Module): The Seq2Seq model.
    - num_epochs (int): Number of training epochs.
    - criterion: Loss criterion for training.
    - optimizer: Optimizer for training.
    - train_batch_x: Training input data.
    - train_batch_y: Training target data.
    - val_batch_x: Validation input data.
    - val_batch_y: Validation target data.
    - df_val: DataFrame for validation data.
    - input_char_to_int (dict): Mapping from characters to integers for the input vocabulary.
    - output_char_to_int (dict): Mapping from characters to integers for the output vocabulary.
    - output_int_to_char (dict): Reverse mapping from integers to characters for the output vocabulary.
    - beam_width (int): Beam width for beam search.
    - length_penalty (float): Length penalty for beam search.
    - cell_type (str): Type of RNN cell used in the model ('LSTM', 'GRU', or 'RNN').
    - max_length (int): Maximum length of sequences.
    - wandb_log (int): Whether to log to wandb (1 or 0).
    Returns:
    - nn.Module: The trained model.
    - float: Validation accuracy.
    """
    for epoch in range(num_epochs):
        total_words = 0
        correct_pred = 0
        total_loss = 0
        accuracy = 0
        model.train()
        
        # Use tqdm for progress tracking
        train_data_iterator = tqdm(zip(train_batch_x, train_batch_y), total=len(train_batch_x))
        
        for (x, y) in train_data_iterator:
            # Get input and targets and move to device
            target, inp_data = y.to(device), x.to(device)
            
            # Forward propagation
            optimizer.zero_grad()
            output = model(inp_data, target)
            
            target = target.reshape(-1)
            output = output.reshape(-1, output.shape[2])
            
            pad_mask = (target != 0)  
            target = target[pad_mask] # Select non-padding elements
            output = output[pad_mask] 
            
            # Calculate loss
            loss = criterion(output, target)
            
            # Backpropagation
            loss.backward()
            
            # Clip gradients to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            
            # Update parameters
            optimizer.step()
            
            # Accumulate total loss
            total_loss += loss.item()
            # Update total words processed
            total_words += target.size(0)
            # Calculate number of correct predictions
            correct_pred += torch.sum(torch.argmax(output, dim=1) == target).item()
            
        # Calculate average loss per batch
        avg_loss = total_loss / len(train_batch_x)
        # Calculate accuracy
        accuracy = 100*correct_pred / total_words
        
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_total_loss = 0
            val_total_words = 0
            val_correct_pred = 0

            val_data_iterator = tqdm(zip(val_batch_x, val_batch_y), total=len(val_batch_x))
            for x_val, y_val in val_data_iterator:
                target_val, inp_data_val = y_val.to(device), x_val.to(device)
                output_val = model(inp_data_val, target_val)
                
                
                target_val = target_val.reshape(-1)
                output_val = output_val.reshape(-1, output_val.shape[2])
                
                pad_mask = (target_val != 0)  
                target_val = target_val[pad_mask] # Select non-padding elements
                output_val = output_val[pad_mask] 
            
                val_loss = criterion(output_val, target_val)
                val_total_loss += val_loss.item()
                val_total_words += target_val.size(0)
                val_correct_pred += torch.sum(torch.argmax(output_val, dim=1) == target_val).item()

            # Calculate validation statistics
            val_accuracy = 100*val_correct_pred / val_total_words
            val_avg_loss = val_total_loss / len(val_batch_x)

            
            
        # Total word predict correct over training
        beam_val_pred = 0
        beam_val = 0
        for i in tqdm(range(df_val.shape[0])):
            input_seq = df_val.iloc[i, 0][:-1] 
            true_seq = df_val.iloc[i, 1][1:-1]
            predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type)
            if true_seq == predicted_output[:-1]:
                beam_val_pred+=1
        beam_val = 100*beam_val_pred/df_val.shape[0]



        # Print statistics
        print(f"Epoch {epoch + 1} / {num_epochs} ===========================>")
        print(f"Train Accuracy Char: {accuracy:.4f}, Train Average Loss: {avg_loss:.4f}")
        print(f"Validation Accuracy Char: {val_accuracy:.4f}, Validation Average Loss: {val_avg_loss:.4f}")
        print(f"Beam Val Word Accuracy: {beam_val:.4f} Correct Prediction : {beam_val_pred}/{df_val.shape[0]}")    
        
        if wandb_log == 1:
            wandb.log({
                "train_accuracy_char": accuracy,
                "train_loss": avg_loss,
                "val_accuracy_char": val_accuracy,
                "val_loss": val_avg_loss,
                "beam_val_accuracy_word" : beam_val,
            })
        
    
    return model, beam_val


## SWEEP CONFIGURATION

In [22]:
def main():
    wandb.init(project='DA6401_A3')
    config = wandb.config
    wandb.run.name = 'cell_' + config.cell_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.num_layers) + '_dlayer_' + str(config.num_layers)
    
    # Load Dataset
    # df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_train.csv')
    # df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_valid.csv')
    # df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/hinid-dataset/aksharantar_sampled/hin/hin_test.csv')

    df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.train.tsv')
    df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.dev.tsv')
    df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.test.tsv')
    
    input_max_len = max(train_input_len, val_input_len, test_input_len)
    output_max_len = max(train_out_len, val_out_len, test_out_len)
    
    max_length = max(input_max_len, output_max_len)

    # Create Look Up Table
    input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
    output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

    # Data Embedding and Converting them into Tensor
    train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
    val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
    test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

    # Transpose column wise
    train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
    val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
    test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


    # Initialize Hyperparameters
    input_size = len(input_char_to_int)
    output_size = len(output_char_to_int)
    embedding_size = config.embedding_size
    hidden_size = config.hidden_size
    enc_num_layers = config.num_layers
    dec_num_layers = config.num_layers
    cell_type = config.cell_type
    dropout = config.dropout
    learning_rate = config.learning_rate
    batch_size = config.batch_size
    num_epochs = config.num_epochs  
    optimizer = config.optimizer  
    beam_width = config.beam_search_width
    bidirectional = config.bidirectional
    length_penalty = config.length_penalty
    teacher_forcing = config.teacher_forcing
    learning_rate = config.learning_rate

    # Create train data batch
    train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
    # Validation data batch
    val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


    # Intialize encoder, decoder and seq2seq model
    encoder = Encoder(input_size, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional, cell_type).to(device)
    decoder = Decoder(output_size, embedding_size, hidden_size, output_size, dec_num_layers, dropout, bidirectional, cell_type).to(device)  
    model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, cell_type).to(device)

    # Print total number of parameters in the model
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(model)
    print(f'Total Trainable Parameters: {total_params}')


    # Loss function and Optimizer
    criterion = nn.CrossEntropyLoss()
    if optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    elif optimizer == 'nadam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
    else:
        print("Incorrect Optmizer !!!!")

    # TRAINING
    model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length, 1)
    wandb.log({
            "accuracy": acc,
        })
    
# SWEEP CONFIG
# sweep_config = {
#     'name': 'sweep_2',
#     'method': 'bayes',  
#     'metric': {'name': 'accuracy', 'goal': 'maximize'},
#     'parameters': {
#         'embedding_size': {'values': [64, 256, 512]},  
#         'hidden_size': {'values': [256, 512, 1024]},
#         'num_layers': {'values': [1, 2]},  
#         'cell_type': {'values':['LSTM', "GRU", "RNN"]}, # RNN, LSTM, GRU
#         'dropout': {'values': [0.2, 0.3]},
#         'learning_rate': {'values': [0.01, 0.001]},
#         'batch_size': {'values': [ 64,128,256]},
#         'num_epochs': {'values': [5,10]},
#         'optimizer': {'values': ['adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
#         'beam_search_width': {'values': [1, 3, 5]},
#         'length_penalty' : {'values': [0.6]},
#         'bidirectional': {'values': [True]},
#         'teacher_forcing': {'values': [0.5, 0.7]}
#     }
# }
sweep_config = {
    'name': 'sweep_2',
    'method': 'bayes',  
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'embedding_size': {'values': [512]},  
        'hidden_size': {'values': [512]},
        'num_layers': {'values': [1,2]},  
        'cell_type': {'values':["GRU", "LSTM"]}, # RNN, LSTM, GRU
        'dropout': {'values': [ 0.3,0.5]},
        'learning_rate': {'values': [0.01]},
        'batch_size': {'values': [ 32,64]},
        'num_epochs': {'values': [10]},
        'optimizer': {'values': ['adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
        'beam_search_width': {'values': [1]},
        'length_penalty' : {'values': [0.6]},
        'bidirectional': {'values': [True]},
        'teacher_forcing': {'values': [0.7]}
    }
}


# RUN SWEEP ID with agent
sweep_id = wandb.sweep(sweep_config, project = 'DA6401_A3')
wandb.agent(sweep_id, main, count = 7)
wandb.finish()

Create sweep with ID: b84a7ohc
Sweep URL: https://wandb.ai/cs24m030-indian-institute-of-technology-madras/DA6401_A3/sweeps/b84a7ohc


[34m[1mwandb[0m: Agent Starting Run: 9qn65dmr with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7




Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): LSTM(512, 512, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): LSTM(512, 512, dropout=0.3, bidirectional=True)
  )
)
Total Trainable Parameters: 8483357


100%|██████████| 691/691 [00:30<00:00, 22.46it/s]
100%|██████████| 69/69 [00:00<00:00, 89.92it/s]
100%|██████████| 4358/4358 [00:31<00:00, 139.65it/s]


Train Accuracy Char: 53.3416, Train Average Loss: 1.5693
Validation Accuracy Char: 56.9953, Validation Average Loss: 1.4006
Beam Val Word Accuracy: 10.9224 Correct Prediction : 476/4358


100%|██████████| 691/691 [00:30<00:00, 22.43it/s]
100%|██████████| 69/69 [00:00<00:00, 87.11it/s]
100%|██████████| 4358/4358 [00:32<00:00, 135.56it/s]


Train Accuracy Char: 67.2685, Train Average Loss: 1.0879
Validation Accuracy Char: 67.4658, Validation Average Loss: 1.0784
Beam Val Word Accuracy: 23.4052 Correct Prediction : 1020/4358


100%|██████████| 691/691 [00:30<00:00, 22.50it/s]
100%|██████████| 69/69 [00:00<00:00, 86.36it/s]
100%|██████████| 4358/4358 [00:32<00:00, 132.99it/s]


Train Accuracy Char: 71.3614, Train Average Loss: 0.9563
Validation Accuracy Char: 70.7289, Validation Average Loss: 0.9732
Beam Val Word Accuracy: 29.0959 Correct Prediction : 1268/4358


100%|██████████| 691/691 [00:30<00:00, 22.42it/s]
100%|██████████| 69/69 [00:00<00:00, 88.92it/s]
100%|██████████| 4358/4358 [00:32<00:00, 133.03it/s]


Train Accuracy Char: 73.3290, Train Average Loss: 0.8896
Validation Accuracy Char: 72.2835, Validation Average Loss: 0.9242
Beam Val Word Accuracy: 32.5838 Correct Prediction : 1420/4358


100%|██████████| 691/691 [00:30<00:00, 22.45it/s]
100%|██████████| 69/69 [00:00<00:00, 83.90it/s]
100%|██████████| 4358/4358 [00:32<00:00, 132.32it/s]


Train Accuracy Char: 74.6330, Train Average Loss: 0.8447
Validation Accuracy Char: 73.4423, Validation Average Loss: 0.8874
Beam Val Word Accuracy: 35.6586 Correct Prediction : 1554/4358


100%|██████████| 691/691 [00:30<00:00, 22.39it/s]
100%|██████████| 69/69 [00:00<00:00, 88.74it/s]
100%|██████████| 4358/4358 [00:33<00:00, 131.86it/s]


Train Accuracy Char: 75.5525, Train Average Loss: 0.8131
Validation Accuracy Char: 74.6987, Validation Average Loss: 0.8417
Beam Val Word Accuracy: 36.9206 Correct Prediction : 1609/4358


100%|██████████| 691/691 [00:30<00:00, 22.49it/s]
100%|██████████| 69/69 [00:00<00:00, 86.78it/s]
100%|██████████| 4358/4358 [00:32<00:00, 132.11it/s]


Train Accuracy Char: 76.2637, Train Average Loss: 0.7889
Validation Accuracy Char: 74.9891, Validation Average Loss: 0.8289
Beam Val Word Accuracy: 37.7467 Correct Prediction : 1645/4358


100%|██████████| 691/691 [00:30<00:00, 22.49it/s]
100%|██████████| 69/69 [00:00<00:00, 88.56it/s]
100%|██████████| 4358/4358 [00:33<00:00, 131.93it/s]


Train Accuracy Char: 76.9721, Train Average Loss: 0.7627
Validation Accuracy Char: 75.5132, Validation Average Loss: 0.8118
Beam Val Word Accuracy: 38.4810 Correct Prediction : 1677/4358


100%|██████████| 691/691 [00:30<00:00, 22.43it/s]
100%|██████████| 69/69 [00:00<00:00, 88.97it/s]
100%|██████████| 4358/4358 [00:33<00:00, 131.55it/s]


Train Accuracy Char: 77.3789, Train Average Loss: 0.7467
Validation Accuracy Char: 75.1176, Validation Average Loss: 0.8260
Beam Val Word Accuracy: 39.4218 Correct Prediction : 1718/4358


100%|██████████| 691/691 [00:30<00:00, 22.47it/s]
100%|██████████| 69/69 [00:00<00:00, 88.13it/s]
100%|██████████| 4358/4358 [00:33<00:00, 131.29it/s]

Train Accuracy Char: 77.5636, Train Average Loss: 0.7408
Validation Accuracy Char: 75.5441, Validation Average Loss: 0.8097
Beam Val Word Accuracy: 39.8348 Correct Prediction : 1736/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▄▅▆▇▇▇███
train_accuracy_char,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy_char,▁▅▆▇▇█████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,39.83479
beam_val_accuracy_word,39.83479
train_accuracy_char,77.56356
train_loss,0.74083
val_accuracy_char,75.54408
val_loss,0.80967


[34m[1mwandb[0m: Agent Starting Run: ngyna1om with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
)
Total Trainable Parameters: 15831581


100%|██████████| 691/691 [00:45<00:00, 15.04it/s]
100%|██████████| 69/69 [00:01<00:00, 51.51it/s]
100%|██████████| 4358/4358 [00:35<00:00, 122.78it/s]


Train Accuracy Char: 48.1034, Train Average Loss: 1.7790
Validation Accuracy Char: 51.2886, Validation Average Loss: 1.6422
Beam Val Word Accuracy: 4.9793 Correct Prediction : 217/4358


100%|██████████| 691/691 [00:45<00:00, 15.08it/s]
100%|██████████| 69/69 [00:01<00:00, 52.16it/s]
100%|██████████| 4358/4358 [00:37<00:00, 117.46it/s]


Train Accuracy Char: 65.7938, Train Average Loss: 1.1209
Validation Accuracy Char: 63.3521, Validation Average Loss: 1.2033
Beam Val Word Accuracy: 15.3970 Correct Prediction : 671/4358


100%|██████████| 691/691 [00:45<00:00, 15.07it/s]
100%|██████████| 69/69 [00:01<00:00, 51.09it/s]
100%|██████████| 4358/4358 [00:38<00:00, 114.46it/s]


Train Accuracy Char: 71.3523, Train Average Loss: 0.9488
Validation Accuracy Char: 69.7988, Validation Average Loss: 1.0062
Beam Val Word Accuracy: 26.7095 Correct Prediction : 1164/4358


100%|██████████| 691/691 [00:45<00:00, 15.08it/s]
100%|██████████| 69/69 [00:01<00:00, 52.41it/s]
100%|██████████| 4358/4358 [00:38<00:00, 114.62it/s]


Train Accuracy Char: 73.8887, Train Average Loss: 0.8678
Validation Accuracy Char: 72.4839, Validation Average Loss: 0.9204
Beam Val Word Accuracy: 32.6985 Correct Prediction : 1425/4358


100%|██████████| 691/691 [00:45<00:00, 15.10it/s]
100%|██████████| 69/69 [00:01<00:00, 52.15it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.86it/s]


Train Accuracy Char: 74.7742, Train Average Loss: 0.8354
Validation Accuracy Char: 73.8482, Validation Average Loss: 0.8742
Beam Val Word Accuracy: 36.1175 Correct Prediction : 1574/4358


100%|██████████| 691/691 [00:46<00:00, 15.00it/s]
100%|██████████| 69/69 [00:01<00:00, 52.22it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.15it/s]


Train Accuracy Char: 75.8355, Train Average Loss: 0.7987
Validation Accuracy Char: 74.7347, Validation Average Loss: 0.8401
Beam Val Word Accuracy: 37.8614 Correct Prediction : 1650/4358


100%|██████████| 691/691 [00:46<00:00, 14.91it/s]
100%|██████████| 69/69 [00:01<00:00, 51.14it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.70it/s]


Train Accuracy Char: 76.5681, Train Average Loss: 0.7730
Validation Accuracy Char: 75.1587, Validation Average Loss: 0.8259
Beam Val Word Accuracy: 39.7201 Correct Prediction : 1731/4358


100%|██████████| 691/691 [00:46<00:00, 14.99it/s]
100%|██████████| 69/69 [00:01<00:00, 52.19it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.60it/s]


Train Accuracy Char: 77.3191, Train Average Loss: 0.7480
Validation Accuracy Char: 75.5646, Validation Average Loss: 0.8216
Beam Val Word Accuracy: 40.8215 Correct Prediction : 1779/4358


100%|██████████| 691/691 [00:45<00:00, 15.04it/s]
100%|██████████| 69/69 [00:01<00:00, 51.71it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.39it/s]


Train Accuracy Char: 77.6811, Train Average Loss: 0.7319
Validation Accuracy Char: 76.5590, Validation Average Loss: 0.7828
Beam Val Word Accuracy: 41.0968 Correct Prediction : 1791/4358


100%|██████████| 691/691 [00:45<00:00, 15.04it/s]
100%|██████████| 69/69 [00:01<00:00, 52.65it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.61it/s]

Train Accuracy Char: 77.8960, Train Average Loss: 0.7231
Validation Accuracy Char: 76.2250, Validation Average Loss: 0.7917
Beam Val Word Accuracy: 41.3034 Correct Prediction : 1800/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▅▆▇▇████
train_accuracy_char,▁▅▆▇▇█████
train_loss,█▄▂▂▂▂▁▁▁▁
val_accuracy_char,▁▄▆▇▇▇████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,41.30335
beam_val_accuracy_word,41.30335
train_accuracy_char,77.89603
train_loss,0.72307
val_accuracy_char,76.22498
val_loss,0.79167


[34m[1mwandb[0m: Agent Starting Run: r2x53o28 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
)
Total Trainable Parameters: 15831581


100%|██████████| 691/691 [00:45<00:00, 15.05it/s]
100%|██████████| 69/69 [00:01<00:00, 52.46it/s]
100%|██████████| 4358/4358 [00:36<00:00, 119.86it/s]


Train Accuracy Char: 49.6888, Train Average Loss: 1.7096
Validation Accuracy Char: 52.3189, Validation Average Loss: 1.6081
Beam Val Word Accuracy: 5.4153 Correct Prediction : 236/4358


100%|██████████| 691/691 [00:46<00:00, 14.98it/s]
100%|██████████| 69/69 [00:01<00:00, 53.21it/s]
100%|██████████| 4358/4358 [00:37<00:00, 117.66it/s]


Train Accuracy Char: 66.7818, Train Average Loss: 1.0855
Validation Accuracy Char: 63.2005, Validation Average Loss: 1.2007
Beam Val Word Accuracy: 15.7412 Correct Prediction : 686/4358


100%|██████████| 691/691 [00:46<00:00, 14.93it/s]
100%|██████████| 69/69 [00:01<00:00, 51.11it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.36it/s]


Train Accuracy Char: 72.5662, Train Average Loss: 0.9086
Validation Accuracy Char: 69.1462, Validation Average Loss: 1.0270
Beam Val Word Accuracy: 25.7228 Correct Prediction : 1121/4358


100%|██████████| 691/691 [00:46<00:00, 14.88it/s]
100%|██████████| 69/69 [00:01<00:00, 51.56it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.40it/s]


Train Accuracy Char: 75.0774, Train Average Loss: 0.8262
Validation Accuracy Char: 72.3888, Validation Average Loss: 0.9163
Beam Val Word Accuracy: 31.7577 Correct Prediction : 1384/4358


100%|██████████| 691/691 [00:46<00:00, 14.92it/s]
100%|██████████| 69/69 [00:01<00:00, 52.16it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.23it/s]


Train Accuracy Char: 76.6498, Train Average Loss: 0.7733
Validation Accuracy Char: 73.9690, Validation Average Loss: 0.8629
Beam Val Word Accuracy: 36.0028 Correct Prediction : 1569/4358


100%|██████████| 691/691 [00:46<00:00, 14.95it/s]
100%|██████████| 69/69 [00:01<00:00, 51.81it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.16it/s]


Train Accuracy Char: 77.5793, Train Average Loss: 0.7391
Validation Accuracy Char: 75.0276, Validation Average Loss: 0.8363
Beam Val Word Accuracy: 38.8251 Correct Prediction : 1692/4358


100%|██████████| 691/691 [00:46<00:00, 14.92it/s]
100%|██████████| 69/69 [00:01<00:00, 51.88it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.31it/s]


Train Accuracy Char: 78.3711, Train Average Loss: 0.7121
Validation Accuracy Char: 75.0585, Validation Average Loss: 0.8395
Beam Val Word Accuracy: 39.8577 Correct Prediction : 1737/4358


100%|██████████| 691/691 [00:46<00:00, 14.92it/s]
100%|██████████| 69/69 [00:01<00:00, 51.48it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.18it/s]


Train Accuracy Char: 78.8231, Train Average Loss: 0.6922
Validation Accuracy Char: 76.0323, Validation Average Loss: 0.8080
Beam Val Word Accuracy: 40.9592 Correct Prediction : 1785/4358


100%|██████████| 691/691 [00:46<00:00, 14.94it/s]
100%|██████████| 69/69 [00:01<00:00, 52.25it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.40it/s]


Train Accuracy Char: 79.3329, Train Average Loss: 0.6738
Validation Accuracy Char: 76.2841, Validation Average Loss: 0.7958
Beam Val Word Accuracy: 41.2345 Correct Prediction : 1797/4358


100%|██████████| 691/691 [00:46<00:00, 14.93it/s]
100%|██████████| 69/69 [00:01<00:00, 50.69it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.58it/s]

Train Accuracy Char: 79.5896, Train Average Loss: 0.6621
Validation Accuracy Char: 76.1916, Validation Average Loss: 0.8066
Beam Val Word Accuracy: 42.0376 Correct Prediction : 1832/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▅▆▇▇████
train_accuracy_char,▁▅▆▇▇█████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy_char,▁▄▆▇▇█████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,42.03763
beam_val_accuracy_word,42.03763
train_accuracy_char,79.58962
train_loss,0.6621
val_accuracy_char,76.19158
val_loss,0.80656


[34m[1mwandb[0m: Agent Starting Run: qijqd0mu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): LSTM(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): LSTM(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
)
Total Trainable Parameters: 21082653


100%|██████████| 691/691 [00:53<00:00, 12.97it/s]
100%|██████████| 69/69 [00:01<00:00, 47.07it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.59it/s]


Train Accuracy Char: 46.1420, Train Average Loss: 1.8464
Validation Accuracy Char: 45.2196, Validation Average Loss: 1.8908
Beam Val Word Accuracy: 1.6292 Correct Prediction : 71/4358


100%|██████████| 691/691 [00:53<00:00, 12.99it/s]
100%|██████████| 69/69 [00:01<00:00, 47.21it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.10it/s]


Train Accuracy Char: 63.8865, Train Average Loss: 1.1824
Validation Accuracy Char: 61.3351, Validation Average Loss: 1.2839
Beam Val Word Accuracy: 14.7545 Correct Prediction : 643/4358


100%|██████████| 691/691 [00:53<00:00, 13.02it/s]
100%|██████████| 69/69 [00:01<00:00, 46.20it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.56it/s]


Train Accuracy Char: 69.8720, Train Average Loss: 0.9932
Validation Accuracy Char: 68.7505, Validation Average Loss: 1.0371
Beam Val Word Accuracy: 26.0441 Correct Prediction : 1135/4358


100%|██████████| 691/691 [00:52<00:00, 13.14it/s]
100%|██████████| 69/69 [00:01<00:00, 47.12it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.47it/s]


Train Accuracy Char: 72.7312, Train Average Loss: 0.9028
Validation Accuracy Char: 71.3842, Validation Average Loss: 0.9494
Beam Val Word Accuracy: 31.2529 Correct Prediction : 1362/4358


100%|██████████| 691/691 [00:52<00:00, 13.16it/s]
100%|██████████| 69/69 [00:01<00:00, 45.64it/s]
100%|██████████| 4358/4358 [00:40<00:00, 108.76it/s]


Train Accuracy Char: 74.1299, Train Average Loss: 0.8539
Validation Accuracy Char: 73.3832, Validation Average Loss: 0.8835
Beam Val Word Accuracy: 35.0849 Correct Prediction : 1529/4358


100%|██████████| 691/691 [00:52<00:00, 13.09it/s]
100%|██████████| 69/69 [00:01<00:00, 47.01it/s]
100%|██████████| 4358/4358 [00:39<00:00, 109.12it/s]


Train Accuracy Char: 75.5048, Train Average Loss: 0.8093
Validation Accuracy Char: 74.9146, Validation Average Loss: 0.8374
Beam Val Word Accuracy: 37.3336 Correct Prediction : 1627/4358


100%|██████████| 691/691 [00:52<00:00, 13.08it/s]
100%|██████████| 69/69 [00:01<00:00, 46.64it/s]
100%|██████████| 4358/4358 [00:39<00:00, 108.95it/s]


Train Accuracy Char: 76.1535, Train Average Loss: 0.7838
Validation Accuracy Char: 75.5929, Validation Average Loss: 0.8075
Beam Val Word Accuracy: 38.2285 Correct Prediction : 1666/4358


100%|██████████| 691/691 [00:52<00:00, 13.12it/s]
100%|██████████| 69/69 [00:01<00:00, 46.19it/s]
100%|██████████| 4358/4358 [00:40<00:00, 108.89it/s]


Train Accuracy Char: 76.6854, Train Average Loss: 0.7660
Validation Accuracy Char: 75.1458, Validation Average Loss: 0.8194
Beam Val Word Accuracy: 38.5498 Correct Prediction : 1680/4358


100%|██████████| 691/691 [00:52<00:00, 13.15it/s]
100%|██████████| 69/69 [00:01<00:00, 46.94it/s]
100%|██████████| 4358/4358 [00:39<00:00, 109.77it/s]


Train Accuracy Char: 77.1428, Train Average Loss: 0.7482
Validation Accuracy Char: 75.4721, Validation Average Loss: 0.8128
Beam Val Word Accuracy: 39.3759 Correct Prediction : 1716/4358


100%|██████████| 691/691 [00:52<00:00, 13.15it/s]
100%|██████████| 69/69 [00:01<00:00, 47.03it/s]
100%|██████████| 4358/4358 [00:39<00:00, 109.88it/s]

Train Accuracy Char: 77.6194, Train Average Loss: 0.7343
Validation Accuracy Char: 75.7316, Validation Average Loss: 0.7965
Beam Val Word Accuracy: 39.6512 Correct Prediction : 1728/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▅▆▇█████
train_accuracy_char,▁▅▆▇▇█████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy_char,▁▅▆▇▇█████
val_loss,█▄▃▂▂▁▁▁▁▁

0,1
accuracy,39.65122
beam_val_accuracy_word,39.65122
train_accuracy_char,77.61938
train_loss,0.73425
val_accuracy_char,75.73165
val_loss,0.79653


[34m[1mwandb[0m: Agent Starting Run: u9kq6ooe with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): LSTM(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): LSTM(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
)
Total Trainable Parameters: 21082653


100%|██████████| 691/691 [00:52<00:00, 13.10it/s]
100%|██████████| 69/69 [00:01<00:00, 47.88it/s]
100%|██████████| 4358/4358 [00:37<00:00, 117.07it/s]


Train Accuracy Char: 44.7516, Train Average Loss: 1.9027
Validation Accuracy Char: 45.2941, Validation Average Loss: 1.8292
Beam Val Word Accuracy: 2.2717 Correct Prediction : 99/4358


100%|██████████| 691/691 [00:52<00:00, 13.12it/s]
100%|██████████| 69/69 [00:01<00:00, 46.17it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.46it/s]


Train Accuracy Char: 64.3913, Train Average Loss: 1.1673
Validation Accuracy Char: 61.3556, Validation Average Loss: 1.2594
Beam Val Word Accuracy: 14.9610 Correct Prediction : 652/4358


100%|██████████| 691/691 [00:52<00:00, 13.12it/s]
100%|██████████| 69/69 [00:01<00:00, 46.36it/s]
100%|██████████| 4358/4358 [00:39<00:00, 111.49it/s]


Train Accuracy Char: 71.0614, Train Average Loss: 0.9570
Validation Accuracy Char: 69.7731, Validation Average Loss: 0.9963
Beam Val Word Accuracy: 26.6636 Correct Prediction : 1162/4358


100%|██████████| 691/691 [00:52<00:00, 13.14it/s]
100%|██████████| 69/69 [00:01<00:00, 46.03it/s]
100%|██████████| 4358/4358 [00:39<00:00, 110.62it/s]


Train Accuracy Char: 73.9199, Train Average Loss: 0.8640
Validation Accuracy Char: 72.3271, Validation Average Loss: 0.9169
Beam Val Word Accuracy: 32.1478 Correct Prediction : 1401/4358


100%|██████████| 691/691 [00:52<00:00, 13.12it/s]
100%|██████████| 69/69 [00:01<00:00, 46.34it/s]
100%|██████████| 4358/4358 [00:39<00:00, 110.51it/s]


Train Accuracy Char: 75.4642, Train Average Loss: 0.8094
Validation Accuracy Char: 73.8919, Validation Average Loss: 0.8702
Beam Val Word Accuracy: 35.8192 Correct Prediction : 1561/4358


100%|██████████| 691/691 [00:52<00:00, 13.11it/s]
100%|██████████| 69/69 [00:01<00:00, 46.36it/s]
100%|██████████| 4358/4358 [00:40<00:00, 108.66it/s]


Train Accuracy Char: 76.5590, Train Average Loss: 0.7736
Validation Accuracy Char: 75.2178, Validation Average Loss: 0.8263
Beam Val Word Accuracy: 37.6319 Correct Prediction : 1640/4358


100%|██████████| 691/691 [00:52<00:00, 13.11it/s]
100%|██████████| 69/69 [00:01<00:00, 46.03it/s]
100%|██████████| 4358/4358 [00:40<00:00, 108.74it/s]


Train Accuracy Char: 76.9803, Train Average Loss: 0.7578
Validation Accuracy Char: 75.4259, Validation Average Loss: 0.8182
Beam Val Word Accuracy: 39.4676 Correct Prediction : 1720/4358


100%|██████████| 691/691 [00:52<00:00, 13.12it/s]
100%|██████████| 69/69 [00:01<00:00, 47.24it/s]
100%|██████████| 4358/4358 [00:39<00:00, 110.16it/s]


Train Accuracy Char: 77.9032, Train Average Loss: 0.7241
Validation Accuracy Char: 75.8113, Validation Average Loss: 0.8014
Beam Val Word Accuracy: 39.8348 Correct Prediction : 1736/4358


100%|██████████| 691/691 [00:52<00:00, 13.04it/s]
100%|██████████| 69/69 [00:01<00:00, 46.25it/s]
100%|██████████| 4358/4358 [00:39<00:00, 109.57it/s]


Train Accuracy Char: 78.3384, Train Average Loss: 0.7080
Validation Accuracy Char: 76.1864, Validation Average Loss: 0.7950
Beam Val Word Accuracy: 40.5920 Correct Prediction : 1769/4358


100%|██████████| 691/691 [00:52<00:00, 13.08it/s]
100%|██████████| 69/69 [00:01<00:00, 47.87it/s]
100%|██████████| 4358/4358 [00:39<00:00, 109.74it/s]

Train Accuracy Char: 78.8762, Train Average Loss: 0.6880
Validation Accuracy Char: 76.1864, Validation Average Loss: 0.8018
Beam Val Word Accuracy: 41.3263 Correct Prediction : 1801/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▅▆▇▇████
train_accuracy_char,▁▅▆▇▇█████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy_char,▁▅▇▇▇█████
val_loss,█▄▂▂▂▁▁▁▁▁

0,1
accuracy,41.3263
beam_val_accuracy_word,41.3263
train_accuracy_char,78.87624
train_loss,0.68801
val_accuracy_char,76.18644
val_loss,0.80185


[34m[1mwandb[0m: Agent Starting Run: 7zasksf6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
)
Total Trainable Parameters: 15831581


100%|██████████| 691/691 [00:46<00:00, 15.02it/s]
100%|██████████| 69/69 [00:01<00:00, 52.08it/s]
100%|██████████| 4358/4358 [00:35<00:00, 121.78it/s]


Train Accuracy Char: 50.7071, Train Average Loss: 1.6686
Validation Accuracy Char: 50.0475, Validation Average Loss: 1.6877
Beam Val Word Accuracy: 4.3369 Correct Prediction : 189/4358


100%|██████████| 691/691 [00:46<00:00, 15.02it/s]
100%|██████████| 69/69 [00:01<00:00, 51.81it/s]
100%|██████████| 4358/4358 [00:37<00:00, 117.69it/s]


Train Accuracy Char: 65.9465, Train Average Loss: 1.1126
Validation Accuracy Char: 61.6177, Validation Average Loss: 1.2988
Beam Val Word Accuracy: 13.8825 Correct Prediction : 605/4358


100%|██████████| 691/691 [00:46<00:00, 15.02it/s]
100%|██████████| 69/69 [00:01<00:00, 51.82it/s]
100%|██████████| 4358/4358 [00:37<00:00, 115.47it/s]


Train Accuracy Char: 71.0359, Train Average Loss: 0.9556
Validation Accuracy Char: 67.7304, Validation Average Loss: 1.0802
Beam Val Word Accuracy: 22.0514 Correct Prediction : 961/4358


100%|██████████| 691/691 [00:45<00:00, 15.16it/s]
100%|██████████| 69/69 [00:01<00:00, 52.31it/s]
100%|██████████| 4358/4358 [00:38<00:00, 114.67it/s]


Train Accuracy Char: 73.7827, Train Average Loss: 0.8696
Validation Accuracy Char: 71.4484, Validation Average Loss: 0.9605
Beam Val Word Accuracy: 29.8073 Correct Prediction : 1299/4358


100%|██████████| 691/691 [00:45<00:00, 15.11it/s]
100%|██████████| 69/69 [00:01<00:00, 52.02it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.83it/s]


Train Accuracy Char: 75.1817, Train Average Loss: 0.8220
Validation Accuracy Char: 73.0697, Validation Average Loss: 0.9063
Beam Val Word Accuracy: 33.2951 Correct Prediction : 1451/4358


100%|██████████| 691/691 [00:46<00:00, 14.99it/s]
100%|██████████| 69/69 [00:01<00:00, 52.66it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.42it/s]


Train Accuracy Char: 75.8734, Train Average Loss: 0.7975
Validation Accuracy Char: 73.6761, Validation Average Loss: 0.8818
Beam Val Word Accuracy: 36.4846 Correct Prediction : 1590/4358


100%|██████████| 691/691 [00:46<00:00, 14.93it/s]
100%|██████████| 69/69 [00:01<00:00, 52.10it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.04it/s]


Train Accuracy Char: 76.5684, Train Average Loss: 0.7735
Validation Accuracy Char: 74.8889, Validation Average Loss: 0.8397
Beam Val Word Accuracy: 38.2515 Correct Prediction : 1667/4358


100%|██████████| 691/691 [00:46<00:00, 15.00it/s]
100%|██████████| 69/69 [00:01<00:00, 51.68it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.36it/s]


Train Accuracy Char: 77.0491, Train Average Loss: 0.7551
Validation Accuracy Char: 75.3154, Validation Average Loss: 0.8364
Beam Val Word Accuracy: 39.5365 Correct Prediction : 1723/4358


100%|██████████| 691/691 [00:45<00:00, 15.15it/s]
100%|██████████| 69/69 [00:01<00:00, 52.14it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.18it/s]


Train Accuracy Char: 77.4996, Train Average Loss: 0.7386
Validation Accuracy Char: 74.9531, Validation Average Loss: 0.8516
Beam Val Word Accuracy: 40.7067 Correct Prediction : 1774/4358


100%|██████████| 691/691 [00:45<00:00, 15.14it/s]
100%|██████████| 69/69 [00:01<00:00, 52.69it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.95it/s]

Train Accuracy Char: 77.8589, Train Average Loss: 0.7261
Validation Accuracy Char: 76.1582, Validation Average Loss: 0.8005
Beam Val Word Accuracy: 41.0509 Correct Prediction : 1789/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▄▆▇▇▇███
train_accuracy_char,▁▅▆▇▇▇████
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy_char,▁▄▆▇▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
accuracy,41.05094
beam_val_accuracy_word,41.05094
train_accuracy_char,77.8589
train_loss,0.72612
val_accuracy_char,76.15817
val_loss,0.80055


[34m[1mwandb[0m: Agent Starting Run: 02076rho with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 512
[34m[1mwandb[0m: 	hidden_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
)
Total Trainable Parameters: 15831581


100%|██████████| 691/691 [00:45<00:00, 15.15it/s]
100%|██████████| 69/69 [00:01<00:00, 51.62it/s]
100%|██████████| 4358/4358 [00:35<00:00, 121.24it/s]


Train Accuracy Char: 49.0389, Train Average Loss: 1.7459
Validation Accuracy Char: 50.2865, Validation Average Loss: 1.6804
Beam Val Word Accuracy: 4.5893 Correct Prediction : 200/4358


100%|██████████| 691/691 [00:45<00:00, 15.16it/s]
100%|██████████| 69/69 [00:01<00:00, 52.57it/s]
100%|██████████| 4358/4358 [00:36<00:00, 118.61it/s]


Train Accuracy Char: 66.1535, Train Average Loss: 1.1072
Validation Accuracy Char: 62.4965, Validation Average Loss: 1.2379
Beam Val Word Accuracy: 13.5154 Correct Prediction : 589/4358


100%|██████████| 691/691 [00:45<00:00, 15.10it/s]
100%|██████████| 69/69 [00:01<00:00, 51.74it/s]
100%|██████████| 4358/4358 [00:37<00:00, 115.39it/s]


Train Accuracy Char: 72.2937, Train Average Loss: 0.9190
Validation Accuracy Char: 68.9766, Validation Average Loss: 1.0253
Beam Val Word Accuracy: 24.0018 Correct Prediction : 1046/4358


100%|██████████| 691/691 [00:45<00:00, 15.10it/s]
100%|██████████| 69/69 [00:01<00:00, 52.16it/s]
100%|██████████| 4358/4358 [00:38<00:00, 114.58it/s]


Train Accuracy Char: 75.1669, Train Average Loss: 0.8248
Validation Accuracy Char: 71.9289, Validation Average Loss: 0.9321
Beam Val Word Accuracy: 30.7939 Correct Prediction : 1342/4358


100%|██████████| 691/691 [00:45<00:00, 15.14it/s]
100%|██████████| 69/69 [00:01<00:00, 52.06it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.47it/s]


Train Accuracy Char: 76.5192, Train Average Loss: 0.7791
Validation Accuracy Char: 74.2337, Validation Average Loss: 0.8564
Beam Val Word Accuracy: 35.8880 Correct Prediction : 1564/4358


100%|██████████| 691/691 [00:45<00:00, 15.13it/s]
100%|██████████| 69/69 [00:01<00:00, 52.28it/s]
100%|██████████| 4358/4358 [00:38<00:00, 113.30it/s]


Train Accuracy Char: 77.4959, Train Average Loss: 0.7437
Validation Accuracy Char: 74.6268, Validation Average Loss: 0.8633
Beam Val Word Accuracy: 38.8022 Correct Prediction : 1691/4358


100%|██████████| 691/691 [00:45<00:00, 15.13it/s]
100%|██████████| 69/69 [00:01<00:00, 53.06it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.28it/s]


Train Accuracy Char: 78.2659, Train Average Loss: 0.7158
Validation Accuracy Char: 75.0636, Validation Average Loss: 0.8452
Beam Val Word Accuracy: 40.2478 Correct Prediction : 1754/4358


100%|██████████| 691/691 [00:45<00:00, 15.08it/s]
100%|██████████| 69/69 [00:01<00:00, 52.61it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.66it/s]


Train Accuracy Char: 78.4786, Train Average Loss: 0.7041
Validation Accuracy Char: 76.3509, Validation Average Loss: 0.7937
Beam Val Word Accuracy: 41.0968 Correct Prediction : 1791/4358


100%|██████████| 691/691 [00:45<00:00, 15.06it/s]
100%|██████████| 69/69 [00:01<00:00, 52.98it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.81it/s]


Train Accuracy Char: 79.0794, Train Average Loss: 0.6819
Validation Accuracy Char: 76.1736, Validation Average Loss: 0.8023
Beam Val Word Accuracy: 41.4640 Correct Prediction : 1807/4358


100%|██████████| 691/691 [00:46<00:00, 15.02it/s]
100%|██████████| 69/69 [00:01<00:00, 52.78it/s]
100%|██████████| 4358/4358 [00:38<00:00, 112.63it/s]

Train Accuracy Char: 79.4839, Train Average Loss: 0.6674
Validation Accuracy Char: 76.5025, Validation Average Loss: 0.7873
Beam Val Word Accuracy: 42.1524 Correct Prediction : 1837/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▃▅▆▇▇████
train_accuracy_char,▁▅▆▇▇█████
train_loss,█▄▃▂▂▁▁▁▁▁
val_accuracy_char,▁▄▆▇▇▇████
val_loss,█▅▃▂▂▂▁▁▁▁

0,1
accuracy,42.15236
beam_val_accuracy_word,42.15236
train_accuracy_char,79.48388
train_loss,0.66744
val_accuracy_char,76.50248
val_loss,0.78726


In [8]:

# Load Dataset
# df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina-dataset-ass-3/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv')
# df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina-dataset-ass-3/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv')
# df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina-dataset-ass-3/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv')

df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.train.tsv')
df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.dev.tsv')
df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.test.tsv')


input_max_len = max(train_input_len, val_input_len, test_input_len)
output_max_len = max(train_out_len, val_out_len, test_out_len)

max_length = max(input_max_len, output_max_len)

# Create Look Up Table
input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])



params = {
        "input_size": len(input_char_to_int),
        "output_size": len(output_char_to_int),
        "embedding_size": 512,
        "hidden_size": 512,
        "enc_num_layers": 2,
        "dec_num_layers": 2,
        "cell_type": "GRU", # LSTM, GRU, RNN
        "dropout": 0.3,
        "learning_rate": 0.01,
        "batch_size": 64,
        "num_epochs": 12,
        "optimizer": 'adagrad',  # ['sgd', 'rmsprop', 'adam', 'nadam']
        "beam_search_width" : 1,
        "length_penalty" : 0.6,
        "bidirectional": True,
        "teacher_forcing":0.7,

    }


# Data Embedding and Converting them into Tensor
train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

# Transpose column wise
train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


# Initialize Hyperparameters
input_size = params['input_size']
output_size = params['output_size']
embedding_size = params['embedding_size']
hidden_size = params['hidden_size']
enc_num_layers = params['enc_num_layers'] 
dec_num_layers = params['dec_num_layers']  
cell_type = params['cell_type']
dropout = params['dropout']
learning_rate = params['learning_rate']
batch_size = params['batch_size']
num_epochs = params['num_epochs']  
optimizer = params['optimizer']  
beam_width = params['beam_search_width']
bidirectional = params['bidirectional']
length_penalty = params['length_penalty']
teacher_forcing = params['teacher_forcing']

# Create train data batch
train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
# Validation data batch
val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


# Intialize encoder, decoder and seq2seq model
encoder = Encoder(input_size, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional, cell_type).to(device)
decoder = Decoder(output_size, embedding_size, hidden_size, output_size, dec_num_layers, dropout, bidirectional, cell_type).to(device)  
model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, cell_type).to(device)

# Print total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(model)
print(f'Total Trainable Parameters: {total_params}')


# Loss function and Optimizer
criterion = nn.CrossEntropyLoss()
if optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
elif optimizer == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
elif optimizer == 'nadam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
elif optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
else:
    print("Incorrect Optmizer !!!!")

# TRAINING
model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length, 0)

Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(29, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.3, inplace=False)
    (embedding): Embedding(66, 512)
    (rnn): GRU(512, 512, num_layers=2, dropout=0.3, bidirectional=True)
  )
)
Total Trainable Parameters: 15831581


100%|██████████| 691/691 [00:45<00:00, 15.10it/s]
100%|██████████| 69/69 [00:01<00:00, 51.96it/s]
100%|██████████| 4358/4358 [00:34<00:00, 125.25it/s]


Train Accuracy Char: 49.3185, Train Average Loss: 1.7432
Validation Accuracy Char: 50.6025, Validation Average Loss: 1.6916
Beam Val Word Accuracy: 3.9927 Correct Prediction : 174/4358


100%|██████████| 691/691 [00:45<00:00, 15.32it/s]
100%|██████████| 69/69 [00:01<00:00, 52.66it/s]
100%|██████████| 4358/4358 [00:35<00:00, 121.09it/s]


Train Accuracy Char: 66.2777, Train Average Loss: 1.1011
Validation Accuracy Char: 61.4892, Validation Average Loss: 1.2818
Beam Val Word Accuracy: 13.1941 Correct Prediction : 575/4358


100%|██████████| 691/691 [00:45<00:00, 15.28it/s]
100%|██████████| 69/69 [00:01<00:00, 52.41it/s]
100%|██████████| 4358/4358 [00:36<00:00, 118.66it/s]


Train Accuracy Char: 72.2799, Train Average Loss: 0.9189
Validation Accuracy Char: 68.7453, Validation Average Loss: 1.0342
Beam Val Word Accuracy: 24.0018 Correct Prediction : 1046/4358


100%|██████████| 691/691 [00:45<00:00, 15.27it/s]
100%|██████████| 69/69 [00:01<00:00, 53.63it/s]
100%|██████████| 4358/4358 [00:37<00:00, 117.12it/s]


Train Accuracy Char: 75.0666, Train Average Loss: 0.8266
Validation Accuracy Char: 71.6668, Validation Average Loss: 0.9441
Beam Val Word Accuracy: 31.2758 Correct Prediction : 1363/4358


100%|██████████| 691/691 [00:45<00:00, 15.18it/s]
100%|██████████| 69/69 [00:01<00:00, 52.92it/s]
100%|██████████| 4358/4358 [00:37<00:00, 116.33it/s]


Train Accuracy Char: 76.4056, Train Average Loss: 0.7810
Validation Accuracy Char: 74.4598, Validation Average Loss: 0.8528
Beam Val Word Accuracy: 35.7274 Correct Prediction : 1557/4358


100%|██████████| 691/691 [00:45<00:00, 15.23it/s]
100%|██████████| 69/69 [00:01<00:00, 53.39it/s]
100%|██████████| 4358/4358 [00:37<00:00, 116.15it/s]


Train Accuracy Char: 77.3789, Train Average Loss: 0.7451
Validation Accuracy Char: 74.8092, Validation Average Loss: 0.8422
Beam Val Word Accuracy: 37.7696 Correct Prediction : 1646/4358


100%|██████████| 691/691 [00:45<00:00, 15.24it/s]
100%|██████████| 69/69 [00:01<00:00, 53.50it/s]
100%|██████████| 4358/4358 [00:37<00:00, 116.20it/s]


Train Accuracy Char: 78.2010, Train Average Loss: 0.7177
Validation Accuracy Char: 75.5492, Validation Average Loss: 0.8237
Beam Val Word Accuracy: 40.0642 Correct Prediction : 1746/4358


100%|██████████| 691/691 [00:45<00:00, 15.24it/s]
100%|██████████| 69/69 [00:01<00:00, 52.77it/s]
100%|██████████| 4358/4358 [00:37<00:00, 116.32it/s]


Train Accuracy Char: 78.8113, Train Average Loss: 0.6951
Validation Accuracy Char: 75.8704, Validation Average Loss: 0.8155
Beam Val Word Accuracy: 40.8903 Correct Prediction : 1782/4358


100%|██████████| 691/691 [00:45<00:00, 15.24it/s]
100%|██████████| 69/69 [00:01<00:00, 52.58it/s]
100%|██████████| 4358/4358 [00:37<00:00, 116.20it/s]


Train Accuracy Char: 79.1204, Train Average Loss: 0.6818
Validation Accuracy Char: 76.7928, Validation Average Loss: 0.7757
Beam Val Word Accuracy: 41.2575 Correct Prediction : 1798/4358


100%|██████████| 691/691 [00:45<00:00, 15.19it/s]
100%|██████████| 69/69 [00:01<00:00, 52.75it/s]
100%|██████████| 4358/4358 [00:37<00:00, 115.37it/s]


Train Accuracy Char: 79.5299, Train Average Loss: 0.6636
Validation Accuracy Char: 76.1813, Validation Average Loss: 0.8031
Beam Val Word Accuracy: 41.8311 Correct Prediction : 1823/4358


100%|██████████| 691/691 [00:45<00:00, 15.17it/s]
100%|██████████| 69/69 [00:01<00:00, 52.72it/s]
100%|██████████| 4358/4358 [00:37<00:00, 115.91it/s]


Train Accuracy Char: 79.8188, Train Average Loss: 0.6528
Validation Accuracy Char: 76.7569, Validation Average Loss: 0.7785
Beam Val Word Accuracy: 41.8770 Correct Prediction : 1825/4358


100%|██████████| 691/691 [00:45<00:00, 15.25it/s]
100%|██████████| 69/69 [00:01<00:00, 53.21it/s]
100%|██████████| 4358/4358 [00:37<00:00, 114.88it/s]

Train Accuracy Char: 80.0394, Train Average Loss: 0.6419
Validation Accuracy Char: 76.8237, Validation Average Loss: 0.7791
Beam Val Word Accuracy: 41.9458 Correct Prediction : 1828/4358





## TEST PREDICTON

In [12]:
def store_results(data_type, words, translations, predictions, results):
    """
    This function saves the evaluation results to a CSV file.

    Args:
        data_type (str): The type of data used for evaluation (e.g., 'val', 'test').
        words (list): List of source words (without start/end tokens).
        translations (list): List of reference translations (without start/end tokens).
        predictions (list): List of predicted translated sequences (without start/end tokens).
        results (list): List of 'Yes' or 'No' indicating correct/incorrect predictions.
    """

    # Create a dictionary to store the results in a structured format
    log = {
        'Word': words,
        'Translation': translations,
        'Prediction': predictions,
        'Result': results  # 'Yes' for correct, 'No' for incorrect
    }
    
    # Construct the file path for the CSV file
    path = '/kaggle/working/predictions.csv'

    # Create a Pandas DataFrame from the dictionary
    data_frame = pd.DataFrame(log)

    # Save the DataFrame to a CSV file (header=True includes column names, index=False excludes row index)
    data_frame.to_csv(path, header=True, index=False)
    
    # Log to wandb
    wandb.init(project='DA6401_A3', name='Prediction_Store')

    wandb.log({'Prediction_table': wandb.Table(dataframe= data_frame)})

    wandb.finish()

In [10]:

test_acc = 0
correct_pred = 0
words_test = [] 
translations_test = [] 
predictions_test = []
results_test = []

for i in tqdm(range(df_test.shape[0])):
    input_seq = df_test.iloc[i, 0][:-1] 
    true_seq = df_test.iloc[i, 1][1:-1]
    predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type)
    words_test.append(input_seq)
    translations_test.append(true_seq)
    predictions_test.append(predicted_output[:-1])
    if true_seq == predicted_output[:-1]:
        correct_pred += 1
        results_test.append('Yes')
    else:
        results_test.append('No')

test_acc = 100 * correct_pred / df_test.shape[0]   

print(f'Test Accuracy Word Level: {test_acc}, Correctly Predicted: {correct_pred}')
wandb.init(project='DA6401_A3', name='bestmodel_test')

wandb.log({ "val_accuracy_word" : acc,
            "test_accuracy_word" : test_acc
            })

wandb.finish()
#store_results('test', words_test, translations_test, predictions_test, results_test)

100%|██████████| 4502/4502 [00:39<00:00, 114.79it/s]


Test Accuracy Word Level: 40.58196357174589, Correctly Predicted: 1827


0,1
test_accuracy_word,▁
val_accuracy_word,▁

0,1
test_accuracy_word,40.58196
val_accuracy_word,41.94585


## Prediction

In [None]:
# if __name__ == "__main__":
#     parser.add_argument('-dp', '--data_path', type=str, default='kaggle/input/hinid-dataset/aksharantar_sampled/hin', help='Path to the data folder')
#     parser.add_argument('-l', '--lang', type=str, default='hin', help='Language for which training is to be done')
#     parser.add_argument('-es', '--embedding_size', type=int, default=256, help='Embedding size')
#     parser.add_argument('-hs', '--hidden_size', type=int, default=512, help='Hidden size')
#     parser.add_argument('-nl', '--num_layers', type=int, default=2, help='Number of layers')
#     parser.add_argument('-ct', '--cell_type', type=str, default='LSTM', choices=['RNN', 'LSTM', 'GRU'], help='Cell type (RNN, LSTM, GRU)')
#     parser.add_argument('-dr', '--dropout', type=float, default=0.3, help='Dropout rate')
#     parser.add_argument('-lr', '--learning_rate', type=float, default=0.01, help='Learning rate')
#     parser.add_argument('-bs', '--batch_size', type=int, default=32, help='Batch size')
#     parser.add_argument('-ne', '--num_epochs', type=int, default=10, help='Number of epochs')
#     parser.add_argument('-op', '--optimizer', type=str, default='adagrad', choices=['adam', 'sgd', 'rmsprop', 'nadam', 'adagrad'], help='Optimizer (adam, sgd, rmsprop, nadam, adagrad)')
#     parser.add_argument('-bw', '--beam_search_width', type=int, default=1, help='Beam search width')
#     parser.add_argument('-lp', '--length_penalty', type=float, default=0.6, help='Length penalty')
#     parser.add_argument('-tf', '--teacher_forcing', type=float, default=0.7, help='Teacher forcing ratio')
#     parser.add_argument('-bi', '--bidirectional', action='store_true', default=True, help='Use bidirectional encoder')
#     parser.add_argument('--wandb_log', type=int, default=0, help='Whether to log to WandB (1 for yes, 0 for no)')
    
    
#     config = parser.parse_args()
#     data_path = config.data_path
#     lang = config.lang
    
    
#     # Load Dataset
#     df_train, train_input_len, train_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_train.csv')
#     df_val, val_input_len, val_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_valid.csv')
#     df_test, test_input_len, test_out_len = load_dataset(f'/{data_path}/{lang}/{lang}_test.csv')

#     input_max_len = max(train_input_len, val_input_len, test_input_len)
#     output_max_len = max(train_out_len, val_out_len, test_out_len)
    
#     max_length = max(input_max_len, output_max_len)

#     # Create Look Up Table
#     input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
#     output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

#     # Data Embedding and Converting them into Tensor
#     train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
#     val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
#     test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

#     # Transpose column wise
#     train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
#     val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
#     test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)
    
#     # Initialize Hyperparameters
#     input_size = len(input_char_to_int)
#     output_size = len(output_char_to_int)
#     embedding_size = config.embedding_size
#     hidden_size = config.hidden_size
#     enc_num_layers = config.num_layers
#     dec_num_layers = config.num_layers
#     cell_type = config.cell_type
#     dropout = config.dropout
#     learning_rate = config.learning_rate
#     batch_size = config.batch_size
#     num_epochs = config.num_epochs  
#     optimizer = config.optimizer  
#     beam_width = config.beam_search_width
#     bidirectional = config.bidirectional
#     length_penalty = config.length_penalty
#     teacher_forcing = config.teacher_forcing
#     learning_rate = config.learning_rate
    
#     # Create train data batch
#     train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
#     # Validation data batch
#     val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


#     # Intialize encoder, decoder and seq2seq model
#     encoder = Encoder(input_size, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional, cell_type).to(device)
#     decoder = Decoder(output_size, embedding_size, hidden_size, output_size, dec_num_layers, dropout, bidirectional, cell_type).to(device)  
#     model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, cell_type).to(device)

#     # Print total number of parameters in the model
#     total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(model)
#     print(f'Total Trainable Parameters: {total_params}')


#     # Loss function and Optimizer
#     criterion = nn.CrossEntropyLoss()
#     if optimizer == 'adam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'sgd':
#         optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#     elif optimizer == 'rmsprop':
#         optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
#     elif optimizer == 'nadam':
#         optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     elif optimizer == 'adagrad':
#         optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

#     # TRAINING
    
#     if config.wandb_log == 1:  
#         wandb.init(project='DA6401_A3')
#         wandb.run.name = 'cell_' + config.cell_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.num_layers) + '_dlayer_' + str(config.num_layers)

#     model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length, config.wandb_log)
#     if config.wandb_log == 1: 
#         wandb.log({
#                 "accuracy": acc,
#             })

In [None]:
# # Example usage
# for i in range(10):
#     input_seq = df_train.iloc[i, 0][:-1] 
#     predicted_output = beam_search(model, input_seq, input_char_to_int, output_char_to_int, output_int_to_char, 1, 0.6, "RNN")

#     print(f"Input Sequence {i+1}: {input_seq}")
#     print(f"Predicted Output Sequence {i+1}: {predicted_output}\n")
