In [1]:
# Import Lib

import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable 
import copy
from matplotlib.font_manager import FontProperties
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import random
import heapq
import wandb
import matplotlib.pyplot as plt
# Set device (CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
wandb.login(key='b8d44a4abbab8753e976a6e5ab717fd669ba99a2')


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcs24m030[0m ([33mcs24m030-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## PREPROCESSING

In [3]:
def encode(x, max_length, char_to_idx):
    encoded = np.zeros(max_length, dtype=int)
    encoder = np.array([char_to_idx[char] for char in x])
    length = min(max_length, len(encoder))
    encoded[:length] = encoder[:length]

    return torch.tensor(encoded, dtype=torch.int64), length

def get_tensor_object(df, max_input_length, max_output_length, char_to_idx_input, char_to_idx_output):
    # Encode unique inputs and outputs into tensors
    encoded_inputs = []
    encoded_outputs = []

    # Encode the input column
    for input_str in df[0]:
        encoded_input, input_length = encode(input_str, max_input_length, char_to_idx_input)
        encoded_inputs.append(encoded_input)

    # Encode the output column
    for output_str in df[1]:
        encoded_output, output_length = encode(output_str, max_output_length, char_to_idx_output)
        encoded_outputs.append(encoded_output)

    # Stack tensors column-wise
    
#     tensor_inputs = torch.stack(encoded_inputs, dim=1)
#     tensor_outputs = torch.stack(encoded_outputs, dim=1)
    tensor_inputs = torch.stack(encoded_inputs)
    tensor_outputs = torch.stack(encoded_outputs)

    return tensor_inputs, tensor_outputs

def load_dataset(path):
    """
    Load a dataset from a TSV file.
    Args:
    - path (str): Path to the TSV file.
    Returns:
    - df (pd.DataFrame): Loaded DataFrame.
    - max_input_length (int): Maximum length for input sequences.
    - max_output_length (int): Maximum length for output sequences.
    """
    df = pd.read_csv(path, header=None, encoding='utf-8', sep='\t')  # Changed separator to tab
    
    # Convert values to strings before adding special characters
    df[0] = df[0].astype(str).apply(lambda x: x + '$')
    df[1] = df[1].astype(str).apply(lambda x: '^' + x + '$')
    
    # Determine maximum length for input and output sequences
    max_input_length = max(len(x) for x in df[0].unique())
    max_output_length = max(len(x) for x in df[1].unique())
    return df, max_input_length, max_output_length

def look_up_table(vocab1, vocab2, vocab3):
    # Combine all vocabularies into one set
    vocab = set(''.join(vocab1) + ''.join(vocab2) + ''.join(vocab3))
    vocab.discard('^')  
    vocab.discard('$')  
    vocab_to_int = {"": 0, '^':1, '$':2}
    for v_i, v in enumerate(sorted(vocab), len(vocab_to_int)):
        vocab_to_int[v] = v_i
    int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()}
    return vocab_to_int, int_to_vocab




# Load Dataset
df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.train.tsv')
df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.dev.tsv')
df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.test.tsv')

input_max_len = max(train_input_len, val_input_len, test_input_len) + 1
output_max_len = max(train_out_len, val_out_len, test_out_len) + 1


# Create Look Up Table
input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

print("Input Lookup Table:", input_char_to_int)
print("\n\n Output Lookup Table", output_char_to_int)

# Data Embedding and Converting them into Tensor
train_inputs, train_outputs = get_tensor_object(df_train, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
val_inputs, val_outputs = get_tensor_object(df_val, input_max_len, input_max_len, input_char_to_int, output_char_to_int)
test_inputs, test_outputs = get_tensor_object(df_test, input_max_len, input_max_len, input_char_to_int, output_char_to_int)

# Transpose column wise
train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


print("\n", train_inputs[:,0],train_outputs[:,0])
print("Training Input:", train_inputs.shape, train_outputs.shape)

print("Validation", val_inputs.shape, val_inputs.shape)
print(df_train.head())

Input Lookup Table: {'': 0, '^': 1, '$': 2, 'ँ': 3, 'ं': 4, 'ः': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ए': 13, 'ऐ': 14, 'ऑ': 15, 'ओ': 16, 'औ': 17, 'क': 18, 'ख': 19, 'ग': 20, 'घ': 21, 'ङ': 22, 'च': 23, 'छ': 24, 'ज': 25, 'झ': 26, 'ञ': 27, 'ट': 28, 'ठ': 29, 'ड': 30, 'ढ': 31, 'ण': 32, 'त': 33, 'थ': 34, 'द': 35, 'ध': 36, 'न': 37, 'प': 38, 'फ': 39, 'ब': 40, 'भ': 41, 'म': 42, 'य': 43, 'र': 44, 'ल': 45, 'व': 46, 'श': 47, 'ष': 48, 'स': 49, 'ह': 50, '़': 51, 'ा': 52, 'ि': 53, 'ी': 54, 'ु': 55, 'ू': 56, 'ृ': 57, 'ॅ': 58, 'े': 59, 'ै': 60, 'ॉ': 61, 'ो': 62, 'ौ': 63, '्': 64, 'ॐ': 65}


 Output Lookup Table {'': 0, '^': 1, '$': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}

 tensor([6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) tensor([ 1,  3, 16,  2,  0,  0,  0,  0,  0,  0, 

# Create Seq2Seq CLass

In [4]:
class Encoder(nn.Module): 
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout, bidirectional, cell_type):
        super(Encoder, self).__init__()
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        
        # Define embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # Define RNN layer with specific cell type
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=bidirectional)
        else:
            raise ValueError("Invalid RNN type. Choose from 'LSTM', 'GRU', or 'RNN'.")
        
        
    def forward(self, x): # x shape: (seq_length, N) where N is batch size
        # Perform dropout on the input
        embedding = self.embedding(x)
        embedding = self.dropout(embedding) # embedding shape: (seq_length, N, embedding_size)
        
        if self.cell_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.num_layers], hidden[self.num_layers:]), dim=0)

            # Return hidden state and cell state
            return outputs, hidden, cell
        elif self.cell_type == "GRU" or self.cell_type == "RNN":
            # Pass through the RNN/GRU layer
            outputs, hidden = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
            if self.bidirectional:
                # Sum the bidirectional outputs
                outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
                hidden = torch.cat((hidden[: self.num_layers], hidden[self.num_layers:]), dim=0)

            # Return output (max_seq, N, hidden size)
            return outputs, hidden 
        else:
            print("Encoder Failed to initialize!")
            return None

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
      
    def dot_score(self, hidden_state, encoder_states):
        return torch.sum(hidden_state * encoder_states, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        attn_scores = self.dot_score(hidden, encoder_outputs)
        attn_scores = attn_scores.t()  # Transpose to match dimensions
        attention_weights = F.softmax(attn_scores, dim=1).unsqueeze(1)
        return attention_weights

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout, bidirection_type, cell_type):
        super(Decoder, self).__init__()
        self.bidirectional = bidirection_type
        self.dropout = nn.Dropout(dropout)  
        self.num_layers = num_layers 
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.cell_type = cell_type
        
        
        # Define embedding layer
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # Define RNN layer with specific cell type
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(embedding_size, hidden_size, num_layers, dropout=dropout)
        elif cell_type == 'RNN':
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout=dropout)
        else:
            raise ValueError("Invalid RNN type. Choose from 'LSTM', 'GRU', or 'RNN'.")
        
        
            
        # Define fully connected layer
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)  
        
        # Attention Class object
        self.attn = Attention(hidden_size)
        
        # Softmax layer
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, encoder_outputs, hidden, cell): # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        
        
        # Ensure x has the shape (1, N)
        x = x.unsqueeze(0)
        
        # Perform dropout on the input
        embedding = self.embedding(x)
        embedding = self.dropout(embedding)  # embedding shape: (1, N, embedding_size)
        
        if self.cell_type == "LSTM":
            # Pass through the LSTM layer
            outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))  # outputs shape: (1, N, hidden_size * num_directions)
            
            # Calculate attention weights
            attention_weights = self.attn(outputs, encoder_outputs)
            context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
            
            # Concatenate  context vector and GRU output
            outputs = outputs.squeeze(0)
            context = context.squeeze(1)
            concat_input = torch.cat((outputs, context), 1)
            concat_output = torch.tanh(self.concat(concat_input))
            
            # Pass through fully connected layer
            out = self.fc(concat_output)
            predictions = self.log_softmax(out)

            return predictions, hidden, cell, attention_weights.squeeze(1)
        
        elif self.cell_type == "GRU" or self.cell_type == "RNN":
            # Pass through the  layer
            outputs, hidden = self.rnn(embedding, hidden)  # outputs shape: (1, N, hidden_size * num_directions)
            
            # Calculate attention weights
            attention_weights = self.attn(outputs, encoder_outputs)
            context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
            
            # Concatenate  context vector and GRU output
            outputs = outputs.squeeze(0)
            context = context.squeeze(1)
            concat_input = torch.cat((outputs, context), 1)
            concat_output = torch.tanh(self.concat(concat_input))

            
            # Pass through fully connected layer
            out = self.fc(concat_output)
            predictions = self.log_softmax(out)
            
            return predictions, hidden, attention_weights.squeeze(1)
    
        else:
            print("Encoder Failed to initialized!!!!!!!!")
            return None
        
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, output_char_to_int, teacher_forcing, cell_type):

        super(Seq2Seq, self).__init__()  
        # Initialize encoder and decoder
        self.decoder = decoder
        self.encoder = encoder
        self.cell_type = cell_type
        self.target_vocab_size =  len(output_char_to_int)
        self.teacher_force_ratio = teacher_forcing
    
#     def create_mask(self, input_sequence):
#         return (input_sequence != 0).permute(1, 0)
    
    def forward(self, source, target):
        
        # Get batch size, target length, and target vocabulary size
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.target_vocab_size
        teacher_force_ratio = self.teacher_force_ratio
        
        # Initialize outputs tensor
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        # Grab the first input to the Decoder which will be <SOS> token i.e '^'
        x = target[0,:]
        
        # Get hidden state and cell state from encoder
        if self.cell_type == 'LSTM':
            encoder_outputs, hidden, cell = self.encoder(source)
            hidden =  hidden[:self.decoder.num_layers]
            cell =  cell[:self.decoder.num_layers]
        else:
            encoder_outputs, hidden = self.encoder(source)
            hidden =  hidden[:self.decoder.num_layers]
        
        for t in range(1, target_len):
            # Use previous hidden and cell states as context from encoder at start
            if self.cell_type == 'LSTM':
                output, hidden, cell, _ = self.decoder(x, encoder_outputs, hidden, cell)
            else:
                output, hidden, _ = self.decoder(x, encoder_outputs, hidden, None)
            #output, hidden, cell = self.decoder(x, hidden, cell)
                
            # Store next output prediction
            outputs[t], best_guess = output, output.argmax(1)
            # Get the best word the Decoder predicted (index in the vocabulary)
            x = best_guess if random.random() >= teacher_force_ratio else target[t]

        return outputs

## TRAINING

In [5]:
# BEAM SEARCH FUNCTION
def beam_search(model, input_seq, max_length, input_char_index, output_char_index, reverse_target_char_index, beam_width, length_penalty, cell_type):
    if len(input_seq) > max_length:
        print("Input Length is exceeding max length!!!!")
        return ""
    
    # Create np array of zero of length i/p 
    input_data = np.zeros((max_length, 1), dtype=int) # (N,1)
    
    # Encode the input
    for idx, char in enumerate(input_seq):
        input_data[idx, 0] = input_char_index[char]
    input_data[idx+1, 0] = input_char_index["$"] # EOS
    
    # Convert to tensor
    input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device) # N,1
    
    with torch.no_grad():
        if cell_type == 'LSTM':
            encoder_outputs, hidden, cell = model.encoder(input_tensor)
            hidden =  hidden[:model.decoder.num_layers]
            cell =  cell[:model.decoder.num_layers]
        else:
            encoder_outputs, hidden = model.encoder(input_tensor)
            hidden =  hidden[:model.decoder.num_layers]
    
    # Initialize beam
    out_t = output_char_index['^']
    out_reshape = np.array(out_t).reshape(1,)
    hidden_par = hidden.unsqueeze(0)
    initial_sequence = torch.tensor(out_reshape).to(device)
    beam = [(0.0, initial_sequence, hidden_par)]  # [(score, sequence, hidden)]

    for _ in range(len(output_char_index)):
        candidates = []
        for score, seq, hidden in beam:
            if seq[-1].item() == output_char_index['$']:
                # If the sequence ends with the end token, add it to the candidates
                candidates.append((score, seq, hidden))
                continue
            
            last_token = np.array(seq[-1].item()).reshape(1,)
            x = torch.tensor(last_token).to(device)
            
            if cell_type == 'LSTM':
                output, hidden, cell, _ = model.decoder(x, encoder_outputs, hidden.squeeze(0), cell)
            else:
                output, hidden, _ = model.decoder(x, encoder_outputs, hidden.squeeze(0), None)
                
            probabilities = F.softmax(output, dim=1)
            topk_probs, topk_tokens = torch.topk(probabilities, k=beam_width)

            for prob, token in zip(topk_probs[0], topk_tokens[0]):
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                seq_length_norm_factor = (len(new_seq) - 1) / 5
                candidate_score = score + torch.log(prob).item() / (seq_length_norm_factor ** length_penalty)
                candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))

        # Select top-k candidates based on the accumulated scores
        beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[0])

    best_score, best_sequence, _ = max(beam, key=lambda x: x[0])  # Select the best sequence from the beam as the output

    # Convert the best sequence indices to characters
    return ''.join([reverse_target_char_index[token.item()] for token in best_sequence[1:]])


# TRAINING FUNCTION
def train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length):
    for epoch in range(num_epochs):
        total_words = 0
        correct_pred = 0
        total_loss = 0
        accuracy = 0
        model.train()
        
        # Use tqdm for progress tracking
        train_data_iterator = tqdm(zip(train_batch_x, train_batch_y), total=len(train_batch_x))
        
        for (x, y) in train_data_iterator:
            # Get input and targets and move to device
            target, inp_data = y.to(device), x.to(device)
            
            # Forward propagation
            optimizer.zero_grad()
            output = model(inp_data, target)
            
            target = target.reshape(-1)
            output = output.reshape(-1, output.shape[2])
            
            pad_mask = (target != 0)  
            target = target[pad_mask] # Select non-padding elements
            output = output[pad_mask] 
            
            # Calculate loss
            loss = criterion(output, target)
            
            # Backpropagation
            loss.backward()
            
            # Clip gradients to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            
            # Update parameters
            optimizer.step()
            
            # Accumulate total loss
            total_loss += loss.item()
            # Update total words processed
            total_words += target.size(0)
            # Calculate number of correct predictions
            correct_pred += torch.sum(torch.argmax(output, dim=1) == target).item()
            
        # Calculate average loss per batch
        avg_loss = total_loss / len(train_batch_x)
        # Calculate accuracy
        accuracy = 100*correct_pred / total_words
        
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_total_loss = 0
            val_total_words = 0
            val_correct_pred = 0

            val_data_iterator = tqdm(zip(val_batch_x, val_batch_y), total=len(val_batch_x))
            for x_val, y_val in val_data_iterator:
                target_val, inp_data_val = y_val.to(device), x_val.to(device)
                output_val = model(inp_data_val, target_val)
                
                
                target_val = target_val.reshape(-1)
                output_val = output_val.reshape(-1, output_val.shape[2])
                
                pad_mask = (target_val != 0)  
                target_val = target_val[pad_mask] # Select non-padding elements
                output_val = output_val[pad_mask] 
            
                val_loss = criterion(output_val, target_val)
                val_total_loss += val_loss.item()
                val_total_words += target_val.size(0)
                val_correct_pred += torch.sum(torch.argmax(output_val, dim=1) == target_val).item()

            # Calculate validation statistics
            val_accuracy = 100*val_correct_pred / val_total_words
            val_avg_loss = val_total_loss / len(val_batch_x)

            
        # Total word predict correct over training
        beam_val_pred = 0
        beam_val = 0
        for i in tqdm(range(df_val.shape[0])):
            input_seq = df_val.iloc[i, 0][:-1] 
            true_seq = df_val.iloc[i, 1][1:-1]
            predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type)
            if true_seq == predicted_output[:-1]:
                beam_val_pred+=1
        beam_val = 100*beam_val_pred/df_val.shape[0]
        
        # Print statistics
        print(f"Epoch {epoch + 1} / {num_epochs} ===========================>")
        print(f"Train Accuracy Char: {accuracy:.4f}, Train Average Loss: {avg_loss:.4f}")
        print(f"Validation Accuracy Char: {val_accuracy:.4f}, Validation Average Loss: {val_avg_loss:.4f}")
        print(f"Beam Val Word Accuracy: {beam_val:.4f} Correct Prediction : {beam_val_pred}/{df_val.shape[0]}")    
        # wandb.log({
        #         "train_accuracy_char": accuracy,
        #         "train_loss": avg_loss,
        #         "val_accuracy_char": val_accuracy,
        #         "val_loss": val_avg_loss,
        #         "beam_val_accuracy_word" : beam_val,
        #     })
        
    return model, beam_val

## SWEEP RUN

In [8]:
def main():
    wandb.init(project='DA6401_A3_partB')
    config = wandb.config
    wandb.run.name = 'attention'+ '_cell_' + config.cell_type + '_bs_' + str(config.batch_size) + '_ep_' + str(config.num_epochs) + '_op_' + str(config.optimizer) + '_drop_' + str(config.dropout) + '_bsw_' + str(config.beam_search_width) +'_emb_' + str(config.embedding_size) + '_hs_' + str(config.hidden_size) + '_elayer_' + str(config.num_layers) + '_dlayer_' + str(config.num_layers)
    
    # Load Dataset
    df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.train.tsv')
    df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.dev.tsv')
    df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.test.tsv')

    input_max_len = max(train_input_len, val_input_len, test_input_len)
    output_max_len = max(train_out_len, val_out_len, test_out_len)
    
    max_length = max(input_max_len, output_max_len)

    # Create Look Up Table
    input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
    output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

    # Data Embedding and Converting them into Tensor
    train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
    val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
    test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

    # Transpose column wise
    train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
    val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
    test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


    # Initialize Hyperparameters
    input_size = len(input_char_to_int)
    output_size = len(output_char_to_int)
    embedding_size = config.embedding_size
    hidden_size = config.hidden_size
    enc_num_layers = config.num_layers
    dec_num_layers = config.num_layers
    cell_type = config.cell_type
    dropout = config.dropout
    learning_rate = config.learning_rate
    batch_size = config.batch_size
    num_epochs = config.num_epochs  
    optimizer = config.optimizer  
    beam_width = config.beam_search_width
    bidirectional = config.bidirectional
    length_penalty = config.length_penalty
    teacher_forcing = config.teacher_forcing
    learning_rate = config.learning_rate

    # Create train data batch
    train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
    # Validation data batch
    val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


    # Intialize encoder, decoder and seq2seq model
    encoder = Encoder(input_size, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional, cell_type).to(device)
    decoder = Decoder(output_size, embedding_size, hidden_size, output_size, dec_num_layers, dropout, bidirectional, cell_type).to(device)  
    model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, cell_type).to(device)

    # Print total number of parameters in the model
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(model)
    print(f'Total Trainable Parameters: {total_params}')


    # Loss function and Optimizer
    criterion = nn.CrossEntropyLoss()
    if optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    elif optimizer == 'nadam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)
    else:
        print("Incorrect Optmizer !!!!")

    # TRAINING
    model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length)
    wandb.log({
            "accuracy": acc,
        })
    
# SWEEP CONFIG
# sweep_config = {
#     'name': 'sweep_1_attention',
#     'method': 'bayes',  
#     'metric': {'name': 'accuracy', 'goal': 'maximize'},
#     'parameters': {
#         'embedding_size': {'values': [256,512]},  
#         'hidden_size': {'values': [512, 1024]},
#         'num_layers': {'values': [2]},  
#         'cell_type': {'values':['LSTM', 'GRU']}, # RNN, LSTM, GRU
#         'dropout': {'values': [0.3,0.5]},
#         'learning_rate': {'values': [0.01,0.005]},
#         'batch_size': {'values': [32,64,128]},
#         'num_epochs': {'values': [5,10]},
#         'optimizer': {'values': ['adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
#         'beam_search_width': {'values': [1, 4]},
#         'length_penalty' : {'values': [0.6]},
#         'bidirectional': {'values': [True]},
#         'teacher_forcing': {'values': [0.7]}
#     }
# }
sweep_config = {
    'name': 'sweep_2_attention',
    'method': 'bayes',  
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'embedding_size': {'values': [256]},  
        'hidden_size': {'values': [1024]},
        'num_layers': {'values': [3]},  
        'cell_type': {'values':['LSTM']}, # RNN, LSTM, GRU
        'dropout': {'values': [0.5]},
        'learning_rate': {'values': [0.0001]},
        'batch_size': {'values': [64]},
        'num_epochs': {'values': [10]},
        'optimizer': {'values': ['adagrad']}, # ['sgd', 'rmsprop', 'adam', 'nadam']
        'beam_search_width': {'values': [1]},
        'length_penalty' : {'values': [0.6]},
        'bidirectional': {'values': [True]},
        'teacher_forcing': {'values': [0.7]}
    }
}

# RUN SWEEP ID with agent
sweep_id = wandb.sweep(sweep_config, project = 'DA6401_A3_partB')
wandb.agent(sweep_id, main, count=1)
wandb.finish()

Create sweep with ID: lxfhbsbz
Sweep URL: https://wandb.ai/cs24m030-indian-institute-of-technology-madras/DA6401_A3_partB/sweeps/lxfhbsbz


[34m[1mwandb[0m: Agent Starting Run: m7g7iw71 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_search_width: 1
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	embedding_size: 256
[34m[1mwandb[0m: 	hidden_size: 1024
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	length_penalty: 0.6
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	teacher_forcing: 0.7


Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 256)
    (rnn): LSTM(256, 1024, num_layers=3, dropout=0.5)
    (concat): Linear(in_features=2048, out_features=1024, bias=True)
    (fc): Linear(in_features=1024, out_features=29, bias=True)
    (attn): Attention()
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(66, 256)
    (rnn): LSTM(256, 1024, num_layers=3, dropout=0.5, bidirectional=True)
  )
)
Total Trainable Parameters: 85063453


100%|██████████| 691/691 [01:58<00:00,  5.81it/s]
100%|██████████| 69/69 [00:03<00:00, 18.22it/s]
100%|██████████| 4358/4358 [00:56<00:00, 77.37it/s]


Train Accuracy Char: 24.8230, Train Average Loss: 2.7159
Validation Accuracy Char: 20.9743, Validation Average Loss: 2.9415
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.78it/s]
100%|██████████| 69/69 [00:03<00:00, 18.20it/s]
100%|██████████| 4358/4358 [00:57<00:00, 75.27it/s]


Train Accuracy Char: 26.0887, Train Average Loss: 2.6679
Validation Accuracy Char: 21.6116, Validation Average Loss: 2.9311
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.77it/s]
100%|██████████| 69/69 [00:03<00:00, 18.16it/s]
100%|██████████| 4358/4358 [00:57<00:00, 75.53it/s]


Train Accuracy Char: 26.4399, Train Average Loss: 2.6508
Validation Accuracy Char: 21.5319, Validation Average Loss: 2.9289
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.77it/s]
100%|██████████| 69/69 [00:03<00:00, 18.05it/s]
100%|██████████| 4358/4358 [00:55<00:00, 78.25it/s]


Train Accuracy Char: 26.9733, Train Average Loss: 2.6290
Validation Accuracy Char: 20.9127, Validation Average Loss: 2.9994
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.77it/s]
100%|██████████| 69/69 [00:03<00:00, 18.20it/s]
100%|██████████| 4358/4358 [00:56<00:00, 76.79it/s]


Train Accuracy Char: 27.7909, Train Average Loss: 2.6042
Validation Accuracy Char: 21.2544, Validation Average Loss: 2.9861
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.78it/s]
100%|██████████| 69/69 [00:03<00:00, 18.25it/s]
100%|██████████| 4358/4358 [00:56<00:00, 76.54it/s]


Train Accuracy Char: 28.3179, Train Average Loss: 2.5830
Validation Accuracy Char: 21.4985, Validation Average Loss: 2.9760
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.77it/s]
100%|██████████| 69/69 [00:03<00:00, 18.27it/s]
100%|██████████| 4358/4358 [00:57<00:00, 76.28it/s]


Train Accuracy Char: 28.7721, Train Average Loss: 2.5644
Validation Accuracy Char: 21.5345, Validation Average Loss: 2.9747
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.78it/s]
100%|██████████| 69/69 [00:03<00:00, 18.19it/s]
100%|██████████| 4358/4358 [00:56<00:00, 76.63it/s]


Train Accuracy Char: 29.0369, Train Average Loss: 2.5496
Validation Accuracy Char: 21.4188, Validation Average Loss: 2.9964
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.78it/s]
100%|██████████| 69/69 [00:03<00:00, 18.21it/s]
100%|██████████| 4358/4358 [00:56<00:00, 76.58it/s]


Train Accuracy Char: 29.2947, Train Average Loss: 2.5361
Validation Accuracy Char: 21.5319, Validation Average Loss: 3.0142
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358


100%|██████████| 691/691 [01:59<00:00,  5.79it/s]
100%|██████████| 69/69 [00:03<00:00, 18.04it/s]
100%|██████████| 4358/4358 [00:57<00:00, 75.63it/s]

Train Accuracy Char: 29.4061, Train Average Loss: 2.5254
Validation Accuracy Char: 21.5704, Validation Average Loss: 3.0086
Beam Val Word Accuracy: 0.0229 Correct Prediction : 1/4358





0,1
accuracy,▁
beam_val_accuracy_word,▁▁▁▁▁▁▁▁▁▁
train_accuracy_char,▁▃▃▄▆▆▇▇██
train_loss,█▆▆▅▄▃▂▂▁▁
val_accuracy_char,▂█▇▁▄▇▇▆▇█
val_loss,▂▁▁▇▆▅▅▇██

0,1
accuracy,0.02295
beam_val_accuracy_word,0.02295
train_accuracy_char,29.40605
train_loss,2.52537
val_accuracy_char,21.57044
val_loss,3.00856


In [6]:
# Load Dataset
df_train, train_input_len, train_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.train.tsv')
df_val, val_input_len, val_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.dev.tsv')
df_test, test_input_len, test_out_len = load_dataset('/kaggle/input/dakshina/hi.translit.sampled.test.tsv')

input_max_len = max(train_input_len, val_input_len, test_input_len)
output_max_len = max(train_out_len, val_out_len, test_out_len)

max_length = max(input_max_len, output_max_len)

# Create Look Up Table
input_char_to_int, input_int_to_char = look_up_table(df_train[0], df_val[0], df_test[0])
output_char_to_int, output_int_to_char = look_up_table(df_train[1], df_val[1], df_test[1])

params = {
        "input_size": len(input_char_to_int),
        "output_size": len(output_char_to_int),
        "embedding_size": 256,
        "hidden_size": 512,
        "enc_num_layers": 2,
        "dec_num_layers": 2,
        "cell_type": "LSTM", # LSTM, GRU, RNN
        "dropout": 0.5,
        "learning_rate": 0.005,
        "batch_size": 32,
        "num_epochs": 10,
        "optimizer": 'adagrad',  # ['sgd', 'rmsprop', 'adam', 'nadam']
        "beam_search_width" : 1,
        "length_penalty" : 0.6,
        "bidirectional": True,
        "teacher_forcing":0.7,

    }

# Data Embedding and Converting them into Tensor
train_inputs, train_outputs = get_tensor_object(df_train, max_length, max_length, input_char_to_int, output_char_to_int)
val_inputs, val_outputs = get_tensor_object(df_val, max_length, max_length, input_char_to_int, output_char_to_int)
test_inputs, test_outputs = get_tensor_object(df_test, max_length, max_length, input_char_to_int, output_char_to_int)

# Transpose column wise
train_inputs, train_outputs = torch.transpose(train_inputs, 0, 1), torch.transpose(train_outputs, 0, 1)
val_inputs, val_outputs = torch.transpose(val_inputs, 0, 1), torch.transpose(val_outputs, 0, 1)
test_inputs, test_outputs = torch.transpose(test_inputs, 0, 1), torch.transpose(test_outputs, 0, 1)


# Initialize Hyperparameters
input_size = params['input_size']
output_size = params['output_size']
embedding_size = params['embedding_size']
hidden_size = params['hidden_size']
enc_num_layers = params['enc_num_layers'] 
dec_num_layers = params['dec_num_layers']  
cell_type = params['cell_type']
dropout = params['dropout']
learning_rate = params['learning_rate']
batch_size = params['batch_size']
num_epochs = params['num_epochs']  
optimizer = params['optimizer']  
beam_width = params['beam_search_width']
bidirectional = params['bidirectional']
length_penalty = params['length_penalty']
teacher_forcing = params['teacher_forcing']

# Create train data batch
train_batch_x, train_batch_y = torch.split(train_inputs, batch_size, dim=1), torch.split(train_outputs, batch_size, dim=1)
# Validation data batch
val_batch_x, val_batch_y = torch.split(val_inputs, batch_size, dim=1), torch.split(val_outputs, batch_size, dim=1)


# Intialize encoder, decoder and seq2seq model
encoder = Encoder(input_size, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional, cell_type).to(device)
decoder = Decoder(output_size, embedding_size, hidden_size, output_size, dec_num_layers, dropout, bidirectional, cell_type).to(device)  
model = Seq2Seq(encoder, decoder, output_char_to_int, teacher_forcing, cell_type).to(device)



# Print total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(model)
print(f'Total Trainable Parameters: {total_params}')


# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
if optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
elif optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=params['learning_rate'])
elif optimizer == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(), lr=params['learning_rate'])
elif optimizer == 'nadam':
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])
elif optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(), lr=params['learning_rate'])
else:
    print("Incorrect Optmizer !!!!")

model, acc = train(model, num_epochs, criterion, optimizer, train_batch_x, train_batch_y, val_batch_x, val_batch_y, df_val, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type, max_length)
        

Seq2Seq(
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (concat): Linear(in_features=1024, out_features=512, bias=True)
    (fc): Linear(in_features=512, out_features=29, bias=True)
    (attn): Attention()
    (log_softmax): LogSoftmax(dim=1)
  )
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(66, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
)
Total Trainable Parameters: 13695773


100%|██████████| 1382/1382 [01:19<00:00, 17.31it/s]
100%|██████████| 137/137 [00:02<00:00, 62.62it/s]
100%|██████████| 4358/4358 [00:42<00:00, 101.95it/s]


Train Accuracy Char: 66.1978, Train Average Loss: 1.1868
Validation Accuracy Char: 71.9649, Validation Average Loss: 0.9753
Beam Val Word Accuracy: 31.8495 Correct Prediction : 1388/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.38it/s]
100%|██████████| 137/137 [00:02<00:00, 64.59it/s]
100%|██████████| 4358/4358 [00:43<00:00, 101.04it/s]


Train Accuracy Char: 75.2473, Train Average Loss: 0.8577
Validation Accuracy Char: 75.0071, Validation Average Loss: 0.8582
Beam Val Word Accuracy: 37.7237 Correct Prediction : 1644/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.31it/s]
100%|██████████| 137/137 [00:02<00:00, 63.07it/s]
100%|██████████| 4358/4358 [00:43<00:00, 100.58it/s]


Train Accuracy Char: 76.2413, Train Average Loss: 0.8130
Validation Accuracy Char: 75.7728, Validation Average Loss: 0.8232
Beam Val Word Accuracy: 39.9036 Correct Prediction : 1739/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.28it/s]
100%|██████████| 137/137 [00:02<00:00, 63.95it/s]
100%|██████████| 4358/4358 [00:43<00:00, 100.32it/s]


Train Accuracy Char: 76.7675, Train Average Loss: 0.7888
Validation Accuracy Char: 76.9162, Validation Average Loss: 0.7781
Beam Val Word Accuracy: 41.3722 Correct Prediction : 1803/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.31it/s]
100%|██████████| 137/137 [00:02<00:00, 63.94it/s]
100%|██████████| 4358/4358 [00:43<00:00, 100.23it/s]


Train Accuracy Char: 77.2857, Train Average Loss: 0.7694
Validation Accuracy Char: 76.9881, Validation Average Loss: 0.7757
Beam Val Word Accuracy: 42.1065 Correct Prediction : 1835/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.28it/s]
100%|██████████| 137/137 [00:02<00:00, 63.97it/s]
100%|██████████| 4358/4358 [00:43<00:00, 100.04it/s]


Train Accuracy Char: 77.7332, Train Average Loss: 0.7516
Validation Accuracy Char: 77.2065, Validation Average Loss: 0.7728
Beam Val Word Accuracy: 43.2308 Correct Prediction : 1884/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.29it/s]
100%|██████████| 137/137 [00:02<00:00, 64.67it/s]
100%|██████████| 4358/4358 [00:43<00:00, 100.07it/s]


Train Accuracy Char: 77.8798, Train Average Loss: 0.7432
Validation Accuracy Char: 77.5868, Validation Average Loss: 0.7553
Beam Val Word Accuracy: 43.6439 Correct Prediction : 1902/4358


100%|██████████| 1382/1382 [01:19<00:00, 17.28it/s]
100%|██████████| 137/137 [00:02<00:00, 63.38it/s]
100%|██████████| 4358/4358 [00:43<00:00, 99.83it/s] 


Train Accuracy Char: 78.0374, Train Average Loss: 0.7386
Validation Accuracy Char: 77.2450, Validation Average Loss: 0.7626
Beam Val Word Accuracy: 43.8045 Correct Prediction : 1909/4358


100%|██████████| 1382/1382 [01:20<00:00, 17.26it/s]
100%|██████████| 137/137 [00:02<00:00, 63.86it/s]
100%|██████████| 4358/4358 [00:43<00:00, 99.88it/s] 


Train Accuracy Char: 78.1988, Train Average Loss: 0.7287
Validation Accuracy Char: 77.5354, Validation Average Loss: 0.7534
Beam Val Word Accuracy: 44.0340 Correct Prediction : 1919/4358


100%|██████████| 1382/1382 [01:20<00:00, 17.25it/s]
100%|██████████| 137/137 [00:02<00:00, 62.62it/s]
100%|██████████| 4358/4358 [00:43<00:00, 99.58it/s] 

Train Accuracy Char: 78.3119, Train Average Loss: 0.7240
Validation Accuracy Char: 77.4763, Validation Average Loss: 0.7539
Beam Val Word Accuracy: 44.2864 Correct Prediction : 1930/4358





## Store Prediction result

In [20]:
def store_results(data_type, words, translations, predictions, results):
    """
    This function saves the evaluation results to a CSV file.

    Args:
        data_type (str): The type of data used for evaluation (e.g., 'val', 'test').
        words (list): List of source words (without start/end tokens).
        translations (list): List of reference translations (without start/end tokens).
        predictions (list): List of predicted translated sequences (without start/end tokens).
        results (list): List of 'Yes' or 'No' indicating correct/incorrect predictions.
    """

    # Create a dictionary to store the results in a structured format
    log = {
        'Word': words,
        'Translation': translations,
        'Prediction': predictions,
        'Result': results  # 'Yes' for correct, 'No' for incorrect
    }
    
    # Construct the file path for the CSV file
    path = '/kaggle/working/predictions.csv'

    # Create a Pandas DataFrame from the dictionary
    data_frame = pd.DataFrame(log)

    # Save the DataFrame to a CSV file (header=True includes column names, index=False excludes row index)
    data_frame.to_csv(path, header=True, index=False)
    
    # Log to wandb
    wandb.init(project='DA6401_A3_partB', name='Prediction_Store')

    wandb.log({'Prediction_table': wandb.Table(dataframe= data_frame)})

    wandb.finish()

## TEST ACCURACY

In [8]:

test_acc = 0
correct_pred = 0
words_test = [] 
translations_test = [] 
predictions_test = []
results_test = []

for i in tqdm(range(df_test.shape[0])):
    input_seq = df_test.iloc[i, 0][:-1] 
    true_seq = df_test.iloc[i, 1][1:-1]
    predicted_output = beam_search(model, input_seq, max_length, input_char_to_int, output_char_to_int, output_int_to_char, beam_width, length_penalty, cell_type)
    words_test.append(input_seq)
    translations_test.append(true_seq)
    predictions_test.append(predicted_output[:-1])
    if true_seq == predicted_output[:-1]:
        correct_pred += 1
        results_test.append('Yes')
    else:
        results_test.append('No')

test_acc = 100 * correct_pred / df_test.shape[0]   

print(f'Test Accuracy Word Level: {test_acc}, Correctly Predicted: {correct_pred}')
wandb.init(project='DA6401_A3_partB', name='bestmodel_test')

wandb.log({ "val_accuracy_word" : acc,
            "test_accuracy_word" : test_acc
            })

wandb.finish()
# store_results('test', words_test, translations_test, predictions_test, results_test)

100%|██████████| 4502/4502 [00:44<00:00, 100.16it/s]


Test Accuracy Word Level: 42.514438027543314, Correctly Predicted: 1914


0,1
test_accuracy_word,▁
val_accuracy_word,▁

0,1
test_accuracy_word,42.51444
val_accuracy_word,44.28637


## HEATMAP

In [14]:
def predict(model, input_seq, input_char_index, output_char_index, reverse_target_char_index):
    model.eval()
    if len(input_seq) > input_max_len+1:
        print("Input Length is exceeding max length!!!!")
        return ""
    
    # Create np array of zero of length i/p 
    input_data = np.zeros((input_max_len+1, 1), dtype=int) # (N,1)
    
    # Encode the input
    for idx, char in enumerate(input_seq):
        input_data[idx, 0] = input_char_index[char]
    input_data[idx+1, 0] = input_char_index["$"]
    
    # Convert to tensor
    input_tensor = torch.tensor(input_data, dtype=torch.int64).to(device) # N,1
    
    with torch.no_grad():
        if cell_type == "LSTM":
            encoder_outputs, hidden_state, cell = model.encoder(input_tensor)
            hidden_state =  hidden_state[:model.decoder.num_layers]
            cell =  cell[:model.decoder.num_layers]
        
        else:
            encoder_outputs, hidden_state = model.encoder(input_tensor)
            hidden_state =  hidden_state[:model.decoder.num_layers]
    
    output_text = []
    output_start_token = output_char_index['^'] # SOS token
    output_start_token_tensor = torch.tensor([output_start_token]).to(device)
    
    attentions = torch.zeros(input_max_len + 1, 1, input_max_len + 1)
    #decoder_attentions = torch.zeros(29, 29)
    for i in range(1, len(output_char_index)):
        if cell_type == "LSTM":
            output, hidden_state, cell, attention = model.decoder(output_start_token_tensor, encoder_outputs, hidden_state, cell)
        else:
            output, hidden_state, attention = model.decoder(output_start_token_tensor, encoder_outputs, hidden_state, None)
        
        #print(attention)
        predicted_char = reverse_target_char_index[output.argmax(1).item()]
        attentions[i] = attention
        #decoder_attentions[i] = attention.data
        if predicted_char != '$':
            output_text.append(predicted_char)
        else:
            break
        output_start_token_tensor = torch.tensor([output.argmax(1)]).to(device)


    return ''.join(output_text), attentions[:i + 1]

In [None]:
def plot_attention_grid(sentences, translations, attentions, figsize=(15, 15)):
    fig, axes = plt.subplots(4, 3, figsize=figsize)
    fig.suptitle('Attention Matrix Grid', fontsize=18, fontweight='bold', y=0.95)
    for i in range(10):  # Changed to 10 examples
        sentence = list(sentences[i])
        translation = list(translations[i])
        attention = attentions[i][:len(translation), :len(sentence)].squeeze(1).detach().numpy()
        ax = axes.flat[i]
        im = ax.matshow(attention, cmap='plasma')  # Using 'plasma' colormap for better visualization
        ax.set_xticks(np.arange(len(sentence)))
        ax.set_xticklabels(sentence, size=10, fontweight='bold')  # Bold font for input sequence
        ax.set_yticks(np.arange(len(translation)))
        hindi_font = FontProperties(fname='/kaggle/input/hindi-font/TiroDevanagariHindi-Regular.ttf')
        ax.set_yticklabels(translation, size=10, fontproperties=hindi_font, fontweight='bold')  # Bold font for output sequence
        ax.set_xlabel('Input Sequence', fontsize=12, fontweight='bold')  # Bold font for axis labels
        ax.set_ylabel('Output Sequence', fontsize=12, fontweight='bold')  # Bold font for axis labels
        ax.grid(visible=True)  # Showing grid 
        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)  # Adding colorbar with adjusted position
        
    for ax in axes.flat[10:]:  # Changed to start hiding from the 11th subplot
        ax.axis('off')      
    fig.tight_layout(rect=[0, 0, 1, 0.93])  # Adjusting layout to avoid overlap with title
    wandb.log({"Attention Images": wandb.Image(fig)})
    plt.show()

# Get random samples from the test dataset
import random

# Set a random seed for reproducibility (optional)
random.seed(42)

# Get total number of rows in test dataset
total_rows = len(df_test)

# Randomly select 10 indices from the dataset
random_indices = random.sample(range(total_rows), 10)  # Changed from 9 to 10

inputs = []
outputs = []
attentions = []

# Use the random indices to fetch data
for idx in random_indices:
    input_seq = df_test.iloc[idx, 0][:-1]  # Extract input sequence from DataFrame
    output_seq = df_test.iloc[idx, 1][:-1]
    predicted_output, attention = predict(model, input_seq, input_char_to_int, output_char_to_int, output_int_to_char)
    attention = attention[:, :, :(len(input_seq))]
    
    inputs.append(input_seq)
    outputs.append(output_seq)
    attentions.append(attention)
    print(f"Processing example {idx+1}/{total_rows}: '{input_seq}' → '{output_seq}'")

# Initialize wandb and plot the attention grid
wandb.init(project='DA6401_A3_partB', name='Random_Attention_HeatMap_10')
plot_attention_grid(inputs, outputs, attentions)
wandb.finish()