## IMPORT LIBRARIES

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm
import heapq
import csv

import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import pandas as pd
import wandb

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



# DATA ANALYSIS AND PREPROCESSING

In [5]:
def data_load(path):
    # input - English
    # output - മലയാളം (Malayalam)
    df = pd.read_csv(path,header=None)
    input_data = df[1].tolist()
    output_data = df[0].tolist()
    return input_data, output_data
def create_char_set(train, val):
    char_set = set()
    for word in train:
        for char in word:
            char_set.add(char)
    for word in val:
        for char in word:
            char_set.add(char)
    return char_set


In [6]:
train_input, train_output = data_load("/content/drive/MyDrive/DA6401_Assignment-3/dakshina_dataset_v1.0/dakshina_dataset_v1.0/ml/lexicons/output.csv")
val_input, val_output = data_load("/content/drive/MyDrive/DA6401_Assignment-3/dakshina_dataset_v1.0/dakshina_dataset_v1.0/ml/lexicons/output_2.csv")
test_input, test_output = data_load("/content/drive/MyDrive/DA6401_Assignment-3/dakshina_dataset_v1.0/dakshina_dataset_v1.0/ml/lexicons/output_1.csv")
print("Number of training samples: ", len(train_input))
print("Number of validation samples: ", len(val_input))
print("Number of test samples: ", len(test_input))

Number of training samples:  58382
Number of validation samples:  5641
Number of test samples:  5610


In [7]:
def create_char_set(train, val):
    char_set = set()
    for word in train:
        # Convert word to string to handle potential float values
        for char in str(word):
            char_set.add(char)
    for word in val:
        # Convert word to string to handle potential float values
        for char in str(word):
            char_set.add(char)
    return char_set
eng_chars = create_char_set(train_input, val_input)
print("Total English characters: ",len(eng_chars))
print(sorted(eng_chars))
mal_chars = create_char_set(train_output, val_output)
print("Total Malayalam characters: ",len(mal_chars))
print(sorted(mal_chars))

Total English characters:  26
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Total Malayalam characters:  71
[' ', 'ം', 'ഃ', 'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'റ', 'ല', 'ള', 'ഴ', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', '്', 'ൗ', 'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ', '\u200c']


In [8]:
max_seq_eng = len(max((str(word) for word in train_input + val_input + test_input), key=len))
max_seq_mal = len(max((str(word) for word in train_output + val_output + test_output), key=len))
print("Length of the longest English word in corpus:",max_seq_eng)
print("Length of the longest Malayalam word in corpus::",max_seq_mal)
#max_seq_eng = len(max(train_input+val_input+test_input, key=len))
#max_seq_mal = len(max(train_output+val_output+test_output, key=len))
#print("Length of the longest English word in corpus:",max_seq_eng)
#print("Length of the longest Malayalam word in corpus::",max_seq_mal)

Length of the longest English word in corpus: 32
Length of the longest Malayalam word in corpus:: 31


In [9]:
eng_chars_idx = {char: idx + 3 for idx, char in enumerate(sorted(eng_chars))}
eng_chars_idx['0'] = 0 # padding
eng_chars_idx['\t'] = 1 # <SOW>
eng_chars_idx['\n'] = 2 # <EOW>
print(eng_chars_idx)
mal_chars_idx = {char: idx+3 for idx, char in enumerate(sorted(mal_chars))}
mal_chars_idx['0'] = 0 # padding
mal_chars_idx['\t'] = 1 # <SOW>
mal_chars_idx['\n'] = 2 # <EOW>
print(mal_chars_idx)

{'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28, '0': 0, '\t': 1, '\n': 2}
{' ': 3, 'ം': 4, 'ഃ': 5, 'അ': 6, 'ആ': 7, 'ഇ': 8, 'ഈ': 9, 'ഉ': 10, 'ഊ': 11, 'ഋ': 12, 'എ': 13, 'ഏ': 14, 'ഐ': 15, 'ഒ': 16, 'ഓ': 17, 'ഔ': 18, 'ക': 19, 'ഖ': 20, 'ഗ': 21, 'ഘ': 22, 'ങ': 23, 'ച': 24, 'ഛ': 25, 'ജ': 26, 'ഝ': 27, 'ഞ': 28, 'ട': 29, 'ഠ': 30, 'ഡ': 31, 'ഢ': 32, 'ണ': 33, 'ത': 34, 'ഥ': 35, 'ദ': 36, 'ധ': 37, 'ന': 38, 'പ': 39, 'ഫ': 40, 'ബ': 41, 'ഭ': 42, 'മ': 43, 'യ': 44, 'ര': 45, 'റ': 46, 'ല': 47, 'ള': 48, 'ഴ': 49, 'വ': 50, 'ശ': 51, 'ഷ': 52, 'സ': 53, 'ഹ': 54, 'ാ': 55, 'ി': 56, 'ീ': 57, 'ു': 58, 'ൂ': 59, 'ൃ': 60, 'െ': 61, 'േ': 62, 'ൈ': 63, 'ൊ': 64, 'ോ': 65, '്': 66, 'ൗ': 67, 'ൺ': 68, 'ൻ': 69, 'ർ': 70, 'ൽ': 71, 'ൾ': 72, '\u200c': 73, '0': 0, '\t': 1, '\n': 2}


In [10]:
idx2char_mal = {idx: char for char, idx in mal_chars_idx.items()}
mal_embedd_size = 29
eng_embedd_size = 32


In [11]:
def data_preprocess(data, max_seq, chars_idx):
    # Add start & end tokens and padding
    # sow = "\t" & eow = "\n"
    sow = "\t"
    eow = "\n"
    # Explicitly convert word to string before concatenation
    padded_data = [sow + str(word) + "0" * (max_seq - len(str(word))) + eow for word in data]
    # Convert sequences to indices
    seq2idx = torch.LongTensor([[chars_idx[char] for char in seq] for seq in padded_data])
    return seq2idx

In [12]:
train_idx_eng = data_preprocess(train_input, max_seq_eng, eng_chars_idx)
train_idx_mal = data_preprocess(train_output, max_seq_mal, mal_chars_idx)
val_idx_eng = data_preprocess(val_input, max_seq_eng, eng_chars_idx)
val_idx_mal = data_preprocess(val_output, max_seq_mal, mal_chars_idx)
test_idx_eng = data_preprocess(test_input, max_seq_eng, eng_chars_idx)
test_idx_mal = data_preprocess(test_output, max_seq_mal, mal_chars_idx)

In [13]:
from torch.utils.data import DataLoader
class Dataset():
    def __init__(self, train_idx_src, train_idx_tgt):
        self.train_idx_src = train_idx_src
        self.train_idx_tgt = train_idx_tgt

    def __len__(self):
        return len(self.train_idx_src)

    def __getitem__(self, idx):
        src_sample = self.train_idx_src[idx]
        tgt_label = self.train_idx_tgt[idx]
        return src_sample, tgt_label

# Assuming train_idx_src and train_idx_tgt are lists or arrays
train_dataset = Dataset(train_idx_eng, train_idx_mal)
val_dataset = Dataset(val_idx_eng, val_idx_mal)
test_dataset = Dataset(test_idx_eng, test_idx_mal)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=True)

Encoder part

In [14]:
# Encoder part
class Encoder(nn.Module):
    def __init__(self,
                 input_dim = 32,
                 emb_dim = 256,
                 enc_hid_dim = 256,
                 cell_type='gru',
                 num_layers=2,
                 dropout = 0,
                 #bidirectional = True # Keep commented out for the non-attention sweep
                 ):

        super(Encoder, self).__init__()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.enc_hid_dim = enc_hid_dim
        self.num_layers = num_layers
        # Embedding the input
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.cell_type = cell_type
        # Add dropout
        self.dropout = nn.Dropout(dropout)

        # Bidirectional part is commented out, so assume unidirectional (1 direction)
        self.val_direction = 1 # Initialize val_direction unconditionally to 1

        # If you uncomment bidirectional later, you'll need to modify this logic
        # if bidirectional:
        #     self.val_direction = 2
        # else :
        #     self.val_direction = 1


        if cell_type.lower() == 'rnn':
            self.rnn = nn.RNN(input_size = emb_dim,
                              hidden_size = enc_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= False, # Should be False if not a param
                              batch_first=True)
        elif cell_type.lower() == 'lstm':
            self.rnn = nn.LSTM(input_size = emb_dim,
                              hidden_size = enc_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= False, # Should be False if not a param
                              batch_first=True)
        elif cell_type.lower() == 'gru':
            self.rnn = nn.GRU(input_size = emb_dim,
                              hidden_size = enc_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= False, # Should be False if not a param
                              batch_first=True)

    def forward(self, src, hidden, cell=None):
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)

        if self.cell_type == 'lstm':
            # nn.LSTM returns output, (h_n, c_n)
            output,(hidden,cell) = self.rnn(embedded, (hidden,cell))
        else:
            # nn.RNN and nn.GRU return output, h_n
            output, hidden = self.rnn(embedded, hidden)
            cell = None # Ensure cell is None when not LSTM

        # output shape is (batch_size, seq_len, hidden_size * num_directions) if batch_first=True
        # hidden/cell shapes are (num_layers * num_directions, batch_size, hidden_size)
        return output, hidden, cell

Decoder part

In [15]:
# Decoder part
class Decoder(nn.Module):
    def __init__(self,
                 output_dim = 29,
                 emb_dim = 256,
                 dec_hid_dim = 256,
                 cell_type='gru',
                 num_layers=2,
                 dropout = 0,
                 #bidirectional = True, # Keep commented if not using bidirectional
                 attention = False,
                 attention_dim = None
                 ):

        super(Decoder, self).__init__()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Embedding part
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.attention = attention
        # Dropout to add onto embedded input
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type

        # Define val_direction - assuming not using bidirectional decoder
        self.val_direction = 1

        #if bidirectional :
            #self.val_direction = 2
        #else :
           # self.val_direction = 1

        # Linear layer to get the output
        self.W1 = nn.Linear(dec_hid_dim * self.val_direction, output_dim)
        # Softmax layer
        self.softmax = F.softmax # This is not used in the forward pass when CrossEntropyLoss is used

        if attention:
            self.attention_dim = attention_dim
            # Need to define self.input_size and self.hidden_dimension etc if using attention
            # Assuming input_size is emb_dim and hidden_dimension is dec_hid_dim for this context
            self.input_size = emb_dim # Example definition, adjust if needed
            self.hidden_dimension = dec_hid_dim # Example definition, adjust if needed
            self.attention_out_dimension = self.hidden_dimension # Example definition, adjust if needed
            # If attention is concatenated to the input embedding, the input size to the RNN/GRU changes
            # This input size definition was likely for the attention calculation itself, not the RNN input.
            # The input to the RNN/GRU should be emb_dim + context_vector_dim if attention is concatenated to input.
            # Or, if attention is used to modify the hidden state, the input size is just emb_dim.
            # Based on your Decoder forward, attention output (context) is concatenated to the embedded output.
            # So, if attention, the input size to RNN/GRU should be emb_dim + attention_out_dimension (which seems to be dec_hid_dim).
            rnn_input_size = emb_dim
            if attention:
                 # Assuming context vector size is dec_hid_dim * self.val_direction if derived from encoder_outputs
                 # based on calculate_attention logic and the linear layer in Seq2Seq.
                 # Let's check calculate_attention function again.
                 # The calculate_attention function returns normalized_context_vector, which is (batch_size, 1, hidden_size*num_directions) if encoder_outputs are (batch_size, seq_len, hidden_size*num_directions)
                 # Based on the code, encoder_outputs are (batch_size, seq_len, enc_hid_dim * val_direction) from Encoder's batch_first output
                 # calculate_attention permutes it to (seq_len, batch_size, ...), operates, and returns (batch_size, 1, hidden_size*num_directions)
                 # So context vector size is enc_hid_dim * val_direction.
                 # If concatenated, rnn_input_size = emb_dim + enc_hid_dim * val_direction
                 rnn_input_size = emb_dim + self.encoder.enc_hid_dim * self.encoder.val_direction # Assuming encoder is accessible and has val_direction attribute

            self.U = nn.Sequential(nn.Linear( self.hidden_dimension, self.hidden_dimension), nn.LeakyReLU())
            self.W = nn.Sequential(nn.Linear( self.hidden_dimension, self.hidden_dimension), nn.LeakyReLU())
            self.V = nn.Sequential(nn.Linear( self.hidden_dimension, self.attention_out_dimension), nn.LeakyReLU())


        # Determine the input size for the RNN/GRU based on whether attention is used for concatenation
        rnn_input_size = emb_dim
        if attention:
             # If context is concatenated to the input embedding before the RNN/GRU
             # Need to get the size of the context vector from the calculate_attention logic
             # Based on Seq2Seq and Decoder forward, context comes from encoder_outputs (batch_size, seq_len, enc_hid_dim * val_direction)
             # calculate_attention output is (batch_size, 1, enc_hid_dim * val_direction)
             # So context size is enc_hid_dim * val_direction
             # Assuming encoder is an attribute of the Seq2Seq model, not directly available in Decoder init.
             # You might need to pass encoder config or context size to the Decoder if attention is used.
             # Let's assume for now the context size is enc_hid_dim * self.encoder.val_direction (if accessible)
             # Or, if attention_dim is intended for the context size, use that.
             # Let's use attention_dim for now, assuming that's the intended size of the context vector.
             # However, based on the calculate_attention structure, the context vector size is tied to encoder output dim.
             # Let's pass encoder_hid_dim and encoder_val_direction to Decoder init if attention is True.
             # For now, let's assume attention_dim is the correct size to add.
             rnn_input_size = emb_dim + self.attention_dim


        if cell_type.lower() == 'rnn':
            self.rnn = nn.RNN(input_size=rnn_input_size, # Use rnn_input_size
                              hidden_size = dec_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= bidirectional, # Keep commented if not using bidirectional
                              batch_first=True)
        elif cell_type.lower() == 'lstm':
            self.rnn = nn.LSTM(input_size=rnn_input_size, # Use rnn_input_size
                               hidden_size = dec_hid_dim,
                               num_layers=num_layers,
                               dropout = dropout,
                               #bidirectional= bidirectional, # Keep commented if not using bidirectional
                               batch_first=True)
        elif cell_type.lower() == 'gru':
            self.rnn = nn.GRU(input_size=rnn_input_size, # Use rnn_input_size
                              hidden_size = dec_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= bidirectional, # Keep commented if not using bidirectional
                              batch_first=True)

        # self.fc_out = nn.Linear(dec_hid_dim, output_dim) # This layer is not used in the forward pass


    # The calculate_attention function needs to be a method of the class if it uses self attributes, or a standalone function
    # If it's a method, it needs 'self' as the first parameter.
    # Based on its structure, it seems intended to be a standalone helper function or a method needing more context (like encoder_outputs structure)
    # Let's move it outside the class definition for now, assuming it's a helper.
    # Or, more likely, it should be integrated into the Decoder forward method directly if it's simple attention.
    # If it's a complex attention module, it should be a separate nn.Module.
    # Given it's defined *inside* the Decoder class but without 'self', it's syntactically incorrect.
    # Let's remove the misplaced function definition for now. If attention is needed, it must be correctly implemented.
    # Assuming for the "no attention" sweep this function isn't called anyway.

    # Removed the misplaced calculate_attention function definition here


    def forward(self, input, hidden, cell=None,encoder_outputs=None):
#         Incorporate dropout in embedding.
        # input shape: (batch_size, 1) if processing one token at a time
        embedded = self.embedding(input) # embedded shape: (batch_size, 1, emb_dim)
        output = self.dropout(embedded) # output shape: (batch_size, 1, emb_dim)

        attention_weights = None
#         If we are using attention, then we need to concatenate the context vector, which we obtain from attention

        if self.attention and encoder_outputs is not None:
            # The calculate_attention function needs to be defined correctly.
            # Assuming the helper function calculate_attention is defined globally or imported.
            # Need to ensure hidden and encoder_outputs shapes match what calculate_attention expects.
            # Based on calculate_attention's assumed signature: calculate_attention(hidden, encoder_outputs)
            # hidden shape (from Seq2Seq): (num_layers, batch_size, hidden_size) -> Needs permuting/reshaping for calculate_attention?
            # encoder_outputs shape (from Seq2Seq): (batch_size, seq_len, hidden_size * num_directions) -> Needs permuting/reshaping for calculate_attention?
            # Let's assume for now that the calculate_attention function (wherever it is) expects:
            # hidden: (batch_size, hidden_size) - the hidden state for the *current* time step (single layer, single direction)
            # encoder_outputs: (batch_size, src_seq_len, enc_hidden_size * enc_directions)
            # This implies significant reshaping and potentially selecting the last layer's hidden state before passing to attention.
            # However, the current Decoder forward is processing one token at a time. The 'hidden' state passed is the full state (num_layers, batch_size, hidden_size).
            # If using Bahdanau/Luong style attention, the hidden state from the *previous* decoder step is used with encoder outputs.
            # Let's assume `calculate_attention` is meant to take the current decoder hidden state `hidden` (possibly reshaped or last layer)
            # and `encoder_outputs`. The `context` returned should be `(batch_size, 1, context_size)`.

            # Example call assuming calculate_attention expects (batch_size, dec_hid_dim) and (batch_size, src_len, enc_hid_dim * enc_directions)
            # This requires getting the appropriate hidden state from the full 'hidden' tensor.
            # For a single-layer decoder, hidden is (1, batch_size, dec_hid_dim). For multi-layer, might use top layer.
            # Let's simplify for now and assume a simple attention mechanism that might require reshaping.
            # If `calculate_attention` was meant to be a simple dot product or similar, it should operate on appropriate shapes.
            # Let's stick to fixing the RNN type error first and note the attention part needs proper implementation later.

            # Assuming calculate_attention is corrected and returns (batch_size, 1, context_size)
            # context, attention_weights = calculate_attention(...) # Call the corrected attention function
            # output = torch.cat((output, context), 2) # Concatenate context to the input embedding


            # **Temporarily skip attention logic for fixing the RNN type error**
            pass # Do nothing if attention is False

        # output shape is now (batch_size, 1, rnn_input_size) - where rnn_input_size = emb_dim or emb_dim + context_size


        if self.cell_type == 'lstm':
            # self.rnn is nn.LSTM, expects (input, (h_0, c_0))
            rnn_output, (hidden, cell) = self.rnn(output, (hidden, cell)) # Pass tuple for LSTM
        else: # Covers 'rnn' and 'gru'
            # self.rnn is nn.RNN or nn.GRU, expects (input, h_0)
            rnn_output, hidden = self.rnn(output, hidden) # Pass single tensor for RNN/GRU
            cell = None # Ensure cell is None when not LSTM

        # rnn_output shape is (batch_size, 1, dec_hid_dim * val_direction)

        # Apply the final linear layer to get logits
        # Squeeze the sequence length dimension (size 1) before the linear layer
        output_logits = self.W1(rnn_output.squeeze(1)) # output_logits shape: (batch_size, output_dim)


        return output_logits, hidden, cell, attention_weights # Return logits, updated states, attention weights

# Moved calculate_attention outside the class or defined elsewhere if needed
# def calculate_attention(hidden, encoder_outputs, ...):
#     # ... implementation ...
#     return context, attention_weights


# The train1 and calc_test_acc functions should be the corrected versions from the previous turn.
# They should obtain vocab_size using model.decoder.W1.out_features
# and use reshape(-1, vocab_size) and reshape(-1) for loss calculation,
# and torch.argmax(outputs, dim=2) for accuracy calculation.

Seq2Seq Model

In [16]:
# Seq2Seq Model
class Seq2Seq(nn.Module):

    def __init__(self,
                 encoder,
                 decoder,
                 # dec_inp_dim = 29, # This parameter is not strictly needed if we get output_dim from decoder
                 enc_hid_dim = 256, # Parameter needed for linear transformation layers
                 dec_hid_dim =256, # Parameter needed for linear transformation layers
                 #bidirectional = True, # Keep commented for now
                 enc_num_layers = 3, # Parameter needed for linear transformation layers
                 dec_num_layers = 2, # Parameter needed for linear transformation layers
                 cell_type = 'lstm', # Parameter needed for conditional logic and state transformation
                 dropout = 0.2, # Parameter potentially needed elsewhere in Seq2Seq (though typically in Encoder/Decoder)
                 attention = False # Parameter needed to control attention logic
                ):


        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Decoder input dimension
        # self.dec_inp_dim = dec_inp_dim # Removed as it's not used for predictions tensor size

        # Store the parameters needed for state transformation layers
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.enc_num_layers = enc_num_layers
        self.dec_num_layers = dec_num_layers
        self.cell_type = cell_type
        self.dropout = dropout # Storing, but not directly used in Seq2Seq forward here
        self.attention = attention # Storing, used in Seq2Seq forward

        # Initialize val_direction (assuming unidirectional if bidirectional is commented)
        self.val_direction = 1

        self.softmax = F.softmax # Not used in forward with CrossEntropyLoss

        # If attention is used, then we need to transform encoder's last hidden to decoder's first hidden
        # Correct the input dimension for the linear transformation based on the flattened encoder hidden state
        # Input size: num_layers * num_directions * hidden_size of encoder
        # Output size: num_layers * num_directions * hidden_size of decoder
        self.enc_dec_linear1 = nn.Linear(self.enc_hid_dim * self.val_direction * self.enc_num_layers,
                                         self.dec_hid_dim * self.val_direction * self.dec_num_layers)


        # Linear layer to transform encoder's last cell to decoder's first cell (only for LSTM)
        if self.cell_type == 'lstm':
             # Correct the input dimension for the linear transformation based on the flattened encoder cell state
             self.enc_dec_cell_linear1 = nn.Linear(self.enc_hid_dim * self.val_direction * self.enc_num_layers,
                                                  self.dec_hid_dim * self.val_direction * self.dec_num_layers)


        # Get the global max_seq_mal value for target sequence length
        global max_seq_mal
        self.target_seq_len = max_seq_mal + 2 # Store the correct target sequence length


    def forward(self, source, target, teacher_forcing = False, is_training=False):
        batch_size = source.shape[0]

        # Initialize initial states for the encoder
        # Correct shape: (num_layers * num_directions, batch_size, hidden_size)
        encoder_initial_hidden = torch.zeros(self.encoder.num_layers * self.encoder.val_direction,
                                             batch_size,
                                             self.encoder.enc_hid_dim,
                                             device=device)
        encoder_initial_cell = torch.zeros(self.encoder.num_layers * self.encoder.val_direction,
                                           batch_size,
                                           self.encoder.enc_hid_dim,
                                           device=device) if self.encoder.cell_type == 'lstm' else None # Use encoder's cell_type


        # Pass the full source sequence through the encoder
        # encoder_output: (batch_size, seq_len, hidden_size * num_directions) if batch_first=True
        # last_state: (num_layers * num_directions, batch_size, hidden_size)
        # cell_state: (num_layers * num_directions, batch_size, hidden_size) for LSTM
        encoder_output, last_state, cell_state = self.encoder(source, encoder_initial_hidden, encoder_initial_cell)


        # If attention is used, `encoder_outputs` for attention are the outputs from the encoder at each time step
        if self.attention:
             encoder_outputs = encoder_output # Shape: (batch_size, seq_len, hidden_size * num_directions)
        else:
             encoder_outputs = None # Explicitly None if attention is off


        # Encoder's last state is decoders first state (after transformation)
        # last_state is (enc_num_layers * val_direction, batch_size, enc_hid_dim)

        # Transform encoder's last hidden state to decoder's first hidden state
        # Use the last hidden state from the encoder across all layers if applicable
        # Reshape last_state to (batch_size, enc_num_layers * val_direction * enc_hid_dim) before linear transformation
        last_state_reshaped = last_state.permute(1, 0, 2).reshape(batch_size, -1) # Shape: (batch_size, enc_num_layers * val_direction * enc_hid_dim)

        # Apply the linear transformation
        decoder_hidden_reshaped = self.enc_dec_linear1(last_state_reshaped) # Shape: (batch_size, dec_num_layers * val_direction * dec_hid_dim)

        # Reshape back to (dec_num_layers * val_direction, batch_size, dec_hid_dim) for the decoder
        # Note: The reshape should match the target shape (num_layers * num_directions, batch_size, hidden_size)
        decoder_hidden = decoder_hidden_reshaped.reshape(batch_size, self.dec_num_layers * self.val_direction, self.dec_hid_dim).permute(1, 0, 2) # Shape: (dec_num_layers * val_direction, batch_size, dec_hid_dim)


        # Here also, encoders last cell is decoders first cell, also transform to same dimension (for LSTM)
        if  self.cell_type == 'lstm': # Use Seq2Seq's cell_type for this transformation
            cell_state_reshaped = cell_state.permute(1, 0, 2).reshape(batch_size, -1) # Shape: (batch_size, enc_num_layers * val_direction * enc_hid_dim)
            decoder_cell_reshaped = self.enc_dec_cell_linear1(cell_state_reshaped) # Shape: (batch_size, dec_num_layers * val_direction * dec_hid_dim)
            # Reshape back to (dec_num_layers * val_direction, batch_size, dec_hid_dim) for the decoder
            decoder_cell_state = decoder_cell_reshaped.reshape(batch_size, self.dec_num_layers * self.val_direction, self.dec_hid_dim).permute(1, 0, 2) # Shape: (dec_num_layers * val_direction, batch_size, dec_hid_dim)
        else:
            decoder_cell_state = None # Ensure cell_state is None for RNN/GRU


        # Initialize predictions and attention_weights
        # Get the correct output dimension from the decoder's final linear layer (assuming W1 is the final layer)
        target_output_dim = self.decoder.W1.out_features # Use W1 as it's used in decoder forward

        # Use the stored target sequence length
        predictions = torch.zeros(batch_size, self.target_seq_len, target_output_dim, device = device)

        # Attention weights shape: (batch_size, target_seq_len, source_seq_len)
        # Need global max_seq_eng here
        global max_seq_eng
        attention_weights = torch.zeros(batch_size, self.target_seq_len, max_seq_eng + 2, device = device) if self.attention else None # Source seq len is max_seq_eng + SOW + EOW

        # Initialize the first input to the decoder. This should be the <SOW> token (index 1).
        # Use mal_chars_idx['\t'] for the SOW index
        decoder_input = torch.full((batch_size, 1), mal_chars_idx['\t'], dtype=torch.long, device=device)


        # Do decoding by char by char fashion by batch
        # The loop should run self.target_seq_len times
        for t in range(self.target_seq_len): # Loop over the correct target sequence length

            # Pass the current hidden state and cell state (if LSTM) to the decoder
            decoder_output, decoder_hidden, cell_state, attention_wts = self.decoder(
                decoder_input,
                decoder_hidden, # Pass the updated hidden state from the previous step
                cell_state, # Pass the updated cell state from the previous step (will be None for RNN/GRU)
                encoder_outputs # Pass encoder outputs for attention
            )

            # Store the prediction (logits) for the current time step
            # decoder_output shape from Decoder forward: (batch_size, output_dim) after squeeze(1)
            predictions[:, t, :] = decoder_output # decoder_output is already squeezed in Decoder forward


            if self.attention and attention_wts is not None:
                # Store attention weights if attention is used
                # attention_wts shape from calculate_attention: (batch_size, source_seq_len)
                attention_weights[:, t, :] = attention_wts # Store attention weights for this decoding step


            # Determine the input for the next time step
            # Teacher forcing should only happen if t is within the bounds of the target sequence
            if teacher_forcing and is_training and t < self.target_seq_len - 1:
                # Teacher forcing: use the actual target token as input to the decoder
                # Target shape is (batch_size, target_seq_len). Input needs to be (batch_size, 1).
                decoder_input = target[:, t].unsqueeze(1)
            else:
                # Without teacher forcing or during inference: use the decoder's predicted token from the current time step as input for the next step
                # Get the predicted token index (argmax) from the logits
                predicted_token = torch.argmax(decoder_output, dim=-1) # shape: (batch_size)
                decoder_input = predicted_token.unsqueeze(1).detach() # shape: (batch_size, 1), Detach from graph for the next input


        # Return predictions and attention weights
        # predictions shape: (batch_size, target_seq_len, output_dim)
        # attention_weights shape: (batch_size, target_seq_len, source_seq_len)
        return predictions, attention_weights

Training and Accuracy

In [17]:
# Training and Accuracy
def train1(model, train_loader, val_loader, epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = optim.Adam(model.parameters())
    # Use Ignore Index for padding token if needed, but 0 is padding and shouldn't be in target labels usually
    # If padding token 0 should not contribute to loss/accuracy, use ignore_index=0 in CrossEntropyLoss
    # Assuming padding token 0 *is* part of the target sequence for fixed length.
    criterion = nn.CrossEntropyLoss()
    model.train()

    # Get the vocabulary size from the model's decoder's output layer
    vocab_size = model.decoder.W1.out_features # Assuming W1 is the final output linear layer


    for epoch in tqdm(range(epochs)):
        for phase in ['train', 'val']:
            if phase == 'train':
                data_loader = train_loader
                model.train() # Set model to training mode
            else:
                data_loader = val_loader
                model.eval() # Set model to evaluation mode

            running_loss = 0.0
            running_corrects = 0
            total_tokens = 0


            for inputs, labels in data_loader:
                inputs = inputs.to(device)
                labels = labels.to(device) # labels shape: (batch_size, target_seq_len)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    # outputs shape: (batch_size, target_seq_len, vocab_size)
                    outputs, _ = model(inputs, labels, epoch < epochs/2, phase == 'train')

                    # Reshape outputs for CrossEntropyLoss
                    # Expected shape for CrossEntropyLoss: (N, C) and (N) where C is vocab_size
                    # outputs_reshaped: (batch_size * target_seq_len, vocab_size)
                    outputs_reshaped = outputs.reshape(-1, vocab_size)

                    # Reshape labels for CrossEntropyLoss
                    # labels_reshaped: (batch_size * target_seq_len)
                    labels_reshaped = labels.reshape(-1)

                    # Calculate loss
                    loss = criterion(outputs_reshaped, labels_reshaped)

                    if phase == 'train':
                        loss.backward()
                        nn.utils.clip_grad_norm_(model.parameters(), 1)
                        optimizer.step()


                running_loss += loss.item() * inputs.size(0) # Scale loss by batch size

                # Calculate accuracy - comparing predicted token indices with target token indices
                # preds shape: (batch_size, target_seq_len)
                preds = torch.argmax(outputs, dim=2) # Get the index of the highest logit for each token

                # Count correct predictions
                running_corrects += torch.sum(preds == labels).item() # Count total correctly predicted tokens in the batch
                total_tokens += labels.numel() # Count total tokens in the batch (batch_size * target_seq_len)


            epoch_loss = running_loss / len(data_loader.dataset) # Average loss per sample
            epoch_acc = running_corrects / total_tokens # Accuracy across all tokens in the epoch

            print(f'Epoch no: {epoch}')
            if phase == 'train':
                print(f'Train loss: {epoch_loss:.4f} \t Train Accuracy: {epoch_acc:.4f}')
                wandb.log({ 'Epoch': epoch, 'train_accuracy': epoch_acc * 100})
                wandb.log({ 'Epoch': epoch, 'train_loss': epoch_loss})
            else:
                print(f'Validation loss: {epoch_loss:.4f} \t Validation Accuracy: {epoch_acc:.4f}')
                wandb.log({ 'Epoch': epoch, 'validation_accuracy': epoch_acc * 100})
                wandb.log({ 'Epoch': epoch, 'validation_loss': epoch_loss})


# Test Accuracy calculation
# Assuming compute_score is not available or needed for token-wise accuracy
def calc_test_acc(model, loader):
    model.eval()
    total_loss = 0
    running_corrects = 0
    total_tokens = 0
    criterion = nn.CrossEntropyLoss() # Using CrossEntropyLoss directly on logits

    # Get the vocabulary size from the model's decoder's output layer
    vocab_size = model.decoder.W1.out_features # Assuming W1 is the final output linear layer


    with torch.no_grad(): # No gradient calculation during evaluation
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            # outputs shape: (batch_size, target_seq_len, vocab_size)
            # Pass target=None and teacher_forcing=False, is_training=False for inference
            outputs, _ = model(inputs, None, False, False)

            # Reshape outputs for loss calculation
            outputs_reshaped = outputs.reshape(-1, vocab_size)

            # Reshape targets for loss calculation
            targets_reshaped = targets.reshape(-1)

            # Calculate loss
            loss = criterion(outputs_reshaped, targets_reshaped)
            total_loss += loss.item() * inputs.size(0) # Scale loss by batch size

            # Calculate accuracy - comparing predicted token indices with target token indices
            # preds shape: (batch_size, target_seq_len)
            preds = torch.argmax(outputs, dim=2) # Get the index of the highest logit for each token

            # Count correct predictions
            running_corrects += torch.sum(preds == targets).item() # Count total correctly predicted tokens in the batch
            total_tokens += targets.numel() # Count total tokens in the batch (batch_size * target_seq_len)


    avg_loss = total_loss / len(loader.dataset) # Average loss per sample
    avg_acc = running_corrects / total_tokens # Accuracy across all tokens in the test set

    print(f'Test Loss: {avg_loss:.4f} \t Test Accuracy: {avg_acc:.4f}')

    wandb.log({'Test_accuracy': avg_acc * 100})
    wandb.log({ 'Test_loss': avg_loss})

# WANDB SWEEPS WITHOUT ATTENTION

In [18]:
# sweep config file
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep - no attention',
    'metric': {
      'goal': 'maximize',
      'name': 'validation_accuracy'
    },
    'parameters':{
        'input_embedding_size': {
            'values': [64, 128] # 16,32,64,
        },
        'enc_layers': {
            'values': [1,2,3]
        },
        'dec_layers': {
            'values': [1,2,3]
        },
        'hidden_size': {
            'values': [64, 128, 256]
        },
        'cell_type': {
            'values': ['lstm','rnn','gru']
        #},
        #'bidirectional' : {
        #   'values' : [True]
        },
        'dropout': {
            'values': [0.1, 0.2, 0.3]
        },
        'beam_size' : {
            'values' : [1,3,5]
        }
     }
}

# Create a sweep
sweep_id = wandb.sweep(sweep = sweep_config, entity="mdkarimullahaque-iit-madras", project='DL_Assignment_3') # modeling MA23C021-A3
# f5557fe014798eefe63f9822700bf01578424638

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: qs9elj80
Sweep URL: https://wandb.ai/mdkarimullahaque-iit-madras/DL_Assignment_3/sweeps/qs9elj80


In [None]:
#wandb sweeps without attention - make sure to use your key for running
def main():
  with wandb.init() as run:
    wandb.run.name = f'cell-{wandb.config.cell_type}_hid_sz-{wandb.config.hidden_size}_inp_embed-{wandb.config.input_embedding_size}_enc-{wandb.config.enc_layers}_dec-{wandb.config.dec_layers}_dropout-{wandb.config.dropout}'

    # Get the vocabulary sizes from the global variables
    input_vocab_size = len(eng_chars_idx)
    output_vocab_size = len(mal_chars_idx)

    # Encoder part - Correct parameter names
    encoder = Encoder(
                    input_dim = input_vocab_size, # Use the actual input vocabulary size
                    emb_dim = wandb.config.input_embedding_size, # Map sweep param to emb_dim
                    enc_hid_dim =  wandb.config.hidden_size, # Map sweep param to enc_hid_dim
                    cell_type = wandb.config.cell_type,
                    num_layers = wandb.config.enc_layers, # Map sweep param to num_layers
                    #bidirectional = wandb.config.bidirectional, # Uncomment if bidirectional is added
                    dropout = wandb.config.dropout
                    )
    # Decoder part - Correct parameter names
    decoder = Decoder(
                        output_dim = output_vocab_size, # Use the actual output vocabulary size
                        emb_dim = wandb.config.input_embedding_size, # Map sweep param to emb_dim
                        dec_hid_dim = wandb.config.hidden_size, # Map sweep param to dec_hid_dim
                        cell_type = wandb.config.cell_type,
                        num_layers = wandb.config.dec_layers, # Map sweep param to num_layers
                        dropout = wandb.config.dropout,
                        #bidirectional = wandb.config.bidirectional, # Uncomment if bidirectional is added
                        attention = False,
                        attention_dim = wandb.config.hidden_size # This param might only be needed if attention is True
                        )
    # Init model - Correct parameter names if they exist in Seq2Seq __init__
    # Note: Seq2Seq __init__ now takes encoder and decoder objects directly.
    # The following parameters are passed to Seq2Seq: enc_hid_dim, dec_hid_dim,
    # enc_num_layers, dec_num_layers, cell_type, dropout, attention.
    model1 = Seq2Seq(encoder = encoder, # Pass the instantiated encoder object
                     decoder = decoder, # Pass the instantiated decoder object
                    enc_hid_dim = wandb.config.hidden_size, # Map sweep param
                    dec_hid_dim = wandb.config.hidden_size, # Map sweep param
                    #bidirectional = wandb.config.bidirectional, # Uncomment if bidirectional is added
                    enc_num_layers = wandb.config.enc_layers, # Map sweep param
                    dec_num_layers = wandb.config.dec_layers, # Map sweep param
                    cell_type = wandb.config.cell_type, # Map sweep param
                    dropout = wandb.config.dropout, # Map sweep param
                    attention = False
                    # beam_width is not a parameter of Seq2Seq __init__ based on your definition
                    # device is also not a parameter of Seq2Seq __init__
                )

    model1.to(device)

    epochs = 15
    # train1 function signature is train1(model, train_loader, val_loader, epochs)
    # The extra 'beam' argument seems incorrect based on the train1 definition
    train1(model1, train_loader, val_loader, epochs) # Removed extra argument

wandb.agent(sweep_id, function = main, count = 50) # calls main function for count number of times
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: fhul7g4i with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 128
[34m[1mwandb[0m: Currently logged in as: [33mmdkarimullahaque[0m ([33mmdkarimullahaque-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 1.2506 	 Train Accuracy: 0.7252


  7%|▋         | 1/15 [01:27<20:30, 87.92s/it]

Epoch no: 0
Validation loss: 1.2046 	 Validation Accuracy: 0.7184
Epoch no: 1
Train loss: 0.7740 	 Train Accuracy: 0.7939


 13%|█▎        | 2/15 [02:46<17:54, 82.66s/it]

Epoch no: 1
Validation loss: 1.1830 	 Validation Accuracy: 0.7330
Epoch no: 2
Train loss: 0.6895 	 Train Accuracy: 0.8166


 20%|██        | 3/15 [04:06<16:13, 81.13s/it]

Epoch no: 2
Validation loss: 1.2786 	 Validation Accuracy: 0.7139
Epoch no: 3
Train loss: 0.6277 	 Train Accuracy: 0.8327


 27%|██▋       | 4/15 [05:27<14:51, 81.08s/it]

Epoch no: 3
Validation loss: 1.2470 	 Validation Accuracy: 0.7324
Epoch no: 4
Train loss: 0.5706 	 Train Accuracy: 0.8468


 33%|███▎      | 5/15 [06:46<13:23, 80.31s/it]

Epoch no: 4
Validation loss: 1.2161 	 Validation Accuracy: 0.7437
Epoch no: 5
Train loss: 0.5132 	 Train Accuracy: 0.8606


 40%|████      | 6/15 [08:05<11:59, 79.93s/it]

Epoch no: 5
Validation loss: 1.1670 	 Validation Accuracy: 0.7585
Epoch no: 6
Train loss: 0.4584 	 Train Accuracy: 0.8736


 47%|████▋     | 7/15 [09:24<10:36, 79.62s/it]

Epoch no: 6
Validation loss: 1.0964 	 Validation Accuracy: 0.7815
Epoch no: 7
Train loss: 0.4123 	 Train Accuracy: 0.8850


 53%|█████▎    | 8/15 [10:51<09:34, 82.08s/it]

Epoch no: 7
Validation loss: 1.0510 	 Validation Accuracy: 0.7938
Epoch no: 8
Train loss: 0.7702 	 Train Accuracy: 0.7924


 60%|██████    | 9/15 [12:10<08:06, 81.05s/it]

Epoch no: 8
Validation loss: 0.6341 	 Validation Accuracy: 0.8210
Epoch no: 9
Train loss: 0.6753 	 Train Accuracy: 0.8060


 67%|██████▋   | 10/15 [13:29<06:42, 80.54s/it]

Epoch no: 9
Validation loss: 0.5924 	 Validation Accuracy: 0.8299
Epoch no: 10
Train loss: 0.6389 	 Train Accuracy: 0.8139


 73%|███████▎  | 11/15 [14:49<05:20, 80.24s/it]

Epoch no: 10
Validation loss: 0.5698 	 Validation Accuracy: 0.8340
Epoch no: 11
Train loss: 0.6123 	 Train Accuracy: 0.8194


 80%|████████  | 12/15 [16:07<03:58, 79.53s/it]

Epoch no: 11
Validation loss: 0.5454 	 Validation Accuracy: 0.8389
Epoch no: 12
Train loss: 0.5910 	 Train Accuracy: 0.8241


 87%|████████▋ | 13/15 [17:27<02:39, 79.62s/it]

Epoch no: 12
Validation loss: 0.5239 	 Validation Accuracy: 0.8448
Epoch no: 13
Train loss: 0.5723 	 Train Accuracy: 0.8287


 93%|█████████▎| 14/15 [18:47<01:19, 79.74s/it]

Epoch no: 13
Validation loss: 0.5101 	 Validation Accuracy: 0.8468
Epoch no: 14
Train loss: 0.5567 	 Train Accuracy: 0.8322


100%|██████████| 15/15 [20:05<00:00, 80.37s/it]

Epoch no: 14
Validation loss: 0.4978 	 Validation Accuracy: 0.8512





0,1
Epoch,▁▁▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇██
train_accuracy,▁▄▅▆▆▇▇█▄▅▅▅▅▆▆
train_loss,█▄▃▃▂▂▁▁▄▃▃▃▂▂▂
validation_accuracy,▁▂▁▂▃▃▄▅▆▇▇▇███
validation_loss,▇▇██▇▇▆▆▂▂▂▁▁▁▁

0,1
Epoch,14.0
train_accuracy,83.22433
train_loss,0.55674
validation_accuracy,85.11601
validation_loss,0.49781


[34m[1mwandb[0m: Agent Starting Run: szf6fgu6 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 0.9561 	 Train Accuracy: 0.7487


  7%|▋         | 1/15 [05:27<1:16:18, 327.05s/it]

Epoch no: 0
Validation loss: 1.2153 	 Validation Accuracy: 0.7482
Epoch no: 1
Train loss: 0.6758 	 Train Accuracy: 0.8037


 13%|█▎        | 2/15 [10:46<1:09:54, 322.63s/it]

Epoch no: 1
Validation loss: 1.1494 	 Validation Accuracy: 0.7326
Epoch no: 2
Train loss: 0.5569 	 Train Accuracy: 0.8339


 20%|██        | 3/15 [16:05<1:04:09, 320.81s/it]

Epoch no: 2
Validation loss: 1.0773 	 Validation Accuracy: 0.7549
Epoch no: 3
Train loss: 0.4609 	 Train Accuracy: 0.8586


 27%|██▋       | 4/15 [21:22<58:34, 319.48s/it]  

Epoch no: 3
Validation loss: 0.9939 	 Validation Accuracy: 0.7949
Epoch no: 4
Train loss: 0.3849 	 Train Accuracy: 0.8795


 33%|███▎      | 5/15 [26:43<53:21, 320.13s/it]

Epoch no: 4
Validation loss: 0.9899 	 Validation Accuracy: 0.7986
Epoch no: 5
Train loss: 0.3287 	 Train Accuracy: 0.8956


 40%|████      | 6/15 [32:08<48:14, 321.60s/it]

Epoch no: 5
Validation loss: 0.9042 	 Validation Accuracy: 0.8144
Epoch no: 6
Train loss: 0.2863 	 Train Accuracy: 0.9084


 47%|████▋     | 7/15 [37:34<43:03, 322.91s/it]

Epoch no: 6
Validation loss: 0.8744 	 Validation Accuracy: 0.8197
Epoch no: 7
Train loss: 0.2530 	 Train Accuracy: 0.9187


 53%|█████▎    | 8/15 [42:58<37:44, 323.50s/it]

Epoch no: 7
Validation loss: 0.8108 	 Validation Accuracy: 0.8545
Epoch no: 8
Train loss: 0.5670 	 Train Accuracy: 0.8232


 60%|██████    | 9/15 [48:21<32:18, 323.10s/it]

Epoch no: 8
Validation loss: 0.4686 	 Validation Accuracy: 0.8506
Epoch no: 9
Train loss: 0.5394 	 Train Accuracy: 0.8298


 67%|██████▋   | 10/15 [53:41<26:51, 322.36s/it]

Epoch no: 9
Validation loss: 0.5525 	 Validation Accuracy: 0.8248
Epoch no: 10
Train loss: 0.4357 	 Train Accuracy: 0.8573


 73%|███████▎  | 11/15 [59:02<21:27, 321.90s/it]

Epoch no: 10
Validation loss: 0.4122 	 Validation Accuracy: 0.8571
Epoch no: 11
Train loss: 0.3858 	 Train Accuracy: 0.8743


 80%|████████  | 12/15 [1:04:22<16:03, 321.32s/it]

Epoch no: 11
Validation loss: 0.3657 	 Validation Accuracy: 0.8885
Epoch no: 12
Train loss: 0.3591 	 Train Accuracy: 0.8835


 87%|████████▋ | 13/15 [1:09:43<10:42, 321.15s/it]

Epoch no: 12
Validation loss: 0.3517 	 Validation Accuracy: 0.8932
Epoch no: 13
Train loss: 0.3367 	 Train Accuracy: 0.8917


 93%|█████████▎| 14/15 [1:15:00<05:20, 320.04s/it]

Epoch no: 13
Validation loss: 0.3777 	 Validation Accuracy: 0.8775
Epoch no: 14
Train loss: 0.4020 	 Train Accuracy: 0.8665


100%|██████████| 15/15 [1:20:16<00:00, 321.11s/it]

Epoch no: 14
Validation loss: 0.3770 	 Validation Accuracy: 0.8743





0,1
Epoch,▁▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇███
train_accuracy,▁▃▅▆▆▇██▄▄▅▆▇▇▆
train_loss,█▅▄▃▂▂▁▁▄▄▃▂▂▂▂
validation_accuracy,▂▁▂▄▄▅▅▆▆▅▆██▇▇
validation_loss,█▇▇▆▆▅▅▅▂▃▁▁▁▁▁

0,1
Epoch,14.0
train_accuracy,86.65487
train_loss,0.40196
validation_accuracy,87.42701
validation_loss,0.377


[34m[1mwandb[0m: Agent Starting Run: j29i91xo with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 128


  0%|          | 0/15 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "<ipython-input-16-5b44e9432a02>", line 55, in main
    train1(model1, train_loader, val_loader, epochs) # Removed extra argument
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-14-1a4a35a7af27>", line 37, in train1
    outputs, _ = model(inputs, labels, epoch < epochs/2, phase == 'train')
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-13-2d0e953aaefd>", line 137, in forward
    decoder_output, decoder_hidden, cell_state, attention_wts = self.decoder(
  

[34m[1mwandb[0m: [32m[41mERROR[0m Run j29i91xo errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-16-5b44e9432a02>", line 55, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train1(model1, train_loader, val_loader, epochs) # Removed extra argument
[34m[1mwandb[0m: [32m[41mERROR[0m     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-14-1a4a35a7af27>", line 37, in train1
[34m[1mwandb[0m: [32m[41mERROR[0m     outputs, _ = model(inputs, labels, epoch < epochs/2, phase == 'train')
[34m[1mwandb[0m: [32m[41mERROR[0m                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 0.9718 	 Train Accuracy: 0.7479


  7%|▋         | 1/15 [09:34<2:14:06, 574.78s/it]

Epoch no: 0
Validation loss: 1.2607 	 Validation Accuracy: 0.7016
Epoch no: 1
Train loss: 0.6666 	 Train Accuracy: 0.8059


 13%|█▎        | 2/15 [19:09<2:04:33, 574.90s/it]

Epoch no: 1
Validation loss: 1.1466 	 Validation Accuracy: 0.7405
Epoch no: 2
Train loss: 0.4856 	 Train Accuracy: 0.8532


 20%|██        | 3/15 [28:48<1:55:17, 576.43s/it]

Epoch no: 2
Validation loss: 1.0152 	 Validation Accuracy: 0.8005
Epoch no: 3
Train loss: 0.3642 	 Train Accuracy: 0.8863


 27%|██▋       | 4/15 [38:17<1:45:08, 573.52s/it]

Epoch no: 3
Validation loss: 0.8928 	 Validation Accuracy: 0.8317
Epoch no: 4
Train loss: 0.2797 	 Train Accuracy: 0.9113


 33%|███▎      | 5/15 [47:41<1:35:00, 570.09s/it]

Epoch no: 4
Validation loss: 0.8426 	 Validation Accuracy: 0.8250
Epoch no: 5
Train loss: 0.2283 	 Train Accuracy: 0.9274


 40%|████      | 6/15 [57:09<1:25:26, 569.60s/it]

Epoch no: 5
Validation loss: 0.7764 	 Validation Accuracy: 0.8419
Epoch no: 6
Train loss: 0.1918 	 Train Accuracy: 0.9390


 47%|████▋     | 7/15 [1:06:40<1:15:59, 569.89s/it]

Epoch no: 6
Validation loss: 0.6635 	 Validation Accuracy: 0.8874
Epoch no: 7
Train loss: 0.1638 	 Train Accuracy: 0.9487


 53%|█████▎    | 8/15 [1:16:10<1:06:29, 569.98s/it]

Epoch no: 7
Validation loss: 0.6833 	 Validation Accuracy: 0.8875
Epoch no: 8
Train loss: 0.5336 	 Train Accuracy: 0.8339


 60%|██████    | 9/15 [1:25:44<57:07, 571.32s/it]  

Epoch no: 8
Validation loss: 0.4339 	 Validation Accuracy: 0.8529
Epoch no: 9
Train loss: 0.3757 	 Train Accuracy: 0.8776


 67%|██████▋   | 10/15 [1:35:15<47:35, 571.07s/it]

Epoch no: 9
Validation loss: 0.3310 	 Validation Accuracy: 0.8976
Epoch no: 10
Train loss: 0.3601 	 Train Accuracy: 0.8829


 73%|███████▎  | 11/15 [1:44:47<38:06, 571.53s/it]

Epoch no: 10
Validation loss: 0.4654 	 Validation Accuracy: 0.8663
Epoch no: 11
Train loss: 0.3211 	 Train Accuracy: 0.8942


 80%|████████  | 12/15 [1:54:12<28:28, 569.39s/it]

Epoch no: 11
Validation loss: 0.3035 	 Validation Accuracy: 0.9130
Epoch no: 12
Train loss: 0.2745 	 Train Accuracy: 0.9076


 87%|████████▋ | 13/15 [2:03:35<18:55, 567.67s/it]

Epoch no: 12
Validation loss: 0.3517 	 Validation Accuracy: 0.8959
Epoch no: 13
Train loss: 0.2702 	 Train Accuracy: 0.9147


 93%|█████████▎| 14/15 [2:12:55<09:25, 565.23s/it]

Epoch no: 13
Validation loss: 0.2845 	 Validation Accuracy: 0.9147
Epoch no: 14
Train loss: 0.2280 	 Train Accuracy: 0.9282


100%|██████████| 15/15 [2:22:17<00:00, 569.18s/it]

Epoch no: 14
Validation loss: 0.2743 	 Validation Accuracy: 0.9191





0,1
Epoch,▁▁▁▁▁▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train_accuracy,▁▃▅▆▇▇██▄▆▆▆▇▇▇
train_loss,█▅▄▃▂▂▁▁▄▃▃▂▂▂▂
validation_accuracy,▁▂▄▅▅▆▇▇▆▇▆█▇██
validation_loss,█▇▆▅▅▅▄▄▂▁▂▁▂▁▁

0,1
Epoch,14.0
train_accuracy,92.81721
train_loss,0.22805
validation_accuracy,91.90881
validation_loss,0.27426


[34m[1mwandb[0m: Agent Starting Run: 7fm42vm1 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 0.9977 	 Train Accuracy: 0.7396


  7%|▋         | 1/15 [11:31<2:41:17, 691.23s/it]

Epoch no: 0
Validation loss: 1.1914 	 Validation Accuracy: 0.7450
Epoch no: 1
Train loss: 0.6985 	 Train Accuracy: 0.7982


 13%|█▎        | 2/15 [22:59<2:29:23, 689.49s/it]

Epoch no: 1
Validation loss: 1.1493 	 Validation Accuracy: 0.7401
Epoch no: 2
Train loss: 0.5109 	 Train Accuracy: 0.8459


 20%|██        | 3/15 [34:37<2:18:42, 693.54s/it]

Epoch no: 2
Validation loss: 1.0100 	 Validation Accuracy: 0.7953
Epoch no: 3
Train loss: 0.3788 	 Train Accuracy: 0.8814


 27%|██▋       | 4/15 [46:09<2:07:02, 692.98s/it]

Epoch no: 3
Validation loss: 0.8550 	 Validation Accuracy: 0.8405
Epoch no: 4
Train loss: 0.2903 	 Train Accuracy: 0.9077


 33%|███▎      | 5/15 [57:48<1:55:51, 695.14s/it]

Epoch no: 4
Validation loss: 0.8260 	 Validation Accuracy: 0.8345
Epoch no: 5
Train loss: 0.2306 	 Train Accuracy: 0.9265


 40%|████      | 6/15 [1:09:41<1:45:09, 701.05s/it]

Epoch no: 5
Validation loss: 0.7494 	 Validation Accuracy: 0.8501
Epoch no: 6
Train loss: 0.1925 	 Train Accuracy: 0.9392


 47%|████▋     | 7/15 [1:21:25<1:33:37, 702.17s/it]

Epoch no: 6
Validation loss: 0.6484 	 Validation Accuracy: 0.8952
Epoch no: 7
Train loss: 0.1618 	 Train Accuracy: 0.9495


 53%|█████▎    | 8/15 [1:33:16<1:22:14, 704.95s/it]

Epoch no: 7
Validation loss: 0.6154 	 Validation Accuracy: 0.9031
Epoch no: 8
Train loss: 0.4432 	 Train Accuracy: 0.8617


 60%|██████    | 9/15 [1:44:54<1:10:15, 702.61s/it]

Epoch no: 8
Validation loss: 0.3332 	 Validation Accuracy: 0.8994
Epoch no: 9
Train loss: 0.3504 	 Train Accuracy: 0.8890


 67%|██████▋   | 10/15 [1:56:38<58:36, 703.23s/it] 

Epoch no: 9
Validation loss: 0.3293 	 Validation Accuracy: 0.9012
Epoch no: 10
Train loss: 0.3011 	 Train Accuracy: 0.9051


 73%|███████▎  | 11/15 [2:08:28<47:01, 705.30s/it]

Epoch no: 10
Validation loss: 0.3044 	 Validation Accuracy: 0.9082
Epoch no: 11
Train loss: 0.2725 	 Train Accuracy: 0.9146


 80%|████████  | 12/15 [2:20:03<35:05, 701.95s/it]

Epoch no: 11
Validation loss: 0.2946 	 Validation Accuracy: 0.9137
Epoch no: 12
Train loss: 0.2325 	 Train Accuracy: 0.9271


 87%|████████▋ | 13/15 [2:31:44<23:23, 701.79s/it]

Epoch no: 12
Validation loss: 0.3216 	 Validation Accuracy: 0.9034
Epoch no: 13
Train loss: 0.2142 	 Train Accuracy: 0.9332


 93%|█████████▎| 14/15 [2:43:20<11:39, 699.89s/it]

Epoch no: 13
Validation loss: 0.2757 	 Validation Accuracy: 0.9210
Epoch no: 14
Train loss: 0.1944 	 Train Accuracy: 0.9395


100%|██████████| 15/15 [2:54:54<00:00, 699.62s/it]

Epoch no: 14
Validation loss: 0.2542 	 Validation Accuracy: 0.9278





0,1
Epoch,▁▁▁▁▁▂▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train_accuracy,▁▃▅▆▇▇██▅▆▇▇▇▇█
train_loss,█▅▄▃▂▂▁▁▃▃▂▂▂▁▁
validation_accuracy,▁▁▃▅▅▅▇▇▇▇▇▇▇██
validation_loss,██▇▅▅▅▄▄▂▂▁▁▂▁▁

0,1
Epoch,14.0
train_accuracy,93.95398
train_loss,0.19439
validation_accuracy,92.7753
validation_loss,0.2542


[34m[1mwandb[0m: Agent Starting Run: 5uqbcmb8 with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 1.0267 	 Train Accuracy: 0.7328


  7%|▋         | 1/15 [11:31<2:41:24, 691.78s/it]

Epoch no: 0
Validation loss: 1.2134 	 Validation Accuracy: 0.7205
Epoch no: 1
Train loss: 0.6794 	 Train Accuracy: 0.8064


 13%|█▎        | 2/15 [22:59<2:29:20, 689.25s/it]

Epoch no: 1
Validation loss: 1.1350 	 Validation Accuracy: 0.7498
Epoch no: 2
Train loss: 0.4690 	 Train Accuracy: 0.8583


 20%|██        | 3/15 [34:24<2:17:31, 687.61s/it]

Epoch no: 2
Validation loss: 0.9208 	 Validation Accuracy: 0.8238
Epoch no: 3
Train loss: 0.3317 	 Train Accuracy: 0.8959


 27%|██▋       | 4/15 [45:53<2:06:06, 687.86s/it]

Epoch no: 3
Validation loss: 0.8025 	 Validation Accuracy: 0.8555
Epoch no: 4
Train loss: 0.2585 	 Train Accuracy: 0.9188


 33%|███▎      | 5/15 [57:26<1:54:58, 689.88s/it]

Epoch no: 4
Validation loss: 0.6857 	 Validation Accuracy: 0.8814
Epoch no: 5
Train loss: 0.2007 	 Train Accuracy: 0.9363


 40%|████      | 6/15 [1:09:02<1:43:47, 691.91s/it]

Epoch no: 5
Validation loss: 0.6727 	 Validation Accuracy: 0.8774
Epoch no: 6
Train loss: 0.1784 	 Train Accuracy: 0.9436


 47%|████▋     | 7/15 [1:20:39<1:32:29, 693.64s/it]

Epoch no: 6
Validation loss: 0.6150 	 Validation Accuracy: 0.8894
Epoch no: 7
Train loss: 0.1651 	 Train Accuracy: 0.9495


 53%|█████▎    | 8/15 [1:32:23<1:21:18, 696.86s/it]

Epoch no: 7
Validation loss: 0.5981 	 Validation Accuracy: 0.9014
Epoch no: 8
Train loss: 0.4408 	 Train Accuracy: 0.8634


 60%|██████    | 9/15 [1:43:53<1:09:28, 694.77s/it]

Epoch no: 8
Validation loss: 0.3256 	 Validation Accuracy: 0.9033
Epoch no: 9
Train loss: 0.3305 	 Train Accuracy: 0.8936


 67%|██████▋   | 10/15 [1:55:32<57:59, 695.93s/it] 

Epoch no: 9
Validation loss: 0.3047 	 Validation Accuracy: 0.9119
Epoch no: 10
Train loss: 0.2848 	 Train Accuracy: 0.9100


 73%|███████▎  | 11/15 [2:07:11<46:27, 696.99s/it]

Epoch no: 10
Validation loss: 0.2889 	 Validation Accuracy: 0.9137
Epoch no: 11
Train loss: 0.2648 	 Train Accuracy: 0.9171


 80%|████████  | 12/15 [2:18:53<34:55, 698.56s/it]

Epoch no: 11
Validation loss: 0.2717 	 Validation Accuracy: 0.9190
Epoch no: 12
Train loss: 0.2340 	 Train Accuracy: 0.9273


 87%|████████▋ | 13/15 [2:30:28<23:14, 697.39s/it]

Epoch no: 12
Validation loss: 0.2923 	 Validation Accuracy: 0.9100


# Test Accuracy calculation

In [22]:

def calc_test_acc(model, loader):
    model.eval()
    total_loss = 0
    total_score = 0
    criterion = nn.CrossEntropyLoss()

    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs, _ = model(inputs, None, False, False)

        # Calculate accuracy
        preds = torch.argmax(F.softmax(outputs, dim=2), dim=2).T
        total_score += compute_score(preds, targets)

        # Reshape outputs and targets for loss calculation
        outputs = outputs.permute(1, 0, 2).reshape(-1, 72)
        targets = F.one_hot(targets, num_classes=72).float().reshape(-1, 72)

        # Calculate loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    avg_score = total_score / len(loader.dataset)

    print(f'Test Loss: {avg_loss} \t Test Accuracy: {avg_score}')

    wandb.log({'Test_accuracy': avg_score * 100})
    wandb.log({ 'Test_loss': avg_loss})



In [None]:
# Best hyperparameter configuration without attention
'''
input embedding size: 256
number of encoder layers: 3
number of decoder layers: 2 -> This should be 3 based on the error and the code below
hidden layer size: 256
cell type: LSTM
#bidirectional: True
dropout: 0.2 -> Using 0.3 based on the code below
beam width : 1
'''

# Instantiate the Encoder and Decoder first
# Use len(eng_chars_idx) and len(mal_chars_idx) for input_dim and output_dim respectively
# These global variables are available from previous code blocks.
encoder_best = Encoder(
    input_dim=len(eng_chars_idx), # Use the size of the English character index as input_dim
    emb_dim=256,
    enc_hid_dim=256,
    cell_type='lstm',
    num_layers=3,
    dropout=0.3 # Using 0.3 as in the original code block
    # Remove bidirectional if not used
)

decoder_best = Decoder(
    output_dim=len(mal_chars_idx), # Use the size of the Malayalam character index as output_dim
    emb_dim=256,
    dec_hid_dim=256,
    cell_type='lstm',
    num_layers=3, # Changed from 2 to 3 to match the Seq2Seq instantiation below
    dropout=0.3, # Using 0.3 as in the original code block
    # Remove bidirectional if not used
    attention=False # No attention for this model
)

# Instantiate Seq2Seq with the encoder and decoder objects and correct parameter names
# The Seq2Seq __init__ takes encoder and decoder objects directly, plus some config parameters.
best_model = Seq2Seq(
    encoder=encoder_best, # Pass the instantiated encoder object
    decoder=decoder_best, # Pass the instantiated decoder object
    enc_hid_dim=256, # Pass encoder hidden dim
    dec_hid_dim=256, # Pass decoder hidden dim
    #bidirectional = True, # Uncomment if bidirectional is used
    enc_num_layers=3, # Pass encoder num layers
    dec_num_layers=3, # Changed from 2 to 3 to match decoder_best.num_layers
    cell_type='lstm', # Pass cell type
    dropout=0.3, # Pass dropout
    attention=False # Pass attention flag
    # Remove beam_width and device as they are not arguments for Seq2Seq __init__
)

best_model.to(device)
epochs = 20

# Wrap the training call for the best_model in a wandb.init() block
# Give it a name so it appears separately in your dashboard
with wandb.init(project='DL_Assignment_3', entity="mdkarimullahaque-iit-madras", name='Best_Model_Training') as run:
    # Ensure the train1 function call matches its definition: train1(model, train_loader, val_loader, epochs)
    train1(best_model, train_loader, val_loader, epochs) # Removed the extra 'False' argument

# If you also want to calculate and log test accuracy after training the best model
# Add the call to calc_test_acc within the same wandb.init block or another one
# using the trained best_model
with wandb.init(project='DL_Assignment_3', entity="mdkarimullahaque-iit-madras", name='Best_Model_Test_Accuracy') as run:
    calc_test_acc(best_model, test_loader)

[34m[1mwandb[0m: Currently logged in as: [33mmdkarimullahaque[0m ([33mmdkarimullahaque-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/20 [00:00<?, ?it/s]

Epoch no: 0
Train loss: 1.0156 	 Train Accuracy: 0.7457


  5%|▌         | 1/20 [17:51<5:39:22, 1071.74s/it]

Epoch no: 0
Validation loss: 1.1797 	 Validation Accuracy: 0.7429
Epoch no: 1
Train loss: 0.7104 	 Train Accuracy: 0.8067


 10%|█         | 2/20 [35:44<5:21:45, 1072.53s/it]

Epoch no: 1
Validation loss: 1.2888 	 Validation Accuracy: 0.7427
Epoch no: 2
Train loss: 0.5701 	 Train Accuracy: 0.8433


 15%|█▌        | 3/20 [53:52<5:05:48, 1079.32s/it]

Epoch no: 2
Validation loss: 1.2967 	 Validation Accuracy: 0.7544
Epoch no: 3
Train loss: 0.4085 	 Train Accuracy: 0.8856


 20%|██        | 4/20 [1:11:49<4:47:36, 1078.52s/it]

Epoch no: 3
Validation loss: 1.0384 	 Validation Accuracy: 0.8169
Epoch no: 4
Train loss: 0.2654 	 Train Accuracy: 0.9234


 25%|██▌       | 5/20 [1:29:56<4:30:22, 1081.49s/it]

Epoch no: 4
Validation loss: 0.8266 	 Validation Accuracy: 0.8618
Epoch no: 5
Train loss: 0.1877 	 Train Accuracy: 0.9452


 30%|███       | 6/20 [1:47:59<4:12:28, 1082.06s/it]

Epoch no: 5
Validation loss: 0.6712 	 Validation Accuracy: 0.8925
Epoch no: 6
Train loss: 0.1472 	 Train Accuracy: 0.9565


 35%|███▌      | 7/20 [2:05:59<3:54:18, 1081.43s/it]

Epoch no: 6
Validation loss: 0.5929 	 Validation Accuracy: 0.9089
Epoch no: 7
Train loss: 0.1191 	 Train Accuracy: 0.9648


 40%|████      | 8/20 [2:24:07<3:36:42, 1083.54s/it]

Epoch no: 7
Validation loss: 0.5673 	 Validation Accuracy: 0.9163
Epoch no: 8
Train loss: 0.1015 	 Train Accuracy: 0.9697


 45%|████▌     | 9/20 [2:42:11<3:18:38, 1083.51s/it]

Epoch no: 8
Validation loss: 0.5485 	 Validation Accuracy: 0.9212
Epoch no: 9
Train loss: 0.0883 	 Train Accuracy: 0.9736


 50%|█████     | 10/20 [3:00:16<3:00:41, 1084.15s/it]

Epoch no: 9
Validation loss: 0.5336 	 Validation Accuracy: 0.9250
Epoch no: 10
Train loss: 0.3150 	 Train Accuracy: 0.9101


 55%|█████▌    | 11/20 [3:18:15<2:42:22, 1082.49s/it]

Epoch no: 10
Validation loss: 0.2642 	 Validation Accuracy: 0.9254
Epoch no: 11
Train loss: 0.2312 	 Train Accuracy: 0.9318


 60%|██████    | 12/20 [3:36:18<2:24:21, 1082.72s/it]

Epoch no: 11
Validation loss: 0.2512 	 Validation Accuracy: 0.9287
Epoch no: 12
Train loss: 0.1990 	 Train Accuracy: 0.9410


 65%|██████▌   | 13/20 [3:54:19<2:06:14, 1082.13s/it]

Epoch no: 12
Validation loss: 0.2391 	 Validation Accuracy: 0.9320
Epoch no: 13
Train loss: 0.1775 	 Train Accuracy: 0.9474


 70%|███████   | 14/20 [4:12:19<1:48:09, 1081.55s/it]

Epoch no: 13
Validation loss: 0.2356 	 Validation Accuracy: 0.9352
Epoch no: 14
Train loss: 0.1573 	 Train Accuracy: 0.9534


 75%|███████▌  | 15/20 [4:30:25<1:30:14, 1082.85s/it]

Epoch no: 14
Validation loss: 0.2291 	 Validation Accuracy: 0.9379
Epoch no: 15
Train loss: 0.1427 	 Train Accuracy: 0.9576


 80%|████████  | 16/20 [4:48:29<1:12:13, 1083.26s/it]

Epoch no: 15
Validation loss: 0.2362 	 Validation Accuracy: 0.9356
Epoch no: 16
Train loss: 0.1331 	 Train Accuracy: 0.9602


 85%|████████▌ | 17/20 [5:06:42<54:18, 1086.27s/it]  

Epoch no: 16
Validation loss: 0.2363 	 Validation Accuracy: 0.9366
Epoch no: 17
Train loss: 0.1236 	 Train Accuracy: 0.9631


 90%|█████████ | 18/20 [5:24:50<36:13, 1086.68s/it]

Epoch no: 17
Validation loss: 0.2368 	 Validation Accuracy: 0.9381


In [19]:
# sweep config file
sweep_config = {
    'method': 'grid',
    'name' : 'testset run',
    'metric': {
      'goal': 'maximize',
      'name': 'test_accuracy'
    },
    'parameters': {
        'beam_size':{
            'values': [1]
        }
    }
}

In [20]:
# Create a sweep
sweep_id = wandb.sweep(sweep = sweep_config, entity="mdkarimullahaque-iit-madras", project='DL_Assignment_3')

Create sweep with ID: iehkqnj9
Sweep URL: https://wandb.ai/mdkarimullahaque-iit-madras/DL_Assignment_3/sweeps/iehkqnj9


In [27]:
# wandb log for test accuracy
def main():
  with wandb.init() as run:
    #run_name = "-f_num_"+str(wandb.config.filters_num)+"-f_num_"+wandb.config.filter_org+"-ac_fn_"+wandb.config.act_fn+\
                #"-b_norm_"+str(wandb.config.batch_norm) + "-bs_"+str(wandb.config.batch_size) +"-neu_num"+str(wandb.config.num_neurons_dense)

    wandb.run.name = "test_set_run"
    calc_test_acc(best_model, test_loader)

wandb.agent(sweep_id, function = main, count = 1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: 2pkytu97 with config:
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	input_embedding_size: 128


Traceback (most recent call last):
  File "<ipython-input-27-d8d18fff0d25>", line 8, in main
    calc_test_acc(best_model, test_loader)
                  ^^^^^^^^^^
NameError: name 'best_model' is not defined


[34m[1mwandb[0m: [32m[41mERROR[0m Run 2pkytu97 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-27-d8d18fff0d25>", line 8, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     calc_test_acc(best_model, test_loader)
[34m[1mwandb[0m: [32m[41mERROR[0m                   ^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m NameError: name 'best_model' is not defined
[34m[1mwandb[0m: [32m[41mERROR[0m 


# wandb sweeps with attention

In [24]:
# sweep config file
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep - attention',
    'metric': {
      'goal': 'maximize',
      'name': 'validation_accuracy'
    },
    'parameters':{
        'input_embedding_size': {
            'values': [32, 64, 128] # 16,32,64,
        },
        'enc_layers': {
            'values': [1,2,3]
        },
        'dec_layers': {
            'values': [1,2,3]

        },
        'hidden_size': {
            'values': [64, 128, 256]
        },
        'cell_type': {
            'values': ['lstm','rnn','gru']
        #},
        #'bidirectional' : {
           # 'values' : [True]
        },
        'dropout': {
            'values': [0.1, 0.2, 0.3]
        },
        'beam_size' : {
            'values' : [1,3,5]
        }
     }
}

# Create a sweep
sweep_id = wandb.sweep(sweep = sweep_config, entity="mdkarimullahaque-iit-madras", project='DL_Assignment_3')


Create sweep with ID: z8h0iary
Sweep URL: https://wandb.ai/mdkarimullahaque-iit-madras/DL_Assignment_3/sweeps/z8h0iary


In [25]:
#wandb sweeps with attention
def main():
  with wandb.init() as run:
    wandb.run.name = f'cell-{wandb.config.cell_type}_hid_sz-{wandb.config.hidden_size}_inp_embed-{wandb.config.input_embedding_size}_enc-{wandb.config.enc_layers}_dec-{wandb.config.dec_layers}_dropout-{wandb.config.dropout}'

    model1 = Seq2Seq(
        encoder_hidden_dimension = wandb.config.hidden_size,
        decoder_hidden_dimension = wandb.config.hidden_size,
        encoder_embed_dimension =  wandb.config.input_embedding_size,
        decoder_embed_dimension =  wandb.config.input_embedding_size,
        #bidirectional = wandb.config.bidirectional,
        encoder_num_layers = wandb.config.enc_layers,
        decoder_num_layers = wandb.config.dec_layers,
        cell_type = wandb.config.cell_type,
        dropout = wandb.config.dropout,
        beam_width = wandb.config.beam_size,
        device = device,
        attention = True
    )


    model1.to(device)
    beam = False

    epochs = 15
    train1(model1, train_loader, val_loader, epochs, beam)

wandb.agent(sweep_id, function = main, count = 25) # calls main function for count number of times
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: s4sj5kpk with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dec_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	enc_layers: 1
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	input_embedding_size: 64


Traceback (most recent call last):
  File "<ipython-input-25-ec00833857b3>", line 6, in main
    model1 = Seq2Seq(
             ^^^^^^^^
TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'


[34m[1mwandb[0m: [32m[41mERROR[0m Run s4sj5kpk errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-25-ec00833857b3>", line 6, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     model1 = Seq2Seq(
[34m[1mwandb[0m: [32m[41mERROR[0m              ^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: Agent Starting Run: 4cr6waye with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dec_layers: 1
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hidden_size: 1

Traceback (most recent call last):
  File "<ipython-input-25-ec00833857b3>", line 6, in main
    model1 = Seq2Seq(
             ^^^^^^^^
TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'


[34m[1mwandb[0m: [32m[41mERROR[0m Run 4cr6waye errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-25-ec00833857b3>", line 6, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     model1 = Seq2Seq(
[34m[1mwandb[0m: [32m[41mERROR[0m              ^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: Agent Starting Run: x5bb0sc7 with config:
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: rnn
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	enc_layers: 3
[34m[1mwandb[0m: 	hidden_size: 2

Traceback (most recent call last):
  File "<ipython-input-25-ec00833857b3>", line 6, in main
    model1 = Seq2Seq(
             ^^^^^^^^
TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'


[34m[1mwandb[0m: [32m[41mERROR[0m Run x5bb0sc7 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-25-ec00833857b3>", line 6, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     model1 = Seq2Seq(
[34m[1mwandb[0m: [32m[41mERROR[0m              ^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m TypeError: Seq2Seq.__init__() got an unexpected keyword argument 'encoder_hidden_dimension'
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true


In [34]:
# Decoder part
class Decoder(nn.Module):
    def __init__(self,
                 output_dim = 29,
                 emb_dim = 256,
                 dec_hid_dim = 256,
                 cell_type='gru',
                 num_layers=2, # Store this attribute
                 dropout = 0,
                 #bidirectional = True, # Keep commented if not using bidirectional
                 attention = False,
                 attention_dim = None,
                 encoder_hid_dim=256, # Added for attention context size calculation
                 encoder_val_direction=1 # Added for attention context size calculation
                 ):

        super(Decoder, self).__init__()
        device = torch.device('cuda' if torch.cuda.cuda() else 'cpu')
        # Store num_layers as an attribute
        self.num_layers = num_layers # <--- Add this line
        # Embedding part
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.attention = attention
        # Dropout to add onto embedded input
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type
        self.dec_hid_dim = dec_hid_dim # Store dec_hid_dim as well for potential use

        # Define val_direction - assuming not using bidirectional decoder
        self.val_direction = 1

        # Linear layer to get the output
        self.W1 = nn.Linear(dec_hid_dim * self.val_direction, output_dim)
        # Softmax layer
        self.softmax = F.softmax # This is not used in the forward pass when CrossEntropyLoss is used

        # Determine the input size for the RNN/GRU based on whether attention is used for concatenation
        rnn_input_size = emb_dim
        if attention:
             # If context is concatenated to the input embedding before the RNN/GRU
             # Need to get the size of the context vector from the calculate_attention logic
             # Based on Seq2Seq and Decoder forward, context comes from encoder_outputs (batch_size, seq_len, enc_hid_dim * val_direction)
             # calculate_attention output is (batch_size, 1, enc_hid_dim * val_direction)
             # So context size is enc_hid_dim * encoder_val_direction
             rnn_input_size = emb_dim + encoder_hid_dim * encoder_val_direction # Use passed encoder dims


        if cell_type.lower() == 'rnn':
            self.rnn = nn.RNN(input_size=rnn_input_size, # Use rnn_input_size
                              hidden_size = dec_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= bidirectional, # Keep commented if not using bidirectional
                              batch_first=True)
        elif cell_type.lower() == 'lstm':
            self.rnn = nn.LSTM(input_size=rnn_input_size, # Use rnn_input_size
                               hidden_size = dec_hid_dim,
                               num_layers=num_layers,
                               dropout = dropout,
                               #bidirectional= bidirectional, # Keep commented if not using bidirectional
                               batch_first=True)
        elif cell_type.lower() == 'gru':
            self.rnn = nn.GRU(input_size=rnn_input_size, # Use rnn_input_size
                              hidden_size = dec_hid_dim,
                              num_layers=num_layers,
                              dropout = dropout,
                              #bidirectional= bidirectional, # Keep commented if not using bidirectional
                              batch_first=True)

        # self.fc_out = nn.Linear(dec_hid_dim, output_dim) # This layer is not used in the forward pass

    # The calculate_attention function needs to be a method of the class if it uses self attributes, or a standalone function
    # If it's a method, it needs 'self' as the first parameter.
    # Based on its structure, it seems intended to be a standalone helper function or a method needing more context (like encoder_outputs structure)
    # Let's move it outside the class definition for now, assuming it's a helper.
    # Or, more likely, it should be integrated into the Decoder forward method directly if it's simple attention.
    # If it's a complex attention module, it should be a separate nn.Module.
    # Given it's defined *inside* the Decoder class but without 'self', it's syntactically incorrect.
    # Let's remove the misplaced function definition for now. If attention is needed, it must be correctly implemented.
    # Assuming for the "no attention" sweep this function isn't called anyway.

    # Removed the misplaced calculate_attention function definition here


    def forward(self, input, hidden, cell=None,encoder_outputs=None):
#         Incorporate dropout in embedding.
        # input shape: (batch_size, 1) if processing one token at a time
        embedded = self.embedding(input) # embedded shape: (batch_size, 1, emb_dim)
        output = self.dropout(embedded) # output shape: (batch_size, 1, emb_dim)

        attention_weights = None
#         If we are using attention, then we need to concatenate the context vector, which we obtain from attention

        if self.attention and encoder_outputs is not None:
            # The calculate_attention function needs to be defined correctly and accessible.
            # Assuming a global calculate_attention or a method call.
            # For now, let's assume a helper function `calculate_attention` exists and works.
            # The function would take the current decoder state(s) and encoder outputs.
            # Example (placeholder):
            # Assuming calculate_attention expects decoder hidden state (batch_size, dec_hid_dim)
            # and encoder outputs (batch_size, src_seq_len, enc_hid_dim * enc_directions)

            # Need to get the last layer's hidden state if num_layers > 1
            # For LSTM hidden is (num_layers, batch_size, hidden_size)
            # If using Bahdanau, attention uses previous decoder hidden state.
            # The `hidden` tensor passed here is the *current* state before the RNN update.
            # If Bahdanau attention: use `hidden` (potentially reshaped)
            # If Luong attention: use the state *after* the RNN update.
            # The original code passes the state *before* the RNN update. Let's assume Bahdanau-like approach.

            # Reshape hidden to (batch_size, num_layers, dec_hid_dim) and take the last layer (index -1)
            # dec_hidden_for_attention = hidden.permute(1, 0, 2)[:, -1, :].squeeze(1) # Shape: (batch_size, dec_hid_dim) if dec_hid_dim matches.

            # The shape expected by calculate_attention should be consistent.
            # Let's assume a simplified attention mechanism that works with the full hidden state for now,
            # or that calculate_attention handles reshaping internally.
            # This is a placeholder; the actual attention implementation logic needs verification.

            # For now, let's assume calculate_attention is called like this (needs actual implementation):
            # context, attention_weights = calculate_attention(hidden, encoder_outputs, self.U, self.W, self.V)
            # Placeholder: assuming context shape is (batch_size, 1, attention_dim) based on concat usage
            # If calculate_attention was meant to return (batch_size, context_size), need to unsqueeze(1)

            # Since calculate_attention is missing, and the sweep is for "no attention",
            # this 'if self.attention' block will not execute, so the error is elsewhere.
            # The error is confirmed to be in Seq2Seq init accessing decoder.num_layers.
            pass # Do nothing if attention is False

        # output shape is now (batch_size, 1, rnn_input_size) - where rnn_input_size = emb_dim or emb_dim + context_size


        if self.cell_type == 'lstm':
            # self.rnn is nn.LSTM, expects (input, (h_0, c_0))
            # output shape: (batch_size, 1, rnn_input_size)
            # hidden shape: (num_layers, batch_size, hidden_size)
            # cell shape: (num_layers, batch_size, hidden_size)
            rnn_output, (hidden, cell) = self.rnn(output, (hidden, cell)) # Pass tuple for LSTM
        else: # Covers 'rnn' and 'gru'
            # self.rnn is nn.RNN or nn.GRU, expects (input, h_0)
            # output shape: (batch_size, 1, rnn_input_size)
            # hidden shape: (num_layers, batch_size, hidden_size)
            rnn_output, hidden = self.rnn(output, hidden) # Pass single tensor for RNN/GRU
            cell = None # Ensure cell is None when not LSTM

        # rnn_output shape is (batch_size, 1, dec_hid_dim * val_direction)

        # Apply the final linear layer to get logits
        # Squeeze the sequence length dimension (size 1) before the linear layer
        output_logits = self.W1(rnn_output.squeeze(1)) # output_logits shape: (batch_size, output_dim)


        return output_logits, hidden, cell, attention_weights # Return logits, updated states, attention weights

# The train1 and calc_test_acc functions should be the corrected versions from the previous turn.
# They should obtain vocab_size using model.decoder.W1.out_features
# and use reshape(-1, vocab_size) and reshape(-1) for loss calculation,
# and torch.argmax(outputs, dim=2) for accuracy calculation.
# %% [markdown]
# Seq2Seq Model
# %%
# Seq2Seq Model
class Seq2Seq(nn.Module):

    def __init__(self,
                 encoder,
                 decoder,
                 # dec_inp_dim = 29, # This parameter is not strictly needed if we get output_dim from decoder
                 enc_hid_dim = 256, # Parameter needed for linear transformation layers
                 dec_hid_dim =256, # Parameter needed for linear transformation layers
                 #bidirectional = True, # Keep commented for now
                 enc_num_layers = 3, # Parameter needed for linear transformation layers
                 dec_num_layers = 2, # Parameter needed for linear transformation layers
                 cell_type = 'lstm', # Parameter needed for conditional logic and state transformation
                 dropout = 0.2, # Parameter potentially needed elsewhere in Seq2Seq (though typically in Encoder/Decoder)
                 attention = False # Parameter needed to control attention logic
                ):


        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        device = torch.device('cuda' if torch.cuda.cuda() else 'cpu')
        # Decoder input dimension
        # self.dec_inp_dim = dec_inp_dim # Removed as it's not used for predictions tensor size

        # Store the parameters needed for state transformation layers
        # Use the attributes from the passed encoder and decoder instances
        self.enc_hid_dim = self.encoder.enc_hid_dim # Use attribute from encoder instance
        self.dec_hid_dim = self.decoder.dec_hid_dim # Use attribute from decoder instance
        self.enc_num_layers = self.encoder.num_layers # Use attribute from encoder instance
        self.dec_num_layers = self.decoder.num_layers # <--- Use attribute from decoder instance
        self.cell_type = self.decoder.cell_type # Use attribute from decoder instance (should match encoder)
        self.dropout = dropout # Storing, but not directly used in Seq2Seq forward here
        self.attention = self.decoder.attention # Use attribute from decoder instance

        # Initialize val_direction (assuming unidirectional if bidirectional is commented)
        # Use the val_direction from the encoder and decoder instances
        self.enc_val_direction = self.encoder.val_direction # Use attribute from encoder instance
        self.dec_val_direction = self.decoder.val_direction # Use attribute from decoder instance
        # Assuming enc_val_direction == dec_val_direction for state transformation
        self.val_direction = self.enc_val_direction


        # If attention is used, then we need to transform encoder's last hidden to decoder's first hidden
        # Correct the input dimension for the linear transformation based on the flattened encoder hidden state
        # Input size: enc_num_layers * enc_val_direction * enc_hid_dim
        # Output size: dec_num_layers * dec_val_direction * dec_hid_dim
        self.enc_dec_linear1 = nn.Linear(self.enc_num_layers * self.enc_val_direction * self.enc_hid_dim,
                                         self.dec_num_layers * self.dec_val_direction * self.dec_hid_dim)


        # Linear layer to transform encoder's last cell to decoder's first cell (only for LSTM)
        # Use the cell_type from the decoder instance
        if self.cell_type == 'lstm':
             # Correct the input dimension for the linear transformation based on the flattened encoder cell state
             self.enc_dec_cell_linear1 = nn.Linear(self.enc_num_layers * self.enc_val_direction * self.enc_hid_dim,
                                                  self.dec_num_layers * self.dec_val_direction * self.dec_hid_dim)


        # Get the global max_seq_mal value for target sequence length
        global max_seq_mal
        self.target_seq_len = max_seq_mal + 2 # Store the correct target sequence length


    def forward(self, source, target, teacher_forcing = False, is_training=False):
        batch_size = source.shape[0]

        # Initialize initial states for the encoder
        # Correct shape: (num_layers * num_directions, batch_size, hidden_size)
        encoder_initial_hidden = torch.zeros(self.encoder.num_layers * self.encoder.val_direction,
                                             batch_size,
                                             self.encoder.enc_hid_dim,
                                             device=device)
        encoder_initial_cell = torch.zeros(self.encoder.num_layers * self.encoder.val_direction,
                                           batch_size,
                                           self.encoder.enc_hid_dim,
                                           device=device) if self.encoder.cell_type == 'lstm' else None # Use encoder's cell_type


        # Pass the full source sequence through the encoder
        # encoder_output: (batch_size, seq_len, hidden_size * num_directions) if batch_first=True
        # last_state: (num_layers * num_directions, batch_size, hidden_size)
        # cell_state: (num_layers * num_directions, batch_size, hidden_size) for LSTM
        encoder_output, last_state, cell_state = self.encoder(source, encoder_initial_hidden, encoder_initial_cell)


        # If attention is used, `encoder_outputs` for attention are the outputs from the encoder at each time step
        if self.attention:
             encoder_outputs = encoder_output # Shape: (batch_size, seq_len, hidden_size * num_directions)
        else:
             encoder_outputs = None # Explicitly None if attention is off


        # Encoder's last state is decoders first state (after transformation)
        # last_state is (enc_num_layers * val_direction, batch_size, enc_hid_dim)

        # Transform encoder's last hidden state to decoder's first hidden state
        # Use the last hidden state from the encoder across all layers if applicable
        # Reshape last_state to (batch_size, enc_num_layers * enc_val_direction * enc_hid_dim) before linear transformation
        last_state_reshaped = last_state.permute(1, 0, 2).reshape(batch_size, -1) # Shape: (batch_size, enc_num_layers * enc_val_direction * enc_hid_dim)

        # Apply the linear transformation
        decoder_hidden_reshaped = self.enc_dec_linear1(last_state_reshaped) # Shape: (batch_size, dec_num_layers * dec_val_direction * dec_hid_dim)

        # Reshape back to (dec_num_layers * dec_val_direction, batch_size, dec_hid_dim) for the decoder
        # Note: The reshape should match the target shape (num_layers * num_directions, batch_size, hidden_size)
        decoder_hidden = decoder_hidden_reshaped.reshape(batch_size, self.dec_num_layers * self.dec_val_direction, self.dec_hid_dim).permute(1, 0, 2) # Shape: (dec_num_layers * dec_val_direction, batch_size, dec_hid_dim)


        # Here also, encoders last cell is decoders first cell, also transform to same dimension (for LSTM)
        # Use the cell_type from the decoder instance
        if  self.decoder.cell_type == 'lstm': # Use decoder's cell_type for this transformation
            cell_state_reshaped = cell_state.permute(1, 0, 2).reshape(batch_size, -1) # Shape: (batch_size, enc_num_layers * enc_val_direction * enc_hid_dim)
            decoder_cell_reshaped = self.enc_dec_cell_linear1(cell_state_reshaped) # Shape: (batch_size, dec_num_layers * dec_val_direction * dec_hid_dim)
            # Reshape back to (dec_num_layers * dec_val_direction, batch_size, dec_hid_dim) for the decoder
            decoder_cell_state = decoder_cell_reshaped.reshape(batch_size, self.dec_num_layers * self.dec_val_direction, self.dec_hid_dim).permute(1, 0, 2) # Shape: (dec_num_layers * dec_val_direction, batch_size, dec_hid_dim)
        else:
            decoder_cell_state = None # Ensure cell_state is None for RNN/GRU


        # Initialize predictions and attention_weights
        # Get the correct output dimension from the model's decoder's final linear layer (assuming W1 is the final layer)
        target_output_dim = self.decoder.W1.out_features # Use W1 from the decoder instance

        # Use the stored target sequence length
        predictions = torch.zeros(batch_size, self.target_seq_len, target_output_dim, device = device)

        # Attention weights shape: (batch_size, target_seq_len, source_seq_len)
        # Need global max_seq_eng here
        global max_seq_eng
        attention_weights = torch.zeros(batch_size, self.target_seq_len, max_seq_eng + 2, device = device) if self.attention else None # Source seq len is max_seq_eng + SOW + EOW

        # Initialize the first input to the decoder. This should be the <SOW> token (index 1).
        # Use mal_chars_idx['\t'] for the SOW index (assuming it's a global variable)
        global mal_chars_idx # Ensure mal_chars_idx is accessible
        decoder_input = torch.full((batch_size, 1), mal_chars_idx['\t'], dtype=torch.long, device=device)


        # Do decoding by char by char fashion by batch
        # The loop should run self.target_seq_len times
        for t in range(self.target_seq_len): # Loop over the correct target sequence length

            # Pass the current hidden state and cell state (if LSTM) to the decoder
            decoder_output, decoder_hidden, cell_state, attention_wts = self.decoder(
                decoder_input,
                decoder_hidden, # Pass the updated hidden state from the previous step
                cell_state, # Pass the updated cell state from the previous step (will be None for RNN/GRU)
                encoder_outputs # Pass encoder outputs for attention
            )

            # Store the prediction (logits) for the current time step
            # decoder_output shape from Decoder forward: (batch_size, output_dim) after squeeze(1)
            predictions[:, t, :] = decoder_output # decoder_output is already squeezed in Decoder forward


            if self.attention and attention_wts is not None:
                # Store attention weights if attention is used
                # attention_wts shape from calculate_attention: (batch_size, source_seq_len)
                attention_weights[:, t, :] = attention_wts # Store attention weights for this decoding step


            # Determine the input for the next time step
            # Teacher forcing should only happen if t is within the bounds of the target sequence
            # And if is_training is True
            if teacher_forcing and is_training and t < self.target_seq_len - 1:
                # Teacher forcing: use the actual target token as input to the decoder
                # Target shape is (batch_size, target_seq_len). Input needs to be (batch_size, 1).
                decoder_input = target[:, t].unsqueeze(1)
            else:
                # Without teacher forcing or during inference: use the decoder's predicted token from the current time step as input for the next step
                # Get the predicted token index (argmax) from the logits
                predicted_token = torch.argmax(decoder_output, dim=-1) # shape: (batch_size)
                decoder_input = predicted_token.unsqueeze(1).detach() # shape: (batch_size, 1), Detach from graph for the next input


        # Return predictions and attention weights
        # predictions shape: (batch_size, target_seq_len, output_dim)
        # attention_weights shape: (batch_size, target_seq_len, source_seq_len)
        return predictions, attention_weights

In [35]:
# sweep config file
sweep_config = {
    'method': 'grid',
    'name' : 'testset run attention',
    'metric': {
      'goal': 'maximize',
      'name': 'test_accuracy'
    },
    'parameters': {
        'beam_size':{
            'values': [1]
        }
    }
}
# Create a sweep
sweep_id = wandb.sweep(sweep = sweep_config, entity="mdkarimullahaque-iit-madras", project='DL_Assignment_3')

Create sweep with ID: rvyo66d8
Sweep URL: https://wandb.ai/mdkarimullahaque-iit-madras/DL_Assignment_3/sweeps/rvyo66d8


In [36]:
# wandb log for test accuracy
def main():
  with wandb.init() as run:
    #run_name = "-f_num_"+str(wandb.config.filters_num)+"-f_num_"+wandb.config.filter_org+"-ac_fn_"+wandb.config.act_fn+\
                #"-b_norm_"+str(wandb.config.batch_norm) + "-bs_"+str(wandb.config.batch_size) +"-neu_num"+str(wandb.config.num_neurons_dense)

    wandb.run.name = "test_set_run_attn"
    calc_test_acc(best_model_attn, test_loader)

wandb.agent(sweep_id, function = main, count = 1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: 22ij5bwu with config:
[34m[1mwandb[0m: 	beam_size: 1


Traceback (most recent call last):
  File "<ipython-input-36-d910eb4bcd6f>", line 8, in main
    calc_test_acc(best_model_attn, test_loader)
                  ^^^^^^^^^^^^^^^
NameError: name 'best_model_attn' is not defined


[34m[1mwandb[0m: [32m[41mERROR[0m Run 22ij5bwu errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-36-d910eb4bcd6f>", line 8, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     calc_test_acc(best_model_attn, test_loader)
[34m[1mwandb[0m: [32m[41mERROR[0m                   ^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m NameError: name 'best_model_attn' is not defined
[34m[1mwandb[0m: [32m[41mERROR[0m 


# Attention heatmap plot and logging it in wandb

In [37]:
test_input, test_labels = next(iter(test_loader))
best_model.eval()
test_output,_weights = best_model.forward(test_input.to(device), None,False)

In [39]:
# file ipython-input-38-959740f04f96
# Plot attention hmap
def prepare_ticks(input_data, output_data, index):
    filter_func = lambda x: x.item() not in [0, 1, 2]
    # Assuming english_index_dict and malayalam_index_dict are defined globally
    x_ticks = [english_index_dict[i.item()] for i in input_data[index] if filter_func(i)]
    y_ticks = [malayalam_index_dict[i.item()] for i in output_data[index] if filter_func(i)]
    return x_ticks, y_ticks

def generate_heatmap(input_data, output_data, weights, num_plots=12):
    fig, ax = plt.subplots(4, 3, figsize=(20, 20))
    plt.setp(ax)

    for idx in range(num_plots):
        x_ticks, y_ticks = prepare_ticks(input_data, output_data, idx)
        # weights shape: (batch_size, target_seq_len, source_seq_len)
        # Need to select the attention weights for the current sample (idx)
        heatmap_data = weights[idx, :, :].detach().cpu().numpy() # Select data for sample idx
        # Ensure indices are correct based on where padding/SOW/EOW are
        # Assuming SOW is index 1, EOW is index 2, Padding is index 0
        # Input ticks correspond to src_len (max_seq_eng + 2)
        # Output ticks correspond to tgt_len (max_seq_mal + 2)
        # If SOW is at index 1 and EOW at len-1 for output, need to adjust.
        # If SOW is at index 1 and EOW at len-1 for input, need to adjust.

        # Based on prepare_ticks, it skips indices 0, 1, 2. SOW is 1, EOW is 2, Padding is 0.
        # It seems prepare_ticks creates ticks for *actual* characters, excluding special tokens.
        # The weights tensor shape is (batch_size, target_seq_len, source_seq_len) including special tokens.
        # You need to align the heatmap data with the ticks.
        # Source sequence in model forward includes SOW (idx 1) and EOW (idx 2) and padding (idx 0).
        # Target sequence in model forward includes SOW (idx 1) and EOW (idx 2) and padding (idx 0).
        # If x_ticks excludes 0,1,2, and y_ticks excludes 0,1,2, you should select corresponding parts of the weights.
        # The original code sliced [1:len(y_ticks)+1, 2:len(x_ticks)+2]. This assumes SOW is at index 1 for output and index 2 for input.
        # And padding is 0, EOW is included? Let's re-check data_preprocess.
        # data_preprocess: sow = "\t" (idx 1), eow = "\n" (idx 2), padding "0" (idx 0).
        # Format is sow + word + padding + eow.
        # So input sequence: [1, char1, char2, ..., padding..., 2]
        # Target sequence: [1, char1, char2, ..., padding..., 2]
        # Source seq len = max_seq_eng + 2 (for sow, eow) + padding length. Padding is max_seq - len(word).
        # The padded data is sow + word + "0" * (max_seq - len(word)) + eow.
        # So source sequence indices: 0: padding, 1: sow, 2: eow, 3...: chars.
        # input_data[index] indices: 0: padding, 1: sow, 2: eow, 3...: chars.
        # y_ticks from output_data[index] indices: 0: padding, 1: sow, 2: eow, 3...: chars.
        # The `prepare_ticks` function explicitly filters out 0, 1, 2.
        # So x_ticks correspond to chars from index 3 onwards in the original sequence.
        # y_ticks correspond to chars from index 3 onwards in the original sequence.

        # The weights shape from Seq2Seq forward is (batch_size, target_seq_len, source_seq_len).
        # target_seq_len is max_seq_mal + 2 (SOW + EOW). Let's check the loop in Seq2Seq.
        # Loop is self.target_seq_len = max_seq_mal + 2 times.
        # decoder_input starts with SOW (index 1). The loop runs `target_seq_len` times.
        # So predictions and attention_weights have shape (batch_size, max_seq_mal + 2, ...).
        # The second dimension corresponds to the decoding step (0 to max_seq_mal + 1).
        # The first step (t=0) processes SOW and predicts the first actual char.
        # The last step (t = max_seq_mal + 1) processes the token before EOW and predicts EOW.

        # The x-axis of the heatmap should correspond to source tokens (input).
        # The y-axis of the heatmap should correspond to target tokens (output).
        # The attention weights are likely calculated between decoder hidden state at step t and encoder outputs across all source tokens.
        # So `weights[idx, t, src_token_idx]` is attention from target token at step `t` to source token at index `src_token_idx`.

        # The original slicing `[1:len(y_ticks)+1, 2:len(x_ticks)+2]` is suspicious and likely incorrect.
        # It's trying to slice the heatmap data based on the *length* of the ticks, but using fixed offsets.
        # A better approach is to select the attention weights corresponding to the actual characters in the input and output sequences *for that specific sample*.
        # However, the `_weights` tensor is padded to max lengths.

        # Let's assume the original slicing logic was *trying* to get rid of attention to/from special tokens.
        # Source sequence indices: [SOW, char1, ..., charN, Padding..., EOW] -> [1, 3, ..., 3+N-1, 0..., 2]
        # Target sequence indices: [SOW, char1, ..., charM, Padding..., EOW] -> [1, 3, ..., 3+M-1, 0..., 2]
        # `prepare_ticks` excludes 0, 1, 2. So `x_ticks` are chars from index 3 onwards in the original input sequence.
        # `y_ticks` are chars from index 3 onwards in the original output sequence.
        # The attention weights `_weights` have shape (batch_size, target_seq_len, source_seq_len).
        # `target_seq_len` is `max_seq_mal + 2`. The first element is prediction after seeing SOW, last is prediction of EOW.
        # The sequence of *predicted* tokens corresponding to `y_ticks` starts from the token after SOW prediction.
        # So, in `_weights`, the y-axis (dim 1) should correspond to the decoding steps.
        # The first decoding step (index 0) predicts the first token after SOW.
        # The decoding steps corresponding to the *actual* characters in `y_ticks` are from index 0 up to `len(y_ticks)-1`.
        # The x-axis (dim 2) corresponds to the source token indices.
        # The source token indices corresponding to the *actual* characters in `x_ticks` are from index 0 up to `len(x_ticks)-1`.

        # Let's try to slice based on the actual number of characters found.
        # We need the length of the original (non-padded, non-special-token) input and output for this sample.
        # This information is not directly available in `input_data[index]` and `output_data[index]` after padding.
        # You might need to store the original lengths during data preprocessing or retrieve them differently.

        # Alternatively, assume the original slicing was based on the fixed positions relative to special tokens.
        # SOW is at index 1, EOW at index 2 in the `chars_idx`. Padding is 0.
        # In the padded sequence `sow + word + padding + eow`, the indices are:
        # Index 0: SOW (1)
        # Index 1 to 1+len(word)-1: Word characters (3+)
        # Index 1+len(word) to max_seq: Padding (0)
        # Index max_seq+1: EOW (2)

        # This contradicts the data_preprocess code: `sow + str(word) + "0" * (max_seq - len(str(word))) + eow`
        # Index 0: SOW (1)
        # Index 1 to len(word): Word characters (3+)
        # Index len(word)+1 to len(word) + (max_seq - len(word)): Padding (0)
        # Index len(word) + (max_seq - len(word)) + 1 = max_seq + 1: EOW (2)
        # Total length = max_seq + 2.

        # So the sequence looks like: [1, char1, ..., charN, 0, ..., 0, 2]
        # Indices:          0, 1, ..., N, N+1, ..., max_seq, max_seq+1
        # `prepare_ticks` filters out 0, 1, 2.
        # x_ticks are chars from index 1 up to len(word).
        # y_ticks are chars from index 1 up to len(word) for the target.

        # The attention weights `weights[idx, t, src_token_idx]`.
        # `t` is the decoding step (0 to max_seq_mal + 1).
        # `src_token_idx` is the index in the source sequence (0 to max_seq_eng + 1).

        # Let's assume the original slicing `[1:len(y_ticks)+1, 2:len(x_ticks)+2]` was somehow intended,
        # maybe based on relative positions after padding/special tokens.
        # y-axis: `1:len(y_ticks)+1` -> This slice would include `len(y_ticks)` items starting from index 1.
        # x-axis: `2:len(x_ticks)+2` -> This slice would include `len(x_ticks)` items starting from index 2.
        # This still seems off based on the actual padded sequence structure.

        # Let's try a simpler slice assuming the attention weights for the actual characters are at indices 1 to len(word) in the padded sequence.
        # This is still just an assumption. The correct way requires knowing the original lengths or adjusting `prepare_ticks` and slicing based on that.

        # However, the primary error is `_weights` being `None`.
        # Let's fix that first by checking if attention was used.

        # --- Correction for plotting ---
        # Assuming attention weights shape is (batch_size, target_seq_len, source_seq_len)
        # `target_seq_len` is max_seq_mal + 2
        # `source_seq_len` is max_seq_eng + 2
        # x_ticks are characters from the *input* sequence (English) excluding special tokens.
        # y_ticks are characters from the *output* sequence (Malayalam) excluding special tokens.

        # The attention weights `weights[idx, target_step_idx, source_token_idx]`
        # The `target_step_idx` corresponds to the prediction of the token at that index *in the padded sequence*.
        # The `source_token_idx` corresponds to the token at that index *in the padded source sequence*.

        # If `prepare_ticks` gives ticks for characters *after* SOW and before EOW/padding,
        # which start from index 1 in the word part of the padded sequence:
        # The attention weights related to the actual target characters would be from decoding steps 0 to len(y_ticks)-1 (predicting tokens after SOW)
        # The attention weights related to the actual source characters would be for source token indices 1 to len(x_ticks) (characters after SOW)

        # So, the slice should potentially be `[0:len(y_ticks), 1:len(x_ticks)+1]`?
        # This is highly dependent on how attention is calculated and aligned.
        # The original slice was `[1:len(y_ticks)+1, 2:len(x_ticks)+2]`. Let's re-implement that exact slice first.
        try:
            # Use the exact original slicing logic, assuming it was correct relative to *something*
            # heatmap_data = weights[idx, :, :].detach().cpu().numpy() # Select data for sample idx
            # heatmap_data = heatmap_data[1:len(y_ticks)+1, 2:len(x_ticks)+2] # Apply the original slice

            # A potentially more correct slice if x_ticks/y_ticks correspond to chars after SOW (index 1)
            # and assuming attention weights align with the sequence structure:
            # y-axis corresponds to decoding steps predicting chars (from step 0 onwards)
            # x-axis corresponds to source sequence indices (chars start from index 1)
            heatmap_data = weights[idx, 0:len(y_ticks), 1:len(x_ticks)+1].detach().cpu().numpy()


        except IndexError as e:
             print(f"IndexError processing sample {idx}: {e}. Check sequence lengths and slicing.")
             continue # Skip this sample if slicing fails


        plt.sca(ax[idx//3, idx%3])
        plt.imshow(heatmap_data, interpolation='nearest', cmap='inferno')
        plt.colorbar()
        plt.xticks(np.arange(0, len(x_ticks)), x_ticks)

        mal_font = FontProperties(fname='/content/drive/MyDrive/DA6401_Assignment-3/dakshina_dataset_v1.0/AnjaliOldLipi-Regular.ttf')
        plt.yticks(np.arange(0, len(y_ticks)), y_ticks, fontproperties=mal_font)

        plt.xlabel('English')
        plt.ylabel('Malayalam')
        plt.title(f'Sample {idx + 1}')

    plt.tight_layout() # Prevent overlap
    plt.show()

    # Removed canvas/image conversion as it's not directly used after show()

    # Return the figure or axes if needed, but for just showing, returning nothing is fine.
    # return fig, ax # Or None

# Need english_index_dict and malayalam_index_dict globally
# Assuming they are defined elsewhere, e.g., idx2char_eng and idx2char_mal or similar.
# Let's use the existing idx2char_mal and define idx2char_eng
# Assuming eng_chars_idx and mal_chars_idx are globally available
english_index_dict = {idx: char for char, idx in eng_chars_idx.items()}
malayalam_index_dict = {idx: char for char, idx in mal_chars_idx.items()}


# --- Check if _weights is not None before proceeding ---
test_input, test_labels = next(iter(test_loader))
best_model.eval()
test_output,_weights = best_model.forward(test_input.to(device), None, False)

if _weights is not None: # Check if attention weights were returned
    # Calculate mean weights only if _weights is not None
    # mean_weights = torch.mean(_weights, axis=2) # This axis might be wrong depending on how mean_weights is used.
    # Based on generate_heatmap usage `weights[:, :, idx]`, it looks like `weights` is expected to be
    # something where the last dimension can be indexed by `idx`. This contradicts the (batch_size, target_seq_len, source_seq_len) shape.
    # Let's re-examine `generate_heatmap`. It takes `weights`. Inside, it does `weights[:, :, idx]`.
    # This means `weights` must have shape (batch_size, target_seq_len, num_samples_to_plot) or something similar.
    # This doesn't fit the (batch_size, target_seq_len, source_seq_len) shape of attention weights.

    # **Re-reading the heatmap generation code:**
    # `heatmap_data = weights[:, :, idx].detach().cpu().numpy()`
    # This indexing `[:, :, idx]` looks like `weights` is supposed to be `(target_seq_len, source_seq_len, batch_size)` or similar
    # Or `(batch_size, target_seq_len, source_seq_len)` and then `weights[idx, :, :]` is used.
    # The `generate_heatmap` function takes `weights` as input.
    # Inside the loop `for idx in range(num_plots):`, it uses `weights[:, :, idx]`.
    # This strongly suggests the `weights` variable *passed into* `generate_heatmap` is indexed by the sample index `idx` in the *last* dimension.
    # However, `_weights` returned from the model has shape (batch_size, target_seq_len, source_seq_len).
    # The heatmap data should be `weights[idx, :, :]` or similar, i.e., attention matrix for one sample.

    # The variable `mean_weights = torch.mean(_weights, axis=2)` seems incorrect for the heatmap generation logic.
    # The heatmap should show the attention from each output token to each input token for a *single sample*.
    # `_weights` already contains the attention weights per sample: `_weights[sample_idx, target_token_idx, source_token_idx]`.

    # The line `mean_weights = torch.mean(_weights, axis=2)` calculates the average attention of *each target token* to *all source tokens* for *all samples*.
    # This is not the attention matrix needed for the heatmap of a single sample.

    # **Correcting the heatmap generation logic:**
    # The `generate_heatmap` function should likely take the full `_weights` tensor (batch_size, target_seq_len, source_seq_len).
    # Inside the loop, it should select the slice for the current sample: `_weights[idx, :, :]`.
    # The parameter `weights` in `generate_heatmap(input_data, output_data, weights, num_plots=12)` is misleadingly named if it expects `_weights`.

    # Let's rename the parameter in `generate_heatmap` to `all_attention_weights`.
    # And call it with `generate_heatmap(test_input, output_argmax, _weights)`.

    # Also, the calculation `output_argmax = torch.argmax(output_softmax, dim=2).T` seems incorrect.
    # `output_argmax` should be shape (batch_size, target_seq_len) for `prepare_ticks`.
    # `test_output` is (batch_size, target_seq_len, vocab_size).
    # `torch.argmax(output_softmax, dim=2)` results in shape (batch_size, target_seq_len).
    # Transposing this `.T` results in (target_seq_len, batch_size), which doesn't match the expected shape in `prepare_ticks`.
    # `prepare_ticks(input_data, output_data, index)` expects `input_data` and `output_data` to be like the original datasets or tensors shaped (batch_size, seq_len).
    # `test_input` is (batch_size, source_seq_len). This works for the input part.
    # `output_argmax` is currently (target_seq_len, batch_size). This needs to be (batch_size, target_seq_len).
    # Remove the `.T` transpose.

    output_softmax = F.softmax(test_output, dim=2)
    output_argmax = torch.argmax(output_softmax, dim=2) # Shape: (batch_size, target_seq_len)

    # Pass the full attention weights tensor to the heatmap function
    # No need to calculate `mean_weights`. That line is removed.
    # image = generate_heatmap(test_input, output_argmax, _weights) # Call with _weights


    # Let's update the generate_heatmap function signature and logic
    def generate_heatmap_corrected(input_data_tensor, output_data_tensor, all_attention_weights_tensor, num_plots=12):
        fig, ax = plt.subplots(4, 3, figsize=(20, 20))
        plt.setp(ax, xticklabels=[], yticklabels=[]) # Optional: Hide default ticks

        # Need global dicts here or pass them
        # Assuming english_index_dict and malayalam_index_dict are available

        # Determine which samples to plot
        # If num_plots > batch_size, plot all samples in the batch
        samples_to_plot = min(num_plots, input_data_tensor.size(0))


        for idx in range(samples_to_plot):
            # Prepare ticks for the current sample
            # prepare_ticks expects data in the format of the original tensors/lists
            # Need to pass the individual sample tensors here
            # input_data_tensor[idx] is shape (source_seq_len,)
            # output_data_tensor[idx] is shape (target_seq_len,)
            x_ticks, y_ticks = prepare_ticks(input_data_tensor, output_data_tensor, idx)

            # Select the attention weights for the current sample
            # all_attention_weights_tensor[idx, :, :] is shape (target_seq_len, source_seq_len)
            attention_for_sample = all_attention_weights_tensor[idx, :, :].detach().cpu().numpy()

            # Slice the attention matrix to align with the ticks (excluding special tokens 0, 1, 2)
            # The chars corresponding to x_ticks/y_ticks start from index 1 in the padded sequence (after SOW)
            # Source indices in padded sequence: [SOW, char1, ..., charN, Pad..., EOW] -> [1, 3..., 3+N-1, 0..., 2]
            # Target indices in padded sequence (decoding steps): Step 0 (predicts token after SOW), Step 1 (predicts next char), ...
            # So, y-axis (decoding steps): from step 0 up to len(y_ticks)-1
            # x-axis (source indices): from index 1 up to len(x_ticks)
            try:
                 # Adjust slice based on the padded sequence structure and how prepare_ticks works
                 # Prepare_ticks filters out 0, 1, 2. This means it extracts the *actual* characters.
                 # If the padded sequence is [1, char1, ..., charN, 0, ..., 0, 2]
                 # Indices 1 to N are the chars.
                 # Attention matrix is (target_seq_len, source_seq_len)
                 # target_seq_len is max_seq_mal + 2
                 # source_seq_len is max_seq_eng + 2

                 # Assuming the attention aligns directly with the padded sequence indices:
                 # y-axis (target tokens): indices corresponding to y_ticks in the target padded sequence.
                 # x-axis (source tokens): indices corresponding to x_ticks in the source padded sequence.

                 # The simplest assumption is that attention[i, j] relates to target token *predicted at step i* and source token *at index j*.
                 # Decoding step 0 predicts the token at index 1 (first char after SOW).
                 # Decoding step `k` predicts the token at index `k+1` in the target sequence.
                 # Source token indices 1 to 1 + len(x_ticks)-1 = len(x_ticks) are the actual chars.

                 # So, y-axis should cover decoding steps 0 to len(y_ticks)-1.
                 # x-axis should cover source indices 1 to len(x_ticks).
                 heatmap_data_sliced = attention_for_sample[0:len(y_ticks), 1:len(x_ticks)+1]

                 # Check if the sliced data is valid
                 if heatmap_data_sliced.size == 0 or heatmap_data_sliced.shape[0] != len(y_ticks) or heatmap_data_sliced.shape[1] != len(x_ticks):
                      print(f"Warning: Sliced heatmap data for sample {idx} has unexpected shape {heatmap_data_sliced.shape}. Expected ({len(y_ticks)}, {len(x_ticks)}). Skipping plot.")
                      continue # Skip if slicing resulted in empty or wrong shape


            except IndexError as e:
                 print(f"IndexError slicing heatmap for sample {idx}: {e}. Source shape: {attention_for_sample.shape}, y_ticks len: {len(y_ticks)}, x_ticks len: {len(x_ticks)}. Skipping plot.")
                 continue # Skip this sample if slicing fails


            ax_curr = ax[idx//3, idx%3]
            im = ax_curr.imshow(heatmap_data_sliced, interpolation='nearest', cmap='inferno')
            fig.colorbar(im, ax=ax_curr) # Attach colorbar to the current subplot
            ax_curr.set_xticks(np.arange(0, len(x_ticks)))
            ax_curr.set_xticklabels(x_ticks)

            mal_font = FontProperties(fname='/content/drive/MyDrive/DA6401_Assignment-3/dakshina_dataset_v1.0/AnjaliOldLipi-Regular.ttf')
            ax_curr.set_yticks(np.arange(0, len(y_ticks)))
            ax_curr.set_yticklabels(y_ticks, fontproperties=mal_font)

            ax_curr.set_xlabel('English')
            ax_curr.set_ylabel('Malayalam')
            ax_curr.set_title(f'Sample {idx + 1}')


        plt.tight_layout() # Prevent overlap
        plt.show()

        # wandb log the figure
        # Need to import wandb
        # wandb.log({"attention_heatmaps": wandb.Image(fig)}) # Requires figure object

        plt.close(fig) # Close the figure after displaying/logging to free memory


    # Call the corrected heatmap generation function
    generate_heatmap_corrected(test_input, output_argmax, _weights)


else:
    print("Attention weights are None. Cannot plot heatmap.")
    # Optionally, log a message to wandb if a run is active
    # if wandb.run:
    #     wandb.log({"attention_heatmap_status": "Attention weights were None."})

Attention weights are None. Cannot plot heatmap.


In [40]:
sweep_config = {
    'method': 'grid',
    'name' : 'attention_plot',
    'parameters': {
        'beam_size': {
            'values': [1]
        }
  }
}
# Create a sweep
sweep_id = wandb.sweep(sweep = sweep_config, entity="mdkarimullahaque-iit-madras", project='DL_Assignment_3')

Create sweep with ID: dqzda4z7
Sweep URL: https://wandb.ai/mdkarimullahaque-iit-madras/DL_Assignment_3/sweeps/dqzda4z7


In [41]:
def main():
    with wandb.init() as run:
        #run_name = "-f_num_"+str(wandb.config.filters_num)+"-f_num_"+wandb.config.filter_org+"-ac_fn_"+wandb.config.act_fn+\
                    #"-b_norm_"+str(wandb.config.batch_norm) + "-bs_"+str(wandb.config.batch_size) +"-neu_num"+str(wandb.config.num_neurons_dense)

        wandb.run.name = "attention_heatmap"
        #wandb.log({"image_pred": [wandb.Image(image, caption="Test Images and Predictions")]})
        wandb.log({"image_grid": [wandb.Image(image, caption="Attention Heatmap")]})
wandb.agent(sweep_id, function = main, count = 1)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: en3h0p8k with config:
[34m[1mwandb[0m: 	beam_size: 1


Traceback (most recent call last):
  File "<ipython-input-41-695a2dcd9917>", line 8, in main
    wandb.log({"image_grid": [wandb.Image(image, caption="Attention Heatmap")]})
                                          ^^^^^
NameError: name 'image' is not defined


[34m[1mwandb[0m: [32m[41mERROR[0m Run en3h0p8k errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "<ipython-input-41-695a2dcd9917>", line 8, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.log({"image_grid": [wandb.Image(image, caption="Attention Heatmap")]})
[34m[1mwandb[0m: [32m[41mERROR[0m                                           ^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m NameError: name 'image' is not defined
[34m[1mwandb[0m: [32m[41mERROR[0m 
