In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader 

from tqdm import tqdm
import heapq
import csv

import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties


#specify max length of sequence
hindi_embedding_size = 29
english_embedding_size = 32

import wandb
# Instantiates the device to be used as GPU/CPU based on availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device.type

'cuda'

In [7]:
#ANOTHER STYLE BEGIN

import numpy as np

# Load Data to capture all characters
array = np.loadtxt("/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_train.csv",
                 delimiter=",", dtype=str)
num_sample = array.shape[0]
x_train, y_train = array[:, 0], array[:, 1]

english_dict = {}
hindi_dict = {}
english_index_dict = {}
hindi_index_dict = {}

'''
english_index = 3
hin_index = 3'''

english_index = hin_index = 3

for sentence in np.concatenate((x_train, y_train)):
    for char in sentence:
        if char not in english_dict:
            english_dict[char] = english_index
            english_index_dict[english_index] = char
            english_index += 1

for sentence in y_train:
    for char in sentence:
        if char not in hindi_dict:
            hindi_dict[char] = hin_index
            hindi_index_dict[hin_index] = char
            hin_index += 1

# Adding start, stop and padding symbols
start_symbol = '<S>'
end_symbol = '<E>'
padding_symbol = '<P>'
english_index_dict[0] = hindi_index_dict[0] = padding_symbol
english_index_dict[1] = hindi_index_dict[1] = start_symbol
english_index_dict[2] = hindi_index_dict[2] = end_symbol  #ANOTHER STYLE END


In [8]:
#ANOTHER STYLE BEGIN

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class DataProcessor:
    def __init__(self, english_embedding_size, hindi_embedding_size):
        self.eng_embedd_size = english_embedding_size
        self.hin_embedd_size = hindi_embedding_size
        self.english_dict = {}
        self.hindi_dict = {}

    def process_data(self, path):
        array = np.loadtxt(path, delimiter=",", dtype=str)
        num_samples = array.shape[0]
        x, y = array[:, 0], array[:, 1]

        X = np.zeros((num_samples, self.english_embedding_size))  # input
        Y = np.zeros((num_samples, self.hindi_embedding_size))  # target

        for i in range(num_samples):
            X[i][0] = Y[i][0] = 1

            for j, char in enumerate(x[i]):
                X[i][j + 1] = self.english_dict.setdefault(char, len(self.english_dict) + 3)

            X[i][len(x[i]) + 1] = 2

            for j, char in enumerate(y[i]):
                Y[i][j + 1] = self.hindi_dict.setdefault(char, len(self.hindi_dict) + 3)

            Y[i][len(y[i]) + 1] = 2

        return X, Y

class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.int64)
        self.Y = torch.tensor(Y, dtype=torch.int64)
        self.length = X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

    def __len__(self):
        return self.length

data_processor = DataProcessor(english_embedding_size, hindi_embedding_size)

X_train, y_train = data_processor.process_data("/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_train.csv")
X_val, y_val = data_processor.process_data("/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_valid.csv")
X_test, y_test = data_processor.process_data("/kaggle/input/aksharantar-sampled2/aksharantar_sampled/hin/hin_test.csv")

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=256)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=256)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=256)

In [9]:
# Importing necessary libraries
import torch
import torch.nn as nn

# Defining the Encoder class as a subclass of nn.Module
class Encoder(nn.Module):
    
    # Initializing the Encoder class with default and custom parameters
    def __init__(self,InputDimension=72,EmbeddingDimension=64,HiddenDimension=256,CellType='gru',layers=2,bi_directional=True,DropOut=0,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(Encoder, self).__init__()
        
        # Dictionary to store detailed parameters
        self.detail_parameters = {}
        self.detail_parameters['InputDimension'] = InputDimension
        self.detail_parameters['EmbeddingDimension'] = EmbeddingDimension
        self.detail_parameters['HiddenDimension'] = HiddenDimension
        self.detail_parameters['CellType'] = CellType
        self.detail_parameters['DropOut'] = DropOut
        self.detail_parameters['layers'] = layers
        self.detail_parameters['direction_value'] = 2 if bi_directional else 1
        self.detail_parameters['device'] = device.type

        # Assigning parameters to instance variables
        self.InputDimension = InputDimension
        self.EmbeddingDimension = EmbeddingDimension
        self.HiddenDimension = HiddenDimension
        self.CellType = CellType
        self.layers = layers
        self.dropout = DropOut
        self.device = device

        # Initializing Embedding layer
        self.embedding = nn.Embedding(self.InputDimension, self.EmbeddingDimension)
        self.dropout_layer = nn.Dropout(DropOut)
        
        # Calculating the direction value based on bidirectionality
        self.direction_value = 2 if bi_directional else 1

        # Defining different types of recurrent cells based on cell type
        if self.cell_type == 'rnn':
            self.encoder_type = RNNLayer(self.EmbeddingDimension, self.HiddenDimension, self.layers, bi_directional, DropOut)
        elif self.cell_type == 'gru':
            self.encoder_type = GRULayer(self.EmbeddingDimension, self.HiddenDimension, self.layers, bi_directional, DropOut)
        elif self.cell_type == 'lstm':
            self.encoder_type = LSTMLayer(self.EmbeddingDimension, self.HiddenDimension, self.layers, bi_directional, DropOut)

    # Forward method for Encoder
    def forward(self, input, hidden, cell=None):
        embedded = self.embedding(input)
        embedded = self.dropout_layer(embedded)
        
        # Handling LSTM separately for its cell state
        if self.cell_type == 'lstm':
            output, (hidden, cell) = self.encoder_type(embedded, (hidden, cell))
        else:
            output, hidden = self.encoder_type(embedded, hidden)

        return output, hidden, cell if self.cell_type == 'lstm' else None

    # Method to get detailed parameters
    def getParams(self):
        return self.detail_parameters
    
    # Method to initialize hidden state
    def init_hidden(self, batch):
        return torch.zeros(self.direction_value * self.layers, batch, self.HiddenDimension, device=self.device)

# Define RNN layer as a subclass of nn.Module
class RNNLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout):
        super(RNNLayer, self).__init__()
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)

    def forward(self, input, hidden):
        return self.rnn(input, hidden)

# Define GRU layer as a subclass of nn.Module
class GRULayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout):
        super(GRULayer, self).__init__()
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)

    def forward(self, input, hidden):
        return self.gru(input, hidden)

# Define LSTM layer as a subclass of nn.Module
class LSTMLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout):
        super(LSTMLayer, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout)

    def forward(self, input, hidden):
        return self.lstm(input, hidden)  #ANOTHER STYLE END
    
    
    
    
import torch
import torch.nn as nn

class MyEncoder(nn.Module):
    def __init__(self, input_size=72, embedding_size=64, hidden_size=256, cell_type='gru', num_layers=2, bidirectional=True, dropout_prob=0,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(MyEncoder, self).__init__()

        # Configuration parameters for the encoder
        self.config = {
            'input_size': input_size,
            'embedding_size': embedding_size,
            'hidden_size': hidden_size,
            'cell_type': cell_type,
            'num_layers': num_layers,
            'bidirectional': bidirectional,
            'dropout_prob': dropout_prob,
            'device': device.type
        }

        # Assigning parameters to instance variables
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout_prob = dropout_prob
        self.device = device

        # Embedding layer
        self.embedding_layer = nn.Embedding(input_size, embedding_size)
        # Dropout layer
        self.dropout_layer = nn.Dropout(dropout_prob)

        # Calculating the number of directions based on bidirectionality
        self.directions = 2 if bidirectional else 1

        # Instantiating the appropriate RNN layer based on cell type
        if cell_type == 'rnn':
            self.rnn_layer = MyRNNLayer(embedding_size, hidden_size, num_layers, bidirectional, dropout_prob)
        elif cell_type == 'gru':
            self.rnn_layer = MyGRULayer(embedding_size, hidden_size, num_layers, bidirectional, dropout_prob)
        elif cell_type == 'lstm':
            self.rnn_layer = MyLSTMLayer(embedding_size, hidden_size, num_layers, bidirectional, dropout_prob)

    def forward(self, input_seq, hidden_state, cell_state=None):
        # Embedding the input sequence
        embedded_seq = self.embedding_layer(input_seq)
        # Applying dropout to the embedded sequence
        embedded_seq = self.dropout_layer(embedded_seq)

        # Forward pass through the RNN layer
        if self.cell_type == 'lstm':
            output_seq, (hidden_state, cell_state) = self.rnn_layer(embedded_seq, (hidden_state, cell_state))
        else:
            output_seq, hidden_state = self.rnn_layer(embedded_seq, hidden_state)

        # Returning the output sequence and hidden/cell states (if LSTM)
        return output_seq, hidden_state, cell_state if self.cell_type == 'lstm' else None

    def get_config(self):
        # Method to retrieve the configuration parameters
        return self.config
    
    def init_hidden_state(self, batch_size):
        # Method to initialize the hidden state
        return torch.zeros(self.directions * self.num_layers, batch_size, self.hidden_size, device=self.device)

class MyRNNLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout_prob):
        super(MyRNNLayer, self).__init__()
        # RNN layer instantiation
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout_prob)

    def forward(self, input_seq, hidden_state):
        # Forward pass through the RNN layer
        return self.rnn(input_seq, hidden_state)

class MyGRULayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout_prob):
        super(MyGRULayer, self).__init__()
        # GRU layer instantiation
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout_prob)

    def forward(self, input_seq, hidden_state):
        # Forward pass through the GRU layer
        return self.gru(input_seq, hidden_state)

class MyLSTMLayer(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional, dropout_prob):
        super(MyLSTMLayer, self).__init__()
        # LSTM layer instantiation
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout_prob)

    def forward(self, input_seq, hidden_state):
        # Forward pass through the LSTM layer
        return self.lstm(input_seq, hidden_state)



In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self,InputDimension=26,EmbeddingDimension=64,HiddenDimension=256,CellType='lstm',layers=2,use_attention=False,
                 attention_dimension=None,DropOut=0,bi_directional=True,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(Decoder, self).__init__()

        self.InputDimension = InputDimension
        self.EmbeddingDimension = EmbeddingDimension
        self.HiddenDimension = HiddenDimension
        self.CellType = CellType
        self.layers = layers
        self.use_attention = use_attention
        self.attention_dimension = attention_dimension
        self.DropOut = DropOut
        self.device = device
        #self.linear_transform = nn.Linear(hidden_dimension, output_dimension)  # Adjust output_dimension as needed

        # Embedding layer
        self.embedding = nn.Embedding(InputDimension, EmbeddingDimension)
        self.dropout_layer = nn.Dropout(DropOut)

        # Calculate input size considering attention
        self.input_size = EmbeddingDimension
        if use_attention:
            self.input_size += attention_dimension

        # Define decoder type (RNN, GRU, LSTM)
        if CellType == 'rnn':
            self.decoder_type = nn.RNN(input_size=self.input_size, hidden_size=HiddenDimension,
                                        num_layers=layers, bi_directional=bi_directional, DropOut=DropOut)
        elif CellType == 'gru':
            self.decoder_type = nn.GRU(input_size=self.input_size, hidden_size=HiddenDimension,
                                        num_layers=layers, bi_directional=bi_directional, DropOut=DropOut)
        elif CellType == 'lstm':
            self.decoder_type = nn.LSTM(input_size=self.input_size, hidden_size=HiddenDimension,
                                         num_layers=layers, bi_directional=bi_directional, DropOut=DropOut)

        # Attention mechanism components
        if use_attention:
            self.U = nn.Linear(HiddenDimension, HiddenDimension)
            self.W = nn.Linear(HiddenDimension, HiddenDimension)
            self.V = nn.Linear(HiddenDimension, 1)

        # Output layer to match input dimension
        self.W1 = nn.Linear(HiddenDimension * (2 if bi_directional else 1), InputDimension)
    
    def forward(self, input, hidden, cell=None, encoder_outputs=None):
        embedded = self.embedding(input)
        embedded = self.dropout_layer(embedded)

        # Apply attention mechanism if enabled
        if self.use_attention:
            context, attention_weights = self.apply_attention(hidden, encoder_outputs)
            embedded = torch.cat((embedded, context), 2)

        # Pass through decoder RNN type
        if self.CellType == 'lstm':
            output, (hidden, cell) = self.decoder_type(embedded, (hidden, cell))
        else:
            output, hidden = self.decoder_type(embedded, hidden)

        # Apply linear layer to match output dimension
        output = self.W1(output)

        return output, hidden, cell, attention_weights if self.use_attention else None
    

    
    def apply_attention(self, hidden, encoder_outputs):
    # Project encoder outputs and hidden state
        encoder_transform = self.W(encoder_outputs)
        hidden_transform = self.U(hidden)

    # Combine encoder and hidden transformations
        concat_transform = encoder_transform + hidden_transform

    # Apply activation function
        concat_transform = torch.tanh(concat_transform)

    # Calculate attention scores
        score = self.V(concat_transform)

    # Apply softmax to get attention weights
        attention_weights = F.softmax(score, dim=1)

    # Compute context vector
        context_vector = torch.sum(attention_weights * encoder_outputs, dim=1)

    # Reshape context vector
        normalized_context_vector = context_vector.unsqueeze(0)

        return normalized_context_vector, attention_weights

    
    def getParams(self):
        return {
            'InputDimension': self.InputDimension,
            'EmbeddingDimension': self.EmbeddingDimension,
            'HiddenDimension': self.HiddenDimension,
            'attention_dimension': self.attention_dimension,
            'CellType': self.CellType,
            'layers': self.layers,
            'device': self.device.type,
            'DropOut': self.DropOut,
            'use_attention': self.use_attention,
            'attention_dimension': self.attention_dimension
        }
    
    
import torch
import torch.nn as nn
import torch.nn.functional as F

class MyDecoder(nn.Module):
    def __init__(self, input_size=26, embedding_size=64, hidden_size=256, cell_type='lstm', num_layers=2, use_attention=False,
                 attention_size=None, dropout=0, bidirectional=True,
                 device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        super(MyDecoder, self).__init__()

        # Decoder parameters
        self.input_size = input_size  # Number of unique input symbols
        self.embedding_size = embedding_size  # Dimensionality of the embedding space
        self.hidden_size = hidden_size  # Dimensionality of the hidden state
        self.cell_type = cell_type  # Type of recurrent cell (RNN, GRU, LSTM)
        self.num_layers = num_layers  # Number of recurrent layers
        self.use_attention = use_attention  # Flag indicating whether to use attention mechanism
        self.attention_size = attention_size  # Dimensionality of attention mechanism
        self.dropout = dropout  # Dropout probability
        self.bidirectional = bidirectional  # Flag indicating bidirectional RNN
        self.device = device  # Device to run computations

        # Components initialization
        self._build_embedding_layer()  # Initialize embedding layer
        self._build_dropout_layer()  # Initialize dropout layer
        self._build_decoder_rnn()  # Initialize decoder RNN
        if use_attention:
            self._build_attention_mechanism()  # Initialize attention mechanism components
        self._build_output_layer()  # Initialize output layer

    def forward(self, input_seq, hidden_state, cell_state=None, encoder_outputs=None):
        # Embed input sequence
        embedded_seq = self.embedding(input_seq)
        embedded_seq = self.dropout(embedded_seq)

        # Apply attention mechanism if enabled
        if self.use_attention:
            context, attention_weights = self._apply_attention(hidden_state, encoder_outputs)
            embedded_seq = torch.cat((embedded_seq, context), 2)

        # Pass through decoder RNN
        output, hidden_state, cell_state = self.decoder_rnn(embedded_seq, (hidden_state, cell_state) if self.cell_type == 'lstm' else hidden_state)

        # Apply output layer to match output dimension
        output = self.output_layer(output)

        return output, hidden_state, cell_state, attention_weights if self.use_attention else None

    # Method to build embedding layer
    def _build_embedding_layer(self):
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

    # Method to build dropout layer
    def _build_dropout_layer(self):
        self.dropout = nn.Dropout(self.dropout)

    # Method to build decoder RNN
    def _build_decoder_rnn(self):
        input_size = self.embedding_size + (self.attention_size if self.use_attention else 0)
        if self.cell_type == 'lstm':
            self.decoder_rnn = nn.LSTM(input_size=input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                                       bidirectional=self.bidirectional, dropout=self.dropout)
        elif self.cell_type == 'gru':
            self.decoder_rnn = nn.GRU(input_size=input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                                      bidirectional=self.bidirectional, dropout=self.dropout)
        else:
            self.decoder_rnn = nn.RNN(input_size=input_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                                      bidirectional=self.bidirectional, dropout=self.dropout)

    # Method to build attention mechanism
    def _build_attention_mechanism(self):
        self.attention_W = nn.Linear(self.hidden_size, self.hidden_size)
        self.attention_U = nn.Linear(self.hidden_size, self.hidden_size)
        self.attention_V = nn.Linear(self.hidden_size, 1)

    # Method to build output layer
    def _build_output_layer(self):
        output_size = self.input_size
        if self.bidirectional:
            output_size *= 2
        self.output_layer = nn.Linear(self.hidden_size, output_size)

    # Method to apply attention mechanism
    def _apply_attention(self, hidden_state, encoder_outputs):
        encoder_transform = self.attention_W(encoder_outputs)
        hidden_transform = self.attention_U(hidden_state)
        concat_transform = encoder_transform + hidden_transform
        concat_transform = torch.tanh(concat_transform)
        score = self.attention_V(concat_transform)
        attention_weights = F.softmax(score, dim=1)
        context_vector = torch.sum(attention_weights * encoder_outputs, dim=1)
        normalized_context_vector = context_vector.unsqueeze(0)

        return normalized_context_vector, attention_weights

    # Method to get decoder parameters
    def get_params(self):
        return {
            'input_size': self.input_size,
            'embedding_size': self.embedding_size,
            'hidden_size': self.hidden_size,
            'attention_size': self.attention_size,
            'cell_type': self.cell_type,
            'num_layers': self.num_layers,
            'dropout': self.dropout.p,
            'bidirectional': self.bidirectional,
            'device': self.device.type,
            'use_attention': self.use_attention,
            'attention_size': self.attention_size
        }


In [11]:
import torch
import heapq

class BeamNode:
    def __init__(self, index, path_probability, hidden_state, cell_state, parent=None):
        self.index = index
        self.path_probability = path_probability
        self.hidden_state = hidden_state
        self.cell_state = cell_state
        self.parent = parent
        self.length = 0

def expand_node(model, node):
    output, dec_hidden, cell, _ = model.decoder.forward(node.index, node.hidden, node.cell, None)
    output = model.softmax(output, dim=2)
    topk_output, topk_index = torch.topk(output, model.beam_width, dim=2)
    return topk_output, topk_index, dec_hidden, cell

def create_child_nodes(model, topk_output, topk_index, dec_hidden, cell, curr_node):
    child_nodes = []
    for j in range(model.beam_width):
        output = topk_output[:, :, j]
        index = topk_index[:, :, j]
        if curr_node.path_probability * output.item() < 0.001:
            continue
        child_node = BeamNode(output.item(), curr_node.path_probability * output.item(), index, dec_hidden, cell, curr_node)
        child_node.length = curr_node.length + 1
        child_nodes.append(child_node)
    return child_nodes

def traverse_path(model, path, predicted):
    while path is not None:
        output, _, _, _ = model.decoder.forward(path.index, path.hidden, path.cell, None)
        predicted[model.output_seq_length - path.length, i:i+1] = output
        path = path.parent

def beam_search(model, outputs, dec_hiddens, cells, predicted):
    batch_size = outputs.shape[1]
    paths = []

    for i in range(batch_size):
        with torch.no_grad():
            model.eval()
            output = outputs[:, i:i+1].contiguous()
            index = output.contiguous()
            dec_hidden = dec_hiddens[:, i:i+1, :].contiguous()
            cell = cells[:, i:i+1, :].contiguous() if cells is not None else None
            
            open_list = []
            heapq.heapify(open_list)
            
            root_node = BeamNode(1, 1, index, dec_hidden, cell, None)
            heapq.heappush(open_list, root_node)

            while len(open_list) > 0:
                curr_node = heapq.heappop(open_list)
                
                if curr_node.length == model.output_seq_length - 1:
                    paths.append(curr_node)
                    continue

                topk_output, topk_index, dec_hidden, cell = expand_node(model, curr_node)
                child_nodes = create_child_nodes(model, topk_output, topk_index, dec_hidden, cell, curr_node)
                for node in child_nodes:
                    heapq.heappush(open_list, node)

            if len(paths) > 0:
                best_path = min(paths, key=lambda x: x.path_probability)
                traverse_path(model, best_path, predicted)
            else:
                for t in range(1, model.output_seq_length):
                    output, _, _, _ = model.decoder.forward(index, dec_hidden, cell, None)
                    predicted[t, i:i+1] = output
                    output = model.softmax(output, dim=2)
                    output = torch.argmax(output, dim=2)

In [13]:
def scoring(y_dash , y):
    num_sample,seq_len = y.shape
    score = torch.sum(torch.sum(y_dash == y,axis = 1) == seq_len)#calculating score
    return score#returning score

def scoring_prx(y_dash, y):
    if y_dash.dim() == 1:
        y_dash = y_dash.unsqueeze(0)  # Add a batch dimension
    elif y_dash.dim() == 2 and y_dash.size(1) != y.size(1):
        y_dash = y_dash.view(y.size(0), -1)  # Reshape y_dash to match the shape of y
    correct = torch.sum(torch.all(torch.eq(y_dash, y), dim=1))
    return correct

In [14]:
class Seq2Seq(nn.Module):
    
   
    #This class incorporate the whole transliteration model. It calls encoder and pass output of encoder
    #to decoder with or wihout attention. Parameters are specified in constructor.
    
    
    def __init__(self, input_seq_length = 32,output_seq_length = 29,encoder_input_dimension = 29, decoder_input_dimension = 72,encoder_hidden_dimension = 256, 
                 decoder_hidden_dimension =256,encoder_embed_dimension = 256, decoder_embed_dimension = 256, bidirectional = True,encoder_num_layers = 3,
                 decoder_num_layers = 2,cell_type = 'lstm', dropout = 0.2,beam_width = 3,device = device,attention = False):
        
        
        super(Seq2Seq, self).__init__()
        
        self.detail_parameters = {}
        self.detail_parameters['input_seq_length'] = input_seq_length
        self.detail_parameters['output_seq_length'] = output_seq_length
        self.detail_parameters['encoder_input_dimension'] = encoder_input_dimension
        self.detail_parameters['decoder_input_dimension'] = decoder_input_dimension
        self.detail_parameters['encoder_hidden_dimension'] = encoder_hidden_dimension
        self.detail_parameters['encoder_embed_dimension'] = encoder_embed_dimension
        self.detail_parameters['decoder_hidden_dimension'] = decoder_hidden_dimension
        self.detail_parameters['decoder_embed_dimension'] = decoder_embed_dimension
        self.detail_parameters['bidirectional'] = bidirectional
        self.detail_parameters['encoder_num_layers'] = encoder_num_layers
        self.detail_parameters['decoder_num_layers'] = decoder_num_layers
        self.detail_parameters['cell_type'] = cell_type
        self.detail_parameters['dropout'] = dropout
        self.detail_parameters['device'] = device.type

        
        
        # Input sequence length => max_length of english
        self.input_seq_length = input_seq_length
        
        # Output sequence length => max_length of malayalam
        self.output_seq_length = output_seq_length
        
        # total number of english characters
        self.encoder_input_dimension = encoder_input_dimension
        
        # total number of malayalam characters
        self.decoder_input_dimension = decoder_input_dimension
        
        # Hidden dim for encoder
        self.encoder_hidden_dimension = encoder_hidden_dimension
        
        # Hidden dim for decoder
        self.decoder_hidden_dimension = decoder_hidden_dimension
        
        # Dimension to which we need to embed our source input
        self.encoder_embed_dimension = encoder_embed_dimension
        
        # Dimension to which we need to embed our target input
        self.decoder_embed_dimension = decoder_embed_dimension
        
        # Whether bidirection needed or not and sets its value as 2, so as to multiply hidden by 2
        self.direction = bidirectional
        self.direction_value = 2 if bidirectional else 1
        
        # Number of layers for encoder and decoder
        self.encoder_num_layers = encoder_num_layers
        self.decoder_num_layers = decoder_num_layers
        
        # Which cell type to use
        self.cell_type = cell_type 
        
        # Whether to use dropout or not
        self.dropout = dropout
        self.device = device
        
        self.softmax = F.softmax
        
        # fix beam width
        self.beam_width = beam_width
        
        # Whether to use attention or not 
        self.use_attention = attention
        
        # Linear Weights so as to make encoder and decoder dimension same (i.e., if they differ by hidden dim or layer)
        self.enc_dec_linear1 = nn.Linear(encoder_hidden_dimension,decoder_hidden_dimension)
        self.enc_dec_linear2 = nn.Linear(encoder_num_layers*self.direction_value,decoder_num_layers*self.direction_value)
        
        # Linear Weights so as to make encoder and decoder cell's dimension same (i.e., if they differ by hidden dim or layer)
        self.enc_dec_cell_linear1 = nn.Linear(encoder_hidden_dimension,decoder_hidden_dimension)
        self.enc_dec_cell_linear2 = nn.Linear(encoder_num_layers*self.direction_value,decoder_num_layers*self.direction_value)
        
        # Linear Weights so as to make encoder and decoder attention dimension same (i.e., if they differ by hidden dim or layer)
        self.enc_dec_att_linear1 = nn.Linear(encoder_hidden_dimension,decoder_hidden_dimension)
        self.enc_dec_att_linear2 = nn.Linear(encoder_num_layers*self.direction_value,decoder_num_layers*self.direction_value)
        
        # initialize encoder
        self.encoder = Encoder(input_dimension = self.encoder_input_dimension,embed_dimension = self.encoder_embed_dimension, 
                               hidden_dimension =  self.encoder_hidden_dimension,cell_type = self.cell_type,layers = self.encoder_num_layers,
                               bidirectional = self.direction,dropout = self.dropout, device = self.device)
        
        # initialize decoder
        self.decoder = Decoder(input_dimension = self.decoder_input_dimension,embed_dimension = self.decoder_embed_dimension,hidden_dimension = self.decoder_hidden_dimension,
                               attention_dimension = self.decoder_hidden_dimension,cell_type = self.cell_type,layers = self.decoder_num_layers,
                               dropout = self.dropout,device = self.device,use_attention = self.use_attention)
        
    def getParams(self):
        return self.detail_parameters
    
    def forwardFls(self, source_sequence, target_sequence, use_teacher_forcing, calculate_accuracy=False):
    # Initialize encoder hidden states
        encoder_hidden = self._initialize_encoder_hidden(source_sequence)

    # Initialize encoder outputs for attention mechanism if enabled
        encoder_outputs = self._initialize_encoder_outputs(source_sequence)

    # Encode the input sequence
        encoder_outputs = self._encode_input_sequence(source_sequence, encoder_hidden, encoder_outputs)

    # Get the encoder's last hidden state
        encoder_last_hidden = encoder_hidden[-1]

    # Initialize decoder hidden and cell states
        decoder_hidden, decoder_cell = self._initialize_decoder_states(encoder_last_hidden)

    # Initialize predicted tensor and attention weights tensor
        predicted_output = torch.zeros(self.output_seq_length, source_sequence.size(0), self.decoder_input_dimension, device=self.device)
        attention_weights = torch.zeros(self.output_seq_length, self.input_seq_length, self.direction_value * self.decoder_num_layers, source_sequence.size(0), device=self.device) if self.use_attention else None

    # Initialize decoder input with SOS token
        decoder_input = torch.ones(1, source_sequence.size(0), dtype=torch.long, device=self.device)

    # Decode the output sequence
        for t in range(1, self.output_seq_length):
            if use_teacher_forcing:
                output, decoder_hidden, decoder_cell, attention_weights = self.decoder(target_sequence[:, t - 1].unsqueeze(0), decoder_hidden, decoder_cell, encoder_outputs)
            else:
                if self.beam_width > 1 and calculate_accuracy:
                    pass  # Implement beam search
                else:
                    output, decoder_hidden, decoder_cell, attention_weights = self.decoder(decoder_input, decoder_hidden, decoder_cell, encoder_outputs)
                    predicted_output[t] = output.squeeze(0)
                    if self.use_attention:
                        attention_weights[t] = attention_weights.squeeze(3)
                    output = F.softmax(output, dim=2)
                    decoder_input = torch.argmax(output, dim=2)

        return predicted_output, attention_weights

    
    
    
    def forward(self, input, target ,teacher_force, acc_calculate = False):
        
        batch_size = input.shape[0]
        
        #initialize hidden dimension o pass to encoder
        enc_hidden = self.encoder.init_hidden(batch_size)
        
        # if lstm then initialize cell also
        if self.cell_type == 'lstm':
            cell = self.encoder.init_hidden(batch_size)
        else:
            cell = None
        
        encoder_outputs = None
        
        # if using attention, then encoder outputs should be stored 
        if self.use_attention:
            encoder_outputs = torch.zeros(self.input_seq_length,self.direction_value*self.decoder_num_layers,batch_size,self.decoder_hidden_dimension,device=device)
        
        # Pass input to encoder one by character in batch fashion
        for t in range(self.input_seq_length):
            enc_output,enc_hidden, cell = self.encoder.forward(input[:,t].unsqueeze(0), enc_hidden, cell)
            
            # Store encoder outputs, by first converting into same dimesnion by linear layers
            if self.use_attention:
                enc_hidden_new = enc_hidden
                enc_hidden_new = self.enc_dec_att_linear1(enc_hidden_new)
                enc_hidden_new = enc_hidden_new.permute(2,1,0).contiguous()
                enc_hidden_new = self.enc_dec_att_linear2(enc_hidden_new)
                enc_hidden_new = enc_hidden_new.permute(2,1,0).contiguous()
                encoder_outputs[t] = enc_hidden_new
        
        # Encoder's last state is decoders first state
        enc_last_state = enc_hidden
        
        # predicted to store all predictions by model to calculate loss
        predicted = torch.zeros(self.output_seq_length, batch_size, self.decoder_input_dimension,device = self.device)
        
        # Store all attention weights, so can be used for plotting attn heatmaps
        attn_weights = torch.zeros(self.output_seq_length, self.input_seq_length, self.direction_value*self.decoder_num_layers ,batch_size, device = self.device)
        
        # Encoders last state is decoders hidden also ransform in case they are of different dimension
        dec_hidden = enc_last_state
        dec_hidden = self.enc_dec_linear1(dec_hidden)

        dec_hidden = dec_hidden.permute(2,1,0).contiguous()
        dec_hidden = self.enc_dec_linear2(dec_hidden)
        dec_hidden = dec_hidden.permute(2,1,0).contiguous()
        
        # Here also, encoders last cell is decoders first cell, also transform to same dimesnion
        if  self.cell_type == 'lstm':
            cell = self.enc_dec_cell_linear1(cell)
            cell = cell.permute(2,1,0).contiguous()
            cell = self.enc_dec_cell_linear2(cell)
            cell = cell.permute(2,1,0).contiguous()
            

        # output at start is all 1's <SOS>
        output = torch.ones(1,batch_size,dtype=torch.long, device=self.device)
        predicted[0,:,1]=torch.ones(batch_size)
        attention_weights = None
        
        
        # Do decoding by char by char fashion by batch   
        for t in range(1,self.output_seq_length):
            # if teacher forcing, then pass target directly
            if teacher_force:
                output,dec_hidden,cell,attention_weights=self.decoder.forward(target[:,t-1].unsqueeze(0),dec_hidden,cell,encoder_outputs)
                predicted[t] = output.squeeze(0)

            else:
                # if beam is to be used, call beam instead of passing output from decoder
                if self.beam_width > 1 and acc_calculate:
                    beam = BeamSearch()
                    beam.beamSearch(self, output,dec_hidden,cell, predicted)
                    break
                    
                # call decoder one at a time
                output,dec_hidden,cell,attention_weights=self.decoder.forward(output,dec_hidden,cell,encoder_outputs)
                #store output in prediced (it containes probabilities)
                predicted[t] = output.squeeze(0)
                if self.use_attention:
                    attn_weights[t] = attention_weights.squeeze(3)
                    
                # Convert output such that, it can be easily given to input
                output = self.softmax(output,dim=2)
                output = torch.argmax(output,dim=2)

        
        return predicted,attn_weights
    



In [15]:
def train(data_loader, val_loader ,epochs, beam):
        
        # Set all training parameters
        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss()
        # set model to train mode
        
        attention_weights = None
        
        model.train()
        # Do training in epoch fashion
        for epoch in tqdm(range(epochs)):
            total_loss=0
            train_loss = 0
            train_score = 0
            val_score = 0
            val_loss = 0
            
            # use data loader and enumerate each of data for training in batchwise
            for i, (source, target) in enumerate(data_loader):

                source = source.to(device)
                target = target.to(device)
                
                
                
                optimizer.zero_grad()
                
                output,attention_weights = model.forward(source, target, epoch < epochs/2, False)                

                # In order to do loss calc, first need to convert target to one-hot and make predicted in probabilistic manner
                output = output.permute(1, 0, 2)
                expected = F.one_hot(target,num_classes = 72).float()
                    
                # make predicted and target in same dimension
                output = output.reshape(-1, 72)
                expected = expected.reshape(-1,72)

                # Calculate loss
                loss = criterion(output, expected)
                
                # Calculate gradients
                loss.backward()
                
                # Clip gradiens, so will not explode
                nn.utils.clip_grad_norm_(model.parameters(),1)
                
                #update parameters
                optimizer.step()
                
 #                 break
#             continue

            # Calculate validation accuracy and losses => Same process as training, but here no updation of gradients
            with torch.no_grad():
                model.eval()

                for val_input, val_target in val_loader:
                    val_input = val_input.to(device)
                    val_target = val_target.to(device)
                    #val_output,_ = model.forward(val_input, None, False ,False)
                    val_output,_ = model.forward(val_input, val_target, False ,False)
                    
                    acc_output = F.softmax(val_output,dim=2)
                    acc_output = torch.argmax(acc_output,dim=2)
                    acc_output = acc_output.T
                    val_score += scoring(acc_output,val_target)

                    
                    val_output = val_output.permute(1, 0, 2)
                    expected = F.one_hot(val_target,num_classes = 72).float()

                    val_output = val_output.reshape(-1, 72)

                    expected = expected.reshape(-1,72)

                    
                    loss = criterion(val_output, expected)
                    val_loss += loss.item()
                    
                    
            def _run_epoch(data_loader, optimizer, criterion, model, epoch, epochs, attention_weights, train_mode=True):
                total_loss = 0
                total_score = 0
    
                model.train() if train_mode else model.eval()
    
                with torch.set_grad_enabled(train_mode):
                    for i, (source, target) in enumerate(data_loader):
                    source, target = source.to(device), target.to(device)
                    optimizer.zero_grad() if optimizer else None
                    output, attention_weights = model.forward(source, target, epoch < epochs/2, False)
                    output = output.permute(1, 0, 2)
                    expected = F.one_hot(target, num_classes=72).float()
                    output, expected = output.reshape(-1, 72), expected.reshape(-1, 72)
                    loss = criterion(output, expected)
                    loss.backward() if optimizer else None
                    nn.utils.clip_grad_norm_(model.parameters(), 1) if optimizer else None
                    optimizer.step() if optimizer else None
            
                    total_loss += loss.item()
            
                    if train_mode:
                        acc_output = F.softmax(output, dim=1)
                        acc_output = torch.argmax(acc_output, dim=1)
                # Reshape acc_output to match the shape of target
                        acc_output = acc_output.view(target.size())
                        total_score += scoring(acc_output, target)
    
                avg_loss = total_loss / len(data_loader)
                avg_score = total_score / len(data_loader.dataset)
    
                return avg_loss, avg_score

              
            # Calculate training accuracy and losses
            with torch.no_grad():
                model.eval()
                for train_input, train_target in data_loader:
                    train_input = train_input.to(device)
                    train_target = train_target.to(device)
                    train_output,_ = model.forward(train_input, None,False)
                    
                    acc_output = F.softmax(train_output,dim=2)
                    acc_output = torch.argmax(acc_output,dim=2)
                    acc_output = acc_output.T
                    train_score += scoring(acc_output,train_target)

                    
                    train_output = train_output.permute(1, 0, 2)
                    expected = F.one_hot(train_target,num_classes = 72).float()

                    train_output = train_output.reshape(-1, 72)

                    expected = expected.reshape(-1,72)

                    
                    loss = criterion(train_output, expected)
                    train_loss += loss.item()
                    
                # Make the model trainable again
                model.train()
            
                
                
            print(f'epoch {epoch}')
            print(f'train loss => {train_loss/len(data_loader)} \ntrain_acc => {train_score/len(data_loader.dataset)}')
            print(f'valid loss => {val_loss/len(val_loader)} \nvalid_acc => {val_score/len(val_loader.dataset)}')


def train_prx(data_loader, val_loader, epochs, beam):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    model.train()
    attention_weights = None
    
    for epoch in tqdm(range(epochs)):
        train_loss, train_score = _run_epoch(data_loader, optimizer, criterion, model, epoch, epochs, attention_weights, train_mode=True)
        val_loss, val_score = _run_epoch(val_loader, None, criterion, model, epoch, epochs, attention_weights, train_mode=False)
        
        print(f'epoch {epoch}')
        print(f'train loss => {train_loss} \ntrain_acc => {train_score}')
        print(f'valid loss => {val_loss} \nvalid_acc => {val_score}')


def _run_epoch(data_loader, optimizer, criterion, model, epoch, epochs, attention_weights, train_mode=True):
    total_loss = 0
    total_score = 0
    
    model.train() if train_mode else model.eval()
    
    with torch.set_grad_enabled(train_mode):
        for i, (source, target) in enumerate(data_loader):
            source, target = source.to(device), target.to(device)
            optimizer.zero_grad() if optimizer else None
            output, attention_weights = model.forward(source, target, epoch < epochs/2, False)
            output = output.permute(1, 0, 2)
            expected = F.one_hot(target, num_classes=72).float()
            output, expected = output.reshape(-1, 72), expected.reshape(-1, 72)
            loss = criterion(output, expected)
            loss.backward() if optimizer else None
            nn.utils.clip_grad_norm_(model.parameters(), 1) if optimizer else None
            optimizer.step() if optimizer else None
            
            total_loss += loss.item()
            
            if train_mode:
                acc_output = F.softmax(output, dim=1)
                acc_output = torch.argmax(acc_output, dim=1)
                # Reshape acc_output to match the shape of target
                acc_output = acc_output.view(target.size())
                total_score += scoring(acc_output, target)
    
    avg_loss = total_loss / len(data_loader)
    avg_score = total_score / len(data_loader.dataset)
    
    return avg_loss, avg_score


model = Seq2Seq(
    encoder_hidden_dimension=256, 
    decoder_hidden_dimension=256,
    encoder_embed_dimension=256, 
    decoder_embed_dimension=256, 
    bidirectional=True,
    encoder_num_layers=3,
    decoder_num_layers=2,
    cell_type='lstm', 
    dropout=0.2,
    beam_width=3,
    device=device,
    attention=False  # Change attention to use_attention
)

model.to(device)
epochs = 10
train(train_loader, val_loader, epochs, False)

 10%|█         | 1/10 [00:36<05:28, 36.48s/it]

epoch 0
train loss => 1.5944575417041777 
train_acc => 0.0
valid loss => 1.3929478898644447 
valid_acc => 0.0


 20%|██        | 2/10 [01:12<04:47, 35.99s/it]

epoch 1
train loss => 1.3363674068450928 
train_acc => 0.011171874590218067
valid loss => 1.0846822932362556 
valid_acc => 0.03125


 30%|███       | 3/10 [01:47<04:10, 35.77s/it]

epoch 2
train loss => 1.182860615849495 
train_acc => 0.1096484363079071
valid loss => 0.9436689093708992 
valid_acc => 0.155029296875


 40%|████      | 4/10 [02:23<03:34, 35.68s/it]

epoch 3
train loss => 1.1087348762154579 
train_acc => 0.18617187440395355
valid loss => 0.8968608416616917 
valid_acc => 0.226806640625


 50%|█████     | 5/10 [02:58<02:58, 35.71s/it]

epoch 4
train loss => 1.0265201181173325 
train_acc => 0.26054686307907104
valid loss => 0.868542242795229 
valid_acc => 0.2783203125


 60%|██████    | 6/10 [03:34<02:22, 35.74s/it]

epoch 5
train loss => 0.5348665207624436 
train_acc => 0.07634765654802322
valid loss => 0.47036867402493954 
valid_acc => 0.098876953125


 70%|███████   | 7/10 [04:10<01:47, 35.76s/it]

epoch 6
train loss => 0.47107376873493195 
train_acc => 0.15576171875
valid loss => 0.42788021452724934 
valid_acc => 0.179931640625


 70%|███████   | 7/10 [04:43<02:01, 40.43s/it]


KeyboardInterrupt: 

In [None]:
# Same as the train loop, but used for wandb running
def runModel(model, data_loader, val_loader ,epochs, beam):

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    # set model to train mode
    model.train()
    train_loss_list = []
    val_loss_list = []
    train_accuracy_list = []
    val_accuracy_list = []

    for epoch in tqdm(range(epochs)):
        total_loss=0
        train_loss = 0
        train_score = 0
        val_score = 0
        val_loss = 0
        for i, (source, target) in enumerate(data_loader):

            source = source.to(device)
            target = target.to(device)

            optimizer.zero_grad()

            output,_ = model.forward(source, target, epoch < epochs/2, False)

            output = output.permute(1, 0, 2)
            expected = F.one_hot(target,num_classes = 72).float()

            output = output.reshape(-1, 72)

            expected = expected.reshape(-1,72)

            loss = criterion(output, expected)

            loss.backward()  # compute gradients
            nn.utils.clip_grad_norm_(model.parameters(),1)
            optimizer.step()  # update parameters
            
            
        def compute_loss(output, target, criterion):
            output = output.permute(1, 0, 2).reshape(-1, output.shape[2])
            target = F.one_hot(target, num_classes=output.shape[1]).float().reshape(-1, output.shape[1])
            return criterion(output, target)

        def compute_accuracy(output, target):
            predictions = torch.argmax(F.softmax(output, dim=2), dim=2).T
            correct = torch.sum(predictions == target)
            return correct.item()



        with torch.no_grad():
            model.eval()
            for val_input, val_target in val_loader:
                val_input = val_input.to(device)
                val_target = val_target.to(device)
                val_output,_ = model.forward(val_input, None,False,beam)

                acc_output = F.softmax(val_output,dim=2)
                acc_output = torch.argmax(acc_output,dim=2)
                acc_output = acc_output.T
                val_score += scoring(acc_output,val_target)


                val_output = val_output.permute(1, 0, 2)
                expected = F.one_hot(val_target,num_classes = 72).float()

                val_output = val_output.reshape(-1, 72)

                expected = expected.reshape(-1,72)


                loss = criterion(val_output, expected)
                val_loss += loss.item()
                
                
        def evaluatepx(model, data_loader, criterion, device, beam):
            model.eval()
            total_loss, total_correct = 0, 0

            with torch.no_grad():
                for source, target in data_loader:
                    source, target = source.to(device), target.to(device)
                    output, _ = model(source, None, False, beam)
                    loss = compute_loss(output, target, criterion)

                total_loss += loss.item()
                total_correct += compute_accuracy(output, target)

            avg_loss = total_loss / len(data_loader)
            avg_accuracy = total_correct / len(data_loader.dataset)
            return avg_loss, avg_accuracy        
        

        with torch.no_grad():
            model.eval()
            for train_input, train_target in data_loader:
                train_input = train_input.to(device)
                train_target = train_target.to(device)
                train_output,_ = model.forward(train_input, None,False,beam)

                acc_output = F.softmax(train_output,dim=2)
                acc_output = torch.argmax(acc_output,dim=2)
                acc_output = acc_output.T
                train_score += scoring(acc_output,train_target)


                train_output = train_output.permute(1, 0, 2)
                expected = F.one_hot(train_target,num_classes = 72).float()

                train_output = train_output.reshape(-1, 72)

                expected = expected.reshape(-1,72)


                loss = criterion(train_output, expected)
                train_loss += loss.item()
            model.train()



        print(f'epoch {epoch}')
        print(f'train loss => {train_loss/len(data_loader)} \ntrain_acc => {train_score/len(data_loader.dataset)}')
        print(f'valid loss => {val_loss/len(val_loader)} \nvalid_acc => {val_score/len(val_loader.dataset)}')
        train_loss_list.append(train_loss/len(data_loader))
        val_loss_list.append(val_loss/len(val_loader))
        train_accuracy_list.append(train_score/len(data_loader.dataset))
        val_accuracy_list.append(val_score/len(val_loader.dataset))

    return train_loss_list,val_loss_list,train_accuracy_list,val_accuracy_list


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

def run_model(model, train_loader, val_loader, epochs, beam):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    train_loss_list, val_loss_list = [], []
    train_accuracy_list, val_accuracy_list = [], []

    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device, beam)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device, beam)

        train_loss_list.append(train_loss)
        val_loss_list.append(val_loss)
        train_accuracy_list.append(train_acc)
        val_accuracy_list.append(val_acc)

        print(f'Epoch {epoch}:')
        print(f'Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}')
        print(f'Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')

    return train_loss_list, val_loss_list, train_accuracy_list, val_accuracy_list

def train_epoch(model, data_loader, optimizer, criterion, device, beam):
    model.train()
    total_loss, total_correct = 0, 0

    for source, target in data_loader:
        source, target = source.to(device), target.to(device)

        optimizer.zero_grad()

        output, _ = model(source, target, epoch < epochs/2, False)
        loss = compute_loss(output, target, criterion)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        total_loss += loss.item()
        total_correct += compute_accuracy(output, target)

    avg_loss = total_loss / len(data_loader)
    avg_accuracy = total_correct / len(data_loader.dataset)
    return avg_loss, avg_accuracy


In [None]:
def train_wandb():

    wandb.init(project="dl-assignment-3-final")

    wandb.run.name = f'inp_embed_{wandb.config.input_embedding}_enclayer_{wandb.config.number_of_enc_layer}_declayer_{wandb.config.number_of_dec_layer}_hidden_{wandb.config.hidden_size}_cell_{wandb.config.cell_type}_drop_{wandb.config.dropout}'

    # Give wandb parameters
    model = Seq2Seq(
        encoder_hidden_dimension = wandb.config.hidden_size,
        decoder_hidden_dimension = wandb.config.hidden_size,
        encoder_embed_dimension =  wandb.config.input_embedding,
        decoder_embed_dimension =  wandb.config.input_embedding,
        bidirectional = wandb.config.bidirectional,
        encoder_num_layers = wandb.config.number_of_enc_layer,
        decoder_num_layers = wandb.config.number_of_dec_layer,
        cell_type = wandb.config.cell_type,
        dropout = wandb.config.dropout,
        beam_width = wandb.config.beam_width,
        device = device,
        attention = False
    )
    model.to(device)
    beam = True
    epochs = 20
    train_loss_list,val_loss_list,train_accuracy_list,val_accuracy_list = runModel(model, train_loader, val_loader, epochs,beam)
    # Log into wandb
    for i in range(epochs):
        wandb.log({'validation_loss': val_loss_list[i],
                  'training_loss': train_loss_list[i],
                  'validation_accuracy': val_accuracy_list[i],
                  'training_accuracy': train_accuracy_list[i]
                  })

In [None]:
import wandb

def train_wandb(config, train_loader, val_loader):
    # Initialize wandb run
    wandb.init(project="dl-assignment-3-final", config=config)

    # Construct run name based on config parameters
    run_name = f'inp_embed_{config.input_embedding}_enclayer_{config.number_of_enc_layer}_declayer_{config.number_of_dec_layer}_hidden_{config.hidden_size}_cell_{config.cell_type}_drop_{config.dropout}'
    wandb.run.name = run_name

    # Initialize model with config parameters
    model = Seq2Seq(
        encoder_hidden_dimension=config.hidden_size,
        decoder_hidden_dimension=config.hidden_size,
        encoder_embed_dimension=config.input_embedding,
        decoder_embed_dimension=config.input_embedding,
        bidirectional=config.bidirectional,
        encoder_num_layers=config.number_of_enc_layer,
        decoder_num_layers=config.number_of_dec_layer,
        cell_type=config.cell_type,
        dropout=config.dropout,
        beam_width=config.beam_width,
        device=device,
        attention=False
    )
    model.to(device)

    epochs = 10
    beam = True
    # Run model training
    train_loss_list, val_loss_list, train_accuracy_list, val_accuracy_list = runModel(model, train_loader, val_loader, epochs, beam)

    # Log training metrics into wandb
    for i in range(epochs):
        wandb.log({
            'validation_loss': val_loss_list[i],
            'training_loss': train_loss_list[i],
            'validation_accuracy': val_accuracy_list[i],
            'training_accuracy': train_accuracy_list[i]
        })


In [None]:
sweep_configuration = {
    'method': 'bayes',
    'name': 'Deep_Learning_Assignment3',
    'metric': {
        'goal': 'maximize',
        'name': 'validation_accuracy'
        },
    'parameters': {
        'epochs': {'values': [5,10,15]}
        'batchsize': {'values': [64, 128, 256]}
        'input_embedding': {'values': [16,32,64,128,256]},
        'number_of_enc_layer': {'values': [1,2,3]},
        'number_of_dec_layer': {'values': [1,2,3]},
        'hidden_size': {'values': [16,32,64,256]},
        'cell_type': {'values': ['rnn','gru','lstm']},
        'dropout': {'values': [0.2,0.3]},
        'bidirectional' : {'values' : [True]},
        'attention': {'values': [False]}
     }
}

wandb.login(key = KEY)
sweep_id = wandb.sweep(sweep=sweep_configuration, project='dl-assignment-3-final')
wandb.agent(sweep_id, function=train_wandb, count=30)
wandb.finish()

In [None]:
def evaluate_model(model, data_loader):  # Define function to evaluate model on test data
    model.eval()  # Set model to evaluation mode
    total_loss = 0  # Initialize total loss variable
    total_correct = 0  # Initialize total correct predictions variable
    criterion = nn.CrossEntropyLoss()  # Define CrossEntropyLoss criterion for calculating loss

    with torch.no_grad():  # Disable gradient calculation for efficiency
        for inputs, targets in data_loader:  # Iterate over test data batches
            inputs = inputs.to(device)  # Move inputs to device (e.g., GPU)
            targets = targets.to(device)  # Move targets to device

            # Forward pass to get predictions
            outputs, _ = model(inputs, None, False, False)

            # Calculate accuracy
            predictions = F.softmax(outputs, dim=2)
            predictions = torch.argmax(predictions, dim=2).transpose(0, 1)
            total_correct += calculate_accuracy(predictions, targets)

            # Calculate loss
            outputs = outputs.permute(1, 0, 2)
            targets_one_hot = F.one_hot(targets, num_classes=72).float()
            outputs = outputs.view(-1, 72)
            targets_one_hot = targets_one_hot.view(-1, 72)
            loss = criterion(outputs, targets_one_hot)
            total_loss += loss.item()  # Accumulate loss

    avg_loss = total_loss / len(data_loader)  # Calculate average loss
    accuracy = total_correct / len(data_loader.dataset)  # Calculate accuracy
    return avg_loss, accuracy  # Return average loss and accuracy

def calculate_accuracy(predictions, targets):  # Define function to calculate accuracy
    correct = torch.sum(predictions == targets)  # Count correct predictions
    accuracy = correct.float() / targets.numel()  # Calculate accuracy
    return accuracy  # Return accuracy

def log_results(avg_loss, accuracy):  # Define function to log results to W&B
    wandb.log({'test_loss': avg_loss, 'test_accuracy': accuracy})  # Log loss and accuracy

def test_model_and_log_results(model, test_loader):  # Define function to test model and log results
    wandb.login(key=KEY)  # Login to W&B using provided API key
    wandb.init(project='dl-assignment-3-final')  # Initialize W&B project
    wandb.run.name = 'vannilla_test'  # Set name for W&B run

    # Evaluate model on test data and get loss, accuracy
    test_loss, test_accuracy = evaluate_model(model, test_loader)
    print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')  # Print test loss and accuracy
    log_results(test_loss, test_accuracy)  # Log test loss and accuracy to W&B

    wandb.finish()  # Finish W&B run

# Usage
#test_model_and_log_results(model, test_loader)  # Call function to test model and log results