### Importing necessary modules

In [None]:
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import gc
import random
import numpy as np 
import pandas as pd

In [None]:
if torch.cuda.is_available():
    # If CUDA is available, use a CUDA device
    device = torch.device("cuda")
else:
    # If CUDA is not available, use the CPU
    device = torch.device("cpu")

In [None]:
device

### Preprocessing

In [None]:
traindata = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_train.csv',names= ['English','Hindi'],header = None)

In [None]:
testdata = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_test.csv',names = ['English','Hindi'],header = None)

In [None]:
valdata = pd.read_csv('/kaggle/input/aksharantar-sampled/aksharantar_sampled/hin/hin_valid.csv',names = ['English','Hindi'],header = None)

In [None]:
traindata

In [None]:
def tokenize(word):
    tokens = []
    for x in word:
        tokens.append(x)
    return tokens

In [None]:
max_eng_len = 0
max_hin_len = 0
test_max_eng_len = 0
test_max_hin_len = 0
val_max_eng_len = 0
val_max_hin_len = 0

In [None]:
for x in range(len(testdata)):
    temp = 0
    for y in testdata.iloc[x]['English']:
        temp+=1
    test_max_eng_len = max(test_max_eng_len,temp)
print(test_max_eng_len)

In [None]:
for x in range(len(testdata)):
    temp = 0
    for y in testdata.iloc[x]['Hindi']:
        temp +=1
    test_max_hin_len = max(test_max_hin_len,temp)
print(test_max_hin_len)

In [None]:
for x in range(len(valdata)):
    temp = 0
    for y in valdata.iloc[x]['English']:
        temp+=1
    val_max_eng_len = max(val_max_eng_len,temp)
print(val_max_eng_len)

In [None]:
English_vocab = []
for x in range(len(traindata)):
    temp = 0
    for y in traindata.iloc[x]['English']:
        temp += 1
        if y not in English_vocab:
            English_vocab.append(y)
    if(temp>max_eng_len):
        max_eng_len = max(max_eng_len,temp)
print(sorted(English_vocab))
print(max_eng_len)

In [None]:
Hindi_vocab = []
for x in range(len(traindata)):
    temp = 0
    for y in traindata.iloc[x]['Hindi']:
        temp += 1
        if y not in Hindi_vocab:
            Hindi_vocab.append(y)
    max_hin_len = max(temp,max_hin_len)
for x in range(len(testdata)):
    for y in testdata.iloc[x]['Hindi']:
        if y not in Hindi_vocab:
            print(y)
            Hindi_vocab.append(y)

In [None]:
English_vocab = sorted(English_vocab)
Hindi_vocab = sorted(Hindi_vocab)

In [None]:
Eng_dict = {}
reverse_Eng = {}

for x in range(len(English_vocab)):
    Eng_dict[English_vocab[x]] = x+3
    reverse_Eng[x+3] = English_vocab[x]
Eng_dict['<sow>'] = 0
Eng_dict['<eow>'] = 1
Eng_dict['<pad>'] = 2
reverse_Eng[0] = '<sow>'
reverse_Eng[1] = '<eow>'
reverse_Eng[2] = '<pad>'

print(Eng_dict)
print(reverse_Eng)

In [None]:
Hin_dict = {}
reverse_Hin = {}
for x in range(len(Hindi_vocab)):
    Hin_dict[Hindi_vocab[x]] = x+3
    reverse_Hin[x+3] = Hindi_vocab[x]
Hin_dict['<sow>'] = 0
Hin_dict['<eow>'] = 1
Hin_dict['<pad>'] = 2
reverse_Hin[0] = '<sow>'
reverse_Hin[1] = '<eow>'
reverse_Hin[2] = '<pad>'
print(Hin_dict)
print(reverse_Hin)

In [None]:
def Eng_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Eng_dict[x])
    for x in range(len(tokens),max_eng_len):
        tokens.append(Eng_dict['<pad>'])
    return tokens

In [None]:
def Hin_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Hin_dict[x])
    tokens.append(Hin_dict['<eow>'])
    for x in range(len(tokens),max_hin_len+1):
        tokens.append(Hin_dict['<pad>'])
    return tokens

In [None]:
eng_word = []
hin_word = []
for x in range(len(traindata)):
    eng_word.append(Eng_tokenize(traindata.iloc[x]['English']))
    hin_word.append(Hin_tokenize(traindata.iloc[x]['Hindi']))

In [None]:
eng_word = torch.tensor(eng_word)
hin_word = torch.tensor(hin_word)

In [None]:
max_hin_len += 1
test_max_hin_len += 1
val_max_hin_len += 1

In [None]:
def test_Eng_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Eng_dict[x])
    for x in range(len(tokens),test_max_eng_len):
        tokens.append(Eng_dict['<pad>'])
    return tokens
def test_Hin_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Hin_dict[x])
    tokens.append(Hin_dict['<eow>'])
    for x in range(len(tokens),test_max_hin_len):
        tokens.append(Hin_dict['<pad>'])
    return tokens
def val_Eng_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Eng_dict[x])
    for x in range(len(tokens),val_max_eng_len):
        tokens.append(Eng_dict['<pad>'])
    return tokens
def val_Hin_tokenize(word):
    tokens = []
    for x in word:
        tokens.append(Hin_dict[x])
    tokens.append(Hin_dict['<eow>'])
    for x in range(len(tokens),val_max_hin_len):
        tokens.append(Hin_dict['<pad>'])
    return tokens
val_eng_word = []
val_hin_word = []
for x in range(len(valdata)):
    val_eng_word.append(val_Eng_tokenize(valdata.iloc[x]['English']))
    val_hin_word.append(val_Hin_tokenize(valdata.iloc[x]['Hindi']))
val_eng_word = torch.tensor(val_eng_word)
val_hin_word = torch.tensor(val_hin_word)
test_eng_word = []
test_hin_word = []
for x in range(len(testdata)):
    test_eng_word.append(test_Eng_tokenize(testdata.iloc[x]['English']))
    test_hin_word.append(test_Hin_tokenize(testdata.iloc[x]['Hindi']))
test_eng_word = torch.tensor(test_eng_word)
test_hin_word = torch.tensor(test_hin_word)

### Encoder and Attention Decoder

In [None]:
class Encoder(nn.Module):
    def __init__(self,char_embed_size,hidden_size,no_of_layers,dropout,rnn):
        super(Encoder,self).__init__()
        self.layer = no_of_layers
        self.rnn = rnn
        self.embedding = nn.Embedding(len(Eng_dict),char_embed_size).to(device)
        self.embedding.weight.requires_grad = True
        self.drop = nn.Dropout(dropout)
        self.LSTM = nn.LSTM(char_embed_size,hidden_size,self.layer,batch_first = True,bidirectional = True).to(device)
        self.RNN = nn.RNN(char_embed_size,hidden_size,self.layer,batch_first = True,bidirectional = True).to(device)
        self.GRU = nn.GRU(char_embed_size,hidden_size,self.layer,batch_first = True,bidirectional = True).to(device)
    def forward(self,input,hidden,cell):
        embedded = self.embedding(input)
        embedded1 = self.drop(embedded)
        cell1 = cell
        if(self.rnn == 'RNN'):
            output,hidden1 = self.RNN(embedded1,hidden)
        elif(self.rnn == 'LSTM'):
            output,(hidden1,cell1) = self.LSTM(embedded1,(hidden,cell))
        elif(self.rnn == 'GRU'):
            output,hidden1 = self.GRU(embedded1,hidden)
        return output,(hidden1,cell1)

In [None]:
class Attention(nn.Module):
    def __init__(self,char_embed_size,hidden_size,no_of_layers,dropout,batchsize,rnn):
        super(Attention,self).__init__()
        self.layer = no_of_layers
        self.batchsize = batchsize
        self.hidden_size = hidden_size
        self.rnn = rnn
        self.embedding = nn.Embedding(len(Hin_dict),char_embed_size).to(device)
        self.drop = nn.Dropout(dropout)
        self.embedding.weight.requires_grad = True
        self.U = nn.Linear(hidden_size,hidden_size,bias = False).to(device)
        self.W = nn.Linear(hidden_size,hidden_size,bias = False).to(device)
        self.V = nn.Linear(hidden_size,1,bias = False).to(device)
        self.LSTM = nn.LSTM(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device)
        self.RNN = nn.RNN(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device)
        self.GRU = nn.GRU(char_embed_size + hidden_size,hidden_size,self.layer,batch_first = True).to(device) 
        self.linear = nn.Linear(hidden_size,len(Hin_dict),bias=True).to(device)
        self.softmax = nn.Softmax(dim = 2).to(device)
    def forward(self,input,hidden,cell,encoder_outputs,matrix):
        embedded = self.embedding(input)
        temp1 = self.U(encoder_outputs)
        temp2 = self.W(hidden[-1])
        s1 = temp2.size()[0]
        s2 = temp2.size()[1]
        add = temp1 + temp2.resize(s1,1,s2)
        tanh = F.tanh(add)
        ejt = self.V(tanh)
        ajt = nn.Softmax(dim = 1)(ejt)
        ct = torch.zeros(self.batchsize,1,self.hidden_size).to(device)
        ct = torch.bmm(ajt.transpose(1,2),encoder_outputs)
        final_input = torch.cat((embedded,ct),dim = 2)
        final_input = self.drop(final_input)
        cell1 = cell
        if(self.rnn == 'LSTM'):
            output,(hidden1,cell1) = self.LSTM(final_input,(hidden,cell))
        elif(self.rnn == 'RNN'):
            output,hidden1 = self.RNN(final_input,hidden)
        elif(self.rnn == 'GRU'):
            output,hidden1 = self.GRU(final_input,hidden)
        output1 = self.linear(output)
        if(matrix == True):
            return ajt,output1,(hidden1,cell1)
        return output1,(hidden1,cell1)

In [None]:
def getword(characters):
    return "".join(characters)

In [None]:
def accuracy(target,predictions,flag):
    total = 0
    for x in range(len(target)):
        if(torch.equal(target[x],predictions[x])):
            total += 1
    return total

In [None]:
def translate(target,predictions,df):
    i = len(df)
    for x in range(len(predictions)):
        original = []
        for y in target[x]:
            if(y != 1):
                original.append(y)
            else:
                break
        predicted = []
        for y in predictions[x]:
            if(y != 1):
                predicted.append(y)
            else:
                break
        df.loc[i,['Original']] = getword([reverse_Hin[x.item()] for x in original])
        df.loc[i,['Predicted']] = getword([reverse_Hin[x.item()] for x in predicted])
        i+=1
    return df
    

In [None]:
def Evaluate(attention,test_eng_word,test_hin_word,encoder,decoder,batchsize,hidden_size,char_embed_size,no_of_layers):
    with torch.no_grad():
        total_loss = 0
        total_acc = 0
        df = pd.DataFrame()
        en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
        en_cell = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
        for x in range(0,len(testdata),batchsize):
            loss = 0
            input_tensor = test_eng_word[x:x+batchsize].to(device)
            if(input_tensor.size()[0] < batchsize):
                break
            output,(hidden,cell) = encoder.forward(input_tensor,en_hidden,en_cell)
            del(input_tensor)
            output = torch.split(output,[hidden_size,hidden_size],dim = 2)
            output = torch.add(output[0],output[1])/2
            input2 = []
            for y in range(batchsize):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2,no_of_layers,batchsize,hidden_size)
            hidden1 = torch.add(hidden[0],hidden[1])/2
            cell = cell.resize(2,no_of_layers,batchsize,hidden_size)
            cell1 = torch.add(cell[0],cell[1])/2
            OGhidden = hidden1
            predicted = []
            predictions = []
            if(attention == True):
                temp = output
            else:
                temp = OGhidden
            for i in range(test_max_hin_len):
                output1,(hidden1,cell1) = decoder.forward(input2,hidden1,cell1,temp,False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2,dim = 2)
                predictions.append(output3)
                input2 = output3
            predicted = torch.cat(tuple(x for x in predicted),dim =1).to(device).resize(test_max_hin_len*batchsize,len(Hin_dict))
            predictions = torch.cat(tuple(x for x in predictions),dim =1).to(device)
            total_acc += accuracy(test_hin_word[x:x+batchsize].to(device),predictions,x)
            df = translate(test_hin_word[x:x+batchsize],predictions,df)
            loss  = nn.CrossEntropyLoss(reduction = 'sum')(predicted,test_hin_word[x:x+batchsize].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
        test_loss = total_loss/(len(testdata)*test_max_hin_len)
        test_accuracy = (total_acc/len(testdata))*100
        del(predictions)
        del(predicted)
        del(input2)
        del(output1)
        del(output2)
        del(output3)
        del(hidden1)
        del(cell1)
        del(OGhidden)
        del(output)
        del(cell)
        return test_loss,test_accuracy,df

In [None]:
def valevaluate(attention,val_eng_word,val_hin_word,encoder,decoder,batchsize,hidden_size,char_embed_size,no_of_layers):
    with torch.no_grad():
        total_loss = 0
        total_acc = 0
        for x in range(0,len(valdata),batchsize):
            loss = 0
            input_tensor = val_eng_word[x:x+batchsize].to(device)
#             en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            if(input_tensor.size()[0] < batchsize):
                break
            en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            en_cell = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            output,(hidden,cell) = encoder.forward(input_tensor,en_hidden,en_cell)
            del(input_tensor)
            del(en_hidden)
            del(en_cell)
            output = torch.split(output,[hidden_size,hidden_size],dim = 2)
            output = torch.add(output[0],output[1])/2
            input2 = []
            for y in range(batchsize):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2,no_of_layers,batchsize,hidden_size)
            hidden1 = torch.add(hidden[0],hidden[1])/2
#             hidden1 = hidden[0]
            cell = cell.resize(2,no_of_layers,batchsize,hidden_size)
            cell1 = torch.add(cell[0],cell[1])/2
#             cell1 = cell[0]
            OGhidden = hidden1
            predicted = []
            predictions = []
            if(attention == True):
                temp = output
            else:
                temp = OGhidden
            for i in range(val_max_hin_len):
                output1,(hidden1,cell1) = decoder.forward(input2,hidden1,cell1,temp,False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2,dim = 2)
                predictions.append(output3)
                input2 = output3
            predicted = torch.cat(tuple(x for x in predicted),dim =1).to(device).resize(val_max_hin_len*batchsize,len(Hin_dict))
            predictions = torch.cat(tuple(x for x in predictions),dim =1).to(device)
            total_acc += accuracy(val_hin_word[x:x+batchsize].to(device),predictions,x)
            loss  = nn.CrossEntropyLoss(reduction = 'sum')(predicted,val_hin_word[x:x+batchsize].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
#             print(loss.item())
        validation_loss = total_loss/(len(valdata)*val_max_hin_len)
        validation_accuracy = (total_acc/len(valdata))*100
        del(predictions)
        del(predicted)
        del(input2)
        del(output1)
        del(output2)
        del(output3)
        del(hidden1)
        del(cell1)
        del(OGhidden)
        del(output)
        del(cell)
        return validation_loss,validation_accuracy

In [None]:
def attentiontrain(batchsize,hidden_size,char_embed_size,no_of_layers,dropout,epochs,rnn):
    gc.collect()
    torch.autograd.set_detect_anomaly(True)
    encoder = Encoder(char_embed_size,hidden_size,no_of_layers,dropout,rnn).to(device)
    decoder = Attention(char_embed_size,hidden_size,no_of_layers,dropout,batchsize,rnn).to(device)
    print(encoder.parameters)
    print(decoder.parameters)
    opt_encoder = optim.Adam(encoder.parameters(),lr = 0.001)
    opt_decoder  = optim.Adam(decoder.parameters(),lr = 0.001)
    teacher_ratio = 0.5
    for _ in range(epochs):
        torch.cuda.empty_cache()
        print(_)
        total_loss = 0
        total_acc = 0
        for x in range(0,len(traindata),batchsize):
            loss = 0
            opt_encoder.zero_grad()
            opt_decoder.zero_grad()
            input_tensor = eng_word[x:x+batchsize].to(device)
            en_hidden = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            en_cell = torch.zeros(2*no_of_layers,batchsize,hidden_size).to(device)
            if(input_tensor.size()[0] < batchsize):
                break
            output,(hidden,cell) = encoder.forward(input_tensor,en_hidden,en_cell)
            output = torch.split(output,[hidden_size,hidden_size],dim = 2)
            output = torch.add(output[0],output[1])/2
            input2 = []
            for y in range(batchsize):
                input2.append([0])
            input2 = torch.tensor(input2).to(device)
            hidden = hidden.resize(2,no_of_layers,batchsize,hidden_size)
            hidden1 = torch.add(hidden[0],hidden[1])/2
            cell = cell.resize(2,no_of_layers,batchsize,hidden_size)
            cell1 = torch.add(cell[0],cell[1])/2
            predicted = []
            predictions = []
#             use_teacher_forcing = True if random.random() < teacher_ratio else False
            for i in range(max_hin_len):
                use_teacher_forcing = True if random.random() < teacher_ratio else False
                output1,(hidden1,cell1) = decoder.forward(input2,hidden1,cell1,output,False)
                predicted.append(output1)
                output2 = decoder.softmax(output1)
                output3 = torch.argmax(output2,dim = 2)
                predictions.append(output3)
                if(use_teacher_forcing):
                    input2 = hin_word[x:x+batchsize,i].to(device).resize(batchsize,1)
                else:
                    input2 = hin_word[x:x+batchsize,i].to(device).resize(batchsize,1)
            
            predicted = torch.cat(tuple(x for x in predicted),dim =1).to(device).resize(max_hin_len*batchsize,len(Hin_dict))
            predictions = torch.cat(tuple(x for x in predictions),dim =1).to(device)
            total_acc += accuracy(hin_word[x:x+batchsize].to(device),predictions,x)
            loss  = nn.CrossEntropyLoss(reduction = 'sum')(predicted,hin_word[x:x+batchsize].reshape(-1).to(device))
            with torch.no_grad():
                total_loss += loss.item()
            loss.backward(retain_graph = True)
            torch.nn.utils.clip_grad_norm_(encoder.parameters(),max_norm = 1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(),max_norm = 1)
            opt_encoder.step()
            opt_decoder.step()
        del(input_tensor)
        del(en_hidden)
        del(en_cell)
        del(predictions)
        del(predicted)
        del(input2)
        del(output1)
        del(output2)
        del(output3)
        del(hidden)
        del(hidden1)
        del(cell1)
        del(output)
        del(cell)
        training_loss = total_loss/(51200*max_hin_len)
        training_accuracy = total_acc/512
        validation_loss,validation_accuracy = valevaluate(True,val_eng_word,val_hin_word,encoder,decoder,batchsize,hidden_size,char_embed_size,no_of_layers)
#         wandb.log({'training_accuracy' : training_accuracy, 'validation_accuracy' : validation_accuracy,'training_loss' : training_loss, 'validation_loss' : validation_loss,'epoch':_+1})
#         if(_ >= epochs/2):
#             teacher_ratio = 0
#         teacher_ratio /= 2
    return encoder,decoder

In [None]:
import wandb
wandb.login()

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [None]:
def main():
    wandb.init(project='CS6910_DLAssignment3')
    config = wandb.config
    wandb.run.name = "_attention_cell_type_{}_bidirec_{}_layers_{}_batchsize_{}_hidden_{}".format(config.cell_type,config.bidirectional,config.no_of_layers,config.batchsize,config.hidden_size)
    hidden_size = config.hidden_size
    char_embed_size = config.input_embedding_size
    no_of_layers = config.no_of_layers
    epochs = config.epochs
    batchsize = config.batchsize
    dropout = config.dropout
    rnn = config.cell_type
    Encoder1,Decoder1 = attentiontrain(batchsize,hidden_size,char_embed_size,no_of_layers,dropout,epochs,rnn)
    attentionmatrix(True,test_eng_word,test_hin_word,Encoder1,Decoder1,1,hidden_size,char_embed_size,no_of_layers,True)
    free_gpu_cache()

In [None]:
sweep_configuration = {
    'method' : 'bayes',
    'metric' : { 'goal' : 'maximize',
    'name' : 'validation_accuracy'},
    'parameters':{
        'batchsize' : {'values' : [128,256,512]},
        'input_embedding_size' : {'values' : [128,256,512,1024]},
        'no_of_layers' : {'values' : [1,2,3,4]},
        'hidden_size' : {'values' : [128,256,512,1024]},
        'cell_type' : {'values' : ['LSTM']},
        'bidirectional' : {'values' : ['Yes']},
        'dropout' : {'values' : [0.3,0.4,0.5]},
        'epochs' : {'values' : [10,20,30]}
    }
}
sweep_id = wandb.sweep(sweep = sweep_configuration,project = 'CS6910_DLAssignment3')
wandb.agent(sweep_id,function=main,count = 30)

In [None]:
#best parameters for attention

In [None]:
Encoder1,Decoder1 = attentiontrain(batchsize,hidden_size,char_embed_size,no_of_layers,dropout,epochs,rnn)

In [None]:
test_loss,test_accuracy,predictions = Evaluate(True,test_eng_word,test_hin_word,Encoder1,Decoder1,batchsize,hidden_size,char_embed_size,no_of_layers)

In [None]:
#combine predictions and english column of testdata and show just head(10)

In [None]:
predictions.to_excel("predictions_vanilla/output.xlsx")