In [None]:
# import cell
import numpy as np
import pandas as pd
import random
import wandb

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

import time
import math

# Paste your own key here
wandb.login()
# functions with comments explained in script file
# no redundant comments here
# only new functions are explained here in comments

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

SOS_token = "@"
EOS_token = "#"
PAD_token = "^"
UNK_token = "$"

SOS_idx = 0
EOS_idx = 1
PAD_idx = 2
UNK_idx = 3

batch_size = 32

In [None]:
def timeInMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    s = format(s, ".0f")
    return str(m) + "m " + str(s) + "s"

In [None]:
class Script:
    def __init__(self, name):
        self.name = name
        self.char2index = {SOS_token: SOS_idx, EOS_token: EOS_idx, PAD_token: PAD_idx, UNK_token: UNK_idx}
        self.char2count = {}
        self.index2char = {SOS_idx: SOS_token, EOS_idx: EOS_token, PAD_idx: PAD_token, UNK_idx: UNK_token}
        self.n_chars = 4  # Count SOS, EOS, PAD and UNK

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

In [None]:
def prepareVocab(data, in_scr="lat", out_scr="dev"):
    input_vocab = Script(in_scr)
    output_vocab = Script(out_scr)
    
    for pair in data:
        input_vocab.addWord(pair[0])
        output_vocab.addWord(pair[1])
    
    return input_vocab, output_vocab

In [None]:
def tensorFromWord(word, vocab, sos=False, eos=False):
    char_list = []
    if sos:
        char_list.append(vocab.char2index[SOS_token])
    for char in word:
        if char in vocab.char2index:
            char_list.append(vocab.char2index[char])
        else:
            char_list.append(vocab.char2index[UNK_token])
    if eos:
        char_list.append(vocab.char2index[EOS_token])
    char_tensor = torch.tensor(char_list, dtype=torch.long)
    return char_tensor

In [None]:
def processData(data, vocab, sos=False, eos=False):
    tensor_list = []
    for word in data:
        word_tensor = tensorFromWord(word, vocab, sos, eos)
        tensor_list.append(word_tensor)
    word_tensor_pad = pad_sequence(tensor_list, padding_value=PAD_idx, batch_first=True)
    return word_tensor_pad

In [None]:
def wordFromTensor(word_tensor, vocab):
    word = ""
    for idx in word_tensor:
        if idx == EOS_idx:
            break
        if idx >= UNK_idx:
            word += vocab.index2char[idx.item()]
    return word

In [None]:
class Encoder(nn.Module):
    def __init__(self, cell_type, input_size, embedding_size, hidden_size, num_layers, dp, bidir=False):
        super(Encoder, self).__init__()
        self.cell_type = cell_type
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dp)
        self.bidir = bidir
        
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        if self.num_layers == 1:
            dp = 0.0
        if self.cell_type == "RNN":
            self.cell = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp, bidirectional=self.bidir)
        elif self.cell_type == "GRU":
            self.cell = nn.GRU(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp, bidirectional=self.bidir)
        elif self.cell_type == "LSTM":
            self.cell = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp, bidirectional=self.bidir)

    def forward(self, x):

        embedding = self.dropout(self.embedding(x))
        
        cell = None
        if self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.cell(embedding)
            if self.bidir:
                b_sz = cell.size(1)
                cell = cell.view(self.num_layers, 2, b_sz, -1)
                cell = cell[-1]
                cell = cell.mean(axis=0)
            else:
                cell = cell[-1,:,:]
            cell = cell.unsqueeze(0)
        else:
            outputs, hidden = self.cell(embedding)
        
        if self.bidir:
            b_sz = hidden.size(1)
            hidden = hidden.view(self.num_layers, 2, b_sz, -1)
            hidden = hidden[-1]
            hidden = hidden.mean(axis=0)
        else:
            hidden = hidden[-1,:,:]
        hidden = hidden.unsqueeze(0)

        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(
        self, cell_type, input_size, embedding_size, hidden_size, output_size, num_layers, dp
    ):
        super(Decoder, self).__init__()
        self.cell_type = cell_type
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dp)

        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        if self.num_layers == 1:
            dp = 0.0
        if self.cell_type == "RNN":
            self.cell = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp)
        elif self.cell_type == "GRU":
            self.cell = nn.GRU(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp)
        elif self.cell_type == "LSTM":
            self.cell = nn.LSTM(self.embedding_size, self.hidden_size, self.num_layers, dropout=dp)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x, hidden, cell):
        
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        

        if self.cell_type == "LSTM":
            outputs, (hidden, cell) = self.cell(embedding, (hidden, cell))
        else:
            outputs, hidden = self.cell(embedding, hidden)
        

        predictions = self.fc(outputs)

        
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_sz = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = self.decoder.output_size

        outputs = torch.zeros(target_len, batch_sz, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        hidden = hidden.repeat(self.decoder.num_layers,1,1)
        if self.decoder.cell_type == "LSTM":
            cell = cell.repeat(self.decoder.num_layers,1,1)

        
        x = target[0]

        for t in range(1, target_len):
            
            output, hidden, cell = self.decoder(x, hidden, cell)

            
            outputs[t] = output

            
            best_guess = output.argmax(dim=1)

            
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [None]:
def sum_accuracy(preds, target):
    num_equal_columns = torch.logical_or(preds == target, target == PAD_idx).all(dim=0).sum().item()
    return num_equal_columns

In [None]:
def evaluateModel(model, dataloader, criterion, b_sz=32):
    model.eval()
    
    n_data = len(dataloader) * b_sz
    loss_epoch = 0
    n_correct = 0
    
    with torch.no_grad():
        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            
            input_seq = input_seq.T.to(device)
            target_seq = target_seq.T.to(device)

            
            output = model(input_seq, target_seq, teacher_force_ratio=0.0)
            
            pred_seq = output.argmax(dim=2)
            n_correct += sum_accuracy(pred_seq, target_seq)

            
            output = output[1:].reshape(-1, output.shape[2])
            target = target_seq[1:].reshape(-1)
            
            loss = criterion(output, target)

            loss_epoch += loss.item()
        
        acc = n_correct / n_data
        acc = acc * 100.0
        loss_epoch /= len(dataloader)
        return loss_epoch, acc

In [None]:
def trainModel(model, criterion, optimizer, train_dataloader, valid_dataloader, num_epochs, batch_size=32):
#     start = time.time()
    max_val_acc = -1.0
    max_val_epoch = 0
    trigger = 0
    
    tr_loss_list = []
    tr_acc_list = []
    val_loss_list = []
    val_acc_list = []
    for epoch in range(num_epochs):
#         print(f"[Epoch {epoch+1} / {num_epochs}]")
        
        model.train()

        for batch_idx, (input_seq, target_seq) in enumerate(train_dataloader):
            
            input_seq = input_seq.T.to(device)
            target_seq = target_seq.T.to(device)

            
            output = model(input_seq, target_seq)
            
            output = output[1:].reshape(-1, output.shape[2])
            target = target_seq[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            
            loss.backward()

            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            
            optimizer.step()

        #-----------------------------------------------
        # Train loss and accuracy
        tr_loss, tr_acc = evaluateModel(model, train_dataloader, criterion, batch_size)
        tr_loss_list.append(tr_loss)
        tr_acc_list.append(tr_acc)
#         print(f"Training Loss: {tr_loss:.2f}")
#         print(f"Training Accuracy: {tr_acc:.2f}")
        

        #-----------------------------------------------
        # Valid loss and accuracy
        val_loss, val_acc = evaluateModel(model, valid_dataloader, criterion, batch_size)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
#         print(f"Validation Loss: {val_loss:.2f}")
#         print(f"Validation Accuracy: {val_acc:.2f}")

#         wandb.log({'tr_loss' : tr_loss, 'tr_acc' : tr_acc, 'val_loss' : val_loss, 'val_acc' : val_acc})

        if val_acc >= max_val_acc:
            trigger = 0
            max_val_acc = val_acc
            max_val_epoch = epoch
        else:
            trigger += 1
        
        if trigger == 5:
            print('Early stopping!')
            break

#         end = time.time()
#         print("Time: ", timeInMinutes(end-start))
#         print("----------------------------------")
    for i in range(max_val_epoch+1):
        wandb.log({'tr_loss' : tr_loss_list[i], 'tr_acc' : tr_acc_list[i], 'val_loss' : val_loss_list[i], 'val_acc' : val_acc_list[i]})

In [None]:
# load dataset
train_data = pd.read_csv('/kaggle/input/eng-hin/hin_train.csv', sep=',', header=None).values
valid_data = pd.read_csv('/kaggle/input/eng-hin/hin_valid.csv', sep=',', header=None).values

In [None]:
# build vocabulary
x_vocab, y_vocab = prepareVocab(train_data)

In [None]:
print(x_vocab.n_chars)
print(y_vocab.n_chars)

In [None]:
x_train = processData(train_data[:,0], x_vocab, eos=True).to(device=device)
x_valid = processData(valid_data[:,0], x_vocab, eos=True).to(device=device)

y_train = processData(train_data[:,1], y_vocab, sos=True, eos=True).to(device=device)
y_valid = processData(valid_data[:,1], y_vocab, sos=True, eos=True).to(device=device)

In [None]:
n_train = x_train.size(0)
n_valid = x_valid.size(0)

print(n_train, n_valid)

In [None]:
train_dataset = TensorDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(x_valid, y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size)

In [None]:
# sweep configuration
sweep_config = {
    'method': 'bayes',
    'name' : 'Bayesian_sweep',
    'metric': {
      'name': 'val_acc',
      'goal': 'maximize'
    },
    'parameters': {
        'cell_type' : {
            'values' : ['LSTM', 'GRU', 'RNN']
        },
        'embedding_size': {
            'values': [64, 128, 256]
        },
         'hidden_size': {
            'values': [64, 128, 256]
        },
        'enc_num_layers': {
            'values': [1, 2, 3]
        },
        'dec_num_layers': {
            'values': [1, 2, 3]
        },
         'dropout': {
            'values': [0.0, 0.2, 0.3]         
        },
        'bidirectional': {
            'values': ['Yes', 'No']   
        },
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project = 'dl_assgn_3_q_2')

In [None]:
def main():
    with wandb.init() as run:
        run_name = 'cell_' + wandb.config.cell_type + '_enc-n-l_' + str(wandb.config.enc_num_layers) + '_dec-n-l_' + str(wandb.config.dec_num_layers) +\
                    '_emb-sz_' + str(wandb.config.embedding_size) + '_hid-sz_' + str(wandb.config.hidden_size) + \
                    '_dp_' + str(wandb.config.dropout) + '_bidir_' + wandb.config.bidirectional
        wandb.run.name = run_name

        num_epochs = 15
        learning_rate = 0.001

        # Fixed parameters for encoder and decoder
        input_size_encoder = x_vocab.n_chars
        input_size_decoder = y_vocab.n_chars
        output_size = input_size_decoder

        # Model hyperparameters
        cell_type = wandb.config.cell_type
        embedding_size = wandb.config.embedding_size
        hidden_size = wandb.config.hidden_size  # Needs to be the same for both RNN's
        enc_num_layers = wandb.config.enc_num_layers
        dec_num_layers = wandb.config.dec_num_layers
        dropout = wandb.config.dropout
        bidirectional = True if wandb.config.bidirectional == "Yes" else False

        encoder_net = Encoder(
        cell_type, input_size_encoder, embedding_size, hidden_size, enc_num_layers, dropout, bidirectional).to(device)

        decoder_net = Decoder(
            cell_type,
            input_size_decoder,
            embedding_size,
            hidden_size,
            output_size,
            dec_num_layers,
            dropout,
        ).to(device)

        model = Seq2Seq(encoder_net, decoder_net).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()
        
        trainModel(model, criterion, optimizer, train_dataloader, valid_dataloader, num_epochs, batch_size)
    
wandb.agent(sweep_id, function = main, count = 100)
wandb.finish()