Import Statements

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import csv
import os
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
%%capture
!pip install wandb

In [None]:
import wandb

Load Data

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Unzip data to local session
%%capture
!unzip "/content/drive/MyDrive/aksharantar_sampled.zip"

In [None]:
# Detect all languages
datafolder_path = "/content/aksharantar_sampled/"
languages = [name for name in os.listdir(datafolder_path) if os.path.isdir(os.path.join(datafolder_path, name))]
# print(languages)

In [None]:
# Read Data
allData = {}
language = 'tam'
languages = ['tam']
datasetCategories = ["train", "test", "valid"]
dataPath = datafolder_path + language + "/"
allData[language] = {}
for category in datasetCategories:
    allData[language][category] = pd.read_csv(dataPath + language + "_" + category + ".csv", header = None)

In [None]:
train_pairs = []
for language in languages:
    for idx in range(len(allData[language]['train'][0])):
        pair = [allData[language]['train'][0][idx], allData[language]['train'][1][idx]]
        train_pairs.append(pair)

val_pairs = []
for language in languages:
    for idx in range(len(allData[language]['valid'][0])):
        pair = [allData[language]['valid'][0][idx], allData[language]['valid'][1][idx]]
        val_pairs.append(pair)

test_pairs = []
for language in languages:
    for idx in range(len(allData[language]['test'][0])):
        pair = [allData[language]['test'][0][idx], allData[language]['test'][1][idx]]
        test_pairs.append(pair)

Create Language Model

In [None]:
class LangModel:
    def __init__(self, name = "devanagiri"):
        self.name = name
        self.character2index = {"SOW" : 0, "EOW" : 1}
        self.index2character = {0 : "SOW", 1 : "EOW"}
        self.nCharacters = 2
        self.character2count = {}

    def addWord(self, word):
        for character in word:
            self.addCharacter(character)
    
    def addCharacter(self, character):
        if character not in self.character2index:
            self.character2index[character] = self.nCharacters
            self.character2count[character] = 1
            self.index2character[self.nCharacters] = character
            self.nCharacters += 1
        else:
            self.character2count[character] += 1


In [None]:
def prepareData(allData, languages):
    inputModel = LangModel(name = 'eng')
    outputModel = LangModel()
    for language in languages:
        nSamples = len(allData[language]['train'])
        for i in range(nSamples):
            inputModel.addWord(allData[language]['train'][0][i])
            outputModel.addWord(allData[language]['train'][1][i])
    return inputModel, outputModel

In [None]:
input_lang, output_lang = prepareData(allData, languages)

Preparing Training Data

In [None]:
def indexesFromWord(lang, word):
    return [lang.character2index[character] for character in word]

def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(lang.character2index["EOW"])
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromWord(input_lang, pair[0])
    target_tensor = tensorFromWord(output_lang, pair[1])
    return (input_tensor, target_tensor)

Seq2Seq Model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers = 3, embedding_size = 256, bidirectional = True, cell_type = "GRU"):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.bidirectional = bidirectional
        self.cell_type = cell_type
        if cell_type == "RNN":
            self.rnn = nn.RNN(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)
        elif cell_type == "LSTM":
            self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)
        else:
            self.gru = nn.GRU(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)

    def forward(self, input, hidden, c):
        # print("Encoder")
        # print("Input shape :", input.shape)
        embedded = self.embedding(input).view(1, 1, -1)
        # print("Embedded shape :", embedded.shape)
        output = embedded
        if self.cell_type == "RNN":
            output, hidden = self.rnn(output, hidden)
        elif self.cell_type == "LSTM":
            output, (hiddden, c) = self.lstm(output, (hidden, c))
        else:
            output, hidden = self.gru(output, hidden)
        # print("output shape :", output.shape)
        # print("hidden shape :", hidden.shape)
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device), torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p = 0, bidirectional = True, num_layers = 3, embedding_size = 256, cell_type = "GRU"):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.dropout_p = dropout_p
        self.num_layers = num_layers
        self.embedding_size = embedding_size
        self.bidirectional = bidirectional
        self.cell_type = cell_type
        # # print(output_size, embedding_size)

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.dropout = nn.Dropout(self.dropout_p)
        if cell_type == "RNN":
            self.rnn = nn.RNN(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)
        elif cell_type == "LSTM":
            self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)
        else:
            self.gru = nn.GRU(embedding_size, hidden_size, bidirectional = bidirectional, num_layers = num_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, c):
        # print(input.shape)
        output = self.embedding(input)
        # print(output.shape)
        output = output.view(1, 1, -1)
        output = F.relu(output)
        if self.cell_type == "RNN":
            output, hidden = self.rnn(output, hidden)
        elif self.cell_type == "LSTM":
            output, (hiddden, c) = self.lstm(output, (hidden, c))
        else:
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size, device=device), torch.zeros(self.num_layers, 1, self.hidden_size, device=device)

Training Loop

In [None]:
teacher_forcing_ratio = 0.5
SOS_token = 0
EOS_token = 1
MAX_LENGTH = input_lang.nCharacters + 10

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden, c = encoder.initHidden()
    # # print(encoder_hidden.shape)
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden, c)
        # encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, c)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden= decoder(
                decoder_input, decoder_hidden, c)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def trainIters(pairs, encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):
    plot_losses = []
    plot_valacc = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print("Iteration :", iter)
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0

            # val_acc = calc_acc(encoder, decoder, val_pairs)
            plot_losses.append(print_loss_avg)
            # plot_valacc.append(val_acc)
            # wandb.log({"train_loss" : print_loss_avg})
            # print("Loss :", print_loss_avg)
            
            val_acc = calc_acc(encoder, decoder, val_pairs)
            plot_valacc.append(val_acc)
            # wandb.log({"val_acc" : val_acc})
            # print("Val Acc:", val_acc)

    return plot_losses, plot_valacc
  

In [None]:
def evaluate(encoder, decoder, word, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromWord(input_lang, word)
        input_length = input_tensor.size()[0]
        encoder_hidden, c = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden, c)
            # encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, c)
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOW>')
                break
            else:
                decoded_words.append(output_lang.index2character[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def calc_acc(encoder, decoder, data_pairs):
    num_correct = 0
    num_total = 0
    for pair in data_pairs:
        pred = ""
        for c in evaluate(encoder, decoder, pair[0])[:-1]:
            pred += c
        num_total += 1
        if(pred == pair[1]):
            num_correct += 1
    return num_correct/num_total

In [None]:
# hidden_size = 257
# encoder1 = EncoderRNN(input_lang.nCharacters, hidden_size, bidirectional = False, cell_type = "LSTM").to(device)
# decoder1 = DecoderRNN(hidden_size, output_lang.nCharacters, bidirectional = False, cell_type = "LSTM").to(device)

# trainIters(train_pairs, encoder1, decoder1, 25000, print_every=2500, learning_rate = 0.001)

Wandb Sweeps

In [None]:
import wandb
wandb.login()

True

In [None]:
sweep_config = {
    "name": "Bayesian Sweep without attention",
    "method": "bayes",
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        
        "num_layers" : {"values" : [1, 2, 3]},

        "cell_type" : {"values" : ["LSTM", "RNN", "GRU"]},

        "dropout" : {"values" : [0, 0.2, 0.3]},

        "embedding_size" : {"values" : [32, 128, 256]},

        "learning_rate" : {"values" : [0.001, 0.01]}
    }
}

In [None]:
def create_and_train_model(config = None):
    
    with wandb.init(config = config, project = "CS6910-A3") as run:
        
        config = wandb.config

        name_str = "nl_" + str(config['num_layers']) + "_" + str(config['cell_type']) + "_d_" + str(config['dropout']) + "_es_" + str(config['embedding_size']) + "_lr_" + str(config['learning_rate'])
        run.name = name_str

        embedding_size = config['embedding_size']
        bidirectional = False
        dropout_p = config['dropout']
        cell_type = config['cell_type']
        num_layers = config['num_layers']
        learning_rate = config['learning_rate']
        hidden_size = 256

        encoder = EncoderRNN(input_lang.nCharacters, hidden_size, num_layers, embedding_size, bidirectional, cell_type).to(device)
        decoder = DecoderRNN(hidden_size, output_lang.nCharacters, dropout_p, bidirectional, num_layers, embedding_size, cell_type).to(device)

        # We will train for 50000 iterations and select the most promising
        losses, valaccs = trainIters(train_pairs, encoder, decoder, 50000, print_every = 5000, learning_rate = learning_rate)

        for idx in range(len(losses)):
            wandb.log({"train_loss" : losses[idx],
                       "val_acc" : valaccs[idx],
                       "iterations" : 5000*(idx + 1)})


In [None]:
sweep_id = wandb.sweep(sweep_config, project = "CS6910-A3")

Create sweep with ID: ctjyxywh
Sweep URL: https://wandb.ai/mani-ml/CS6910-A3/sweeps/ctjyxywh


In [None]:
agent = wandb.agent(sweep_id, function = create_and_train_model, project = "CS6910-A3", count = 30)
wandb.finish()

In [None]:
def evaluateRandomly(pairs, encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_Word = ''.join(output_words[:-1])
        print('<', output_Word)
        print('')