In [1]:
import csv
import string
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

#setting the device to "cuda" if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
MAX_LENGTH=10

class Vocab_builder:
    def __init__(self):
        self.word_2_index={"<SOS>":0,"<EOS>":1,"<PAD>":2,"<UKN>":3}
        self.index_2_word={0:"<SOS>", 1:"<EOS>", 2:"<PAD>", 3:"<UKN>"}
        self.freq={}
        self.size=4

    def add_this_sentence(self,sentence):
        words=sentence.split(" ")
        for word in words:
            if word not in self.word_2_index:
                #If the word is not there, add it to a new index and store the indexes
                #Initialize the frequency of the word to 1 and increase the size of the vocabulary
                self.word_2_index[word]=self.size
                self.freq[word]=1
                self.index_2_word[self.size]=word
                self.size+=1
            else:
                # If the word is already present then just increase the frequency
                self.freq[word]+=1

In [4]:
#Initilizing the objects of hindi and english vocabularies:
hindi_vocab=Vocab_builder()
eng_vocab=Vocab_builder()

In [5]:
def length(sentence):
    return len(sentence.split(" "))

def is_mixed(sentence):
    letters="abcdefghijklmnopqrstuvwxyz"
    for ch in letters:
        if ch in sentence:
            return True
    return False

def preprocess(sentence):
    #First we will remove all punctuations from the sentence
    punctuations=list(string.punctuation)
    cleaned=""
    for letter in sentence:
        if letter not in punctuations:
            cleaned+=letter
    cleaned=cleaned.lower() ## Converting into lowercase
    return cleaned

In [6]:
def clean_the_data(path):
    pairs=[]
    with open(path,'rt') as f:
        data=csv.reader(f, delimiter=',')
        row_num=0
        for row in data:
            if row_num!=0:  #We will not process first row as it will contain header
                hindi=row[1]
                #print(row_num)
                #print(hindi)
                eng=row[2]
                #print(eng)

                if length(hindi)>=MAX_LENGTH or length(eng)>=MAX_LENGTH:  #skipping if length is more than MAX_LENGTH
                    continue
                if not hindi or not eng:  #skipping pair having any NULL value
                    continue
                if is_mixed(hindi):   #skipping sentence if it contains some english word
                    continue
                hindi=hindi.encode('utf-8',errors='ignore').decode('utf-8')
                eng=eng.encode('ascii',errors='ignore').decode('utf-8')
                hindi=preprocess(hindi)
                eng=preprocess(eng)
                #Adding <SOS>, <EOS> and padding tokens
                pair=[hindi.strip(), eng.strip()]

                hin_extra=MAX_LENGTH-len(hindi.strip().split(" "))
                eng_extra=MAX_LENGTH-len(eng.strip().split(" "))

                hindi_vocab.add_this_sentence(pair[0])
                eng_vocab.add_this_sentence(pair[1])
                pair[0]=pair[0].split(" ")
                pair[0].insert(0,"<SOS>")
                pair[0].append("<EOS>")
                pair[0]=pair[0]+["<PAD>"]*(hin_extra)

                pair[1]=pair[1].split(" ")
                pair[1].insert(0,"<SOS>")
                pair[1].append("<EOS>")
                pair[1]=pair[1]+["<PAD>"]*(eng_extra)

                pair[0]=" ".join(pair[0])
                pair[1]=" ".join(pair[1])
                pairs.append(pair)
            row_num+=1
    return pairs

In [7]:
file_path="/content/drive/MyDrive/"     
train_file_path=file_path+"train-set.csv"

In [8]:
pairs=clean_the_data(train_file_path)

In [9]:
pairs

[['<SOS> मैं उनके साथ कोई लेना देना नहीं है <EOS> <PAD> <PAD>',
  '<SOS> i have nothing to do with them <EOS> <PAD> <PAD> <PAD>'],
 ['<SOS> हटाओ रिक <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
  '<SOS> fuck them rick <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'],
 ['<SOS> क्योंकि यह एक खुशियों भरी फ़िल्म है <EOS> <PAD> <PAD> <PAD>',
  '<SOS> because its a happy film <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>'],
 ['<SOS> क्या भाषा क्या वे वहाँ बात की <EOS> <PAD> <PAD> <PAD>',
  '<SOS> what language do they speak there <EOS> <PAD> <PAD> <PAD> <PAD>'],
 ['<SOS> गन क्लिक करके <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
  '<SOS> gun clicking <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'],
 ['<SOS> ये बिलकुल रोमांचकारी अनुभव है। <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>',
  '<SOS> its thrilling <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'],
 ['<SOS> ♪औरमैंउसे वहाँखड़े देखा थाएक <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
  '<SOS> and i saw her standing there <EOS>

In [10]:
len(pairs)

55133

In [11]:
#Now we need to convert each of this pair into corresponding tensors
def pair_to_tensor(pair):
    hindi_sentence=pair[0]
    eng_sentence=pair[1]
    indexes_hindi=[hindi_vocab.word_2_index[word] for word in hindi_sentence.split(' ')]
    indexes_eng=[eng_vocab.word_2_index[word] for word in eng_sentence.split(' ')]
    hindi_tensor=torch.tensor(indexes_hindi, dtype=torch.long, device=device).view(-1,1)
    eng_tensor=torch.tensor(indexes_eng, dtype=torch.long, device=device).view(-1,1)
    return (hindi_tensor, eng_tensor)

In [12]:
hin_tensors=[]
eng_tensors=[]
for pair in pairs:      # we will convert each pair into tensor to process it
    hin,eng=pair_to_tensor(pair)
    hin_tensors.append(hin)
    eng_tensors.append(eng)

In [13]:
hin_tensors[0]

tensor([[ 0],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [ 1],
        [ 2],
        [ 2]], device='cuda:0')

In [14]:
hin_tensors[1]

tensor([[ 0],
        [12],
        [13],
        [ 1],
        [ 2],
        [ 2],
        [ 2],
        [ 2],
        [ 2],
        [ 2],
        [ 2],
        [ 2]], device='cuda:0')

In [15]:
class Transformer_model(nn.Module):
    def __init__(self, embed_size, len_hin_vocab, len_eng_vocab, src_pad_index, num_heads, enc_layers, dec_layers, forward_exp, dropout, max_length,device):
        super(Transformer_model,self).__init__()
        self.hin_word_embed=nn.Embedding(len_hin_vocab, embed_size) #shape: (len_hin_vocab, embed_size)
        self.eng_word_embed=nn.Embedding(len_eng_vocab, embed_size) #shape: (len_eng_vocab, embed_size)
        self.hin_positional_embed=nn.Embedding(max_length, embed_size) #shape: (MAX_LENGTH, embed_size)
        self.eng_positional_embed=nn.Embedding(max_length, embed_size)  #shape: (MAX_LENGTH, embed_size)
        self.device=device
        self.transformer_layer=nn.Transformer(embed_size, num_heads,enc_layers, dec_layers, forward_expansion, dropout)
        self.out_fc=nn.Linear(embed_size, len_eng_vocab)    #linear layer to predicted the output word
        self.dropout=nn.Dropout(dropout)
        self.src_pad_index=src_pad_index

    def gen_mask_for_hindi(self, source):
        #need to transpose source as padding need to be of size (batch_size, seq_len) but source is of shape (seq_len, batch_size)
        source=source.transpose(0,1)
        mask=(source==self.src_pad_index) #(mask will contain 1 where there is pad token, and 0 otherwise)
        return mask.to(self.device)

    def forward(self, src, target):
        hin_seq_length, batch_size=src.shape
        eng_seq_length, batch_size=target.shape
        # creating positional embeddings to encode position of words in transformer (it will be just a range array upto max_length)
        hin_positional=torch.arange(0,hin_seq_length).unsqueeze(1).expand(hin_seq_length, batch_size).to(self.device)
        eng_positional=torch.arange(0,eng_seq_length).unsqueeze(1).expand(eng_seq_length, batch_size).to(self.device)
        # calculating embeddings as sum of positional and word embeddings
        hin_embedding=self.dropout(self.hin_word_embed(src)+self.hin_positional_embed(hin_positional))
        eng_embedding=self.dropout(self.eng_word_embed(target)+self.eng_positional_embed(eng_positional))
        # generating padding mask for hindi (source)
        hindi_padding_mask=self.gen_mask_for_hindi(src)
        # using in-built transformer function to generate mask for english (target)
        # It will be in form of a lower-triangular matrix
        eng_mask=self.transformer_layer.generate_square_subsequent_mask(eng_seq_length).to(self.device)
        output=self.transformer_layer(hin_embedding, eng_embedding, src_key_padding_mask=hindi_padding_mask, tgt_mask=eng_mask)
        output=self.out_fc(output)
        return output

**Training the Model**

In [16]:
# defining model parameters
embed_size=512
len_hin_vocab=hindi_vocab.size
len_eng_vocab=eng_vocab.size
padding_idx=eng_vocab.word_2_index["<PAD>"]
num_heads=8
enc_layers, dec_layers= 1,1
dropout=0.10
forward_expansion=4

In [17]:
model=Transformer_model(embed_size, len_hin_vocab, len_eng_vocab, padding_idx, num_heads, enc_layers, dec_layers, forward_expansion, dropout, MAX_LENGTH+2,device).to(device)



In [18]:
model_available=False # A variable to indicate whether a model is present in the path or not

In [19]:
batch_size=64
optimizer=optim.Adam(model.parameters(),lr=0.001)
PATH="/content/hi_en_demo.pth"

epochs=2
epoch_loss=0.0

criterion=nn.CrossEntropyLoss(ignore_index=padding_idx) #ignore padding index while calculating loss

train_model=True #if need to train the model again, set it to True

if train_model == False:
    model = torch.load(PATH)
else:
    if model_available:
        model = torch.load(PATH)
    else:
        epochs_completed = 0
        while epochs_completed < epochs:
            epochs_completed += 1
            print(f"epoch {epochs_completed}/{epochs}")
            model.eval()
            model.train(True)
            cur_batch = 0
            idx = 0
            while idx < len(pairs):
                cur_batch += 1
                if cur_batch % 100 == 0:
                    print(f"    running batch {cur_batch} of {batches}")
                if idx + batch_size < len(pairs):
                    src_batch = hin_tensors[idx:idx+batch_size]
                    target_batch = eng_tensors[idx:idx+batch_size]
                else:
                    src_batch = hin_tensors[idx:]
                    target_batch = eng_tensors[idx:]

                src_batch = torch.cat(src_batch, dim=1).to(device)
                target_batch = torch.cat(target_batch, dim=1).to(device)
                output = model(src_batch, target_batch[:-1, :])
                output = output.reshape(-1, output.shape[2])

                target = target_batch[1:].reshape(-1)

                optimizer.zero_grad()
                loss = criterion(output, target)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

                optimizer.step()
                epoch_loss += loss.item()

                idx += batch_size

            print(f"Epoch loss : {loss.item()}")

            torch.save(model, PATH)
            model_available = True

epoch 1/2
    running batch 100 of 861
    running batch 200 of 861
    running batch 300 of 861
    running batch 400 of 861
    running batch 500 of 861
    running batch 600 of 861
    running batch 700 of 861
    running batch 800 of 861
Epoch loss : 4.024248123168945
epoch 2/2
    running batch 100 of 861
    running batch 200 of 861
    running batch 300 of 861
    running batch 400 of 861
    running batch 500 of 861
    running batch 600 of 861
    running batch 700 of 861
    running batch 800 of 861
Epoch loss : 3.456819772720337


In [20]:
def predict_translation(model, sentence, device, max_length=MAX_LENGTH):
    sentence = clean_sentence(sentence)
    tokens = sentence.split(" ")
    indexes = []
    for token in tokens:
        if token in hindi_vocab.word_2_index:
            indexes.append(hindi_vocab.word_2_index[token])
        else:
            indexes.append(hindi_vocab.word_2_index["<UKN>"])
    indexes = indexes[:MAX_LENGTH+2]  # model is trained on MAX_LENGTH sentences only so it expects sentences of this length only
    tensor_of_sentence = torch.LongTensor(indexes).unsqueeze(1).to(device)
    outputs = [0]  # adding <SOS> in the beginning of output
    while True:
        target_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(tensor_of_sentence, target_tensor)
        pred = output.argmax(2)[-1, :].item()

        outputs.append(pred)

        if eng_vocab.index_2_word[pred] == "<EOS>":
            break

        if len(outputs) >= max_length:
            break

    final = []

    for i in outputs:
        if i == "<PAD>":
            break
        final.append(i)

    final = [eng_vocab.index_2_word[idx] for idx in final]
    translated = " ".join(final)
    return translated

In [21]:
from time import time

start=time()
print(predict_translation(model,'हमसफर',device))
end=time()

elapse=(end-start)*1000
print(f'Time elapses: {elapse}ms')

<SOS> all right <EOS>
Time elapses: 98.72007369995117ms


In [22]:
test_sentences=[pair[0] for pair in pairs[125:150]]
actual_sentences=[pair[1] for pair in pairs[125:150]]
pred_sentences=[]

for idx,i in enumerate(test_sentences):
    translated=predict_translation(model,i,device)
    print("*"*20)
    print(f"Hindi: {i}")
    print(f"Actual: {actual_sentences[idx]}")
    print(f"Predicted: {translated}")
    print("*"*20)

********************
Hindi: <SOS> दीदी फ़ूल दे दीजिये <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> sister flower please <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> the whole doctor <EOS>
********************
********************
Hindi: <SOS> ग्रन्ट्स <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> grunts <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> grunts <EOS>
********************
********************
Hindi: <SOS> ca चलिए उसे यहाँ दिखाते हैं <EOS> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> ca i think we have that lets show that <EOS> <PAD>
Predicted: <SOS> ca here <EOS>
********************
********************
Hindi: <SOS> नुक्सान किन में जमा हो सकता है <EOS> <PAD> <PAD> <PAD>
Actual: <SOS> what can damage accumulate in <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> the first is in the world <EOS>
********************
********************
Hindi: <SOS> और मैं यहीं रुकता हूं धन्यवाद <EOS> <PAD> <P

In [23]:
test_sentences=[pair[0] for pair in pairs[125:150]]
actual_sentences=[pair[1] for pair in pairs[125:150]]
pred_sentences=[]

for idx,i in enumerate(test_sentences):
    translated=predict_translation(model,i,device)
    print("*"*20)
    print(f"Hindi: {i}")
    print(f"Actual: {actual_sentences[idx]}")
    print(f"Predicted: {translated}")
    print("*"*20)

********************
Hindi: <SOS> दीदी फ़ूल दे दीजिये <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> sister flower please <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> the data is a little bit <EOS>
********************
********************
Hindi: <SOS> ग्रन्ट्स <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> grunts <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> grunts <EOS>
********************
********************
Hindi: <SOS> ca चलिए उसे यहाँ दिखाते हैं <EOS> <PAD> <PAD> <PAD> <PAD>
Actual: <SOS> ca i think we have that lets show that <EOS> <PAD>
Predicted: <SOS> ca here lets go here <EOS>
********************
********************
Hindi: <SOS> नुक्सान किन में जमा हो सकता है <EOS> <PAD> <PAD> <PAD>
Actual: <SOS> what can damage accumulate in <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: <SOS> i have to be in the world <EOS>
********************
********************
Hindi: <SOS> और मैं यहीं रुकता हूं ध

In [26]:
fp=open("/content/drive/MyDrive/Colab Notebooks/answer.txt","w")

In [27]:
# Path of the validation/test data
val_data_path="/content/drive/MyDrive/test-statements-phase1.csv"
with open(val_data_path, 'rt') as f:
    data=csv.reader(f, delimiter=',')
    row_num=0
    for row in data:
        if row_num==0:
            row_num+=1
            continue
        sentence=row[2].strip()
        translated=predict_translation(model,sentence,device)
        translated=translated.split(" ")[1:-1]  #removing SOS and EOS token before writing to the file
        translated=" ".join(translated)
        fp.write(translated+'\n')
        if row_num%500==0:
            print(f"sentence : {row_num}")
        row_num+=1
fp.close()


sentence : 500
sentence : 1000
sentence : 1500
sentence : 2000
sentence : 2500
sentence : 3000
sentence : 3500
sentence : 4000
sentence : 4500
sentence : 5000
