In [None]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
import torch.nn.functional as F
from torchvision import transforms
import nltk
from torch.nn import init
import pickle

In [None]:
#Function to get the input from the input file.
def input_data(path):
    
    file=open(path,'r',encoding="utf-8")
    train=file.readlines()
    file.close()
    
    context=[]
    utter=[]
    label=[]
    
    for i in train:
        c,u,l=i.split('\t')#the question, response and label are tab separated 
        context.append(c.strip().split())
        utter.append(u.strip().split())
        label.append(int(l.strip()))
    
    return context,utter,label

In [None]:
#Function to create pairings for all the unique words in the corpus with a unique index
def word_index(context_train,utter_train,context_test,context_valid,utter_test,utter_valid):
    
    wordindex={}
    k=0
    train_words=context_train+utter_train+context_test+context_valid+utter_test+utter_valid
    
    for sentence in tqdm(train_words):
        for word in sentence:
            
            if word not in wordindex.keys():
                wordindex[word]=k
                k+=1
                
            else:
                continue
                
    return wordindex

In [None]:
#Function to return maximum length of a sentence in a list of sentences.
def max_utt_len(sentences):
    
    maxi=0
    for i in sentences:
        maxi=max(len(i),maxi)
        
    return maxi

In [None]:
#Function to convert all the words in sentences with their respective indices which are paired in the word_index function.
def wordtoindex(sentences,dictionary):
    
    for sent in tqdm(sentences):
        for i in range(len(sent)):
            sent[i]=dictionary[sent[i]]

In [None]:
#Function for padding the sentences and creating a mask for each sentence so that all sentences in a corpus are of same size.
def matrix(sentence,max_len):
    
    mask=[]
    
    for i in tqdm(range(len(sentence))):
        
        mask.append([1 for i in range(len(sentence[i]))])
        for j in range(max_len-len(sentence[i])):
            
            sentence[i].append(0)
            mask[i].append(0)
            
    return mask

In [None]:
#Function to map index of a word to its respective glove embedding.
def id_to_glove(id_dict,glove_embeds):
    
    id_to_glove={}
    for word,embed in glove_embeds.items():
        
        if word in id_dict:
            id_to_glove[id_dict[word]]=np.array(embed,dtype='float32')
            
#if any word from the corpus is not present in the glove embedding words then embeddings for those words are randomly initiated
    for word,index in id_dict.items():
        
        if index not in id_to_glove:
            vec=np.zeros(50,dtype='float32')
            vec[:]=np.random.randn(50)*0.01
            id_to_glove[ind]=vec
            
    return id_to_glove

In [None]:
context_train,utter_train,label_train=input_data("../Corpus/WikiQA-train.txt")
context_test,utter_test,label_test=input_data("../Corpus/WikiQA-test.txt")
context_valid,utter_valid,label_valid=input_data("../Corpus/WikiQA-dev.txt")

In [None]:
#Getting all the responses from the training dataset and dumping them as a pkl file for using it during inference.
file=open("../Corpus/WikiQA-train.txt",'r',encoding="utf-8")
train=file.readlines()
file.close()

all_answers=[]
for i in train:
    all_answers.append(i.split('\t')[1])
    
file1=open("../Backend/allresponses.pkl",'ab')
pickle.dump(all_answers,file1)
file1.close()

In [None]:
#Creating a dictionary, words mapping their respective glove embeddings.
file=open("../Corpus/glove.6B.50d.txt",'r',encoding="utf-8")
wordembeds=file.readlines()
file.close()

glove_embeds={}
for i in tqdm(wordembeds):
    
    ls=i.strip().split()
    
    for i in range(1,len(ls)):
        ls[i]=float(ls[i])
        
    glove_embeds[ls[0]]=ls[1:]

In [None]:
#Creating word to index mappings and dumping for further use in inference
word_to_index=word_index(context_train,utter_train,context_test,context_valid,utter_test,utter_valid)

file=open("../Backend/word_to_index_dictionary.pkl",'ab')
pickle.dump(word_to_index,file)
file.close()

In [None]:
#Getting rid of long texts
for i in range(len(utter_train)):
    if len(utter_train[i])>150:
        utter_train[i]=utter_train[i][:150]

In [None]:
#Converting all the words to their respective indices
wordtoindex(context_train,word_to_index)
wordtoindex(utter_train,word_to_index)
wordtoindex(context_test,word_to_index)
wordtoindex(utter_test,word_to_index)
wordtoindex(context_valid,word_to_index)
wordtoindex(utter_valid,word_to_index)

In [None]:
#Creating a id to glove embedding mapping
words_id_glove=id_to_glove(word_to_index,glove_embeds)

In [None]:
#Creating of mask and padding of sentences to max length.
mask_utter_train=matrix(utter_train,max_utt_len(utter_train))
mask_sentence_train=matrix(context_train,max_utt_len(context_train))
mask_utter_test=matrix(utter_test,max_utt_len(utter_test))
mask_sentence_test=matrix(context_test,max_utt_len(context_test))
mask_utter_valid=matrix(utter_valid,max_utt_len(utter_valid))
mask_sentence_valid=matrix(context_valid,max_utt_len(context_valid))

In [None]:
#Converting training testing and validation sets to numpy arrays
context_train=np.array(context_train)
utter_test=np.array(utter_test)
context_test=np.array(context_test)
utter_valid=np.array(utter_valid)
context_valid=np.array(context_valid)

mask_utter_train=np.array(mask_utter_train)
mask_sentence_train=np.array(mask_sentence_train)
mask_utter_test=np.array(mask_utter_test)
mask_sentence_test=np.array(mask_sentence_test)
mask_utter_valid=np.array(mask_utter_valid)
mask_sentence_valid=np.array(mask_sentence_valid)

label_train=np.array(label_train,dtype='float32')
label_test=np.array(label_test,dtype='float32')
label_valid=np.array(label_valid,dtype='float32')

In [None]:
utter_train=np.array(utter_train)

In [None]:
#Defining torch custom dataloader.
class customdataloader(torch.utils.data.Dataset):
    def __init__(self,sent,utter,lab,sent_mask,utter_mask):
        self.sent=sent
        self.utter=utter
        self.lab=lab
        self.sent_mask=sent_mask
        self.utter_mask=utter_mask
    def __len__(self):
        return len(self.lab)
    def __getitem__(self,idx):
        return (self.sent[idx],self.utter[idx],self.lab[idx],self.sent_mask[idx],self.utter_mask[idx])

In [None]:
class Net(nn.Module):
    def __init__(self,Dictionary,word_embedding_length=50):
        
        super(Net,self).__init__()
        
        self.Dictionary=Dictionary
        self.lenDictionary=len(Dictionary)
        self.word_embedding_length=word_embedding_length
        self.embedding=nn.Embedding(self.lenDictionary,self.word_embedding_length)
        self.lstmBlock=nn.LSTM(self.word_embedding_length,self.word_embedding_length)
        self.dropout=nn.Dropout(0.5)
        self.init_weights()
        
    def init_weights(self):#for providing weights manually
        
        embedding_weights=torch.FloatTensor(self.lenDictionary,self.word_embedding_length)
        
        for idx,glove in self.Dictionary.items():
            embedding_weights[idx]=torch.FloatTensor(list(glove))#initializing weights with glve vectors
            
        self.embedding.weight=nn.Parameter(embedding_weights,requires_grad=True)#Training these weights to reach optimal values
        self.embedding=nn.Embedding.from_pretrained(self.embedding.weight)
    
    def forward(self,sent,mask):
        
        out_sent=self.forwardLSTM(sent,mask)#lstm layer
        return out_sent
    
    def forwardLSTM(self,utt,mask):
        
        output=torch.zeros([utt.shape[0],self.word_embedding_length])#initializing a output tensor shape->[bathc_size,50]
        
        for no,(utti,maski) in enumerate(zip(utt,mask)):
            
            utti_embed=self.embedding(utti)
            numutt=torch.sum(maski)
            utti_embed=utti_embed[:numutt].unsqueeze(1)#shape->[number_of_words,1,embedding size=50]
            
            _,(last_hidden,_)=self.lstmBlock(utti_embed)#last_hidden shape->[1,1,embedding_size=50]
            last_hidden=self.dropout(last_hidden[0][0])
            output[no]=last_hidden
            
        return output

In [None]:
def train(model,train_loader,optimizer,epoch):
    
    model.train()#Preparing the model for training.
    
    for batchid,(sent,utt,lab,masksent,maskutt) in enumerate(train_loader):#getting the batch
        
        optimizer.zero_grad()#setting the cummulative gradients to 0.
        
        output_sent=model(sent,masksent)#forward pass shape->[batch_size,50]
        output_utt=model(utt,maskutt)#forward pass shape->[batch_size,50]
        
        #Finding the MSE loss
        loss=0
        for i in range(len(output_sent)):
            loss+=(output_sent[i].dot(output_utt[i])-lab[i])**2
        loss/=len(output_sent)
        
        loss.backward()#calculating gradients of model
        optimizer.step()#updating model parameters
        
        if batchid % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batchid * len(sent), len(train_loader.dataset),
            100. * batchid / len(train_loader), loss.item()))

In [None]:
def validation(model,valid_loader):
    
    model.eval()#preparing the model for evaluation
    correct=0# variable to store total correct predictions
    
    with torch.no_grad():#to ensure gradients are not calculated as calculating gradients is not required for testing
        
        for batchid,(sent,utt,lab,masksent,maskutt) in enumerate(valid_loader):
            
            output_utt=model(utt,maskutt)#forward pass shape->[batch_size,50]
            output_sent=model(sent,masksent)#forward pass shape->[batch_size,50]
            
            for i in range(len(output_sent)):#calculating number of correct predictions
                if int(output_sent[i].dot(output_utt[i])+0.5)==lab[i]:
                    correct+=1
                    
        print(100*correct/len(valid_loader.dataset))

In [None]:
def seed(seed_value):
    #this function removes randomness and makes everything deterministic
    #here we set the seed for torch.cuda,torch,numpy and random.
    #torch.cuda.manual_seed_all(seed_value) ,if we are using multi-GPU then we should use this to set the seed.
    torch.cuda.manual_seed_all(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)

In [None]:
def main():
    
    seed(0)#fixing the randomness of the code
    
     #passing the data into custom data loader
    train_data=customdataloader(context_train,utter_train,label_train,mask_sentence_train,mask_utter_train)
    test_data=customdataloader(context_test,utter_test,label_test,mask_sentence_test,mask_utter_test)
    valid_data=customdataloader(context_valid,utter_valid,label_valid,mask_sentence_valid,mask_utter_valid)
    
    train_loader=DataLoader(train_data,num_workers=0,batch_size=40,shuffle=False)#getting train data loader
    test_loader=DataLoader(test_data,num_workers=0,batch_size=1000,shuffle=False)#getting test data loader
    valid_loader=DataLoader(valid_data,num_workers=0,batch_size=1000,shuffle=False)#getting validation data loader
    
    model=Net(words_id_glove)
    optimizer=optim.Adam(model.parameters(),lr=0.001)
    
    for epoch in range(1,101):
        train(model,train_loader,optimizer,epoch)
        validation(model,valid_loader)
    validation(model,test_loader)
    
    #Storing all the response embeddings and dumping them in a pkl file for further use during inference.
    answer_embeddings=[]
    for (sent, utt, lab, masksent, maskutt) in train_loader:
        output=model(utt)
        answer_embeddings.apppend(torch.tolist(output))
    
    file = open("../Backend/all_response_embeddings.pkl", 'ab')
    pickle.dump(answer_embeddings,file2)
    file.close()

    #saving the model and loading it
    torch.save(model.state_dict(), "wikiqafinal.pt")

In [None]:
if __name__=="__main__":
    main()

In [None]:
model=Net(words_id_glove)
model.load_state_dict(torch.load("wikiqafinal.pt"))

#saving the model in onnx format
dummy1=torch.randint(0,2000,(1,len(context_train[0])))
dummy2=torch.randint(0,2000,(1,len(context_train[0])))
torch.onnx.export(
    model,args=(dummy1,dummy2),f="model1.onnx",verbose=True,opset_version=11,
    input_names=['data1','data2'],output_names=['output1'])