In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
import torch.nn.functional as F
from torchvision import transforms
import nltk
from torch.nn import init

In [2]:
import pickle

In [3]:
def input_data(path):
    file=open(path,'r',encoding="utf-8")
    train=file.readlines()
    file.close()
    context=[]
    utter=[]
    label=[]
    for i in train:
        c,u,l=i.split('\t')
        context.append(c.strip().split())
        utter.append(u.strip().split())
        label.append(int(l.strip()))
    return context,utter,label

In [4]:
context_train,utter_train,label_train=input_data("chatbot/WikiQA-train.txt")
context_test,utter_test,label_test=input_data("WikiQACorpus/WikiQACorpus/WikiQA-test.txt")
context_valid,utter_valid,label_valid=input_data("WikiQACorpus/WikiQACorpus/WikiQA-dev.txt")

In [5]:
label_train=np.array(label_train,dtype='float32')
label_test=np.array(label_test,dtype='float32')
label_valid=np.array(label_valid,dtype='float32')

In [6]:
o=0
for i in label_train:
    if i==0:
        o+=1
print(o/len(label_train))

0.9489194499017681


In [7]:
q,a,l=input_data("WikiQACorpus/WikiQACorpus/WikiQA-train.txt")

In [9]:
a[0]

['A',
 'partly',
 'submerged',
 'glacier',
 'cave',
 'on',
 'Perito',
 'Moreno',
 'Glacier',
 '.']

In [10]:
def word_index(sentence,utter,sv,uv,st,ut):
    wordindex={}
    k=0
    train_words=sentence+utter+sv+uv+st+ut
    for sent in tqdm(train_words):
        for t in sent:
            if t not in wordindex.keys():
                wordindex[t]=k
                k+=1
            else:
                continue
    print(len(wordindex.keys()))
    return wordindex

In [11]:
words_dict=word_index(context_train,utter_train,context_test,context_valid,utter_test,utter_valid)

100%|██████████| 58516/58516 [00:00<00:00, 374788.34it/s]

49372





In [12]:
def max_utt_len(sentence):
    maxi=0
    for i in sentence:
        maxi=max(len(i),maxi)
    return maxi

In [13]:
max_utt_len(utter_train)

236

In [14]:
for i in range(len(utter_train)):
    if len(utter_train[i])>150:
        utter_train[i]=utter_train[i][:150]

In [15]:
def wordtoindex(sentences,d):
    for sent in tqdm(sentences):
        for i in range(len(sent)):
            sent[i]=d[sent[i]]

In [16]:
import fasttext


In [17]:
model=fasttext.train_unsupervised('WikiQACorpus/WikiQACorpus/WikiQA-train.txt','cbow')

In [18]:
ls=list(model.words)

In [19]:
'Beretta' in ls

True

In [20]:
model.get_word_vector(ls[0]).shape

(100,)

In [22]:
id_to_vec_dict={}
for i in range(len(ls)):
    id_to_vec_dict[i]=model.get_word_vector(ls[i])

In [23]:
len(id_to_vec_dict.keys())

14358

In [24]:
word_to_index={i:k for k,i in enumerate(ls)}

In [27]:
word_to_index['Beretta']

13796

In [25]:
wordtoindex(context_train,word_to_index)
wordtoindex(utter_train,word_to_index)
wordtoindex(context_test,word_to_index)
wordtoindex(utter_test,word_to_index)
wordtoindex(context_valid,word_to_index)
wordtoindex(utter_valid,word_to_index)

  0%|          | 0/20360 [00:00<?, ?it/s]


KeyError: 'beretta'

In [18]:
def matrix(sentence,max_len):
    mask=[]
    for i in tqdm(range(len(sentence))):
        mask.append([1 for i in range(len(sentence[i]))])
        for j in range(max_len-len(sentence[i])):
            sentence[i].append(0)
            mask[i].append(0)
    return mask

In [19]:
mask_utter_train=matrix(utter_train,max_utt_len(utter_train))
mask_sentence_train=matrix(context_train,max_utt_len(context_train))
mask_utter_test=matrix(utter_test,max_utt_len(utter_test))
mask_sentence_test=matrix(context_test,max_utt_len(context_test))
mask_utter_valid=matrix(utter_valid,max_utt_len(utter_valid))
mask_sentence_valid=matrix(context_valid,max_utt_len(context_valid))

100%|██████████| 20360/20360 [00:00<00:00, 28276.44it/s]
100%|██████████| 20360/20360 [00:00<00:00, 104315.66it/s]
100%|██████████| 6165/6165 [00:00<00:00, 21652.84it/s]
100%|██████████| 6165/6165 [00:00<00:00, 90908.68it/s]
100%|██████████| 2733/2733 [00:00<00:00, 18942.53it/s]
100%|██████████| 2733/2733 [00:00<00:00, 77426.77it/s]


In [20]:
context_train=np.array(context_train)

utter_test=np.array(utter_test)

context_test=np.array(context_test)

utter_valid=np.array(utter_valid)

context_valid=np.array(context_valid)

mask_utter_train=np.array(mask_utter_train)

mask_sentence_train=np.array(mask_sentence_train)

mask_utter_test=np.array(mask_utter_test)

mask_sentence_test=np.array(mask_sentence_test)

mask_utter_valid=np.array(mask_utter_valid)

mask_sentence_valid=np.array(mask_sentence_valid)

In [21]:
utter_train=np.array(utter_train)

In [22]:
print(utter_train.shape)
print(mask_utter_train.shape)
print(context_train.shape)
print(mask_sentence_train.shape)
print(utter_test.shape)
print(mask_utter_test.shape)
print(mask_sentence_test.shape)
print(context_test.shape)
print(utter_valid.shape)
print(mask_utter_valid.shape)
print(context_valid.shape)
print(mask_sentence_valid.shape)

(20360, 150)
(20360, 150)
(20360, 23)
(20360, 23)
(6165, 112)
(6165, 112)
(6165, 21)
(6165, 21)
(2733, 132)
(2733, 132)
(2733, 24)
(2733, 24)


In [23]:
file=open("glove.6B.50d.txt",'r',encoding="utf-8")
wordembeds=file.readlines()
file.close()
glove_embeds={}
for i in tqdm(wordembeds):
    ls=i.strip().split()
    for i in range(1,len(ls)):
        ls[i]=float(ls[i])
    glove_embeds[ls[0]]=ls[1:]

100%|██████████| 400000/400000 [00:19<00:00, 20203.20it/s]


In [24]:
def id_to_glove(id_dict,glove_embeds):
    id_to_glove={}
    for word,embed in glove_embeds.items():
        if word in id_dict:
            id_to_glove[id_dict[word]]=np.array(embed,dtype='float32')
    for word,ind in id_dict.items():
        if ind not in id_to_glove:
            vec=np.zeros(50,dtype='float32')
            vec[:]=np.random.randn(50)*0.01
            id_to_glove[ind]=vec
    return id_to_glove

In [25]:
words_id_glove=id_to_glove(words_dict,glove_embeds)

In [26]:
class customdataloader(torch.utils.data.Dataset):
    def __init__(self,sent,utter,lab,sent_mask,utter_mask):
        self.sent=sent
        self.utter=utter
        self.lab=lab
        self.sent_mask=sent_mask
        self.utter_mask=utter_mask
    def __len__(self):
        return len(self.lab)
    def __getitem__(self,idx):
        return (self.sent[idx],self.utter[idx],self.lab[idx],self.sent_mask[idx],self.utter_mask[idx])

In [27]:
class Net(nn.Module):
    def __init__(self,Dictionary,word_embedding_length=50):
        super(Net,self).__init__()
        self.Dictionary=Dictionary
        self.lenDictionary=len(Dictionary)
        self.word_embedding_length=word_embedding_length
        self.embedding=nn.Embedding(self.lenDictionary,self.word_embedding_length)
        self.rnnBlock=nn.Linear(self.word_embedding_length*2,self.word_embedding_length*2)
        self.lstmBlock=nn.LSTM(self.word_embedding_length,self.word_embedding_length)
        self.dropout=nn.Dropout(0.5)
        self.init_weights()
        
    def init_weights(self):
        #init.uniform(self.lstmBlock.weight_ih_l0,a=-0.01,b=0.01)
        #init.orthogonal(self.lstmBlock.weight_hh_l0)
        #self.lstmBlock.weight_ih_l0.requires_grad=True
        #self.lstmBlock.weight_hh_l0.requires_grad=True
        
        embedding_weights=torch.FloatTensor(self.lenDictionary,self.word_embedding_length)
        for idx,glove in self.Dictionary.items():
            embedding_weights[idx]=torch.FloatTensor(list(glove))
        self.embedding.weight=nn.Parameter(embedding_weights,requires_grad=True)
        self.embedding=nn.Embedding.from_pretrained(self.embedding.weight)
    
    def forward(self,sent,masksent):
        out_sent=self.forwardLSTM(sent,masksent)
        #M=torch.FloatTensor(50,50)
        #init.xavier_normal(M)
        #self.M=nn.Parameter(M,requires_grad=True)
        #context=out_sent.mm(self.M)
        #context=context.view(-1,1,50)
        #response=out_utt.view(-1,50,1)
        #dotprod=torch.bmm(context,response).view(-1,1)
        #dotprod=[]
        #for i in range(len(out_sent)):
            #dotprod.append(out_sent[i].dot(out_utt[i]))
        return out_sent
    def forwardRNN(self,utt,mask):
        hidden=torch.zeros([utt.shape[0],self.word_embedding_length]).to(utt.device)
        output=torch.zeros([utt.shape[0],self.word_embedding_length]).to(utt.device)
        for no,(utti,maski) in enumerate(zip(utt,mask)):
            utti_embedding=self.embedding(utti)
            for noj,(uttij,maskij) in enumerate(zip(utti_embedding,maski)):
                if maskij==0:
                    break
                temp=self.rnnBlock(torch.cat([uttij,hidden[no]]).unsqueeze(0))[0]
                hidden[no]=F.relu(temp[self.word_embedding_length:])
                output[no]=temp[:self.word_embedding_length]
        return output
    def forwardLSTM(self,utt,mask):
        output=torch.zeros([utt.shape[0],self.word_embedding_length]).to(utt.device)
        for no,(utti,maski) in enumerate(zip(utt,mask)):
            utti_embed=self.embedding(utti)
            numutt=torch.sum(maski)
            utti_embed=utti_embed[:numutt].unsqueeze(1)
            _,(last_hidden,_)=self.lstmBlock(utti_embed)
            last_hidden=self.dropout(last_hidden[0][0])
            output[no]=last_hidden
        return output

In [28]:
def train(model,train_loader,optimizer,epoch):
    model.train()
    for batchid,(sent,utt,lab,masksent,maskutt) in tqdm(enumerate(train_loader)):
        correct=0
        optimizer.zero_grad()
        output_sent=model(sent,masksent)
        output_utt=model(utt,maskutt)
        loss=0
        for i in range(len(output_sent)):
            loss+=(output_sent[i].dot(output_utt[i])/(torch.norm(output_sent[i])*torch.norm(output_utt[i]))-lab[i])**2
        loss/=len(output_sent)
        loss.backward()
        optimizer.step()
        if batchid % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batchid * len(sent), len(train_loader.dataset),
            100. * batchid / len(train_loader), loss.item()))

In [29]:
def validation(model,valid_loader):
    model.eval()
    validloss=0
    correct=0
    with torch.no_grad():
        for batchid,(sent,utt,lab,masksent,maskutt) in enumerate(valid_loader):
            output_utt=model(utt,maskutt)
            output_sent=model(sent,masksent)
            for i in range(len(output_sent)):
                validloss+=(output_sent[i].dot(output_utt[i])/(torch.norm(output_sent[i])*torch.norm(output_utt[i]))-lab[i])**2
                if int(output_sent[i].dot(output_utt[i])+0.5)==lab[i]:
                    correct+=1
            validloss/=len(output_sent)
            print(validloss,100*correct/len(valid_loader.dataset))

In [30]:
def seed(seed_value):
    torch.cuda.manual_seed_all(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    np.random.seed(seed_value)
    random.seed(seed_value)
    torch.backends.cudnn.benchmark=False
    torch.backends.cudnn.deterministic=True

In [31]:
def main():
    seed(0)
    train_data=customdataloader(context_train,utter_train,label_train,mask_sentence_train,mask_utter_train)
    test_data=customdataloader(context_test,utter_test,label_test,mask_sentence_test,mask_utter_test)
    valid_data=customdataloader(context_valid,utter_valid,label_valid,mask_sentence_valid,mask_utter_valid)
    train_loader=DataLoader(train_data,num_workers=0,batch_size=40,shuffle=False)
    test_loader=DataLoader(test_data,num_workers=0,batch_size=1000,shuffle=False)
    valid_loader=DataLoader(valid_data,num_workers=0,batch_size=1000,shuffle=False)
    model=Net(words_id_glove)
    optimizer=optim.Adam(model.parameters(),lr=0.001)
    #model.load_state_dict(torch.load("wikiqa.pt"))
    for epoch in range(1,2):
        train(model,train_loader,optimizer,epoch)
        validation(model,valid_loader)
    validation(model,test_loader)

In [44]:
if __name__=="__main__":
    main()

1it [00:00,  1.07it/s]



101it [01:16,  1.41it/s]



201it [02:29,  1.57it/s]



301it [03:42,  1.23it/s]



401it [04:54,  1.63it/s]



501it [06:09,  1.23it/s]



509it [06:15,  1.36it/s]


tensor(0.0748) 35.27259421880717
tensor(0.0985) 69.30113428466886
tensor(0.1102) 94.18221734357849
tensor(0.0981) 15.279805352798054
tensor(0.0860) 30.721816707218167
tensor(0.1130) 45.790754257907544
tensor(0.0980) 61.15166261151663
tensor(0.0997) 76.44768856447689
tensor(0.0859) 91.92214111922141
tensor(0.0994) 94.42011354420113


In [31]:
def preprocess(s,words_dict):
    mask=[[]]
    ls=[[]]
    ls[0]=s.split()
    i=0
    n=len(ls[0])
    while i<n:
        if ls[0][i] in words_dict:
            ls[0][i]=words_dict[ls[0][i]]
            mask[0].append(1)
            i+=1
        else:
            ls[0].pop(i)
            n-=1
    ls=np.array(ls)
    mask=np.array(mask)
    return ls,mask

In [32]:
model=Net(words_id_glove)
model.load_state_dict(torch.load("wikiqafinal.pt"))
train_data=customdataloader(context_train,utter_train,label_train,mask_sentence_train,mask_utter_train)
train_loader=DataLoader(train_data,num_workers=0,batch_size=1000,shuffle=False)
ls_utt=[]
with torch.no_grad():
    for (sent,utt,lab,masksent,maskutt) in tqdm(train_loader):
        output=model(utt,maskutt)
        ls_utt.extend(output)

100%|██████████| 21/21 [00:55<00:00,  2.62s/it]


In [35]:
for i in range(len(ls_utt)):
    ls_utt[i]=ls_utt[i].tolist()

In [39]:
file2=open("answer_embeds(1).pkl",'ab')
pickle.dump(ls_utt,file2)
file2.close()

In [80]:
s='how did appolo creed die'
with torch.no_grad():
    q , qm = preprocess(s,words_dict)
    q=torch.from_numpy(q)
    qm=torch.from_numpy(qm)
    print(q.shape)
    print(qm.shape)
    output_q=model(q,qm)
    print(output_q)
    arg_max=-1
    dot_prod=-1
    for idx,m in enumerate(ls_utt):
        if output_q[0].dot(m)>dot_prod:
            dot_prod=output_q[0].dot(m)
            arg_max=idx
    print(arg_max)
    print(output_q[0].dot(ls_utt[arg_max]))


torch.Size([1, 4])
torch.Size([1, 4])
tensor([[-0.0000e+00,  0.0000e+00, -8.5996e-02,  0.0000e+00, -0.0000e+00,
         -0.0000e+00, -0.0000e+00,  3.0250e-02,  0.0000e+00, -1.2950e-01,
          2.1869e-02, -8.1768e-02, -1.0104e-01,  0.0000e+00,  0.0000e+00,
         -1.5226e-01,  2.4174e-02, -2.6548e-02,  1.4127e-01, -2.0230e-02,
         -0.0000e+00, -9.2644e-02, -4.0494e-02, -0.0000e+00,  0.0000e+00,
         -0.0000e+00,  0.0000e+00, -3.7603e-02, -0.0000e+00, -0.0000e+00,
         -1.8482e-04, -0.0000e+00, -0.0000e+00,  1.0619e-01,  0.0000e+00,
          0.0000e+00,  1.0109e-03,  0.0000e+00, -0.0000e+00,  0.0000e+00,
         -0.0000e+00, -9.1154e-02,  4.3980e-02,  8.7051e-02, -1.2229e-01,
         -9.0598e-02, -2.6327e-01, -0.0000e+00,  9.9694e-02,  0.0000e+00]])
15621
tensor(1.0664)


In [63]:
print(a[7716])

['Information', 'and', 'Communications', 'Technology', 'or', '(', 'ICT', ')', ',', 'is', 'often', 'used', 'as', 'an', 'extended', 'synonym', 'for', 'information', 'technology', '(', 'IT', ')', ',', 'but', 'is', 'a', 'more', 'specific', 'term', 'that', 'stresses', 'the', 'role', 'of', 'unified', 'communications', 'and', 'the', 'integration', 'of', 'telecommunications', '(', 'telephone', 'lines', 'and', 'wireless', 'signals', ')', ',', 'computers', 'as', 'well', 'as', 'necessary', 'enterprise', 'software', ',', 'middleware', ',', 'storage', ',', 'and', 'audio-visual', 'systems', ',', 'which', 'enable', 'users', 'to', 'access', ',', 'store', ',', 'transmit', ',', 'and', 'manipulate', 'information', '.']


In [54]:
l[3]

1