In [1]:
import torch
import pandas as pd


In [2]:
df=pd.read_csv('Dataset/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
#tokenize

def tokenize(text):
    text=text.lower()
    text=text.replace('?', '')
    text=text.replace("'" ,"")
    return text.split()

In [7]:
vocab={
    '<unk>':0
}

In [8]:

def build_vocab(row):
    tokenized_question=tokenize(row['question'])
    tokenize_answer=tokenize(row['answer'])
    merged_token=tokenized_question+tokenize_answer
    # print(merged_token)

    for token in merged_token:

        if token not in vocab:
            vocab[token]=len(vocab)
        

In [9]:
df.apply(build_vocab ,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
87    None
88    None
89    None
90    None
91    None
Length: 92, dtype: object

In [10]:
vocab

{'<unk>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [11]:
def text_to_indiceS(text ,vocab):
    indexed_text=[]
    for token in tokenize(text):

        if token in vocab: 
            indexed_text.append(vocab[token ])
        else:
            indexed_text.append(vocab['<unk>'])
    return indexed_text

     

In [12]:
text_to_indiceS('what is loveless' ,vocab)

[1, 2, 0]

In [13]:
import torch
from torch.utils.data import DataLoader , Dataset

In [17]:
class qadataset(Dataset):
    def __init__(self,df, vocab ):
        super().__init__()
        self.df=df
        self.vocab=vocab


    def __len__(self):
        return self.df.shape[0]  
    def __getitem__(self, index):
        numerical_question=text_to_indiceS(self.df.iloc[index]['question'], self.vocab)
        numerical_answer=text_to_indiceS(self.df.iloc[index]['answer'], self.vocab) 
        return torch.tensor(numerical_question) , torch.tensor(numerical_answer)
        

In [19]:
dataset=qadataset(df ,vocab)

In [21]:
 dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [23]:
dataloader=DataLoader(dataset , batch_size=1 , shuffle=True)

In [26]:
for question ,answer in dataloader:
    print(question ,answer)


tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([[36]])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([[317]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[  1,   2,   3,  17, 115,  83,  84]]) tensor([[116]])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([[268]])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([[184]])
tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([[179]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor(

In [28]:
import torch.nn as nn

In [48]:
class myrnn(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size , embedding_dim=50)
        self.rnn=nn.RNN( 50 ,64, batch_first=True)
        self.fc=nn.Linear(64 ,vocab_size) 
    def forward(self ,question):
        embedding_question=self.embedding(question)
        hidden ,final=self.rnn(embedding_question)
        output=self.fc(final.squeeze(0))
        
        return output

 

In [49]:
learning_rate=0.001
epoches=20


In [50]:
model=myrnn(len(vocab))

In [51]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters() ,lr=learning_rate)

In [57]:
#traing loop

for epoches in range(epoches):
    total_loss=0
    for question ,answer in dataloader:
        optimizer.zero_grad()

        output=model(question)

        loss=criterion(output ,answer)

        loss.backward()
         

        optimizer.step()
        total_loss=total_loss +loss.item()
         
        print(f'epoches{epoches+1} ,loss:{total_loss}') 

In [76]:
def predict(model , question ,threshold=0.5):
    #convert question to numbers
    numerical_question=text_to_indiceS(question , vocab)
    # print(numerical_question)
    question_tensor=torch.tensor(numerical_question).unsqueeze(0)
    # print(question_tensor)
    output=model(question_tensor) 
    #convert logits to porbs

    probs=torch.nn.functional.softmax(output ,dim=1)
    value ,index= torch.max(probs , dim=1)

    if value <threshold:
        print("i dont know") 
    list(vocab.keys())[index]



In [77]:
predict(model , "what is capital of france")

i dont know


In [79]:
 list(vocab.keys())[7]

'paris'