In [270]:
import torch

In [271]:
import pandas as pd
import numpy as np

In [272]:
df=pd.read_csv('100_Unique_QA_Dataset.csv')

In [273]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [274]:
#  tokenize 
def toeknize(text):
    text=text.lower()
    text=text.replace('?','')
    text=text.replace("'",'')
    return text.split()


In [275]:
toeknize('What is your name?')

['what', 'is', 'your', 'name']

In [276]:
# vocab
vocab={'<unk>':0}

def build_vocab(row):
    tokenized_question=toeknize(row['question'])
    tokenized_answer=toeknize(row['answer'])

    merged_token=tokenized_question+tokenized_answer

    for token in merged_token:
        if token not in vocab:
            vocab[token]=len(vocab)
            
    print(tokenized_question,tokenized_answer)


In [277]:
#  convert words to numerical index
df.apply(build_vocab,axis=1)

['what', 'is', 'the', 'capital', 'of', 'france'] ['paris']
['what', 'is', 'the', 'capital', 'of', 'germany'] ['berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird'] ['harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system'] ['jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius'] ['100']
['who', 'painted', 'the', 'mona', 'lisa'] ['leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64'] ['8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold'] ['au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end'] ['1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world'] ['nile']
['what', 'is', 'the', 'capital', 'of', 'japan'] ['tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity'] ['albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit'] ['32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet'] ['mars']
['who', 'is', 'the', 'auth

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [278]:
len(vocab)

324

In [279]:
def text_to_index(text,vocab):
    index_text=[]

    for token in toeknize(text):
        if token in vocab:
            index_text.append(vocab[token])
        else:
            index_text.append(vocab['<unk>'])

    return index_text

In [280]:
text_to_index('What is your name?',vocab)

[1, 2, 0, 0]

In [281]:
import torch
from torch.utils.data import Dataset,DataLoader

In [282]:
class QADataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        numerical_question=text_to_index(self.df.iloc[index]['question'],self.vocab)
        numerical_answer=text_to_index(self.df.iloc[index]['answer'],self.vocab)
    
        return torch.tensor(numerical_question),torch.tensor(numerical_answer)
    


In [283]:
dataset=QADataset(df,vocab)

In [284]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [285]:
for question,answer in dataloader:
    print(question,answer)
    

tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([[162]])
tensor([[  1,   2,   3,   4,   5, 236, 237]]) tensor([[238]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]]) tensor([[285]])
tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[ 42,   2,   3, 274, 211, 275]]) tensor([[276]])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([[317]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([[113]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([[41]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([[184]])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([[74]])
tensor([[ 42, 107,   2, 108,  19, 109]]) tensor([[110]])
tensor([[  1,

In [286]:
import torch.nn as nn

In [287]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, question):
        embedded_question=self.embedding(question)
        hidden,final=self.rnn(embedded_question)
        output=self.fc(final.squeeze(0))

        return output

In [288]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [289]:
learning_rate=0.001
epochs=20

In [290]:
model=SimpleRNN(len(vocab))

In [291]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [292]:
# training loop
for epoch in range(epochs):
    total_loss = 0
    for question, answer in dataloader:
        optimizer.zero_grad()
        # forward pass
        output = model(question)
        # reshape output to (batch_size * sequence_length, num_classes)
        output = output.view(-1, len(vocab))
        # reshape answer to (batch_size * sequence_length)
        answer = answer.view(-1)
        # calculate loss
        loss = criterion(output, answer)
        # backward pass
        loss.backward()
        # update
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

Epoch 1, Loss: 5.823628081215753
Epoch 2, Loss: 5.12026818063524
Epoch 3, Loss: 4.298684496349758
Epoch 4, Loss: 3.578390701611837
Epoch 5, Loss: 2.9930490281846787
Epoch 6, Loss: 2.451018594370948
Epoch 7, Loss: 1.9565055476294624
Epoch 8, Loss: 1.5165443354182773
Epoch 9, Loss: 1.1700683706336552
Epoch 10, Loss: 0.894202987684144
Epoch 11, Loss: 0.6938120083676445
Epoch 12, Loss: 0.5446168579989009
Epoch 13, Loss: 0.44009046620792813
Epoch 14, Loss: 0.3600018443332778
Epoch 15, Loss: 0.29809250699149237
Epoch 16, Loss: 0.2537421520385477
Epoch 17, Loss: 0.21604839505420792
Epoch 18, Loss: 0.18628149992889828
Epoch 19, Loss: 0.16274457358651692
Epoch 20, Loss: 0.1418845396902826


In [293]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_index(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)
  print("value is :",value,"index is:", index)
  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [300]:
x = input("Enter question:")


In [301]:
predict(model, x)

value is : tensor([0.5228], grad_fn=<MaxBackward0>) index is: tensor([54])
tokyo
