In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [13]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?',"")
  text = text.replace("'","")
  return text.split()

In [14]:
tokenize(df['question'][1])

['what', 'is', 'the', 'capital', 'of', 'germany']

In [15]:
#vocabulary
vocab = {'<UNK>':0}



In [25]:
def build_vocab(row):
  # print(row['question'],row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  # print(tokenized_question,tokenized_answer)
  merge_token = tokenized_question + tokenized_answer
  # print(merge_token)
  for token in merge_token:
    if token not in vocab:
      vocab[token] = len(vocab)


In [26]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [28]:
vocab_len = len(vocab)

In [30]:
#convert to num index
def text_to_index(text,vocab):
  index_text = []
  for token in tokenize(text):
    if token in vocab:
      index_text.append(vocab[token])
    else:
      index_text.append(vocab['<UNK>'])
  return index_text

In [33]:
text_to_index("what is uk",vocab)

[1, 2, 272]

In [34]:
import torch
from torch.utils.data import Dataset, DataLoader

In [82]:
class CustomDataset(Dataset):
  def __init__(self,df,vocab):
    self.df = df;
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    num_question = text_to_index(self.df.iloc[index]['question'],self.vocab)
    num_answer = text_to_index(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(num_question),torch.tensor(num_answer)

In [83]:
dataset = CustomDataset(df,vocab)

In [84]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [85]:
for question ,answer in dataloader:
  print(question,answer)
  break

tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([[321]])


In [86]:
# Rnn architecture
import torch.nn as nn

In [103]:
class myRnn(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding  = nn.Embedding(vocab_size,embedding_dim=60)
    self.rnn = nn.RNN(60,64,batch_first=True)
    self.output = nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_question = self.embedding(question)
    hidden,final = self.rnn(embedded_question)
    output = self.output(final.squeeze(0))
    return output

In [112]:
epochs = 200
learning_rate = 0.001

In [113]:
model = myRnn(vocab_len)
criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [114]:
#training loop
for epoch in range(epochs):
  total_loss = 0
  for question,answer in dataloader:

    #forwar pass
    pred_ans = model(question)
    # print(pred_ans.shape)
    #loss
    loss=criteria(pred_ans,answer[0])

    #zero grad
    optimizer.zero_grad()

    #backprop
    loss.backward()

    #optimizer
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch: {epoch+1} Loss: {total_loss:4f}")



Epoch: 1 Loss: 534.035340
Epoch: 2 Loss: 467.092163
Epoch: 3 Loss: 390.416706
Epoch: 4 Loss: 321.780156
Epoch: 5 Loss: 266.214901
Epoch: 6 Loss: 216.532593
Epoch: 7 Loss: 171.912926
Epoch: 8 Loss: 133.753816
Epoch: 9 Loss: 102.590412
Epoch: 10 Loss: 78.203332
Epoch: 11 Loss: 60.221056
Epoch: 12 Loss: 47.327049
Epoch: 13 Loss: 37.682400
Epoch: 14 Loss: 30.862027
Epoch: 15 Loss: 25.789618
Epoch: 16 Loss: 21.306724
Epoch: 17 Loss: 18.551268
Epoch: 18 Loss: 15.906574
Epoch: 19 Loss: 13.473808
Epoch: 20 Loss: 12.059225
Epoch: 21 Loss: 10.416177
Epoch: 22 Loss: 8.983602
Epoch: 23 Loss: 7.923484
Epoch: 24 Loss: 7.046194
Epoch: 25 Loss: 6.267655
Epoch: 26 Loss: 5.629672
Epoch: 27 Loss: 5.088054
Epoch: 28 Loss: 4.598612
Epoch: 29 Loss: 4.187864
Epoch: 30 Loss: 3.816676
Epoch: 31 Loss: 3.505137
Epoch: 32 Loss: 3.222977
Epoch: 33 Loss: 2.974426
Epoch: 34 Loss: 2.747721
Epoch: 35 Loss: 2.539531
Epoch: 36 Loss: 2.356569
Epoch: 37 Loss: 2.191667
Epoch: 38 Loss: 2.044151
Epoch: 39 Loss: 1.900738
Epoc

In [None]:
model.eval()

In [123]:
def predict(model,question,threshold=0.5):
  #convert question to number
  num_question = text_to_index(question,vocab)
  question_tensor = torch.tensor(num_question).unsqueeze(0)
  # print(question_tensor)
  output = model(question_tensor)
  # print(output)

  probs = torch.nn.functional.softmax(output,dim=1)
  value,idx = torch.max(probs,dim=1)
  print(value,idx)
  if value < threshold:
    print("i don't know")
  else:
    print(list(vocab.keys())[idx])


In [126]:
predict(model,"largest planet in solar system")

tensor([0.5384], grad_fn=<MaxBackward0>) tensor([23])
jupiter
