<a href="https://colab.research.google.com/github/mohitt38/Pytorch/blob/main/pytorch_RNN_based_Q_A_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import pandas as pd

df= pd.read_csv("/content/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [59]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?' , '')
  text = text.replace("'" , "")
  return text.split()

In [60]:
tokenize("What is the capital of Germany?	")

['what', 'is', 'the', 'capital', 'of', 'germany']

In [61]:
# vocablury
vocab = {'<UNK>':0}

In [62]:
def build_vocab(row):
  tokenized_ques = tokenize(row['question'])
  tokenized_ans = tokenize(row['answer'])

  merged_tokens = tokenized_ques + tokenized_ans

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [63]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [64]:
len(vocab)

324

In [65]:
# convert words to numerical indices
def text_to_indices(text , vocab):

  indexed_text = []

  for token in tokenize(text):

    if  token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [66]:
text_to_indices("What is the capital of Germany?", vocab)

[1, 2, 3, 4, 5, 8]

In [67]:
text_to_indices("Who is mohit?", vocab)

[10, 2, 0]

In [68]:
import torch
from torch.utils.data import Dataset, DataLoader

In [69]:
class QADataset(Dataset):

  def __init__(self , df , vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)


In [70]:
dataset = QADataset(df, vocab)

In [71]:
dataset[5]

(tensor([10, 29,  3, 30, 31]), tensor([32]))

In [72]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [73]:
for question, answer in dataloader:
  print(question, answer)


tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([[249]])
tensor([[ 10,  75, 208]]) tensor([[209]])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([[184]])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([[317]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[ 10,  11, 189, 158, 190]]) tensor([[191]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([[194]])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([[9]])
tensor([[  1,   2,   3,   4,   5, 236, 237]]) tensor([[238]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[10,  2,  3, 66,  5, 67]]

In [74]:
import torch.nn as nn

In [75]:
class MyRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50,64, batch_first = True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden , final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output

In [76]:
learning_rate = 0.001
epochs = 20

In [77]:
model = MyRNN(len(vocab))

In [78]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [79]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 524.800315
Epoch: 2, Loss: 451.730977
Epoch: 3, Loss: 374.907143
Epoch: 4, Loss: 317.261458
Epoch: 5, Loss: 264.886112
Epoch: 6, Loss: 215.662099
Epoch: 7, Loss: 171.911002
Epoch: 8, Loss: 133.408508
Epoch: 9, Loss: 102.348152
Epoch: 10, Loss: 77.998965
Epoch: 11, Loss: 59.873835
Epoch: 12, Loss: 46.851347
Epoch: 13, Loss: 36.757198
Epoch: 14, Loss: 29.627318
Epoch: 15, Loss: 24.234636
Epoch: 16, Loss: 19.921454
Epoch: 17, Loss: 16.709869
Epoch: 18, Loss: 14.136948
Epoch: 19, Loss: 12.071528
Epoch: 20, Loss: 10.437984


In [92]:
def predict(model, question, threshold = 0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question)

  # unsqueeze
  question_tensor = question_tensor.unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim =1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [94]:
predict(model , "What is the largest planet in our solar system?")

jupiter


## Debugging the RNN model

In [80]:
dataset[15][0]

tensor([ 1,  2,  3, 69,  5,  3, 70, 71])

In [81]:
x = nn.Embedding(324, embedding_dim=50)

In [82]:
a = x(dataset[0][0])

In [83]:
a

tensor([[-0.4461, -0.8667, -0.3559,  1.0807,  0.9863,  2.0267,  0.4628, -0.8047,
         -1.0941, -1.4439, -0.2163,  2.0950, -0.4981,  0.2091, -0.8953, -0.9337,
          0.3749, -0.6643, -0.9074,  0.0132, -1.9927,  0.4696, -1.3910,  0.8902,
          1.1955, -1.6139, -0.9726, -0.0474, -1.5127,  0.3016, -1.1297, -1.4572,
          0.4244,  1.1506, -0.0357, -0.8396, -0.5077,  0.5478, -1.5940, -1.4143,
          0.0477, -0.3730, -0.2286, -0.1047, -2.2962,  0.1111,  1.3262, -0.6242,
          0.2509, -1.2717],
        [-0.0911, -1.1940,  0.0706,  0.2816,  0.4810,  0.6172,  0.6857,  1.2051,
         -0.6096,  0.1420, -0.6275,  1.3983,  0.2012, -0.8374,  0.5143, -2.3620,
         -0.9307,  0.4962,  0.5008, -0.4810, -0.6544,  1.9830, -0.1678,  0.0076,
          0.3778,  0.1043,  0.0895,  0.5263, -0.3860,  0.3775, -0.1149,  0.7785,
         -1.0224,  0.8637, -0.1267, -0.9606,  0.3858, -0.6169, -1.0993,  0.1467,
         -0.3904,  1.0854,  0.6525, -0.1438,  0.0109, -0.4367,  0.0278, -0.0505,


In [84]:
y = nn.RNN(50,64)

In [85]:
# hidden state
y(a)[0]

tensor([[-1.6533e-02, -2.5550e-01, -1.3339e-01, -3.5053e-02, -1.4770e-01,
          2.4737e-01,  3.0746e-01,  5.7550e-01, -4.8106e-01,  2.2775e-01,
          5.2327e-01,  6.7713e-01,  8.8188e-02, -2.4515e-01,  3.4435e-01,
          6.1198e-01,  2.3530e-01,  3.0244e-02, -5.9703e-01,  3.0095e-01,
          8.7754e-01,  4.3378e-02,  8.5479e-01,  2.1791e-01,  1.1660e-01,
          1.0296e-01,  1.6256e-01, -1.4651e-01,  1.3775e-01, -5.5197e-02,
         -4.6380e-03, -1.1380e-01, -2.6894e-02, -3.5227e-01, -5.5863e-01,
          3.7878e-01, -5.9575e-01,  5.1448e-01,  9.5559e-02,  8.1130e-02,
         -3.5667e-01, -1.6019e-01,  1.4601e-01,  2.1563e-02, -2.3952e-01,
         -4.6410e-01,  5.7182e-01, -4.9418e-02, -4.9583e-02,  7.7420e-01,
          4.2231e-01, -6.0862e-01,  7.1711e-01,  2.1645e-01, -3.1451e-01,
          8.3626e-01, -4.6815e-02,  2.6395e-03,  2.8394e-01, -2.2068e-01,
          3.7980e-01,  1.6217e-01, -4.9959e-02,  4.5521e-01],
        [ 8.7321e-03, -6.0969e-01, -1.7276e-01,  5

In [86]:
# final output
b = y(a)[1]

In [87]:
z = nn.Linear(64, 324)

In [88]:
z(b).shape

torch.Size([1, 324])