In [1]:
import pandas as pd

df = pd.read_csv('100_Unique_QA_Dataset.csv')

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [2]:
# Tokenize

def tokenize(text):
  text = text.lower()
  text = text.replace('?', '')
  text = text.replace("'", "")
  return text.split()

In [3]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [4]:
# Vocabulary

vocab = {'<UNK>' : 0}

In [5]:
def build_vocab(row):
  # print(row['question'], row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  # print(tokenized_question, tokenized_answer)
  # print(merged_tokens)

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)

  return vocab

In [6]:
df.apply(build_vocab, axis = 1)

Unnamed: 0,0
0,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
1,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
2,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
3,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
4,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
...,...
85,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
86,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
87,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."
88,"{'<UNK>': 0, 'what': 1, 'is': 2, 'the': 3, 'ca..."


In [7]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [8]:
len(vocab)

324

In [9]:
# Convert words to numerical indices

def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [10]:
text_to_indices('What is campusx', vocab)

[1, 2, 0]

In [11]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

In [13]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [14]:
dataset = QADataset(df, vocab)

In [15]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [16]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [17]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [18]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [19]:
for question, answer in dataloader:
  print(question, answer)

tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[  1,   2,   3,   4,   5, 206]]) tensor([[207]])
tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([[259]])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([[74]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([[113]])
tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([[155]])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([[134]])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([[41]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([[58]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([[321]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[ 42, 125,   

In [20]:
import torch.nn as nn

In [21]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedding_question = self.embedding(question)
    hidden, final = self.rnn(embedding_question)
    output = self.fc(final.squeeze(0))

    return output

In [22]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [23]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [24]:
x = nn.Embedding(324, embedding_dim=50)

In [25]:
x(dataset[0][0])

tensor([[ 0.5626, -0.1686,  1.1519, -2.4442, -0.6210,  1.5849,  1.2694, -0.1334,
         -0.3359, -0.1887, -0.6242,  0.7608, -1.4588,  1.6176,  0.3045,  0.1895,
          1.0351, -0.0390, -0.4903,  0.4260, -0.7984, -0.8647, -0.0673,  1.8367,
         -0.1463,  0.0893,  1.1934,  0.0730,  1.3137, -1.7397, -0.4682, -0.6963,
          0.6624,  1.7350,  0.7590,  0.4876,  1.2379,  1.1898,  0.4822,  0.0198,
          0.0939,  0.6349,  1.5774, -1.5606, -1.6889, -0.3001,  0.4280, -0.7077,
         -0.7827, -0.7956],
        [ 0.1206, -0.7340,  0.0734,  1.1998, -0.7607, -1.1089,  0.3488,  0.4361,
          0.0578,  1.3097,  0.3821, -0.6789,  0.1871, -0.1366,  2.2627,  0.9198,
          1.6598, -0.7115,  1.6399,  0.3016, -0.0821, -0.5353, -0.4723,  2.1727,
          0.2929, -0.3616,  0.7570, -0.6023, -0.4537, -1.4937,  0.4802,  0.2980,
         -0.6903, -1.0647, -1.2451, -0.4285, -0.8982, -0.4576,  0.1301, -2.0639,
         -0.1082, -0.3405,  1.7496, -0.3691, -0.1219,  1.2020,  0.4363, -0.9321,


In [26]:
x(dataset[0][0]).shape

torch.Size([6, 50])

In [27]:
dataset[10][0]

tensor([ 1,  2,  3,  4,  5, 53])

In [28]:
dataset[15][0]

tensor([ 1,  2,  3, 69,  5,  3, 70, 71])

In [29]:
x(dataset[15][0]).shape

torch.Size([8, 50])

In [30]:
y = nn.RNN(50, 64)

In [31]:
a = x(dataset[0][0])

In [32]:
# Hidden States

y(a)

(tensor([[ 2.2722e-01, -6.4735e-01,  2.2527e-01,  5.0739e-01,  4.5927e-01,
           4.1263e-02, -3.2454e-01,  1.9699e-01, -7.2111e-01, -1.3713e-01,
           3.2346e-01, -1.4600e-01,  3.5714e-01,  1.4264e-01,  1.5107e-01,
           1.7145e-01,  4.6616e-01,  1.0534e-01, -5.0966e-02,  3.6538e-01,
          -2.5063e-01,  5.1817e-02,  2.9082e-01,  4.2540e-01,  5.5662e-01,
          -5.2565e-01, -1.9116e-01,  2.0905e-01,  4.1140e-01, -4.4101e-02,
          -6.7434e-01, -2.2046e-01,  6.0624e-01,  8.4410e-01, -6.4677e-02,
           5.5333e-01,  4.0703e-01, -2.2725e-01,  5.0311e-02,  1.4321e-02,
          -3.9215e-01,  8.0254e-01,  4.7396e-02,  7.9179e-01, -1.8468e-01,
           8.1805e-02, -5.9918e-01, -1.4872e-01,  3.1571e-01,  4.0487e-01,
           2.9789e-01, -2.8071e-01, -2.1464e-01, -4.2356e-01, -2.5374e-01,
           3.8437e-01, -8.0926e-01, -2.6333e-01,  9.8887e-02, -4.8590e-01,
          -6.2693e-01,  6.8878e-01, -6.9165e-02, -4.5810e-01],
         [ 7.0454e-01,  2.7137e-02, -

In [33]:
# Final Output

y(a)[0]

tensor([[ 2.2722e-01, -6.4735e-01,  2.2527e-01,  5.0739e-01,  4.5927e-01,
          4.1263e-02, -3.2454e-01,  1.9699e-01, -7.2111e-01, -1.3713e-01,
          3.2346e-01, -1.4600e-01,  3.5714e-01,  1.4264e-01,  1.5107e-01,
          1.7145e-01,  4.6616e-01,  1.0534e-01, -5.0966e-02,  3.6538e-01,
         -2.5063e-01,  5.1817e-02,  2.9082e-01,  4.2540e-01,  5.5662e-01,
         -5.2565e-01, -1.9116e-01,  2.0905e-01,  4.1140e-01, -4.4101e-02,
         -6.7434e-01, -2.2046e-01,  6.0624e-01,  8.4410e-01, -6.4677e-02,
          5.5333e-01,  4.0703e-01, -2.2725e-01,  5.0311e-02,  1.4321e-02,
         -3.9215e-01,  8.0254e-01,  4.7396e-02,  7.9179e-01, -1.8468e-01,
          8.1805e-02, -5.9918e-01, -1.4872e-01,  3.1571e-01,  4.0487e-01,
          2.9789e-01, -2.8071e-01, -2.1464e-01, -4.2356e-01, -2.5374e-01,
          3.8437e-01, -8.0926e-01, -2.6333e-01,  9.8887e-02, -4.8590e-01,
         -6.2693e-01,  6.8878e-01, -6.9165e-02, -4.5810e-01],
        [ 7.0454e-01,  2.7137e-02, -1.2002e-02,  4

In [34]:
# Hidden States

y(a)[0].shape

torch.Size([6, 64])

In [35]:
# Final Output

y(a)[1].shape

torch.Size([1, 64])

In [36]:
b = y(a)[1]

In [37]:
z = nn.Linear(64, 324)

In [38]:
z(b)

tensor([[ 1.9256e-01, -1.6070e-01, -1.9424e-01,  3.4952e-01,  3.4742e-01,
         -2.8906e-01, -1.3376e-01,  1.9378e-01,  1.7185e-01, -1.6994e-01,
         -1.2933e-01, -2.9340e-01,  1.7111e-02, -1.9065e-01, -4.7665e-01,
          3.6522e-02, -2.8925e-01, -1.6981e-01, -4.2639e-03,  1.7063e-02,
         -1.0346e-01,  1.6426e-01, -1.0586e-01,  2.5092e-02, -1.2323e-01,
         -4.0666e-01,  4.1727e-01,  2.4180e-02, -3.9512e-02, -1.6261e-01,
         -2.2308e-01,  3.8639e-01, -4.3135e-01,  2.2129e-01,  8.2645e-02,
          3.9149e-02, -1.1565e-01, -2.6774e-01, -3.9374e-01,  3.5006e-02,
         -1.6283e-01, -7.2907e-01, -4.5231e-01, -3.9964e-01, -2.3999e-01,
         -4.5514e-01, -3.8139e-01,  3.0123e-01, -5.5572e-02,  2.8140e-01,
         -1.0834e-01, -2.5261e-01,  2.1866e-01, -1.1367e-01,  3.9164e-01,
          2.6477e-01,  9.8549e-02, -1.2712e-01,  4.6691e-01,  1.2587e-01,
          9.8104e-02,  2.0237e-01, -2.4334e-01, -1.1206e-01, -7.7360e-02,
         -1.9165e-01, -2.1199e-01,  1.

In [39]:
z(b).shape

torch.Size([1, 324])

In [40]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1, 6)
print("Shape of a:", a.shape)
b = x(a)
print("Shape of b:", b.shape)
c , d = y(b)
print("Shape of c:", c.shape)
print("Shape of d:", d.shape)

e = z(d)

print("Shape of e:", e.shape)

Shape of a: torch.Size([1, 6])
Shape of b: torch.Size([1, 6, 50])
Shape of c: torch.Size([1, 6, 64])
Shape of d: torch.Size([1, 6, 64])
Shape of e: torch.Size([1, 6, 324])


In [41]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1, 6)
print("Shape of a:", a.shape)
b = x(a)
print("Shape of b:", b.shape)
c , d = y(b)
print("Shape of c:", c.shape)
print("Shape of d:", d.shape)

e = z(d)

print("Shape of e:", e.shape)

Shape of a: torch.Size([1, 6])
Shape of b: torch.Size([1, 6, 50])
Shape of c: torch.Size([1, 6, 64])
Shape of d: torch.Size([1, 1, 64])
Shape of e: torch.Size([1, 1, 324])


In [42]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1, 6)
print("Shape of a:", a.shape)
b = x(a)
print("Shape of b:", b.shape)
c , d = y(b)
print("Shape of c:", c.shape)
print("Shape of d:", d.shape)

e = z(d.squeeze(0))

print("Shape of e:", e.shape)

Shape of a: torch.Size([1, 6])
Shape of b: torch.Size([1, 6, 50])
Shape of c: torch.Size([1, 6, 64])
Shape of d: torch.Size([1, 1, 64])
Shape of e: torch.Size([1, 324])


In [43]:
learning_rate = 0.001
epochs = 20

In [44]:
model = SimpleRNN(len(vocab))

In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [46]:
# Training Loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # Forward Pass
    output = model(question)
    # print(output.shape)

    # Loss -> Output Shape (1, 324) - (1)
    loss = criterion(output, answer[0])

    # Gradients
    loss.backward()

    # Update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 526.427852
Epoch: 2, Loss: 456.858276
Epoch: 3, Loss: 377.822192
Epoch: 4, Loss: 319.002589
Epoch: 5, Loss: 267.221820
Epoch: 6, Loss: 219.275397
Epoch: 7, Loss: 175.785072
Epoch: 8, Loss: 138.245054
Epoch: 9, Loss: 106.391005
Epoch: 10, Loss: 81.451012
Epoch: 11, Loss: 63.180018
Epoch: 12, Loss: 49.719154
Epoch: 13, Loss: 39.480555
Epoch: 14, Loss: 31.860376
Epoch: 15, Loss: 26.278654
Epoch: 16, Loss: 21.710055
Epoch: 17, Loss: 18.235216
Epoch: 18, Loss: 15.450702
Epoch: 19, Loss: 13.141939
Epoch: 20, Loss: 11.400963


In [47]:
def predict(model, question, threshold = 0.5):

  # Convert questions to number
  numerical_question = text_to_indices(question, vocab)

  # Tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # Send to Model
  output = model(question_tensor)

  # Convert Logits to Probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # Find index of Max prob
  value, index = torch.max(probs, dim = 1)

  # print(numerical_question)
  # print(output)
  # print(output.shape)
  # print(probs)
  # print(probs.shape)
  # print(value, index)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [48]:
predict(model, "What is campusx")

I don't know
h2o


In [49]:
vocab.keys()

dict_keys(['<UNK>', 'what', 'is', 'the', 'capital', 'of', 'france', 'paris', 'germany', 'berlin', 'who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter', 'boiling', 'point', 'water', 'celsius', '100', 'painted', 'mona', 'lisa', 'leonardo-da-vinci', 'square', 'root', '64', '8', 'chemical', 'symbol', 'for', 'gold', 'au', 'which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945', 'longest', 'river', 'nile', 'japan', 'tokyo', 'developed', 'theory', 'relativity', 'albert-einstein', 'freezing', 'fahrenheit', '32', 'known', 'as', 'red', 'mars', 'author', '1984', 'george-orwell', 'currency', 'united', 'kingdom', 'pound', 'india', 'delhi', 'discovered', 'gravity', 'newton', 'how', 'many', 'continents', 'are', 'there', 'on', 'earth', '7', 'gas', 'do', 'plants', 'use', 'photosynthesis', 'co2', 'smallest', 'prime', 'number', '2', 'invented', 'telephone', 'alexander-graham-bell', 'australia', 'canberra', 'ocean', 'pacific-oce

In [50]:
list(vocab.keys())

['<UNK>',
 'what',
 'is',
 'the',
 'capital',
 'of',
 'france',
 'paris',
 'germany',
 'berlin',
 'who',
 'wrote',
 'to',
 'kill',
 'a',
 'mockingbird',
 'harper-lee',
 'largest',
 'planet',
 'in',
 'our',
 'solar',
 'system',
 'jupiter',
 'boiling',
 'point',
 'water',
 'celsius',
 '100',
 'painted',
 'mona',
 'lisa',
 'leonardo-da-vinci',
 'square',
 'root',
 '64',
 '8',
 'chemical',
 'symbol',
 'for',
 'gold',
 'au',
 'which',
 'year',
 'did',
 'world',
 'war',
 'ii',
 'end',
 '1945',
 'longest',
 'river',
 'nile',
 'japan',
 'tokyo',
 'developed',
 'theory',
 'relativity',
 'albert-einstein',
 'freezing',
 'fahrenheit',
 '32',
 'known',
 'as',
 'red',
 'mars',
 'author',
 '1984',
 'george-orwell',
 'currency',
 'united',
 'kingdom',
 'pound',
 'india',
 'delhi',
 'discovered',
 'gravity',
 'newton',
 'how',
 'many',
 'continents',
 'are',
 'there',
 'on',
 'earth',
 '7',
 'gas',
 'do',
 'plants',
 'use',
 'photosynthesis',
 'co2',
 'smallest',
 'prime',
 'number',
 '2',
 'invented'

In [51]:
list(vocab.keys())[311]

'jamescameron'

In [52]:
list(vocab.keys())[16]

'harper-lee'

In [53]:
predict(model, "What is capital of France")

paris


In [54]:
list(vocab.keys())[7]

'paris'

In [55]:
predict(model, "What is the largest planet in our solar system")

jupiter
