In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

qa_dataset = [
    {
    'context': 'My name is AIVN and I am from Vietnam.',
    'question': 'Where does AIVN come from?',
    'answer': 'Vietnam'
    },
    {
        'context': 'I love painting and my favorite artist is Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo',
        'question': 'Where do I live?',
        'answer': 'Tokyo'
    },
    {
        'context': 'I was born in Paris, but now I live in New York',
        'question': 'Where do I live now',
        'answer': 'New York'
    },
]

data_size = len(qa_dataset)
data_size


4

In [None]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item['context'] + ' <sep> ' + item['question'])

vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials=['<unk>', '<pad>', '<bos>', '<eos>', '<sep>'])

vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'what': 44,
 'was': 43,
 'vietnam': 42,
 'van': 41,
 'university': 40,
 'paris': 35,
 'painting': 34,
 'of': 33,
 'new': 32,
 'tokyo': 39,
 'name': 31,
 'love': 30,
 'gogh': 29,
 'where': 10,
 'my': 9,
 '<sep>': 4,
 '<bos>': 2,
 'is': 7,
 'at': 23,
 'science': 36,
 '?': 6,
 '<unk>': 0,
 'and': 14,
 'the': 38,
 'in': 18,
 'favorite': 16,
 'artist': 22,
 'studying': 37,
 'aivn': 12,
 'i': 5,
 '<eos>': 3,
 'come': 26,
 '<pad>': 1,
 'computer': 27,
 'york': 45,
 'am': 13,
 'do': 15,
 'now': 19,
 ',': 20,
 'live': 8,
 'activity': 21,
 '.': 11,
 'born': 24,
 'from': 17,
 'but': 25,
 'does': 28}

In [None]:
classes = set([item['answer'] for item in qa_dataset])
classes_to_idx = {
    cls_name: idx for idx, cls_name in enumerate(classes)
}
idx_to_classes = {
    idx: cls_name for idx, cls_name in enumerate(classes)
}
print(idx_to_classes)
print(classes_to_idx)

{0: 'painting', 1: 'Vietnam', 2: 'New York', 3: 'Tokyo'}
{'painting': 0, 'Vietnam': 1, 'New York': 2, 'Tokyo': 3}


In [None]:
pad_idx = vocab['<pad>']

def pad_and_truncate(input_ids, max_length):
    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
    elif len(input_ids) < max_length:
        input_ids += [vocab['<pad>']] * (max_length - len(input_ids))
    return input_ids

max_length = 30
text = 'I love AIVN'
tokens = tokenizer(text)
input_ids = vocab(tokens)
input_ids = pad_and_truncate(input_ids, max_length)
input_ids

[5,
 30,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
def vectorize(question, context, answer):
  input_text = question + ' <sep> ' + context
  input_ids = [vocab[token] for token in tokenizer(input_text)]
  input_ids = pad_and_truncate(input_ids, max_length)

  answer_ids = [vocab[token] for token in tokenizer(answer)]
  start_pos = input_ids.index(answer_ids[0])
  end_pos = start_pos + len(answer_ids) - 1

  input_ids = torch.tensor(input_ids, dtype=torch.long)
  start_pos = torch.tensor(start_pos, dtype=torch.long)
  end_pos = torch.tensor(end_pos, dtype=torch.long)

  return input_ids, start_pos, end_pos

vectorize (
    'What is your name?',
    'My name is AIVN',
    'AIVN'
)

(tensor([44,  7,  0, 31,  6,  4,  9, 31,  7, 12,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]),
 tensor(9),
 tensor(9))

In [None]:
class QADataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = self.data[idx]
    question = item['question']
    context = item['context']
    answer = item['answer']

    input_ids, start_pos, end_pos = vectorize(question, context, answer)



    return input_ids, start_pos, end_pos

In [None]:
def decode(input_ids):
  return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
def vectorize(question, context, answer):
    input_text = question + ' <sep> ' + context
    input_ids = [vocab[token] for token in tokenizer(input_text)]
    input_ids = pad_and_truncate(input_ids, max_length)

    answer_ids = [vocab[token] for token in tokenizer(answer)]

    print(f"Question: {question}")
    print(f"Context: {context}")
    print(f"Answer: {answer}")
    print(f"Tokenized Input: {[tokenizer(input_text)]}")
    print(f"Tokenized Answer: {[tokenizer(answer)]}")
    print(f"Input IDs: {input_ids}")
    print(f"Answer IDs: {answer_ids}")

    try:
        start_pos = input_ids.index(answer_ids[0])
        end_pos = start_pos + len(answer_ids) - 1
    except ValueError as e:
        print(f"Error: {e}")
        # Handle the case where the answer is not in the input
        start_pos = torch.tensor(-1, dtype=torch.long)  # Invalid position
        end_pos = torch.tensor(-1, dtype=torch.long)  # Invalid position

    input_ids = torch.tensor(input_ids, dtype=torch.long)
    start_pos = torch.tensor(start_pos, dtype=torch.long)
    end_pos = torch.tensor(end_pos, dtype=torch.long)

    return input_ids, start_pos, end_pos


In [None]:
for item in qa_dataset:
  question = item['question']
  context = item['context']
  answer = item['answer']
  input_ids, start_pos, end_pos = vectorize(question, context, answer)
  print(input_ids)
  text = decode(input_ids)
  answer_span = input_ids[start_pos:end_pos+1]
  answer_text = decode(answer_span)
  print(text)
  print(answer_text)

Question: Where does AIVN come from?
Context: My name is AIVN and I am from Vietnam.
Answer: Vietnam
Tokenized Input: [['where', 'does', 'aivn', 'come', 'from', '?', '<sep>', 'my', 'name', 'is', 'aivn', 'and', 'i', 'am', 'from', 'vietnam', '.']]
Tokenized Answer: [['vietnam']]
Input IDs: [10, 28, 12, 26, 17, 6, 4, 9, 31, 7, 12, 14, 5, 13, 17, 42, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [42]
tensor([10, 28, 12, 26, 17,  6,  4,  9, 31,  7, 12, 14,  5, 13, 17, 42, 11,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
where does aivn come from ? <sep> my name is aivn and i am from vietnam . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
vietnam
Question: What is my favorite activity?
Context: I love painting and my favorite artist is Van Gogh.
Answer: painting
Tokenized Input: [['what', 'is', 'my', 'favorite', 'activity', '?', '<sep>', 'i', 'love', 'painting', 'and', 'my', 'favorite', 'artist', 'is', 'van', 'gogh', '.']]
Tokenized An

In [None]:
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
batch = next(iter(train_loader))
batch

Question: Where does AIVN come from?
Context: My name is AIVN and I am from Vietnam.
Answer: Vietnam
Tokenized Input: [['where', 'does', 'aivn', 'come', 'from', '?', '<sep>', 'my', 'name', 'is', 'aivn', 'and', 'i', 'am', 'from', 'vietnam', '.']]
Tokenized Answer: [['vietnam']]
Input IDs: [10, 28, 12, 26, 17, 6, 4, 9, 31, 7, 12, 14, 5, 13, 17, 42, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [42]
Question: Where do I live now
Context: I was born in Paris, but now I live in New York
Answer: New York
Tokenized Input: [['where', 'do', 'i', 'live', 'now', '<sep>', 'i', 'was', 'born', 'in', 'paris', ',', 'but', 'now', 'i', 'live', 'in', 'new', 'york']]
Tokenized Answer: [['new', 'york']]
Input IDs: [10, 15, 5, 8, 19, 4, 5, 43, 24, 18, 35, 20, 25, 19, 5, 8, 18, 32, 45, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [32, 45]


[tensor([[10, 28, 12, 26, 17,  6,  4,  9, 31,  7, 12, 14,  5, 13, 17, 42, 11,  1,
           1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
         [10, 15,  5,  8, 19,  4,  5, 43, 24, 18, 35, 20, 25, 19,  5,  8, 18, 32,
          45,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]]),
 tensor([15, 17]),
 tensor([15, 18])]

In [None]:
class QAModel(nn.Module):
  def __init__ (self, vocab_size, embedding_dim, hidden_size,
                n_layers):
    super(QAModel, self).__init__()
    self.input_embedding = nn.Embedding(vocab_size, embedding_dim)

    self.lstm = nn.LSTM(
        embedding_dim,
        hidden_size,
        num_layers=n_layers,
        batch_first=True,
        bidirectional=True
    )
    self.start_linear = nn.Linear(hidden_size * 2, 1)
    self.end_linear = nn.Linear(hidden_size * 2, 1)

  def forward(self, text):
    input_embedded = self.input_embedding(text)
    lstm_out, _ = self.lstm(input_embedded)

    start_logit = self.start_linear(lstm_out).squeeze(-1)
    end_logit = self.end_linear(lstm_out).squeeze(-1)

    return start_logit, end_logit


In [None]:
# Model params
Embedding_dims = 64
Hidden_size = 128
vocab_size = len(vocab.get_stoi().values())
n_layers = 2


model = QAModel(vocab_size, Embedding_dims, Hidden_size, n_layers)

input = torch.randint(0, 10, size=(1, 10))
print (input.shape)

model.eval()
with torch.no_grad():
  start_logit, end_logit = model(input)
print (start_logit.shape)
print (end_logit.shape)


torch.Size([1, 10])
torch.Size([1, 10])
torch.Size([1, 10])


In [None]:
LR = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
EPOCHS = 15

In [None]:
model.train()
for epoch in range(EPOCHS):
  for idx, (input_ids, start_pos, end_pos) in enumerate(train_loader):
    optimizer.zero_grad()
    start_logit, end_logit = model(input_ids)
    start_loss = criterion(start_logit, start_pos)
    end_loss = criterion(end_logit, end_pos)
    loss = (start_loss + end_loss) / 2
    loss.backward()
    optimizer.step()
    print (f'Epoch: {epoch+1}/{EPOCHS}, Batch: {idx+1}/{len(train_loader)}, Loss: {loss.item()}')

Question: What is my favorite activity?
Context: I love painting and my favorite artist is Van Gogh.
Answer: painting
Tokenized Input: [['what', 'is', 'my', 'favorite', 'activity', '?', '<sep>', 'i', 'love', 'painting', 'and', 'my', 'favorite', 'artist', 'is', 'van', 'gogh', '.']]
Tokenized Answer: [['painting']]
Input IDs: [44, 7, 9, 16, 21, 6, 4, 5, 30, 34, 14, 9, 16, 22, 7, 41, 29, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [34]
Question: Where do I live now
Context: I was born in Paris, but now I live in New York
Answer: New York
Tokenized Input: [['where', 'do', 'i', 'live', 'now', '<sep>', 'i', 'was', 'born', 'in', 'paris', ',', 'but', 'now', 'i', 'live', 'in', 'new', 'york']]
Tokenized Answer: [['new', 'york']]
Input IDs: [10, 15, 5, 8, 19, 4, 5, 43, 24, 18, 35, 20, 25, 19, 5, 8, 18, 32, 45, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [32, 45]
Epoch: 1/15, Batch: 1/2, Loss: 3.4155917167663574
Question: Where does AIVN come from?
Context: My name is AIVN and I am from V

In [None]:
model.eval()
with torch.no_grad():
  sample = qa_dataset[3]
  context, question, answer = sample.values()
  input_ids, start_pos, end_pos = vectorize(question, context, answer)
  input_ids = input_ids.unsqueeze(0)
  start_logit, end_logit = model(input_ids)

  offset = len(tokenizer(question)) + 1

  start_position = torch.argmax(start_logit, dim=1).numpy()[0]
  end_position = torch.argmax(end_logit, dim=1).numpy()[0]

  start_position -= offset
  end_position -= offset

  start_position = max(start_position, 0)
  end_position = min(end_position, len(tokenizer(context)) - 1)

  if end_position >= start_position:
    # Extracted the predicted answer span
    context_tokens = tokenizer(context)
    predicted_answer = ' '.join(context_tokens[start_position:end_position+1])
  else:
    predicted_answer = ""

  print(f"Question: {question}")
  print(f"Context: {context}")
  print (f"Start Position: {start_position}")
  print (f"End Position: {end_position}")
  print(f"Predicted Answer: {predicted_answer}")

Question: Where do I live now
Context: I was born in Paris, but now I live in New York
Answer: New York
Tokenized Input: [['where', 'do', 'i', 'live', 'now', '<sep>', 'i', 'was', 'born', 'in', 'paris', ',', 'but', 'now', 'i', 'live', 'in', 'new', 'york']]
Tokenized Answer: [['new', 'york']]
Input IDs: [10, 15, 5, 8, 19, 4, 5, 43, 24, 18, 35, 20, 25, 19, 5, 8, 18, 32, 45, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Answer IDs: [32, 45]
Question: Where do I live now
Context: I was born in Paris, but now I live in New York
Start Position: 11
End Position: 12
Predicted Answer: new york
