In [None]:
!pip install torchtext==0.15.1
!pip install torch==2.1.0
!pip install transformers==4.27.1
!pip install datasets==2.17.0

In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

qa_dataset = [
    {
        'context': 'My name is AIVN and I am from Vietnam.',
        'question': 'What is my name?',
        'answer': 'AIVN'
    },
    {
        'context': 'I love painting and my favorite artist is Vincent Van Gogh.',
        'question': 'What is my favorite activity?',
        'answer': 'painting'
    },
    {
        'context': 'I am studying computer science at the University of Tokyo.',
        'question': 'What am I studying?',
        'answer': 'computer science'
    },
    {
        'context': 'My favorite book is "To Kill a Mockingbird" by Harper Lee.',
        'question': 'What is my favorite book?',
        'answer': '"To Kill a Mockingbird"'
    },
    {
        'context': 'I have a pet dog named Max who loves to play fetch.',
        'question': 'What is the name of my pet?',
        'answer': 'Max'
    },
    {
        'context': 'I was born in Paris, but now I live in New York City.',
        'question': 'Where do I live now?',
        'answer': 'New York City'
    }
]

data_size = len(qa_dataset)
data_size

6

In [None]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(data):
    for item in data:
        yield tokenizer(item['context'] + ' <sep> ' + item['question'])

# Create vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials=['<unk>', '<pad>', '<bos>', '<eos>', '<sep>']
)
vocab.set_default_index(vocab['<unk>'])
vocab.get_stoi()

{'to': 24,
 ',': 25,
 'pet': 21,
 'who': 61,
 'gogh': 39,
 'the': 23,
 'fetch': 37,
 'play': 52,
 'van': 56,
 'now': 19,
 'was': 59,
 'a': 14,
 'name': 13,
 'aivn': 27,
 'i': 5,
 'studying': 22,
 'and': 15,
 'where': 60,
 '<unk>': 0,
 'favorite': 11,
 'by': 32,
 'artist': 28,
 'live': 18,
 '<eos>': 3,
 'harper': 40,
 'dog': 36,
 'loves': 45,
 '<pad>': 1,
 'computer': 34,
 '.': 8,
 'born': 30,
 'is': 6,
 'my': 7,
 'book': 16,
 'science': 53,
 'of': 20,
 '<bos>': 2,
 '<sep>': 4,
 'what': 10,
 'am': 12,
 'named': 48,
 'at': 29,
 'but': 31,
 'in': 17,
 'from': 38,
 'tokyo': 54,
 'city': 33,
 'have': 41,
 'kill': 42,
 'lee': 43,
 'love': 44,
 '?': 9,
 'do': 35,
 'max': 46,
 'mockingbird': 47,
 'york': 62,
 'new': 49,
 'vincent': 58,
 'painting': 50,
 'paris': 51,
 'university': 55,
 'activity': 26,
 'vietnam': 57}

In [None]:
MAX_SEQ_LENGTH = 22
PAD_IDX = vocab['<pad>']

def pad_and_truncate(input_ids, max_seq_len):
    if len(input_ids) > max_seq_len:
        input_ids = input_ids[:max_seq_len]
    elif len(input_ids) < max_seq_len:
        input_ids += [PAD_IDX] * (max_seq_len - len(input_ids))

    return input_ids

def vectorize(question, context, answer):
    input_text = question + ' <sep> ' + context
    input_ids = [vocab[token] for token in tokenizer(input_text)]
    input_ids = pad_and_truncate(input_ids, MAX_SEQ_LENGTH)

    answer_ids = [vocab[token] for token in tokenizer(answer)]
    start_positions = input_ids.index(answer_ids[0])
    end_positions = start_positions + len(answer_ids) - 1

    input_ids = torch.tensor(input_ids, dtype=torch.long)
    start_positions = torch.tensor(start_positions, dtype=torch.long)
    end_positions = torch.tensor(end_positions, dtype=torch.long)

    return input_ids, start_positions, end_positions

In [None]:
class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question_text = item['question']
        context_text = item['context']
        answer_text = item['answer']

        input_ids, start_positions, end_positions = vectorize(
            question_text, context_text, answer_text
        )

        return input_ids, start_positions, end_positions

In [None]:
def decode(input_ids):
    return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
for item in qa_dataset:
    question_text = item['question']
    context_text = item['context']
    answer_text = item['answer']
    input_ids, start_positions, end_positions = vectorize(question_text, context_text, answer_text)
    print(input_ids)
    text = decode(input_ids)
    answer_span = input_ids[start_positions:end_positions+1]

    print(text)
    print(decode(answer_span))

tensor([10,  6,  7, 13,  9,  4,  7, 13,  6, 27, 15,  5, 12, 38, 57,  8,  1,  1,
         1,  1,  1,  1])
what is my name ? <sep> my name is aivn and i am from vietnam . <pad> <pad> <pad> <pad> <pad> <pad>
aivn
tensor([10,  6,  7, 11, 26,  9,  4,  5, 44, 50, 15,  7, 11, 28,  6, 58, 56, 39,
         8,  1,  1,  1])
what is my favorite activity ? <sep> i love painting and my favorite artist is vincent van gogh . <pad> <pad> <pad>
painting
tensor([10, 12,  5, 22,  9,  4,  5, 12, 22, 34, 53, 29, 23, 55, 20, 54,  8,  1,
         1,  1,  1,  1])
what am i studying ? <sep> i am studying computer science at the university of tokyo . <pad> <pad> <pad> <pad> <pad>
computer science
tensor([10,  6,  7, 11, 16,  9,  4,  7, 11, 16,  6, 24, 42, 14, 47, 32, 40, 43,
         8,  1,  1,  1])
what is my favorite book ? <sep> my favorite book is to kill a mockingbird by harper lee . <pad> <pad> <pad>
to kill a mockingbird
tensor([10,  6, 23, 13, 20,  7, 21,  9,  4,  5, 41, 14, 21, 36, 48, 46, 61, 45,
     

In [None]:
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
import torch.nn as nn
import torch.optim as optim

class QAModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers):
        super(QAModel, self).__init__()
        self.input_embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim, hidden_size,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True
        )

        self.start_linear = nn.Linear(hidden_size * 2, 1)
        self.end_linear = nn.Linear(hidden_size * 2, 1)

    def forward(self, text):
        input_embedded = self.input_embedding(text)
        lstm_out, _ = self.lstm(input_embedded)

        start_logits = self.start_linear(lstm_out).squeeze(-1)
        end_logits = self.end_linear(lstm_out).squeeze(-1)

        return start_logits, end_logits

# Model parameters
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS = 2

model = QAModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS)

input = torch.randint(0, 10, size=(1, 10))
print(input.shape)
model.eval()
with torch.no_grad():
    start_logits, end_logits = model(input)

print(start_logits.shape)

torch.Size([1, 10])
torch.Size([1, 10])


In [None]:
LR = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
EPOCHS = 15

model.train()
for _ in range(EPOCHS):
    for idx, (input_ids, start_positions, end_positions) in enumerate(train_loader):
        optimizer.zero_grad()
        start_logits, end_logits = model(input_ids)
        start_loss = criterion(start_logits, start_positions)
        end_loss = criterion(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2
        total_loss.backward()
        optimizer.step()
        print(total_loss.item())

3.097381114959717
3.0793018341064453
3.0629773139953613
3.0087430477142334
2.9612159729003906
2.868612766265869
2.8718039989471436
2.721425771713257
2.730931043624878
2.5300614833831787
2.5358078479766846
2.4704222679138184
2.2481765747070312
2.0046045780181885
2.0491843223571777
1.827653408050537
1.6861846446990967
1.4823263883590698
1.3509588241577148
1.3486322164535522
1.2747832536697388
0.9748340845108032
1.0343611240386963
1.23782217502594
1.0911117792129517
0.5120372772216797
0.7503616809844971
0.4456079304218292
0.8514096736907959
0.5883430242538452
0.692642867565155
0.2551625967025757
0.3969961106777191
0.2927904725074768
0.5384989976882935
0.19122187793254852
0.30173128843307495
0.14799988269805908
0.16991019248962402
0.09872200340032578
0.134701207280159
0.28781893849372864
0.06053245812654495
0.04753508046269417
0.1506822109222412


In [None]:
model.eval()
with torch.no_grad():
    sample = qa_dataset[1]
    context, question, answer = sample.values()
    input_ids, start_positions, end_positions = vectorize(question, context, answer)
    input_ids = input_ids.unsqueeze(0)
    start_logits, end_logits = model(input_ids)

    offset = len(tokenizer(question)) + 1
    start_position = torch.argmax(start_logits, dim=1).numpy()[0]
    end_position = torch.argmax(end_logits, dim=1).numpy()[0]

    start_position -= offset
    end_position -= offset

    start_position = max(start_position, 0)
    end_position = min(end_position, len(tokenizer(context)) - 1)

    if end_position >= start_position:
        # Extract the predicted answer span
        context_tokens = tokenizer(context)
        predicted_answer_tokens = context_tokens[start_position:end_position + 1]
        predicted_answer = ' '.join(predicted_answer_tokens)
    else:
        predicted_answer = ''

    print(f'Context: {context}')
    print(f'Question: {question}')
    print(f'Start position: {start_position}')
    print(f'End position: {end_position}')
    print(f'Answer span: {predicted_answer}')

Context: I love painting and my favorite artist is Vincent Van Gogh.
Question: What is my favorite activity?
Start position: 2
End position: 2
Answer span: painting
