In [None]:
!pip install torchtext==0.16.2
!pip install torch==2.2.0
!pip install transformers==4.55.4
!pip install datasets==4.0.0

Collecting torchtext==0.16.2
  Downloading torchtext-0.16.2-cp312-cp312-manylinux1_x86_64.whl.metadata (7.5 kB)
Collecting torch==2.2.0 (from torchtext==0.16.2)
  Downloading torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.16.2)
  Downloading torchdata-0.7.1-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.16.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.16.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0->torchtext==0.16.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0->torchtext==0.16.2)
  Downloading nvidia_cudnn_cu

In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

qa_dataset = [
    {
        'context': 'My name is Melanie.',
        'question': 'What is my name?',
        'answer': 'Melanie'
    },
    {
        'context': 'I love watching movie and my favorite one is The Last Airbender.',
        'question': 'What is my favorite activity?',
        'answer': 'watching movie'
    },
    {
        'context': 'I am studying ML Alignments at MATs.',
        'question': 'What am I studying?',
        'answer': 'ML Alignments'
    }
]

data_size = len(qa_dataset)
data_size


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

3

In [None]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(data):
    for item in data:
        yield tokenizer(item['context'] + ' ' + item['question'])

# Create vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(qa_dataset),
    specials=['<unk>', '<pad>', '<bos>', '<eos>', '<sep>']
)
vocab.set_default_index(vocab['<unk>'])
vocab.get_stoi()

{'watching': 28,
 'melanie': 23,
 'mats': 22,
 'love': 21,
 'last': 20,
 'the': 27,
 'alignments': 17,
 'airbender': 16,
 'activity': 15,
 'name': 13,
 'one': 26,
 'and': 18,
 '<unk>': 0,
 'favorite': 12,
 'movie': 25,
 '<eos>': 3,
 'ml': 24,
 '.': 7,
 '<pad>': 1,
 'i': 9,
 'at': 19,
 'studying': 14,
 'is': 5,
 'my': 6,
 '?': 8,
 '<bos>': 2,
 '<sep>': 4,
 'what': 10,
 'am': 11}

In [None]:
classes = set([item['answer'] for item in qa_dataset])
classes_to_idx = {
    cls_name: idx for idx, cls_name in enumerate(classes)
}
idx_to_classes = {
    idx: cls_name for idx, cls_name in enumerate(classes)
}
print(idx_to_classes)

{0: 'Melanie', 1: 'watching movie', 2: 'ML Alignments'}


In [None]:
MAX_QUESTION_LEN = 10
MAX_CONTEXT_LEN = 15
PAD_IDX = vocab['<pad>']

def pad_and_truncate(input_ids, max_seq_len):
    if len(input_ids) > max_seq_len:
        input_ids = input_ids[:max_seq_len]
    elif len(input_ids) < max_seq_len:
        input_ids += [PAD_IDX] * (max_seq_len - len(input_ids))

    return input_ids

def vectorize(question, context):
    input_question_ids = [vocab[token] for token in tokenizer(question)]
    input_context_ids = [vocab[token] for token in tokenizer(context)]

    input_question_ids = pad_and_truncate(input_question_ids, MAX_QUESTION_LEN)
    input_context_ids = pad_and_truncate(input_context_ids, MAX_CONTEXT_LEN)

    input_question_ids = torch.tensor(input_question_ids, dtype=torch.long)
    input_context_ids = torch.tensor(input_context_ids, dtype=torch.long)

    return input_question_ids, input_context_ids

In [None]:
input_question_ids, input_context_ids = vectorize(
    qa_dataset[0]['context'],
    qa_dataset[0]['question']
)
print(input_question_ids)
print(input_context_ids)
print(classes_to_idx[qa_dataset[0]['answer']])

tensor([ 6, 13,  5, 23,  7,  1,  1,  1,  1,  1])
tensor([10,  5,  6, 13,  8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
0


In [None]:
class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question_text = item['question']
        context_text = item['context']

        input_question_ids, input_context_ids = vectorize(
            question_text, context_text
        )

        answer_text = item['answer']
        answer_id = classes_to_idx[answer_text]
        answer_id = torch.tensor(answer_id, dtype=torch.long)

        return input_question_ids, input_context_ids, answer_id

In [None]:
def decode(input_ids):
    return ' '.join([vocab.lookup_token(token) for token in input_ids])

In [None]:
train_dataset = QADataset(qa_dataset)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
for batch in train_loader:
    input_question_ids, input_context_ids, answer_id = batch
    print(input_question_ids, input_context_ids, answer_id)

tensor([[10,  5,  6, 12, 15,  8,  1,  1,  1,  1],
        [10,  5,  6, 13,  8,  1,  1,  1,  1,  1]]) tensor([[ 9, 21, 28, 25, 18,  6, 12, 26,  5, 27, 20, 16,  7,  1,  1],
        [ 6, 13,  5, 23,  7,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]]) tensor([1, 0])
tensor([[10, 11,  9, 14,  8,  1,  1,  1,  1,  1]]) tensor([[ 9, 11, 14, 24, 17, 19, 22,  7,  1,  1,  1,  1,  1,  1,  1]]) tensor([2])


In [None]:
import torch.nn as nn
import torch.optim as optim

class QAModel(nn.Module):
    def __init__(self,
        vocab_size, embedding_dim, hidden_size,
        n_layers, n_classes
    ):
        super(QAModel, self).__init__()
        self.question_embedding = nn.Embedding(
            vocab_size, embedding_dim
        )
        self.context_embedding = nn.Embedding(
            vocab_size, embedding_dim
        )

        self.lstm = nn.LSTM(
            embedding_dim * 2, hidden_size,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, question, context):
        question_embed = self.question_embedding(question)
        context_embed = self.context_embedding(context)

        combined = torch.cat(
            (question_embed, context_embed),
            dim=1
        )

        lstm_out, _ = self.lstm(combined)
        lstm_out = lstm_out[:, -1, :]

        out = self.fc(lstm_out)

        return out

# Model parameters
EMBEDDING_DIM = 64
HIDDEN_SIZE = 128
VOCAB_SIZE = len(vocab)
N_LAYERS = 2
N_CLASSES = len(classes)

model = QAModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS, N_CLASSES)

input_context = torch.randint(0, 10, size=(1, MAX_CONTEXT_LEN))
input_question = torch.randint(0, 10, size=(1, MAX_QUESTION_LEN))
model.eval()
with torch.no_grad():
    logits = model(input_question, input_context)

print(logits.shape)

torch.Size([1, 3])


In [None]:
LR = 1e-3
optimizer = torch.optim.Adam(
    model.parameters(), lr=LR
)
criterion = nn.CrossEntropyLoss()

In [None]:
EPOCHS = 20

model.train()
for _ in range(EPOCHS):
    for idx, (input_question_ids, input_context_ids, answer_id) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(input_question_ids, input_context_ids)
        loss = criterion(outputs, answer_id)
        loss.backward()
        optimizer.step()
        print(loss.item())

1.1192712783813477
1.1902929544448853
1.050471544265747
1.0951836109161377
1.0501351356506348
1.0091643333435059
1.0296717882156372
1.0110422372817993
1.030808925628662
0.8547526597976685
0.9699543714523315
0.802112340927124
0.7984910011291504
0.9071606397628784
0.7572532296180725
0.7020053863525391
0.6111633777618408
0.8377385139465332
0.7040671706199646
0.28251025080680847
0.6584620475769043
0.16998976469039917
0.3954346776008606
0.5652879476547241
0.25220170617103577
0.8844575881958008
0.5162703990936279
0.04102083668112755
0.6243141293525696
0.029508311301469803
0.2251863330602646
0.23783940076828003
0.2535672187805176
0.19433361291885376
0.20301485061645508
0.16019707918167114
0.184810608625412
0.01270509697496891
0.06714636087417603
0.12024736404418945


In [None]:
import numpy as np

In [None]:
model.eval()
with torch.no_grad():
    sample = qa_dataset[0]
    context, question, answer = sample.values()
    question_ids, context_ids = vectorize(question, context)
    question_ids = question_ids.unsqueeze(0)
    context_ids = context_ids.unsqueeze(0)
    outputs = model(question_ids, context_ids)
    _, predicted = torch.max(outputs.data, 1)
    print(f'Context: {context}')
    print(f'Question: {question}')
    print(f'Prediction: {idx_to_classes[predicted.tolist()[0]]}')

Context: My name is Melanie.
Question: What is my name?
Prediction: Melanie


In [23]:
print(classes_to_idx)

{'Melanie': 0, 'watching movie': 1, 'ML Alignments': 2}
