In [79]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.tokenizer(
            '[START] ' + self.questions[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        answer = self.tokenizer(
            '[START] ' + self.answers[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return question['input_ids'].squeeze(0), answer['input_ids'].squeeze(0)

def collate_fn(batch):
    questions, answers = zip(*batch)
    questions = pad_sequence(questions, batch_first=True, padding_value=0)
    answers = pad_sequence(answers, batch_first=True, padding_value=0)
    return questions, answers[:, :-1], answers[:, 1:]

def get_datasets(tokenizer, batch_size=64):
    dataset = load_dataset("msaad02/brockport-gpt-4-qa")
    dataset = dataset['train'].to_pandas()

    context_raw = dataset['question'].to_list()
    target_raw = dataset['answer'].to_list()

    is_train_mask = np.random.uniform(size=(len(target_raw),)) < 0.8

    train_context = np.array(context_raw)[is_train_mask]
    train_target = np.array(target_raw)[is_train_mask]

    val_context = np.array(context_raw)[~is_train_mask]
    val_target = np.array(target_raw)[~is_train_mask]

    train_dataset = Seq2SeqDataset(train_context, train_target, tokenizer)
    val_dataset = Seq2SeqDataset(val_context, val_target, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, val_loader

from torchtext.data.utils import get_tokenizer
# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
train_loader, val_loader = get_datasets(tokenizer)


In [84]:
tokenizer("Hi, how can I aplpy?")

['hi', ',', 'how', 'can', 'i', 'aplpy', '?']

In [87]:
tokenizer(
    '[START] ' + "Hi, how can I aplpy?" + ' [END]',
    max_length=512,
    return_tensors='pt'
)

TypeError: _basic_english_normalize() got an unexpected keyword argument 'max_length'

In [82]:
next(iter(train_loader))

TypeError: _basic_english_normalize() got an unexpected keyword argument 'truncation'

In [1]:
"""
For some reason, most likely due to how the model is implemented, the ScratchModel
runs EXCEPTIONALLY slow. Again, I'm not sure why exactly, but the scratch model is
a small transformer model that should not be running this slow. This script aims to
re-implement the  ScratchModel in PyTorch using their optimized Transformer module
and see if that fixes the issue.

All the parameters will be the same, so we can just copy them over. The only thing
that will change is the model implementation and data loading.
"""

import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # prevent tensorflow logs

# # Set path to parent directory so we can import from other folders.
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from typing import List, Tuple
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- Model ------------------------------------------------
EPOCHS = 13
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1


# -------- Data ------------------------------------------------

dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

        self.sos_idx = vocab['[START]']
        self.eos_idx = vocab['[END]']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        target = torch.cat([torch.tensor([self.sos_idx]), target, torch.tensor([self.eos_idx])])
        return context, target

def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    vocab = build_vocab_from_iterator(
        iterator = [token_generator], 
        specials=["[UNK]", "[PAD]", "[START]", "[END]"],
        special_first=True, 
        min_freq=5
    )
    return vocab


def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['[PAD]']
    contexts = pad_sequence(contexts, padding_value=pad_idx, batch_first=True)
    targets = pad_sequence(targets, padding_value=pad_idx, batch_first=True)
    return contexts, targets

# Create a list of tuples (context, target)
context = dataset['question'].tolist()
target = dataset['answer'].tolist()

data = list(zip(context, target))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')

vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["[UNK]"])

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)


# -------- Model ------------------------------------------------

class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dim_feedforward, dropout_rate, activation, vocab_length, batch_first=True):
        super(Transformer, self).__init__()


        self.context_embedding = nn.Embedding(vocab_length, d_model)
        self.target_embedding = nn.Embedding(vocab_length, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            activation=activation,
            batch_first=batch_first
        )

        self.classifier = nn.Linear(d_model, vocab_length)

    def forward(self, src, tgt):
        src = self.context_embedding(src)
        tgt = self.target_embedding(tgt)

        out = self.transformer(src, tgt)
        out = self.classifier(out)

        return out
    

model = Transformer(
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout_rate=DROPOUT_RATE,
    activation='relu',
    vocab_length=len(vocab)
).to(device)


# -------- Training ------------------------------------------------
pad_idx = vocab['[PAD]']
sos_idx = vocab['[START]']
eos_idx = vocab['[END]']

# Define the loss function and optimizer
loss_fn = CrossEntropyLoss(ignore_index=pad_idx)
optimizer = Adam(model.parameters(), lr=0.0001)


# Define the training function
def train(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for contexts, targets in train_loader:
        # Move tensors to the right device
        src = contexts.to(device)
        tgt = targets.to(device)

        tgt_input = tgt[:, :-1]  # All tokens except the last (remove <eos>)
        tgt_output = tgt[:, 1:]  # All tokens except the first (remove <sos>)

        # Forward pass
        out = model(src, tgt_input)

        # Calculate the loss
        loss = loss_fn(out.reshape(-1, out.shape[-1]), tgt_output.reshape(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


# Training loop
for epoch in range(EPOCHS):
    epoch_loss = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")


Epoch 1, Loss: 6.0224
Epoch 2, Loss: 4.6945
Epoch 3, Loss: 4.2532
Epoch 4, Loss: 3.9724
Epoch 5, Loss: 3.7514
Epoch 6, Loss: 3.5621
Epoch 7, Loss: 3.4002
Epoch 8, Loss: 3.2589
Epoch 9, Loss: 3.1286
Epoch 10, Loss: 3.0080
Epoch 11, Loss: 2.8982
Epoch 12, Loss: 2.7952
Epoch 13, Loss: 2.6972


In [8]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

get_n_params(model)

53205770

In [123]:
start_symbol = vocab['[START]']
end_symbol = vocab['[END]']
pad_symbol = vocab['[PAD]']
next_word = ""
def greedy_decode(model: Transformer, src, max_len):
    """
    Args:
    - model: Trained Transformer model
    - src (torch.Tensor): Tensor containing the token IDs of the source sequence; shape should be (N, S) where N is batch size (usually 1 during inference)
    - max_len (int): Maximum length of the output sequence

    Returns:
    - output (list of ints): The generated sequence
    """
    global out, next_word, ys
    
    src = src.to(device)  # ensure src is on the correct device
    memory = model.context_embedding(src)
    ys = torch.ones(src.size(0), 1).fill_(start_symbol).type_as(src.data)
    
    for i in range(max_len-1):
        src = model.context_embedding(src)
        tgt = model.target_embedding(ys)

        out = model.transformer(src, tgt)
        out = model.classifier(out)

        next_word = torch.argmax(out, dim=2)
        ys = torch.cat([ys, next_word], dim=1).fill_(start_symbol).type_as(src.data)

        if next_word.item() == end_symbol:
            break

    return ys[0].tolist()

# Usage example
# Assuming `vocab` is a dictionary mapping tokens to integer indices

src_sentence = "How can I apply to SUNY Brockport?"

tokenizer = get_tokenizer('basic_english')

input = tokenizer(src_sentence)
input = [vocab[token] for token in input]

src_tensor = torch.LongTensor(input).unsqueeze(0)  # Add batch dimension

output_indices = greedy_decode(model, src_tensor, max_len=50)

# Convert indices back to tokens (assuming you have an inverse_vocab mapping indices back to tokens)
output_sentence = " ".join(vocab.lookup_tokens(output_indices))
print(output_sentence)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [124]:
ys

tensor([[2., 2.]], device='cuda:0')

In [116]:
next_word

tensor([[175]], device='cuda:0')

In [117]:
ys.shape

torch.Size([1, 1])

In [118]:
next_word.shape

torch.Size([1, 1])

In [120]:
torch.cat([ys, next_word], dim=1)

tensor([[  2, 175]], device='cuda:0')

In [109]:
out[0][0]

tensor([ 3.6008, -2.2197, -1.6768,  ..., -1.4583, -0.3718, -1.2441],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [111]:
torch.argmax(out, dim=2)

tensor([[330]], device='cuda:0')

In [112]:
next_word

tensor([[0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')

In [106]:
ys

tensor([[2]], device='cuda:0')

In [121]:
torch.ones(src_tensor.size(0), 1).fill_(start_symbol).type_as(src_tensor.data)

tensor([[2]])

In [87]:
ys = torch.ones(src_tensor.size(0), 1).fill_(start_symbol).type_as(src_tensor.data)
ys

tensor([[2]])

In [69]:
src_tensor.data

tensor([[ 142,   20,  767,   99,    6,   25,   21, 1766]])

In [96]:
ys

tensor([[2]], device='cuda:0')

In [81]:
next_word.cpu()

tensor(206)

In [83]:
next_word

tensor(206, device='cuda:0')

In [82]:
next_word.reshape(-1, next_word.shape[-1])

IndexError: tuple index out of range

In [88]:
ys

tensor([[2]])

In [91]:
torch.tensor([[next_word]])

tensor([[206]])

In [92]:
torch.cat([ys, torch.tensor([[next_word]])], dim=1)

tensor([[  2, 206]])

In [65]:
ys

tensor([[  2., 116.]], device='cuda:0')

In [64]:
next_word

tensor(116, device='cuda:0')

In [54]:
torch.cat([ys, torch.ones(ys.size(0), 1).type_as(src_tensor.data).fill_(next_word)], dim=1)

TypeError: fill_() received an invalid combination of arguments - got (list), but expected one of:
 * (Tensor value)
      didn't match because some of the arguments have invalid types: (!list of [str]!)
 * (Number value)
      didn't match because some of the arguments have invalid types: (!list of [str]!)


In [48]:
next_word

['make']

In [37]:
probs = torch.nn.functional.softmax(out[0][0], dim=0)

In [38]:
torch.argmax(probs)

tensor(116, device='cuda:0')

In [40]:
vocab.lookup_tokens([torch.argmax(probs).item()])

['college']

In [43]:
vocab.get_itos()[116]

'college'

In [22]:
model.classifier(out[:, -1, :])

NameError: name 'out' is not defined

In [34]:
" ".join(vocab.lookup_tokens(train_dataset.__getitem__(0)[1].numpy()))

"[START] if you have any questions about our policy on commercial [UNK] , you should refer them to the office of human resources . although specific contact information isn ' t provided in the policy statement , the human resources team is well-equipped to provide guidance and support on these matters , ensuring that all activities on campus align with our policies and values . [END]"

In [43]:
targets.shape

torch.Size([64, 104])

In [42]:
targets[:, :-1].shape

torch.Size([64, 103])

In [8]:
output[0][0]

tensor([ 0.6789, -0.5257,  0.0723,  ..., -0.0295, -0.5506, -0.4791],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [6]:
def train(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for contexts, targets in data_loader:
        contexts = contexts.to(device)
        targets = targets.to(device)

        output = model(contexts, targets[:, :-1])

        
        loss = loss_fn(output.reshape(-1, output.size(-1)), targets[:, 1:].reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# Training loop
for epoch in range(EPOCHS):
    epoch_loss = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

RuntimeError: the feature number of src and tgt must be equal to d_model

In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['<pad>']
    sos_idx = vocab['<sos>']
    eos_idx = vocab['<eos>']

    contexts = [torch.cat([torch.tensor([sos_idx]), context, torch.tensor([eos_idx])], dim=0) for context in contexts]
    targets = [torch.cat([torch.tensor([sos_idx]), target, torch.tensor([eos_idx])], dim=0) for target in targets]

    context_lens = torch.tensor([len(context) for context in contexts])
    target_lens = torch.tensor([len(target) for target in targets])

    contexts = pad_sequence(contexts, padding_value=pad_idx)
    targets = pad_sequence(targets, padding_value=pad_idx)

    return contexts, targets, context_lens, target_lens


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator(token_generator, specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True)

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab(train_data + val_data, tokenizer)

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [2]:
tokenizer("How can I apply?")

['how', 'can', 'i', 'apply', '?']

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
EPOCHS = 13
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1

model = nn.Transformer(
    d_model=D_MODEL,
    nhead=NUM_HEADS,
    num_encoder_layers=NUM_LAYERS,
    num_decoder_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout=DROPOUT_RATE,
    activation='relu',
).to(device)



In [25]:
import torch.optim as optim

# Setup the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define a loss function, ignoring the padding index in the loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def train(model, data_loader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in data_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            tgt_input = tgt[:-1, :]
            targets = tgt[1:, :]  # targets do not include the <sos> token

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_masks(src, tgt_input, pad_idx=vocab['<pad>'])
            optimizer.zero_grad()
            output = model(src, tgt_input, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
            output = output.reshape(-1, output.shape[-1])

            loss = criterion(output, targets.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

def create_masks(src, tgt, pad_idx):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).to(device)
    
    src_padding_mask = (src == pad_idx).transpose(0, 1).to(device)
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1).to(device)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Now call the train function
train(model, train_loader, optimizer, criterion, EPOCHS)


RuntimeError: Token what not found and default index is not set
Exception raised from __getitem__ at /__w/text/text/pytorch/text/torchtext/csrc/vocab.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cc5d7dd87 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f6cc5d2e75f in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: torchtext::Vocab::__getitem__(c10::basic_string_view<char> const&) const + 0x384 (0x7f6bb22418b4 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so)
frame #3: <unknown function> + 0x1dc5b (0x7f6c082a9c5b in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #4: <unknown function> + 0x3dee7 (0x7f6c082c9ee7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #5: /home/msaad/miniconda3/envs/thesis/bin/python() [0x525d17]
frame #6: _PyObject_MakeTpCall + 0x254 (0x502a14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #7: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #8: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b51d]
frame #9: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #10: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #11: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #12: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #13: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #14: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b75e]
frame #15: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5a6cea]
frame #16: _PyEval_EvalFrameDefault + 0x53b (0x50eb6b in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #17: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #18: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #19: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #20: _PyEval_EvalFrameDefault + 0x3a22 (0x512052 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #21: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #22: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #23: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #24: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #25: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #26: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5df096]
frame #27: _PyEval_EvalFrameDefault + 0x35e4 (0x511c14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #28: /home/msaad/miniconda3/envs/thesis/bin/python() [0x55451f]
frame #29: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #30: PyObject_Call + 0x9d (0x53ef0d in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #31: _PyEval_EvalFrameDefault + 0x42da (0x51290a in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #32: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #33: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #34: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #35: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #36: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #37: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #38: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #39: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #40: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #41: <unknown function> + 0x79e7 (0x7f6d5c0bb9e7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #42: /home/msaad/miniconda3/envs/thesis/bin/python() [0x52405b]
frame #43: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4bf4de]
frame #44: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4c13c9]
frame #45: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #46: _PyEval_EvalFrameDefault + 0x9353 (0x517983 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #47: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #48: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #49: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #50: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #51: PyObject_Vectorcall + 0x31 (0x51bde1 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x753 (0x50ed83 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #53: _PyFunction_Vectorcall + 0x173 (0x534f13 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #54: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5f33ef]
frame #55: Py_RunMain + 0x14a (0x5f2dfa in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #56: Py_BytesMain + 0x39 (0x5b6f49 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #57: <unknown function> + 0x29d90 (0x7f6d5d33fd90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #58: __libc_start_main + 0x80 (0x7f6d5d33fe40 in /lib/x86_64-linux-gnu/libc.so.6)
frame #59: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5b6d9f]


In [4]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator([token_generator], specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True, min_freq=5)


vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["<unk>"])


class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target
    

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

In [66]:
# def collate_batch(batch):
#     contexts, targets = zip(*batch)
#     pad_idx = vocab['<pad>']
#     sos_idx = vocab['<sos>']
#     eos_idx = vocab['<eos>']

#     contexts = [torch.cat([torch.tensor([sos_idx]), context, torch.tensor([eos_idx])], dim=0) for context in contexts]
#     targets = [torch.cat([torch.tensor([sos_idx]), target, torch.tensor([eos_idx])], dim=0) for target in targets]

#     context_lens = torch.tensor([len(context) for context in contexts])
#     target_lens = torch.tensor([len(target) for target in targets])

#     contexts = pad_sequence(contexts, padding_value=pad_idx)
#     targets = pad_sequence(targets, padding_value=pad_idx)

#     return contexts, targets, context_lens, target_lens


from typing import List, Tuple

i = 1
def collate_fn(data: List[Tuple[torch.Tensor, torch.Tensor]]):
    global i
    i += 1
    tensors, targets = zip(*data)
    features = pad_sequence(tensors, batch_first=True)
    targets = torch.stack(targets)
    return features, targets

In [76]:
data = train_dataset[0]

In [78]:
zip(*data)

<zip at 0x7f8b6026b940>

In [77]:
tensors, targets = zip(*data)
features = pad_sequence(tensors, batch_first=True)
targets = torch.stack(targets)

ValueError: too many values to unpack (expected 2)

In [67]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [73]:
a = next(iter(train_loader))

In [74]:
i

1

In [None]:
train_loader.

In [12]:
train_dataset[0]

(tensor([ 212,   20,  767,   95,   42,   19,  437,  287,    6,   69,  767,  359,
          491,  504, 1766]),
 tensor([ 35,   9,  44, 137,  19, 437, 287,   4,   7,  46, 160, 135,  18,  11,
         121, 310,   5,   9,  20,  80,  74,  48,   6,  12,  46, 258,  13, 380,
         195,   5,  27,  10,  29,  89,   6,  42,   9, 248,  26, 303,  19,  12,
          46, 218,  16]))

In [7]:
import torch
import pickle

# Assuming 'vocab' is your vocabulary object from torchtext
vocab_path = "vocab.pkl"

# Save the vocabulary
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)


In [8]:
# Load the vocabulary
with open(vocab_path, 'rb') as f:
    loaded_vocab = pickle.load(f)


In [10]:

print(" ".join(vocab.lookup_tokens(train_dataset[0][0].cpu().numpy().tolist())))
print(" ".join(vocab.lookup_tokens(train_dataset[0][1].cpu().numpy().tolist())))

where can i get help with class registration to ensure i maintain full-time status ?
if you need assistance with class registration , the academic success center is a fantastic resource . you can also reach out to your academic advisor for personalized guidance . we ' re here to help you stay on track with your academic goals !


In [11]:

print(" ".join(loaded_vocab.lookup_tokens(train_dataset[0][0].cpu().numpy().tolist())))
print(" ".join(loaded_vocab.lookup_tokens(train_dataset[0][1].cpu().numpy().tolist())))

where can i get help with class registration to ensure i maintain full-time status ?
if you need assistance with class registration , the academic success center is a fantastic resource . you can also reach out to your academic advisor for personalized guidance . we ' re here to help you stay on track with your academic goals !


In [9]:
loaded_vocab

Vocab()

In [32]:
len(val_dataset)

1816

In [4]:
val_dataset[0]

(tensor([ 325,   30,    7,  541,  158,   13,  313,   32,  217,   15,    7,  610,
            8, 2278,  122,   40, 1766]),
 tensor([  27,   10,   29,  174,    9,   10,   29,  475, 1575,    6,   25,   21,
           16,   13,  313,   32,  264,    6,  260,    7,  610,    8, 2278,  122,
           40,    4,    7,  402,  355,   13,    7,  288,  115,   18,  702,  343,
            5,   63,  111,    6,   95,   12,   77,   15,   36,  382,    6,  194,
           12,  196,   19,  144,   16]))

In [47]:
question = "How can I apply?"
inputs = tokenizer(question)

ids = vocab.forward(inputs)
print(ids)

txt = vocab.lookup_tokens(ids)
print(txt)

# vocab.get_itos()[5896]

[142, 20, 767, 99, 1766]
['how', 'can', 'i', 'apply', '?']


In [36]:
txt = vocab.lookup_tokens([0])
txt

['<unk>']