In [3]:
from datasets import load_dataset
import torch.nn as nn
import torch
import pandas as pd
import numpy as np


dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

In [10]:


dataset[['question', 'answer']].to_numpy()

array([['Are there any areas on campus where my service animal is not allowed?',
        'SUNY Brockport is committed to ensuring the safety and well-being of all members of our community, including service animals. There are certain areas on campus where service animals may be restricted due to health, safety, or operational reasons. These can include food preparation areas, specific research labs, and other sensitive locations. If you need access to these areas, please contact the appropriate department representative and the ADA or Section 504 officer on campus to discuss your case.'],
       ['Are there opportunities for internships or research practicums in the sociology program?',
        "Absolutely! Many of our sociology students complete internships or research practicums as part of their major. These hands-on experiences are a fantastic way to apply what you've learned in the classroom to real-world situations, and they can be incredibly valuable when you're starting your car

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.tokenizer(
            '[START] ' + self.questions[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        answer = self.tokenizer(
            '[START] ' + self.answers[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return question['input_ids'].squeeze(0), answer['input_ids'].squeeze(0)

def collate_fn(batch):
    questions, answers = zip(*batch)
    questions = pad_sequence(questions, batch_first=True, padding_value=0)
    answers = pad_sequence(answers, batch_first=True, padding_value=0)
    return questions, answers[:, :-1], answers[:, 1:]

def get_datasets(tokenizer, batch_size=64):
    dataset = load_dataset("msaad02/brockport-gpt-4-qa")
    dataset = dataset['train'].to_pandas()

    context_raw = dataset['question'].to_list()
    target_raw = dataset['answer'].to_list()

    is_train_mask = np.random.uniform(size=(len(target_raw),)) < 0.8

    train_context = np.array(context_raw)[is_train_mask]
    train_target = np.array(target_raw)[is_train_mask]

    val_context = np.array(context_raw)[~is_train_mask]
    val_target = np.array(target_raw)[~is_train_mask]

    train_dataset = Seq2SeqDataset(train_context, train_target, tokenizer)
    val_dataset = Seq2SeqDataset(val_context, val_target, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, val_loader

# Assuming you're using a tokenizer compatible with your model, e.g., BERT-based
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_loader, val_loader = get_datasets(tokenizer)


In [26]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['<pad>']
    sos_idx = vocab['<sos>']
    eos_idx = vocab['<eos>']

    contexts = [torch.cat([torch.tensor([sos_idx]), context, torch.tensor([eos_idx])], dim=0) for context in contexts]
    targets = [torch.cat([torch.tensor([sos_idx]), target, torch.tensor([eos_idx])], dim=0) for target in targets]

    context_lens = torch.tensor([len(context) for context in contexts])
    target_lens = torch.tensor([len(target) for target in targets])

    contexts = pad_sequence(contexts, padding_value=pad_idx)
    targets = pad_sequence(targets, padding_value=pad_idx)

    return contexts, targets, context_lens, target_lens


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator(token_generator, specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True)

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab(train_data + val_data, tokenizer)

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [27]:
tokenizer("How can I apply?")

['how', 'can', 'i', 'apply', '?']

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
EPOCHS = 13
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1

model = nn.Transformer(
    d_model=D_MODEL,
    nhead=NUM_HEADS,
    num_encoder_layers=NUM_LAYERS,
    num_decoder_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout=DROPOUT_RATE,
    activation='relu',
).to(device)



In [25]:
import torch.optim as optim

# Setup the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define a loss function, ignoring the padding index in the loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def train(model, data_loader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in data_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            tgt_input = tgt[:-1, :]
            targets = tgt[1:, :]  # targets do not include the <sos> token

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_masks(src, tgt_input, pad_idx=vocab['<pad>'])
            optimizer.zero_grad()
            output = model(src, tgt_input, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
            output = output.reshape(-1, output.shape[-1])

            loss = criterion(output, targets.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

def create_masks(src, tgt, pad_idx):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).to(device)
    
    src_padding_mask = (src == pad_idx).transpose(0, 1).to(device)
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1).to(device)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Now call the train function
train(model, train_loader, optimizer, criterion, EPOCHS)


RuntimeError: Token what not found and default index is not set
Exception raised from __getitem__ at /__w/text/text/pytorch/text/torchtext/csrc/vocab.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cc5d7dd87 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f6cc5d2e75f in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: torchtext::Vocab::__getitem__(c10::basic_string_view<char> const&) const + 0x384 (0x7f6bb22418b4 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so)
frame #3: <unknown function> + 0x1dc5b (0x7f6c082a9c5b in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #4: <unknown function> + 0x3dee7 (0x7f6c082c9ee7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #5: /home/msaad/miniconda3/envs/thesis/bin/python() [0x525d17]
frame #6: _PyObject_MakeTpCall + 0x254 (0x502a14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #7: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #8: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b51d]
frame #9: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #10: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #11: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #12: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #13: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #14: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b75e]
frame #15: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5a6cea]
frame #16: _PyEval_EvalFrameDefault + 0x53b (0x50eb6b in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #17: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #18: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #19: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #20: _PyEval_EvalFrameDefault + 0x3a22 (0x512052 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #21: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #22: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #23: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #24: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #25: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #26: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5df096]
frame #27: _PyEval_EvalFrameDefault + 0x35e4 (0x511c14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #28: /home/msaad/miniconda3/envs/thesis/bin/python() [0x55451f]
frame #29: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #30: PyObject_Call + 0x9d (0x53ef0d in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #31: _PyEval_EvalFrameDefault + 0x42da (0x51290a in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #32: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #33: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #34: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #35: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #36: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #37: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #38: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #39: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #40: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #41: <unknown function> + 0x79e7 (0x7f6d5c0bb9e7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #42: /home/msaad/miniconda3/envs/thesis/bin/python() [0x52405b]
frame #43: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4bf4de]
frame #44: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4c13c9]
frame #45: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #46: _PyEval_EvalFrameDefault + 0x9353 (0x517983 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #47: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #48: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #49: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #50: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #51: PyObject_Vectorcall + 0x31 (0x51bde1 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x753 (0x50ed83 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #53: _PyFunction_Vectorcall + 0x173 (0x534f13 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #54: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5f33ef]
frame #55: Py_RunMain + 0x14a (0x5f2dfa in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #56: Py_BytesMain + 0x39 (0x5b6f49 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #57: <unknown function> + 0x29d90 (0x7f6d5d33fd90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #58: __libc_start_main + 0x80 (0x7f6d5d33fe40 in /lib/x86_64-linux-gnu/libc.so.6)
frame #59: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5b6d9f]


In [29]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator([token_generator], specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True, min_freq=5)


vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["<unk>"])




class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target
    

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

In [30]:
train_dataset[0][0].cpu().numpy().tolist()


print(" ".join(vocab.lookup_tokens(train_dataset[0][0].cpu().numpy().tolist())))
print(" ".join(vocab.lookup_tokens(train_dataset[0][1].cpu().numpy().tolist())))

who should i contact if i have questions about the status of my transcript order at suny brockport ?
if you have any inquiries regarding your transcript order ' s status , you can reach out to credentials inc . at ( <unk> ) <unk> . they ' ll be able to assist you with any questions or concerns you might have about your order .


In [33]:
train_dataset[0][1].cpu().numpy().tolist()

[35,
 9,
 34,
 53,
 545,
 559,
 12,
 988,
 842,
 10,
 22,
 504,
 4,
 9,
 20,
 74,
 48,
 6,
 1818,
 3066,
 5,
 23,
 50,
 0,
 49,
 0,
 5,
 51,
 10,
 39,
 33,
 748,
 6,
 127,
 9,
 19,
 53,
 119,
 24,
 388,
 9,
 352,
 34,
 54,
 12,
 842,
 5]

In [31]:
len(train_dataset)

10288

In [32]:
len(val_dataset)

1816

In [4]:
val_dataset[0]

(tensor([ 325,   30,    7,  541,  158,   13,  313,   32,  217,   15,    7,  610,
            8, 2278,  122,   40, 1766]),
 tensor([  27,   10,   29,  174,    9,   10,   29,  475, 1575,    6,   25,   21,
           16,   13,  313,   32,  264,    6,  260,    7,  610,    8, 2278,  122,
           40,    4,    7,  402,  355,   13,    7,  288,  115,   18,  702,  343,
            5,   63,  111,    6,   95,   12,   77,   15,   36,  382,    6,  194,
           12,  196,   19,  144,   16]))

In [47]:
question = "How can I apply?"
inputs = tokenizer(question)

ids = vocab.forward(inputs)
print(ids)

txt = vocab.lookup_tokens(ids)
print(txt)

# vocab.get_itos()[5896]

[142, 20, 767, 99, 1766]
['how', 'can', 'i', 'apply', '?']


In [36]:
txt = vocab.lookup_tokens([0])
txt

['<unk>']