In [79]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.tokenizer(
            '[START] ' + self.questions[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        answer = self.tokenizer(
            '[START] ' + self.answers[idx] + ' [END]',
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return question['input_ids'].squeeze(0), answer['input_ids'].squeeze(0)

def collate_fn(batch):
    questions, answers = zip(*batch)
    questions = pad_sequence(questions, batch_first=True, padding_value=0)
    answers = pad_sequence(answers, batch_first=True, padding_value=0)
    return questions, answers[:, :-1], answers[:, 1:]

def get_datasets(tokenizer, batch_size=64):
    dataset = load_dataset("msaad02/brockport-gpt-4-qa")
    dataset = dataset['train'].to_pandas()

    context_raw = dataset['question'].to_list()
    target_raw = dataset['answer'].to_list()

    is_train_mask = np.random.uniform(size=(len(target_raw),)) < 0.8

    train_context = np.array(context_raw)[is_train_mask]
    train_target = np.array(target_raw)[is_train_mask]

    val_context = np.array(context_raw)[~is_train_mask]
    val_target = np.array(target_raw)[~is_train_mask]

    train_dataset = Seq2SeqDataset(train_context, train_target, tokenizer)
    val_dataset = Seq2SeqDataset(val_context, val_target, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    return train_loader, val_loader

from torchtext.data.utils import get_tokenizer
# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
train_loader, val_loader = get_datasets(tokenizer)


In [84]:
tokenizer("Hi, how can I aplpy?")

['hi', ',', 'how', 'can', 'i', 'aplpy', '?']

In [87]:
tokenizer(
    '[START] ' + "Hi, how can I aplpy?" + ' [END]',
    max_length=512,
    return_tensors='pt'
)

TypeError: _basic_english_normalize() got an unexpected keyword argument 'max_length'

In [82]:
next(iter(train_loader))

TypeError: _basic_english_normalize() got an unexpected keyword argument 'truncation'

In [1]:
"""
For some reason, most likely due to how the model is implemented, the ScratchModel
runs EXCEPTIONALLY slow. Again, I'm not sure why exactly, but the scratch model is
a small transformer model that should not be running this slow. This script aims to
re-implement the  ScratchModel in PyTorch using their optimized Transformer module
and see if that fixes the issue.

All the parameters will be the same, so we can just copy them over. The only thing
that will change is the model implementation and data loading.
"""

import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # prevent tensorflow logs

# # Set path to parent directory so we can import from other folders.
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from typing import List, Tuple
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- Model ------------------------------------------------
EPOCHS = 13
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1


# -------- Data ------------------------------------------------

dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target

def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    vocab = build_vocab_from_iterator(
        iterator = [token_generator], 
        specials=["<unk>", "<pad>", "<sos>", "<eos>"], 
        special_first=True, 
        min_freq=5
    )
    return vocab

# def collate_fn(data: List[Tuple[torch.Tensor, torch.Tensor]]):
#     tensors, targets = zip(*data)
#     features = pad_sequence(tensors, batch_first=True)
#     targets = torch.stack(targets)
#     return features, targets

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['<pad>']
    contexts = pad_sequence(contexts, padding_value=pad_idx, batch_first=True)
    targets = pad_sequence(targets, padding_value=pad_idx, batch_first=True)
    return contexts, targets

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')

vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["<unk>"])

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)


# -------- Model ------------------------------------------------

class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dim_feedforward, dropout_rate, activation, vocab_length, batch_first=True):
        super(Transformer, self).__init__()


        self.context_embedding = nn.Embedding(vocab_length, d_model)
        self.target_embedding = nn.Embedding(vocab_length, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            activation=activation,
            batch_first=batch_first
        )

    def forward(self, src, tgt):
        src = self.context_embedding(src)
        tgt = self.target_embedding(tgt)

        return self.transformer(src, tgt)
    

model = Transformer(
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout_rate=DROPOUT_RATE,
    activation='relu',
    vocab_length=len(vocab)
).to(device)


# -------- Training ------------------------------------------------
pad_idx = vocab['<pad>']
sos_idx = vocab['<sos>']
eos_idx = vocab['<eos>']

# Define the loss function and optimizer
loss_fn = CrossEntropyLoss(ignore_index=pad_idx)
optimizer = Adam(model.parameters(), lr=0.0001)

# Define the training function
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for contexts, targets in train_loader:

        # Move tensors to the right device
        contexts = contexts.to(device)
        targets = targets.to(device)

        print("contexts", contexts)
        print("targets", targets[:, :-1])

        # Forward pass
        output = model(contexts, targets[:, :-1]) # Shifted targets for computing loss

        # # Compute the loss
        loss = loss_fn(output, targets[:, 1:])
        # loss = loss_fn(output.reshape(-1, output.size(-1)), targets[1:, :].reshape(-1)) # Shifted targets for computing loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")


contexts tensor([[  20,  235,   32,  ...,    1,    1,    1],
        [  30,  951,  590,  ...,    1,    1,    1],
        [ 325,   18,    7,  ...,    1,    1,    1],
        ...,
        [ 325, 3452,   14,  ...,    1,    1,    1],
        [ 325,  216,  365,  ...,    1,    1,    1],
        [ 325,   30,    7,  ...,    1,    1,    1]], device='cuda:0')
targets tensor([[  68,   16,   17,  ...,    1,    1,    1],
        [ 209,    4,  951,  ...,    1,    1,    1],
        [  17,  306,  365,  ...,    1,    1,    1],
        ...,
        [  17,  102,   14,  ...,    1,    1,    1],
        [ 261,    6,   25,  ...,    1,    1,    1],
        [  37,   11, 1513,  ...,    1,    1,    1]], device='cuda:0')
tensor([[[-1.5761,  0.0794, -0.7892,  ...,  1.2343,  0.1028,  1.0264],
         [-1.0959, -0.9691, -0.5410,  ...,  1.5932,  1.0933,  0.6204],
         [-1.1620, -0.1358, -1.1375,  ...,  1.3729,  0.2238,  0.7762],
         ...,
         [-1.3205, -0.5732, -0.2531,  ...,  1.3521, -0.1316,  0.4436],

RuntimeError: Expected target size [64, 512], got [64, 111]

In [3]:
output.shape

torch.Size([64, 111, 512])

In [6]:
def train(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for contexts, targets in data_loader:
        contexts = contexts.to(device)
        targets = targets.to(device)

        output = model(contexts, targets[:, :-1])

        
        loss = loss_fn(output.reshape(-1, output.size(-1)), targets[:, 1:].reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# Training loop
for epoch in range(EPOCHS):
    epoch_loss = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

RuntimeError: the feature number of src and tgt must be equal to d_model

In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['<pad>']
    sos_idx = vocab['<sos>']
    eos_idx = vocab['<eos>']

    contexts = [torch.cat([torch.tensor([sos_idx]), context, torch.tensor([eos_idx])], dim=0) for context in contexts]
    targets = [torch.cat([torch.tensor([sos_idx]), target, torch.tensor([eos_idx])], dim=0) for target in targets]

    context_lens = torch.tensor([len(context) for context in contexts])
    target_lens = torch.tensor([len(target) for target in targets])

    contexts = pad_sequence(contexts, padding_value=pad_idx)
    targets = pad_sequence(targets, padding_value=pad_idx)

    return contexts, targets, context_lens, target_lens


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator(token_generator, specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True)

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab(train_data + val_data, tokenizer)

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [2]:
tokenizer("How can I apply?")

['how', 'can', 'i', 'apply', '?']

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
EPOCHS = 13
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1

model = nn.Transformer(
    d_model=D_MODEL,
    nhead=NUM_HEADS,
    num_encoder_layers=NUM_LAYERS,
    num_decoder_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout=DROPOUT_RATE,
    activation='relu',
).to(device)



In [25]:
import torch.optim as optim

# Setup the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define a loss function, ignoring the padding index in the loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def train(model, data_loader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for src, tgt in data_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            tgt_input = tgt[:-1, :]
            targets = tgt[1:, :]  # targets do not include the <sos> token

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_masks(src, tgt_input, pad_idx=vocab['<pad>'])
            optimizer.zero_grad()
            output = model(src, tgt_input, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, src_padding_mask)
            output = output.reshape(-1, output.shape[-1])

            loss = criterion(output, targets.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

def create_masks(src, tgt, pad_idx):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).to(device)
    
    src_padding_mask = (src == pad_idx).transpose(0, 1).to(device)
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1).to(device)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Now call the train function
train(model, train_loader, optimizer, criterion, EPOCHS)


RuntimeError: Token what not found and default index is not set
Exception raised from __getitem__ at /__w/text/text/pytorch/text/torchtext/csrc/vocab.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f6cc5d7dd87 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f6cc5d2e75f in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: torchtext::Vocab::__getitem__(c10::basic_string_view<char> const&) const + 0x384 (0x7f6bb22418b4 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so)
frame #3: <unknown function> + 0x1dc5b (0x7f6c082a9c5b in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #4: <unknown function> + 0x3dee7 (0x7f6c082c9ee7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #5: /home/msaad/miniconda3/envs/thesis/bin/python() [0x525d17]
frame #6: _PyObject_MakeTpCall + 0x254 (0x502a14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #7: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #8: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b51d]
frame #9: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #10: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #11: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #12: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b0d0]
frame #13: _PyEval_EvalFrameDefault + 0xfbd (0x50f5ed in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #14: /home/msaad/miniconda3/envs/thesis/bin/python() [0x57b75e]
frame #15: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5a6cea]
frame #16: _PyEval_EvalFrameDefault + 0x53b (0x50eb6b in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #17: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #18: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #19: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #20: _PyEval_EvalFrameDefault + 0x3a22 (0x512052 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #21: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #22: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #23: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #24: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #25: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #26: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5df096]
frame #27: _PyEval_EvalFrameDefault + 0x35e4 (0x511c14 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #28: /home/msaad/miniconda3/envs/thesis/bin/python() [0x55451f]
frame #29: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5541c3]
frame #30: PyObject_Call + 0x9d (0x53ef0d in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #31: _PyEval_EvalFrameDefault + 0x42da (0x51290a in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #32: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #33: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #34: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #35: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #36: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #37: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #38: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #39: _PyEval_EvalFrameDefault + 0x32d9 (0x511909 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #40: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5dc8ca]
frame #41: <unknown function> + 0x79e7 (0x7f6d5c0bb9e7 in /home/msaad/miniconda3/envs/thesis/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #42: /home/msaad/miniconda3/envs/thesis/bin/python() [0x52405b]
frame #43: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4bf4de]
frame #44: /home/msaad/miniconda3/envs/thesis/bin/python() [0x4c13c9]
frame #45: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #46: _PyEval_EvalFrameDefault + 0x9353 (0x517983 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #47: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5c83fe]
frame #48: PyEval_EvalCode + 0x9f (0x5c7aff in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #49: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5e1683]
frame #50: /home/msaad/miniconda3/envs/thesis/bin/python() [0x51bef7]
frame #51: PyObject_Vectorcall + 0x31 (0x51bde1 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x753 (0x50ed83 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #53: _PyFunction_Vectorcall + 0x173 (0x534f13 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #54: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5f33ef]
frame #55: Py_RunMain + 0x14a (0x5f2dfa in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #56: Py_BytesMain + 0x39 (0x5b6f49 in /home/msaad/miniconda3/envs/thesis/bin/python)
frame #57: <unknown function> + 0x29d90 (0x7f6d5d33fd90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #58: __libc_start_main + 0x80 (0x7f6d5d33fe40 in /lib/x86_64-linux-gnu/libc.so.6)
frame #59: /home/msaad/miniconda3/envs/thesis/bin/python() [0x5b6d9f]


In [4]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np

# Load dataset
dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

# Create a list of tuples (context, target)
data = list(zip(dataset['question'].tolist(), dataset['answer'].tolist()))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')


def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    return build_vocab_from_iterator([token_generator], specials=["<unk>", "<pad>", "<sos>", "<eos>"], special_first=True, min_freq=5)


vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["<unk>"])


class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        return context, target
    

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

In [66]:
# def collate_batch(batch):
#     contexts, targets = zip(*batch)
#     pad_idx = vocab['<pad>']
#     sos_idx = vocab['<sos>']
#     eos_idx = vocab['<eos>']

#     contexts = [torch.cat([torch.tensor([sos_idx]), context, torch.tensor([eos_idx])], dim=0) for context in contexts]
#     targets = [torch.cat([torch.tensor([sos_idx]), target, torch.tensor([eos_idx])], dim=0) for target in targets]

#     context_lens = torch.tensor([len(context) for context in contexts])
#     target_lens = torch.tensor([len(target) for target in targets])

#     contexts = pad_sequence(contexts, padding_value=pad_idx)
#     targets = pad_sequence(targets, padding_value=pad_idx)

#     return contexts, targets, context_lens, target_lens


from typing import List, Tuple

i = 1
def collate_fn(data: List[Tuple[torch.Tensor, torch.Tensor]]):
    global i
    i += 1
    tensors, targets = zip(*data)
    features = pad_sequence(tensors, batch_first=True)
    targets = torch.stack(targets)
    return features, targets

In [76]:
data = train_dataset[0]

In [78]:
zip(*data)

<zip at 0x7f8b6026b940>

In [77]:
tensors, targets = zip(*data)
features = pad_sequence(tensors, batch_first=True)
targets = torch.stack(targets)

ValueError: too many values to unpack (expected 2)

In [67]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [73]:
a = next(iter(train_loader))

In [74]:
i

1

In [None]:
train_loader.

In [12]:
train_dataset[0]

(tensor([ 212,   20,  767,   95,   42,   19,  437,  287,    6,   69,  767,  359,
          491,  504, 1766]),
 tensor([ 35,   9,  44, 137,  19, 437, 287,   4,   7,  46, 160, 135,  18,  11,
         121, 310,   5,   9,  20,  80,  74,  48,   6,  12,  46, 258,  13, 380,
         195,   5,  27,  10,  29,  89,   6,  42,   9, 248,  26, 303,  19,  12,
          46, 218,  16]))

In [7]:
import torch
import pickle

# Assuming 'vocab' is your vocabulary object from torchtext
vocab_path = "vocab.pkl"

# Save the vocabulary
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)


In [8]:
# Load the vocabulary
with open(vocab_path, 'rb') as f:
    loaded_vocab = pickle.load(f)


In [10]:

print(" ".join(vocab.lookup_tokens(train_dataset[0][0].cpu().numpy().tolist())))
print(" ".join(vocab.lookup_tokens(train_dataset[0][1].cpu().numpy().tolist())))

where can i get help with class registration to ensure i maintain full-time status ?
if you need assistance with class registration , the academic success center is a fantastic resource . you can also reach out to your academic advisor for personalized guidance . we ' re here to help you stay on track with your academic goals !


In [11]:

print(" ".join(loaded_vocab.lookup_tokens(train_dataset[0][0].cpu().numpy().tolist())))
print(" ".join(loaded_vocab.lookup_tokens(train_dataset[0][1].cpu().numpy().tolist())))

where can i get help with class registration to ensure i maintain full-time status ?
if you need assistance with class registration , the academic success center is a fantastic resource . you can also reach out to your academic advisor for personalized guidance . we ' re here to help you stay on track with your academic goals !


In [9]:
loaded_vocab

Vocab()

In [32]:
len(val_dataset)

1816

In [4]:
val_dataset[0]

(tensor([ 325,   30,    7,  541,  158,   13,  313,   32,  217,   15,    7,  610,
            8, 2278,  122,   40, 1766]),
 tensor([  27,   10,   29,  174,    9,   10,   29,  475, 1575,    6,   25,   21,
           16,   13,  313,   32,  264,    6,  260,    7,  610,    8, 2278,  122,
           40,    4,    7,  402,  355,   13,    7,  288,  115,   18,  702,  343,
            5,   63,  111,    6,   95,   12,   77,   15,   36,  382,    6,  194,
           12,  196,   19,  144,   16]))

In [47]:
question = "How can I apply?"
inputs = tokenizer(question)

ids = vocab.forward(inputs)
print(ids)

txt = vocab.lookup_tokens(ids)
print(txt)

# vocab.get_itos()[5896]

[142, 20, 767, 99, 1766]
['how', 'can', 'i', 'apply', '?']


In [36]:
txt = vocab.lookup_tokens([0])
txt

['<unk>']