In [1]:
"""
For some reason, most likely due to how the model is implemented, the ScratchModel
runs EXCEPTIONALLY slow. Again, I'm not sure why exactly, but the scratch model is
a small transformer model that should not be running this slow. This script aims to
re-implement the  ScratchModel in PyTorch using their optimized Transformer module
and see if that fixes the issue.

All the parameters will be the same, so we can just copy them over. The only thing
that will change is the model implementation and data loading.
"""

import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # prevent tensorflow logs

# # Set path to parent directory so we can import from other folders.
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from typing import List, Tuple
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- Model ------------------------------------------------
EPOCHS = 5
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1


# -------- Data ------------------------------------------------

dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

        self.sos_idx = vocab['[START]']
        self.eos_idx = vocab['[END]']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        target = torch.cat([torch.tensor([self.sos_idx]), target, torch.tensor([self.eos_idx])])
        return context, target

def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    vocab = build_vocab_from_iterator(
        iterator = [token_generator], 
        specials=["[UNK]", "[PAD]", "[START]", "[END]"],
        special_first=True, 
        min_freq=5
    )
    return vocab

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['[PAD]']
    contexts = pad_sequence(contexts, padding_value=pad_idx, batch_first=True)
    targets = pad_sequence(targets, padding_value=pad_idx, batch_first=True)
    return contexts, targets

# Create a list of tuples (context, target)
context = dataset['question'].tolist()
target = dataset['answer'].tolist()

data = list(zip(context, target))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')

vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["[UNK]"])

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)


# -------- Model ------------------------------------------------
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dtype, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.dtype = dtype

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x).type(self.dtype)


class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dim_feedforward, dropout_rate, activation, vocab_length, dtype, batch_first=True):
        super(Transformer, self).__init__()

        self.context_embedding = nn.Embedding(vocab_length, d_model, dtype=dtype)
        self.pos_encoder = PositionalEncoding(d_model, dtype=dtype, dropout=dropout_rate, max_len=256)

        self.target_embedding = nn.Embedding(vocab_length, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            activation=activation,
            batch_first=batch_first,
            dtype=dtype
        )

        self.classifier = nn.Linear(d_model, vocab_length, dtype=dtype)

    def forward(self, src, tgt):
        src = self.context_embedding(src)
        tgt = self.target_embedding(tgt)

        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        out = self.transformer(src, tgt)
        out = self.classifier(out)

        return out
    

model = Transformer(
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout_rate=DROPOUT_RATE,
    activation='relu',
    vocab_length=len(vocab),
    dtype=torch.float32
).to(device)


# -------- Training ------------------------------------------------
pad_idx = vocab['[PAD]']
sos_idx = vocab['[START]']
eos_idx = vocab['[END]']

# Define the loss function and optimizer
loss_fn = CrossEntropyLoss(ignore_index=pad_idx)
optimizer = Adam(model.parameters(), lr=0.0001)


# Define the training function
def train(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for contexts, targets in train_loader:
        # Move tensors to the right device
        src = contexts.to(device)
        tgt = targets.to(device)

        tgt_input = tgt[:, :-1]  # All tokens except the last (remove <eos>)
        tgt_output = tgt[:, 1:]  # All tokens except the first (remove <sos>)

        # Forward pass
        out = model(src, tgt_input)

        # Calculate the loss
        loss = loss_fn(out.reshape(-1, out.shape[-1]), tgt_output.reshape(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


# Training loop
for epoch in range(EPOCHS):
    epoch_loss = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

Epoch 1, Loss: 6.3367
Epoch 2, Loss: 5.0624
Epoch 3, Loss: 4.4954
Epoch 4, Loss: 4.1974
Epoch 5, Loss: 3.9810


In [86]:
import torchtext

def predict_sentence(model: Transformer, sentence: str, vocab: torchtext.vocab.Vocab, tokenizer, max_len: int = 50):
    model.eval()

    src = tokenizer(sentence)
    src = vocab.lookup_indices(src)
    src = torch.tensor(src).unsqueeze(0).to(device)

    pad_idx = vocab['[PAD]']
    start_idx = vocab['[START]']

    out = torch.zeros((1, max_len+1)).type_as(src.data)
    out.fill_(pad_idx)
    out[0, 0] = start_idx

    ctx = model.context_embedding(src)
    ctx = model.pos_encoder(ctx)

    next_symbol = start_idx

    for i in range(0, max_len+1):
        out[0][i] = next_symbol

        tgt = model.target_embedding(out)
        tgt = model.pos_encoder(tgt)

        tgt = model.transformer(ctx, tgt)
        tgt = model.classifier(tgt)

        next_symbol = torch.argmax(tgt[0,0]).item()

    return " ".join(vocab.lookup_tokens(out[0].tolist()))

In [87]:
predict_sentence(
    model=model,
    sentence="How can I apply to SUNY Brockport?",
    vocab=vocab,
    tokenizer=tokenizer,
    max_len=50
)

'[START] no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no'

In [88]:
for i in range(10):
    i += 50

    sentence = " ".join(vocab.lookup_tokens(train_dataset.__getitem__(i)[0].tolist()))

    response = predict_sentence(
        model=model,
        sentence=sentence,
        vocab=vocab,
        tokenizer=tokenizer,
        max_len=50
    )

    print(sentence)
    print(response, "\n\n")


when are new [UNK] of the port released ?
[START] no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no 


how does the selection process work for the fast program and what happens if i ' m selected ?
[START] no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no 


what support does suny brockport offer to english students facing financial emergencies ?
[START] no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no 


how can i get in touch with professor [UNK] [UNK] for academic advising ?
[START] no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no no 


what are the benefits of attending scholars day ?
[START

In [84]:
model.parameters

<bound method Module.parameters of Transformer(
  (context_embedding): Embedding(5898, 512)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (target_embedding): Embedding(5898, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): La

In [85]:
sum(p.numel() for p in model.parameters())

53205770