In [1]:
"""
For some reason, most likely due to how the model is implemented, the ScratchModel
runs EXCEPTIONALLY slow. Again, I'm not sure why exactly, but the scratch model is
a small transformer model that should not be running this slow. This script aims to
re-implement the  ScratchModel in PyTorch using their optimized Transformer module
and see if that fixes the issue.

All the parameters will be the same, so we can just copy them over. The only thing
that will change is the model implementation and data loading.
"""

import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # prevent tensorflow logs

# # Set path to parent directory so we can import from other folders.
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from typing import List, Tuple
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- Model ------------------------------------------------
EPOCHS = 20
BATCH_SIZE = 64
NUM_LAYERS = 6
D_MODEL = 512
DFF = 2048
NUM_HEADS = 8
DROPOUT_RATE = 0.1


# -------- Data ------------------------------------------------

dataset = load_dataset("msaad02/brockport-gpt-4-qa")['train'].to_pandas()

class Seq2SeqDataset(Dataset):
    def __init__(self, data, vocab, tokenizer):
        self.data = data
        self.vocab = vocab
        self.tokenizer = tokenizer

        self.sos_idx = vocab['[START]']
        self.eos_idx = vocab['[END]']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context = torch.tensor([self.vocab[token] for token in self.tokenizer(context)])
        target = torch.tensor([self.vocab[token] for token in self.tokenizer(target)])
        target = torch.cat([torch.tensor([self.sos_idx]), target, torch.tensor([self.eos_idx])])
        return context, target

def build_vocab(data, tokenizer):
    token_generator = (token for _, sent in data for token in tokenizer(sent))
    vocab = build_vocab_from_iterator(
        iterator = [token_generator], 
        specials=["[UNK]", "[PAD]", "[START]", "[END]"],
        special_first=True, 
        min_freq=5
    )
    return vocab

def collate_batch(batch):
    contexts, targets = zip(*batch)
    pad_idx = vocab['[PAD]']
    contexts = pad_sequence(contexts, padding_value=pad_idx, batch_first=True)
    targets = pad_sequence(targets, padding_value=pad_idx, batch_first=True)
    return contexts, targets

# Create a list of tuples (context, target)
context = dataset['question'].tolist()
target = dataset['answer'].tolist()

data = list(zip(context, target))
np.random.shuffle(data)

# Randomly split data into train and validation
split_idx = int(0.85 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

# Define tokenizer and vocabulary
tokenizer = get_tokenizer('basic_english')

vocab = build_vocab(train_data + val_data, tokenizer)
vocab.set_default_index(vocab["[UNK]"])

# Create datasets
train_dataset = Seq2SeqDataset(train_data, vocab, tokenizer)
val_dataset = Seq2SeqDataset(val_data, vocab, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch, drop_last=True)


# -------- Model ------------------------------------------------
class NoamOptim(object):
    "Optimizer wrapper for learning rate scheduling."
    # https://colab.research.google.com/github/jaygala24/pytorch-implementations/blob/master/Attention%20Is%20All%20You%20Need.ipynb#scrollTo=pmwvcO8zpNeT
    def __init__(self, optimizer, d_model, factor, n_warmup_steps):
        self.optimizer = optimizer
        self.d_model = d_model
        self.factor = factor
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0
    
    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.n_steps += 1
        lr = self.get_lr()
        for p in self.optimizer.param_groups:
            p['lr'] = lr
        self.optimizer.step()
    
    def get_lr(self):
        return self.factor * (
            self.d_model ** (-0.5)
            * min(self.n_steps ** (-0.5), self.n_steps * self.n_warmup_steps ** (-1.5))
        )
    

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dtype, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.dtype = dtype

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x).type(self.dtype)


class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_layers, dim_feedforward, dropout_rate, activation, vocab_length, dtype, batch_first=True):
        super(Transformer, self).__init__()

        self.context_embedding = nn.Embedding(vocab_length, d_model, dtype=dtype)
        self.pos_encoder = PositionalEncoding(d_model, dtype=dtype, dropout=dropout_rate, max_len=256)

        self.target_embedding = nn.Embedding(vocab_length, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout_rate,
            activation=activation,
            batch_first=batch_first,
            dtype=dtype
        )

        self.classifier = nn.Linear(d_model, vocab_length, dtype=dtype)

    def forward(self, src, tgt, tgt_mask, src_key_padding_mask = None, tgt_key_padding_mask = None):
        src = self.context_embedding(src)
        tgt = self.target_embedding(tgt)

        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        if src_key_padding_mask is None and tgt_key_padding_mask is None:
            # For inference on single examples
            out = self.transformer(src, tgt, tgt_mask=tgt_mask)
        else:
            # For training (or inference if batching)
            out = self.transformer(
                src, tgt,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=src_key_padding_mask                   
            )

        out = self.classifier(out)

        return out
    

model = Transformer(
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    dim_feedforward=DFF,
    dropout_rate=DROPOUT_RATE,
    activation='relu',
    vocab_length=len(vocab),
    dtype=torch.float32
).to(device)


# -------- Masking ------------------------------------------------
# Helper function to create a mask of size 'sz'
def generate_square_subsequent_mask(sz: int):
    mask = (torch.triu(torch.ones(sz, sz)) == 0).transpose(0, 1)
    return mask


# -------- Training ------------------------------------------------
pad_idx = vocab['[PAD]']
sos_idx = vocab['[START]']
eos_idx = vocab['[END]']

# Define the loss function and optimizer
loss_fn = CrossEntropyLoss(ignore_index=pad_idx)#, label_smoothing=0.1)
# optimizer = Adam(model.parameters(), lr=0.0001)

optimizer = NoamOptim(
    optimizer=Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9),
    d_model=D_MODEL,
    factor=1,
    n_warmup_steps=500
)

# Define the training function
def train(model, data_loader, optimizer, loss_fn, device):
    global out, tgt_output, src, tgt, tgt_input

    model.train()
    total_loss = 0

    for contexts, targets in train_loader:
        # Move tensors to the right device
        src = contexts.to(device)
        tgt = targets.to(device)

        tgt_input = tgt[:, :-1]  # All tokens except the last (remove <eos>)
        tgt_output = tgt[:, 1:]  # All tokens except the first (remove <sos>)

        # Generate masks
        tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        src_key_padding_mask = (src == pad_idx)
        tgt_key_padding_mask = (tgt_input == pad_idx)

        # Forward pass
        out = model(src, tgt_input, tgt_mask, src_key_padding_mask, tgt_key_padding_mask)

        # Calculate the loss
        loss = loss_fn(out.transpose(1,2), tgt_output) # make out in shape (N, L, C) and tgt_output in shape (N, L)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


def evaluate(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for contexts, targets in data_loader:
            src = contexts.to(device)
            tgt = targets.to(device)

            tgt_input = tgt[:, :-1]  # All tokens except the last (remove <eos>)
            tgt_output = tgt[:, 1:]  # All tokens except the first (remove <sos>)

            # Generate masks
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            src_key_padding_mask = (src == pad_idx)
            tgt_key_padding_mask = (tgt_input == pad_idx)

            # Forward pass
            out = model(src, tgt_input, tgt_mask, src_key_padding_mask, tgt_key_padding_mask)

            # Calculate the loss
            loss = loss_fn(out.transpose(1,2), tgt_output)
            total_loss += loss.item()

    return total_loss / len(data_loader)


# Training loop
for epoch in range(EPOCHS):
    epoch_loss = train(model, train_loader, optimizer, loss_fn, device)
    val_loss = evaluate(model, val_loader, loss_fn, device)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}")
    # print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch 1, Loss: 6.4581, Val Loss: 5.8376
Epoch 2, Loss: 5.7990, Val Loss: 6.1304
Epoch 3, Loss: 6.1353, Val Loss: 6.1445
Epoch 4, Loss: 6.1337, Val Loss: 6.1327
Epoch 5, Loss: 6.1238, Val Loss: 6.1249


KeyboardInterrupt: 

In [None]:
import torch

def predict_sentence(model, sentence, vocab, tokenizer, max_len=50, device='cuda'):
    model.eval()
    model.to(device)

    src = tokenizer(sentence)
    src_indices = vocab.lookup_indices(src)
    src_tensor = torch.tensor(src_indices).unsqueeze(0).to(device)

    pad_idx = vocab['[PAD]']
    start_idx = vocab['[START]']

    out = torch.full((1, max_len), fill_value=pad_idx, dtype=torch.long).to(device)
    out[:, 0] = start_idx

    for i in range(1, max_len):  # Start at 1 since we already placed the [START] token
        tgt_mask = generate_square_subsequent_mask(i+1).to(device)  # Mask for current target length
        output = model(src_tensor, out[:, :i+1], tgt_mask=tgt_mask)
        next_symbol = output[:, -1:].argmax(-1)  # Predict the next token based on the last position
        out[:, i] = next_symbol

    result_tokens = vocab.lookup_tokens(out[0].tolist())
    # Filter out any tokens after an "[END]" token
    end_idx = result_tokens.index('[END]') if '[END]' in result_tokens else len(result_tokens)
    return " ".join(result_tokens[:end_idx])

In [None]:
predict_sentence(
    model=model,
    sentence="How can I apply to SUNY Brockport?",
    vocab=vocab,
    tokenizer=tokenizer,
    max_len=50
)

'[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,'

In [None]:
for i in range(10):
    i += 50

    sentence = " ".join(vocab.lookup_tokens(train_dataset.__getitem__(i)[0].tolist()))

    response = predict_sentence(
        model=model,
        sentence=sentence,
        vocab=vocab,
        tokenizer=tokenizer,
        max_len=50
    )

    print(sentence)
    print(response, "\n\n")


where can i purchase a parking permit for suny brockport ' s performance venues ?
[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 


when should i see a healthcare provider for a sore throat ?
[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 


how can i get in touch with the suny brockport ' s registrar ' s office ?
[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 


i completed my graduate degree at another institution , how can i apply for professional certification ?
[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 


what are the goals of brockport summer learning , and how do they align with the rochester model for high-quality summer learning ?
[START] is for , , , is , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,

In [6]:
model.parameters

<bound method Module.parameters of Transformer(
  (context_embedding): Embedding(5898, 512)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (target_embedding): Embedding(5898, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): La

In [7]:
sum(p.numel() for p in model.parameters())

53205770