In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch

from dyck_k_generator import constants

In [None]:
device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
device

In [None]:
if device == "mps":
    torch.mps.empty_cache()
elif device == "cuda:0":
    torch.cuda.empty_cache()

In [None]:
k = 3

In [None]:
VOCAB = "".join(
    ["".join((key, value)) for key, value in list(constants.BRACKETS.items())[:k]]
)
VOCAB

In [None]:
from transformer.dataset import DyckLanguageDataset

In [None]:
dataset = DyckLanguageDataset("data/dyck-3_50000-samples_40-len_p05.jsonl", VOCAB).to(
    device
)

In [None]:
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [None]:
from torch.utils.data import DataLoader

In [None]:
dl = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
test_dl = DataLoader(test_dataset, batch_size=8, shuffle=True)

# Manual Transformer + BERTViz


In [None]:
from transformer.transformer import TransformerClassifier, TransformerClassifierConfig

In [None]:
model_config = TransformerClassifierConfig(
    in_dim=40 + 2,
    d_model=128,
    n_heads=8,
    ff_dim=512,
    n_layers=6,
    n_classes=2,
)

In [None]:
model = TransformerClassifier(model_config)

In [None]:
model.train()

In [None]:
model.to(device)

In [None]:
import torch.optim as optim

crit = torch.nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
from tqdm.auto import tqdm

In [None]:
epochs = 1

for epoch in range(epochs):
    running_loss = 0.0

    total_correct = 0
    total_samples = 0

    for i, data in enumerate(tqdm(dl)):
        _, labels, tokens = data

        optimizer.zero_grad()

        outputs = model(tokens)
        loss = crit(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = outputs.max(1)

        # Count correct predictions
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

        # Calculate accuracy
        accuracy = (total_correct / total_samples) * 100
        # calculate accuracy
        if i % 100 == 99:
            print(
                f"Epoch: {epoch + 1}, Loss: {running_loss / 100}, Accuracy: {accuracy}"
            )
            running_loss = 0.0

In [None]:
model.eval()

In [None]:
correct = 0
total = 0
total_loss = 0

criterion = torch.nn.CrossEntropyLoss()

with torch.no_grad():  # Important to use torch.no_grad() to save memory and computations
    for batch in test_dl:
        _, labels, tokens = batch

        # Forward pass
        outputs = model(tokens)

        # Calculate loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Convert outputs probabilities to predicted class (0 or 1)
        _, predicted = torch.max(outputs.data, 1)

        # Count total and correct predictions
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate average loss and accuracy
avg_loss = total_loss / len(test_dl)
accuracy = 100 * correct / total

print(f"Accuracy of the model on the test data: {accuracy:.2f}%")
print(f"Average loss on the test data: {avg_loss:.4f}")

In [3]:
from transformers import AutoConfig, AutoModel

In [2]:
AutoConfig.register("transformer-checker", TransformerClassifierConfig)
AutoModel.register(TransformerClassifierConfig, TransformerClassifier)

NameError: name 'TransformerClassifierConfig' is not defined

In [None]:
TransformerClassifierConfig.register_for_auto_class()
TransformerClassifier.register_for_auto_class("AutoModel")

In [4]:
model.push_to_hub("dyck-3-transformer")

NameError: name 'model' is not defined

In [12]:
model_viz = AutoModel.from_pretrained(
    "matiasmolinolo/dyck-3-transformer",
    output_attentions=True,
    trust_remote_code=True,
    force_download=True,
)



config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

transformer.py:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

transformer.py:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

In [13]:
from transformer.dataset import DyckLanguageTokenizer

tokenizer = DyckLanguageTokenizer("()[]{}")

In [14]:
inputs = "(([])){((}()))"
tokens = tokenizer.tokenize(inputs, max_len=40)
tokens

tensor([[0., 3., 3., 5., 6., 4., 4., 7., 3., 3., 8., 3., 4., 4., 4., 2., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1.]])

In [15]:
out = model_viz(tokens)

In [16]:
out

tensor([[-0.3131, -0.1897]], grad_fn=<AddmmBackward0>)

In [17]:
attn = out[-1]

In [18]:
attn

tensor([-0.3131, -0.1897], grad_fn=<SelectBackward0>)

In [19]:
decoded = tokenizer.decode(tokens)

In [20]:
decoded

['(([])){((}()))']

In [21]:
from bertviz import head_view

head_view(attn, tokens)

ValueError: The attention tensor does not have the correct number of dimensions. Make sure you set output_attentions=True when initializing your model.