In [15]:
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import numpy as np
import torch as t
import tqdm
#functional
import torch.nn.functional as F

In [16]:
potential_moves = 10
cfg = HookedTransformerConfig(
    n_layers = 1,
    n_heads = 4,
    d_model = 128,
    d_head = 32,
    d_mlp = 512,
    act_fn = "relu",
    normalization_type=None,
    d_vocab=potential_moves+1,
    d_vocab_out=potential_moves,
    n_ctx=10,
    init_weights=True,
    device="cuda",
    seed = 999,
)

lr = 1e-3
weight_decay = 1e-4
test_train_split = 0.8
epochs = 100
batch_size = 32



In [17]:
#load npy file
np_data = np.load('data/moves.npy')
data = [sequence[i: i+potential_moves] for sequence in np_data for i in range(len(sequence)-potential_moves)]
labels = [sequence[i+potential_moves] for sequence in np_data for i in range(len(sequence)-potential_moves)]

print(len(data))
print(len(labels))
print(data[1])
print(labels)

460800
460800
[10 10 10 10 10 10 10 10 10  0]
[0, 1, 2, 3, 4, 6, 5, 8, 7, 9, 0, 1, 2, 3, 4, 6, 7, 8, 5, 9, 0, 1, 2, 3, 4, 8, 5, 6, 7, 9, 0, 1, 2, 3, 4, 8, 7, 6, 5, 9, 0, 1, 2, 3, 5, 4, 6, 8, 7, 9, 0, 1, 2, 3, 5, 4, 7, 8, 6, 9, 0, 1, 2, 3, 5, 6, 4, 8, 7, 9, 0, 1, 2, 3, 5, 6, 7, 8, 4, 9, 0, 1, 2, 3, 5, 8, 4, 6, 7, 9, 0, 1, 2, 3, 5, 8, 6, 4, 7, 9, 0, 1, 2, 3, 5, 8, 7, 4, 6, 9, 0, 1, 2, 3, 5, 8, 7, 6, 4, 9, 0, 1, 2, 3, 6, 4, 5, 8, 7, 9, 0, 1, 2, 3, 6, 4, 7, 8, 5, 9, 0, 1, 2, 3, 6, 8, 5, 4, 7, 9, 0, 1, 2, 3, 6, 8, 7, 4, 5, 9, 0, 1, 2, 3, 7, 4, 5, 8, 6, 9, 0, 1, 2, 3, 7, 4, 6, 8, 5, 9, 0, 1, 2, 3, 7, 6, 4, 8, 5, 9, 0, 1, 2, 3, 7, 6, 5, 8, 4, 9, 0, 1, 2, 3, 7, 8, 4, 6, 5, 9, 0, 1, 2, 3, 7, 8, 5, 4, 6, 9, 0, 1, 2, 3, 7, 8, 5, 6, 4, 9, 0, 1, 2, 3, 7, 8, 6, 4, 5, 9, 0, 1, 2, 4, 3, 5, 7, 6, 8, 9, 0, 1, 2, 4, 3, 5, 8, 6, 7, 9, 0, 1, 2, 4, 3, 6, 5, 8, 7, 9, 0, 1, 2, 4, 3, 6, 7, 5, 8, 9, 0, 1, 2, 4, 3, 6, 7, 8, 5, 9, 0, 1, 2, 4, 3, 6, 8, 5, 7, 9, 0, 1, 2, 4, 3, 8, 5, 6, 7, 9, 0, 1, 2, 4, 3, 8, 7, 6,

In [18]:
encoded_labels = F.one_hot(t.tensor(labels))
print(encoded_labels)
print(t.sum(encoded_labels, axis=1))

tensor([[1, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1]])
tensor([1, 1, 1,  ..., 1, 1, 1])


In [19]:
#data and labels as numpy arrays
data = np.array(data)
encoded_labels = np.array(encoded_labels)
#data and encoded_labels as tensors
data = t.from_numpy(data).to(cfg.device)
encoded_labels = t.from_numpy(encoded_labels).to(cfg.device)

In [20]:
#test train split
train_data = data[:int(len(data)*test_train_split)]
train_labels = encoded_labels[:int(len(data)*test_train_split)]
test_data = data[int(len(data)*test_train_split):]
test_labels = encoded_labels[int(len(data)*test_train_split):]

In [21]:
print(len(test_data))
print(len(test_labels))

92160
92160


In [22]:
def loss_fn(logits, labels):
    return t.nn.functional.cross_entropy(logits, labels)

In [23]:
train_losses = []
test_losses = []
model = HookedTransformer(cfg).to(cfg.device)
optimizer = t.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

for epoch in tqdm.tqdm(range(epochs)):
    for batch in range(0, len(train_data), batch_size):
        train_logits = model(train_data[batch:batch+batch_size])
        train_loss = loss_fn(train_logits, train_labels[batch:batch+batch_size])

        train_loss.backward()

        train_losses.append(train_loss.item())
        optimizer.step()
        optimizer.zero_grad()

        with t.inference_mode():
            test_logits = model(test_data)
            print(len(test_data))
            print(len(test_labels))
            test_loss = loss_fn(test_logits, test_labels)
            test_losses.append(test_loss.item())

        print(f"Epoch {epoch} | Train Loss: {train_loss.item()} | Test Loss: {test_loss.item()}")

Moving model to device:  cuda


  0%|          | 0/100 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 450.00 MiB (GPU 0; 7.78 GiB total capacity; 5.85 GiB already allocated; 329.81 MiB free; 5.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF