In [8]:
import numpy as np

from settings import ModelSettings

document = np.arange(1100)
context_length = ModelSettings.max_context_length
print(document)

[   0    1    2 ... 1097 1098 1099]


In [9]:
# Example
example_context_length = 10
x = document[:example_context_length]
y = document[1:example_context_length + 1]
print(x)
print(y)

[0 1 2 3 4 5 6 7 8 9]
[ 1  2  3  4  5  6  7  8  9 10]


In [10]:
import torch
import os

block_size = 256  # smaller context size
batch_size = 8
device = "cpu"
data_dir = "tokenized_data"


def get_mini_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'test.bin'), dtype=np.uint16, mode='r')
    ix = torch.randint(min(len(data), 10_000) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i + block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i + 1:i + 1 + block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [11]:
x, y = get_mini_batch("train")
print(x.shape, y.shape)

torch.Size([8, 256]) torch.Size([8, 256])


In [12]:
from model import ChatModel
from settings import ModelSettings

model = ChatModel(
    ModelSettings.vocabulary_size,
    ModelSettings.embedding_size,
    ModelSettings.embedding_dropout,
    ModelSettings.attention_dropout,
    ModelSettings.max_context_length,
    ModelSettings.ff_size_multiplier,
    ModelSettings.ff_dropout,
    ModelSettings.transformer_blocks,
    ModelSettings.attention_heads
)

In [13]:
best_loss = float("inf")
patience = 3  # number of evaluations to wait
min_delta = 0.05  # minimum improvement
patience_counter = 0

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for step in range(50000):
    xb, yb = get_mini_batch("train")

    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 1 == 0:
        loss_num = loss.item()
        print(f"step {step}, loss {loss_num:.4f}")
        if best_loss - loss_num > min_delta:
            best_loss = loss_num
            patience_counter = 0
            torch.save(model.state_dict(), "mini_model_training.pth")
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [8, 256, 768]], which is output 0 of AddBackward0, is at version 24; expected version 23 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).