In [17]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
!git clone https://github.com/n1teshy/transformer > /dev/null
!mv transformer/* . && rmdir transformer
!mkdir -p drive/MyDrive/checkpoints/poet2
!ls drive/MyDrive/checkpoints/poet2

In [1]:
import os

import torch
from torch.optim import AdamW

from core.data.generator import GeneratorDataset
from core.models import Generator
from core.utils.bpe import Tokenizer
from core.utils.configs import DecoderConfig, GeneratorDataConfig

ModuleNotFoundError: No module named 'core'

In [2]:
# data conf
batch_size = 64
context = 512
train_cache = "drive/MyDrive/datasets/poems_cache_34k/train"
val_cache = "drive/MyDrive/datasets/poems_cache_34k/val"
sample_delimiter = ("# " * 39) + "#"

# model conf
no_blocks = 3
no_heads = 8
model_dim = 768
model_context = 512

# training conf
epochs = 10
learning_rate = 0.001
checkpoints_dir = "drive/MyDrive/checkpoints/poet2"

In [None]:
tokenizer = Tokenizer()
tokenizer.load("tokenizer/poet2_tokenizer.model")

train_data_conf = GeneratorDataConfig(
    batch_size=batch_size, pad_id=tokenizer.pad_id, cache_dir=train_cache
)
val_data_conf = GeneratorDataConfig(
    batch_size=batch_size, pad_id=tokenizer.pad_id, cache_dir=val_cache
)

train_dataset = GeneratorDataset(train_data_conf)
val_dataset = GeneratorDataset(val_data_conf)

In [None]:
model_conf = DecoderConfig(
    no_blocks=no_blocks,
    no_heads=no_heads,
    model_dim=model_dim,
    vocab_size=tokenizer.size,
    pad_id=tokenizer.pad_id,
    context=model_context,
    dropout=0.2,
    train_mode=True,
    sos_id=tokenizer.sos_id,
    eos_id=tokenizer.eos_id,
)
model = Generator(model_conf)
# model.load_state_dict(torch.load())
model = model.to("cuda")
print(
    "model has %.2fmn parameters"
    % (sum(p.numel() for p in model.parameters()) / 1e6,)
)


@torch.no_grad()
def get_val_loss() -> float:
    model.eval()
    batch = val_dataset.next_batch()
    if batch is None:
        val_dataset.reset()
        batch = val_dataset.next_batch()
    x, y = batch
    _, loss = model(x, y)
    return loss.item()


def save_model(t_loss: float, v_loss: float):
    name = "%.2f-%.2f-%.2f-%d-%d-%d-%d.pth" % (
        t_loss,
        v_loss,
        learning_rate,
        no_blocks,
        no_heads,
        model_dim,
        model_context,
    )
    torch.save(model.state_dict(), os.path.join(checkpoints_dir, name))

In [5]:
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [2]:
best_t_loss, best_v_loss = 4, 4
min_loss_improv, loss_window = 0.2, 128
mt_loss, mv_loss = None, None

for epoch in range(epochs):
    batches_processed = 0
    while batch := train_dataset.next_batch():
        model.train()
        x, y = batch
        _, loss = model(x, y)
        batches_processed += 1
        t_loss, v_loss = loss.item(), get_val_loss()
        mt_loss = t_loss * (1/loss_window) + (mt_loss or t_loss) * (1 - 1/loss_window)
        mv_loss = v_loss * (1/loss_window) + (mv_loss or v_loss) * (1 - 1/loss_window)
        print(
            "%d-%d. t-loss: %.2f -> %.2f, v-loss: %.2f -> %.2f"
            % (epoch, batches_processed, t_loss, mt_loss, v_loss, mv_loss)
        )
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (
            batches_processed >= loss_window
            and best_t_loss - mt_loss >= min_loss_improv
            and mv_loss - mt_loss < min_loss_improv
        ):
            save_model(mt_loss, mv_loss)
            best_t_loss, best_v_loss = mt_loss, mv_loss
            print("saved with losses: %.2f, %.2f" % (mt_loss, mv_loss))

NameError: name 'epochs' is not defined

In [None]:
for token in model.generate():
  print(tokenizer.decode([token]), end="")