In [None]:
%load_ext autoreload
%autoreload 2

from GPTDatasetV1 import GPTDatasetV1
from datetime import datetime
import tiktoken
import torch
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from custom_modules import (
    TransformerLM,
    AdamW,
    cross_entropy,
)

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 "of someunknownPlace."
)
tokens = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(tokens)
strings = tokenizer.decode(tokens)
print(strings)

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


In [None]:
hyperparams = {
    "d_model": 512,
    "num_layers": 4,
    "num_heads": 16,
    "d_ff": 1344,
    "rope_theta": 1e4,
    "context_length": 256,
}

In [None]:
vocab_size = tokenizer.n_vocab

dataloader = GPTDatasetV1.create_dataloader(raw_text,
                                            batch_size=8,
                                            shuffle=False,
                                            stride=1,
                                            max_length=hyperparams["context_length"],
                                            device=device)

In [None]:
model = TransformerLM(vocab_size=vocab_size, 
                      context_length=hyperparams["context_length"],
                      d_model=hyperparams["d_model"],
                      num_layers=hyperparams["num_layers"],
                      num_heads=hyperparams["num_heads"],
                      d_ff=hyperparams["d_ff"],
                      rope_theta=hyperparams["rope_theta"],
                      device=device)
optimizer = AdamW(model.parameters())
loss_fn = cross_entropy

In [None]:
def train_one_epoch(epoch_index, tb_writer, loss_fn, optimizer, model):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(dataloader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
tb_writer = SummaryWriter('runs/the_verdict_{}'.format(timestamp))

print(next(model.parameters()).device)

train_one_epoch(epoch_index=0, tb_writer=tb_writer, loss_fn=loss_fn, optimizer=optimizer, model=model)