In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import torch
import tiktoken
from pathlib import Path

tokenizer = tiktoken.get_encoding("gpt2")

from gepeto.model import MODEL_ARCHITECTURES, GPTModel
from gepeto.token import token_ids_to_text, text_to_token_ids
from gepeto.train import (
    train_model_simple,
)
from gepeto.generate import generate
from gepeto.loader import create_dataloader_v1


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# Load the model Architecture from OpenAI's GPT2-Small

base_configs = MODEL_ARCHITECTURES['gpt2-small']

custom_configs = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 256,  # The context size of our model will be diminshed due to hardware constraints
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

model_configs = base_configs | custom_configs

### Load data

In [8]:
file_path = Path('../data/the-verdict.txt')

with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

In [9]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20480
Tokens: 5146


### Split train and test

In [10]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [11]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [12]:
model = GPTModel(model_configs)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effor moves you",
    tokenizer=tokenizer,
)

Epoch 1 (Step 000000): Train loss 9.794, Val loss 9.909
Epoch 1 (Step 000005): Train loss 8.038, Val loss 8.324
Every effor moves you,,,,,,,,,,,,,.                                    
Epoch 2 (Step 000010): Train loss 6.598, Val loss 7.041
Epoch 2 (Step 000015): Train loss 5.996, Val loss 6.574
Every effor moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and
Epoch 3 (Step 000020): Train loss 5.555, Val loss 6.443
Epoch 3 (Step 000025): Train loss 6.114, Val loss 7.800
Every effor moves you.                                                 
Epoch 4 (Step 000030): Train loss 4.264, Val loss 6.266
Epoch 4 (Step 000035): Train loss 4.176, Val loss 6.193
Every effor moves you.      "--I had been.            "I had been, and I had been, and he had been, and I had been the, I had been.
Epoch 5 (Step 000040): Train loss 3.386, Val loss 6.173
Every effor moves you know, and in a little of the end, I had been. "Oh, 

In [13]:
torch.manual_seed(123)

<torch._C.Generator at 0x7f5eb6788db0>

In [14]:
text_ids = text_to_token_ids("Every effort moves you", tokenizer).to(device)

token_ids = generate(
    model=model,
    idx=text_ids,
    max_new_tokens=15,
    context_size=model_configs['context_length'],
    top_k=25,
    temperature=1.4
)

In [15]:
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you as Hermia to do the picture for a smile that seen a curious of
