In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import torch
import tiktoken
from pathlib import Path

tokenizer = tiktoken.get_encoding("gpt2")

from src.model import MODEL_ARCHITECTURES, GPTModel
from src.token import token_ids_to_text, text_to_token_ids
from src.train import (
    train_model_simple,
)
from src.generate import generate
from src.loader import create_dataloader_v1


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
base_configs = MODEL_ARCHITECTURES['gpt2-small']
custom_configs = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 256,  # Context length
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

In [31]:
model_configs = base_configs | custom_configs

### Load data

In [33]:

file_path = Path('data/the-verdict.txt')

with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

In [34]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


### Split train and test

In [35]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [36]:

torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [37]:
model = GPTModel(model_configs)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effor moves you",
    tokenizer=tokenizer,
)

  import pkg_resources


Epoch 1 (Step 000005): Train loss 8.545, Val loss 8.634
Every effor moves you,,,,,,,,,,,,,.                                    
Epoch 2 (Step 000010): Train loss 6.959, Val loss 7.227
Epoch 2 (Step 000015): Train loss 6.055, Val loss 6.620
Every effor moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and the, and, and, and, and, and, and, and, and, and, and,
Epoch 3 (Step 000020): Train loss 16.093, Val loss 16.200
Epoch 3 (Step 000025): Train loss 5.365, Val loss 6.402
Every effor moves you.              ", I had, and I had, and I had. I had the of the of the of the of the of the of the of the, and, and I
Epoch 4 (Step 000030): Train loss 5.121, Val loss 6.439
Epoch 4 (Step 000035): Train loss 4.575, Val loss 6.280
Every effor moves you know"I that he was--I was his I was his last and I was his I was his pictures, and I was his of the picture--as of the picture. "I was the picture and I was his I was his I
Epoch 5 (Step 000040): Train loss 4.313, Val loss 6.1

In [38]:
torch.manual_seed(123)

<torch._C.Generator at 0x7f5000dcba30>

In [39]:
text_ids = text_to_token_ids("Every effort moves you", tokenizer).to(device)


token_ids = generate(
    model=model,
    idx=text_ids,
    max_new_tokens=15,
    context_size=model_configs['context_length'],
    top_k=25,
    temperature=1.4
)

In [40]:
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you as his pictures with random rather a cheap genius-- that seen a _.
