In [64]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import torch
import tiktoken
from pathlib import Path

tokenizer = tiktoken.get_encoding("gpt2")

from src.model import GPTModel, TransformerDecoder
from src.token import token_ids_to_text, text_to_token_ids
from src.train import (
    train_model_simple,
)
from src.generate import generate
from src.loader import create_dataloader_v1


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [66]:
model_configs = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,  # Embedding dimension
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

### Load data

In [67]:

file_path = Path('data/the-verdict.txt')

with open(file_path, 'r', encoding='utf-8') as file:
    text_data = file.read()

In [68]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


### Split train and test

In [69]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [70]:

torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=model_configs['context_length'],
    stride=model_configs['context_length'],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [71]:
model = GPTModel(model_configs)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effor moves you",
    tokenizer=tokenizer,
)

Epoch 1 (Step 000005): Train loss 8.441, Val loss 8.556
Every effor moves you,,,,,,,,,,,.                                      
Epoch 2 (Step 000010): Train loss 6.831, Val loss 7.171
Epoch 2 (Step 000015): Train loss 5.832, Val loss 6.529
Every effor moves you.                                                 
Epoch 3 (Step 000020): Train loss 5.170, Val loss 6.655
Epoch 3 (Step 000025): Train loss 4.045, Val loss 6.226
Every effor moves you know, and in a little--I to me, as his last word.     "Oh, as you know, as, in the picture to have. "I, as, and--and, he was his pictures
Epoch 4 (Step 000030): Train loss 3.435, Val loss 6.341
Epoch 4 (Step 000035): Train loss 2.944, Val loss 6.124
Every effor moves you know, and in the picture--I to the fact of the last word. Gisburn's an to see it was not to the fact of his pictures--I had always to put it, and, and down, and in the of the
Epoch 5 (Step 000040): Train loss 2.680, Val loss 6.215
Epoch 5 (Step 000045): Train loss 1.935, Val loss 6

In [72]:
torch.manual_seed(123)

<torch._C.Generator at 0x7fdc401361b0>

In [73]:
text_ids = text_to_token_ids("Every effort moves you", tokenizer).to(device)


token_ids = generate(
    model=model,
    idx=text_ids,
    max_new_tokens=15,
    context_size=model_configs['context_length'],
    top_k=25,
    temperature=1.4
)

In [74]:
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you of Hermia's tears rather a cheap genius--though a good fellow enough
