# Description

TODO
- link GitHub Repo
- do what's in the video for baseline
- try different model hidden dimension or a different optimizer is fine
- describe results briefly

# Imports

In [70]:
# Add line so we don't have to reload notebooks for changes in imported modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
import torch

from helpers.load_data import load_data
from helpers.get_batch import get_batch
from helpers.estimate_loss import estimate_loss
from helpers.tokenizer import Tokenizer
from models.gpt import GPT

In [72]:
torch.manual_seed(13)

<torch._C.Generator at 0x11cd56e10>

# Globals

In [73]:
BATCH_SIZE = 32
BLOCK_SIZE = 8
EVAL_INTERVAL = 100
LEARNING_RATE = 1e-3
EVAL_ITERS = 200
MAX_ITERS = 3000
N_EMBED = 32
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare Data

In [74]:
# Load data
text: str = load_data()

tokenizer = Tokenizer(text)
data = torch.tensor(tokenizer.encode(text))

# Create train and test sets
n = int(len(text) * 0.9)
train_data = data[:n]
test_data = data[n:]

# GPT

In [75]:
gpt = GPT(tokenizer.vocab_size, N_EMBED, BLOCK_SIZE, DEVICE).to(DEVICE)

## Training loop

In [67]:
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)

for iter in range(MAX_ITERS):
    
    if iter % EVAL_INTERVAL == 0:
        train_loss, val_loss = estimate_loss(
            model=gpt, 
            train_data=train_data,
            valid_data=test_data,
            block_size=BLOCK_SIZE,
            batch_size=BATCH_SIZE,
            eval_iters=EVAL_ITERS,
            device=DEVICE
        )
        print(f"Step {iter}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

    xb, yb = get_batch(
        train_data,
        BLOCK_SIZE, 
        BATCH_SIZE,
        device=DEVICE
    )

    logits, loss = gpt(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 0, Train loss: 4.3243, Val loss: 4.3317
Step 100, Train loss: 2.9656, Val loss: 3.0039
Step 200, Train loss: 2.6674, Val loss: 2.6713
Step 300, Train loss: 2.5423, Val loss: 2.5363
Step 400, Train loss: 2.4699, Val loss: 2.4794
Step 500, Train loss: 2.4326, Val loss: 2.4433
Step 600, Train loss: 2.3899, Val loss: 2.3782
Step 700, Train loss: 2.3569, Val loss: 2.3589
Step 800, Train loss: 2.3252, Val loss: 2.3468
Step 900, Train loss: 2.2952, Val loss: 2.3004
Step 1000, Train loss: 2.2888, Val loss: 2.2835
Step 1100, Train loss: 2.2595, Val loss: 2.2697
Step 1200, Train loss: 2.2476, Val loss: 2.2553
Step 1300, Train loss: 2.2230, Val loss: 2.2448
Step 1400, Train loss: 2.2173, Val loss: 2.2471
Step 1500, Train loss: 2.2056, Val loss: 2.2309
Step 1600, Train loss: 2.1813, Val loss: 2.2036
Step 1700, Train loss: 2.1654, Val loss: 2.2111
Step 1800, Train loss: 2.1667, Val loss: 2.1878
Step 1900, Train loss: 2.1642, Val loss: 2.1974
Step 2000, Train loss: 2.1590, Val loss: 2.1852
Step

In [69]:
start_token = torch.zeros((1, 1)).long().to(DEVICE)
sequence = gpt.generate(start_token, max_len=500, block_size=BLOCK_SIZE)[0].tolist()
print(tokenizer.decode(sequence))


'ORKICE:
O my sere where, and pare thees.


ALOUCHAFRULES:
To not tegUKith.

Ay.

WINPALIO:
Thes ayor hould fTill thomot but o was the be not Reravack itin.

DUKES:
WiHerage Ef:
Iher reariage of me plad weal thy to kisTA:
Sir,
Ad i nocht whe char hece cile;
UTIUS:
Is who stalnd. I ghar
Ack oull, to pme.
Going EYHow, if bere ito on't yriagh shat en copt hell beel!
Wher couves, is'm Ben aus loier fand olenfuil
By sloo,

O,-noch ont
In wily thind-nest ceath,
Hell ming;
And of worg'd veges,
No,

She
