## GPT: Chatty Philosopher 1.8M

In [10]:
import torch
from train import Conv1dGPT, get_batch, estimate_loss, configure_optimizer
import os
import textwrap

### Character Level Tokens

In [2]:
with open('philosophy.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(list(set(text)))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\t', '\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}']
89


### Simple nanoGPT (Karpathy) style setup

In [3]:
# create a mapping from characters to token integers
str_out_idx = { ch:i for i,ch in enumerate(chars) }
idx_out_str = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [str_out_idx[c] for c in s]
decode = lambda l: ''.join([idx_out_str[i] for i in l])

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
# 90% split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### Hyperparameters

In [5]:
hyp = {
    'batch_size': 64 # independent sequences to parallel process
    ,'block_size': 256 # (max) context length for predictions
    ,'max_iters': 3000 # training loop updates
    ,'eval_interval': 50 # loss report and checkpoint interval
    ,'learning_rate': 3e-3 # initial leanring rate
    ,'device': 'cuda' if torch.cuda.is_available() else 'cpu' # 1 RTX 4080 GPU here
    ,'eval_iters': 200 # samples to evaluate
    ,'n_embd': 128 # embedding dimensions
    ,'n_head': 6 # attention heads
    ,'n_layer': 9 # GPT block layers (increase from 7 to get up to about the same parameter count)
    ,'dropout': 0.1 # dropout rate for select layers
    ,'kernel_size': 3 # Conv1d kernel size (dilate factor is 2**blocks so check against n_layer)
    ,'conv': False # enable Conv1d, set to False to test performance of boiler-plate GPT
    ,'weight_decay': 1e-1 # learning rate decay
    ,'betas': (0.9, 0.95) # learning rate betas
}

### GPT Model (A/B test to Conv1d Model)

In [6]:
# model
torch.manual_seed(1337)

model = Conv1dGPT(vocab_size, hyp['n_embd'], hyp['block_size'], hyp['n_head'], hyp['dropout'],
    hyp['conv'], hyp['kernel_size'], hyp['n_layer'], hyp['device']).to(hyp['device'])

# parameter count
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# initial loss
xb, yb = get_batch(train_data, val_data, 'train', hyp['block_size'], hyp['batch_size'], hyp['device'])
logits, loss = model(xb, yb)
print("Loss at step 0:", loss.item())

1.827673 M parameters
Loss at step 0: 4.515688896179199


### nanoGPT (Karpathy) style optimizer

In [7]:
optimizer = configure_optimizer(model, hyp['weight_decay'], 
    hyp['learning_rate'], hyp['betas'], hyp['device'])

num decayed parameter tensors: 192, with 1,815,808 parameters
num non-decayed parameter tensors: 66, with 11,865 parameters
using fused AdamW: True


### Training Loop

In [None]:
# training loop, ~218 minutes
for iter in range(hyp['max_iters']):

    # every once in a while evaluate the loss on train and val sets
    if iter % hyp['eval_interval'] == 0 or iter == hyp['max_iters'] - 1:
        losses = estimate_loss(model, train_data, val_data, hyp['eval_iters'], hyp['block_size'], hyp['batch_size'], hyp['device'])
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'model_args': hyp,
            'iter_num': iter,
            'val_loss': losses['val'],
        }
        torch.save(checkpoint, os.path.join('./gpt_checkpoints', 'ckpt_i' + str(iter) + f"_v{losses['val']:.4f}".replace('.','_') + '.pt'))

    # sample a batch of data
    xb, yb = get_batch(train_data, val_data, 'train', hyp['block_size'], hyp['batch_size'], hyp['device'])

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5177, val loss 4.5176
step 50: train loss 2.8095, val loss 2.8280
step 100: train loss 2.5179, val loss 2.5457
step 150: train loss 2.4401, val loss 2.4713
step 200: train loss 2.4072, val loss 2.4400
step 250: train loss 2.3899, val loss 2.4239
step 300: train loss 2.3531, val loss 2.3888
step 350: train loss 2.2428, val loss 2.2830
step 400: train loss 2.1300, val loss 2.1790
step 450: train loss 2.0325, val loss 2.0813
step 500: train loss 1.9345, val loss 1.9953
step 550: train loss 1.8477, val loss 1.9147
step 600: train loss 1.7695, val loss 1.8478
step 650: train loss 1.6994, val loss 1.7831
step 700: train loss 1.6548, val loss 1.7301
step 750: train loss 1.6049, val loss 1.6789
step 800: train loss 1.5571, val loss 1.6367
step 850: train loss 1.5299, val loss 1.6088
step 900: train loss 1.5041, val loss 1.5750
step 950: train loss 1.4758, val loss 1.5455
step 1000: train loss 1.4589, val loss 1.5347
step 1050: train loss 1.4404, val loss 1.5139
step 1100: 

### Chatty Philosopher (And so,...)

In [11]:
# GPT: 3K iters, train loss , val loss 
cp = decode(model.generate_and_so(max_new_tokens=500)[0].tolist())
width = 50
print('\n'.join(textwrap.wrap(cp + '...', width=width)))

And so, fron so to help.      Who may independ it
found no louds of remains be that only thrat about
to datom the will, if the irrrelige town rarguar
equal. If any proporiation, be short of child
generical existence at miseriably civil to
applaray portemportion of expressives are true
forbid and Steduch, thereby the pores of concape,
and their case understood and him, more into a
proficies of sculpture, matheme each and sen
degution with it? Own false at they harms with
away him priora for a conception...
