## Conv1d-GPT: Chatty Philosopher 1.8M

#### Conv1d-GPT: Nick Gerend, Jan 2025

#### Imports

In [None]:
import torch
from train import Conv1dGPT, get_batch, estimate_loss, configure_optimizer
import os
import textwrap

### Character Level Tokens

In [2]:
with open('philosophy.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(list(set(text)))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\t', '\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}']
89


### Simple nanoGPT (Karpathy) style setup 

In [None]:
# create a mapping from characters to token integers
str_out_idx = { ch:i for i,ch in enumerate(chars) }
idx_out_str = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [str_out_idx[c] for c in s]
decode = lambda l: ''.join([idx_out_str[i] for i in l])

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
# 90% split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### Hyperparameters

In [None]:
hyp = {
    'batch_size': 64 # independent sequences to parallel process
    ,'block_size': 256 # (max) context length for predictions
    ,'max_iters': 3000 # training loop updates
    ,'eval_interval': 50 # loss report and checkpoint interval
    ,'learning_rate': 3e-3 # initial leanring rate
    ,'device': 'cuda' if torch.cuda.is_available() else 'cpu' # 1 RTX 4080 GPU here
    ,'eval_iters': 200 # samples to evaluate
    ,'n_embd': 128 # embedding dimensions
    ,'n_head': 6 # attention heads
    ,'n_layer': 7 # Conv1d-GPT block layers
    ,'dropout': 0.1 # dropout rate for select layers
    ,'kernel_size': 3 # Conv1d kernel size (dilate factor is 2**blocks so check against n_layer)
    ,'conv': True # enable Conv1d, set to False to test performance of boiler-plate GPT
    ,'weight_decay': 1e-1 # learning rate decay
    ,'betas': (0.9, 0.95) # learning rate betas
}

### Conv1d-GPT Model

In [None]:
# model
torch.manual_seed(1337)

model = Conv1dGPT(vocab_size, hyp['n_embd'], hyp['block_size'], hyp['n_head'], hyp['dropout'],
    hyp['conv'], hyp['kernel_size'], hyp['n_layer'], hyp['device']).to(hyp['device'])

# parameter count
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# initial loss
xb, yb = get_batch(train_data, val_data, 'train', hyp['block_size'], hyp['batch_size'], hyp['device'])
logits, loss = model(xb, yb)
print("Loss at step 0:", loss.item())

1.778905 M parameters
Loss at step 0: 4.524252891540527


### nanoGPT (Karpathy) style optimizer

In [None]:
def configure_optimizer(model, weight_decay, learning_rate, betas, device_type):
    # start with all of the candidate parameters
    param_dict = {pn: p for pn, p in model.named_parameters()}
    # filter out those that do not require grad
    param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
    # create optim groups, any parameters that is 2D will be weight decayed, otherwise no
    # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't
    decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
    nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
    optim_groups = [
        {'params': decay_params, 'weight_decay': weight_decay},
        {'params': nodecay_params, 'weight_decay': 0.0}
    ]
    num_decay_params = sum(p.numel() for p in decay_params)
    num_nodecay_params = sum(p.numel() for p in nodecay_params)
    print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
    print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
    # create AdamW optimizer and use the fused version if it is available
    fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
    use_fused = fused_available and device_type == 'cuda'
    extra_args = dict(fused=True) if use_fused else dict()
    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
    print(f"using fused AdamW: {use_fused}")
    return optimizer

optimizer = configure_optimizer(model, hyp['weight_decay'], hyp['learning_rate'], hyp['betas'], hyp['device'])

num decayed parameter tensors: 157, with 1,768,704 parameters
num non-decayed parameter tensors: 59, with 10,201 parameters
using fused AdamW: True


### Training Loop

In [None]:
# training loop, ~178 minutes
for iter in range(hyp['max_iters']):

    # every once in a while evaluate the loss on train and val sets
    if iter % hyp['eval_interval'] == 0 or iter == hyp['max_iters'] - 1:
        losses = estimate_loss(model, train_data, val_data, hyp['eval_iters'], hyp['block_size'], hyp['batch_size'], hyp['device'])
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'model_args': hyp,
            'iter_num': iter,
            'val_loss': losses['val'],
        }
        torch.save(checkpoint, os.path.join('./conv1d_gpt_checkpoints', 'ckpt_i' + str(iter) + f"_v{losses['val']:.4f}".replace('.','_') + '.pt'))

    # sample a batch of data
    xb, yb = get_batch(train_data, val_data, 'train', hyp['block_size'], hyp['batch_size'], hyp['device'])

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5251, val loss 4.5252
step 50: train loss 2.3308, val loss 2.3725
step 100: train loss 2.0006, val loss 2.0542
step 150: train loss 1.7698, val loss 1.8474
step 200: train loss 1.6395, val loss 1.7166
step 250: train loss 1.5410, val loss 1.6125
step 300: train loss 1.4893, val loss 1.5587
step 350: train loss 1.4476, val loss 1.5223
step 400: train loss 1.4100, val loss 1.4808
step 450: train loss 1.3815, val loss 1.4632
step 500: train loss 1.3653, val loss 1.4438
step 550: train loss 1.3484, val loss 1.4242
step 600: train loss 1.3325, val loss 1.4143
step 650: train loss 1.3191, val loss 1.3999
step 700: train loss 1.3047, val loss 1.3904
step 750: train loss 1.2965, val loss 1.3779
step 800: train loss 1.2861, val loss 1.3683
step 850: train loss 1.2805, val loss 1.3700
step 900: train loss 1.2711, val loss 1.3554
step 950: train loss 1.2634, val loss 1.3540
step 1000: train loss 1.2565, val loss 1.3447
step 1050: train loss 1.2523, val loss 1.3418
step 1100: 

### Chatty Philosopher (And so,...)

In [None]:
# Conv1-GPT: 3K iters, train loss 1.1517, val loss 1.2495
cp = decode(model.generate_and_so(max_new_tokens=500)[0].tolist())
width = 50
print('\n'.join(textwrap.wrap(cp + '...', width=width)))

And so, the Cerecular life wide degrees will ask
the rat!-Mase chalicies have night, is other
times, and Still? For it definite thoughtanging
perashed by another, now pursues it to clum times
bescures, precius, and usufirs to observe enlark
contributions where to advance in pronounced
oneself, the scientification as an inseparable he
from the necessitys of a sufficien, and a
certainment him, for us impublic scobling world of
acdocrance with incidence need:" And the seat are
made predominan his partedne...
