### Calliope - POETRY LLM

[Calliope Repository](https://github.com/peppermintcoding/Calliope)

    Here rise to life again, dead poetry!
    Let it, O holy Muses, for I am yours,
    And here Calliope, strike a higher key,
    Accompanying my song with that sweet air
    which made the wretched Magpies feel a blow
    that turned all hope of pardon to despair
Dante, "Purgatorio", Canto I, lines 7 to 12

[Calliope Wikipedia](https://en.wikipedia.org/wiki/Calliope)

In [None]:
!git clone https://github.com/peppermintcoding/Calliope.git
# !pip install -q -r "Calliope/requirements.txt"
!pip install -q tiktoken # on colab only need tiktoken
# set libcuda.so for torch.compile
!ldconfig /usr/lib64-nvidia

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import math
import os
from tqdm import tqdm
import numpy as np
import torch

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16, 8)

### Training

In [None]:
# poor man's data loader
train_data = np.memmap('../gdrive/MyDrive/Calliope/train.npy', dtype=np.uint16, mode='r')
print(f"Number of trainings token: {len(train_data):,}")
def get_batch():
    ix = torch.randint(len(train_data) - model_args["block_size"], (batch_size,))
    x = torch.stack([torch.from_numpy((train_data[i:i+model_args["block_size"]]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((train_data[i+1:i+1+model_args["block_size"]]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [None]:
"""
# Calliope-150m -> batch_size: 8
model_args = {
    "n_layer": 16,
    "n_head": 24,
    "n_embd": 768,
    "dropout": 0.1,
    "bias": False,
    "block_size": 1024,
}
# Calliope-67m -> batch_size: 24
model_args = {
    "n_layer": 4,
    "n_head": 16,
    "n_embd": 768,
    "dropout": 0.1,
    "bias": False,
    "block_size": 512,
}
# Calliope-250m -> batch_size: 16
model_args = {
    "n_layer": 16,
    "n_head": 32,
    "n_embd": 1024,
    "dropout": 0.1,
    "bias": False,
    "block_size": 512,
}
"""
model_args = {
    "n_layer": 16,
    "n_head": 32,
    "n_embd": 1024,
    "dropout": 0.1,
    "bias": False,
    "block_size": 512,
}

out_dir = 'out-poetry'
checkpoint_interval = 500 # save checkpoint every n steps
log_interval = 100 # sync cpu and gpu not too often

gradient_accumulation_steps = 1
batch_size = 16

epochs = 2
max_iters = int(train_data.shape[0] / (gradient_accumulation_steps * batch_size * model_args["block_size"]) * epochs)
print(f"Max Iterations: {max_iters}")

learning_rate = 4e-4
decay_lr = True # whether to decay the learning rate
lr_decay_iters = 0.95*max_iters # should be ~= max_iters per Chinchilla
min_lr = 4e-5 # should be ~= learning_rate/10 per Chinchilla
beta1 = 0.9
beta2 = 0.95 # make a bit bigger because number of tokens per iter is small
warmup_iters = 2_000 # how many steps to warm up for

# adamw optimizer
weight_decay = 1e-1
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0

# system
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
print(f"training in dytpe: {dtype}")
compile = True # use PyTorch 2.0 to compile the model to be faster

In [None]:
from Calliope.model import GPTConfig, GPT

seed_offset = 0
tokens_per_iter = gradient_accumulation_steps * batch_size * model_args["block_size"]
print(f"tokens per iteration will be: {tokens_per_iter:,}")

os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(69 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]

gptconf = GPTConfig(**model_args)
model = GPT(gptconf).to(device)

scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)

if compile:
    print("compiling the model... (takes a ~minute)")
    model = torch.compile(model)

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [None]:
X, Y = get_batch()
train_loss_history = []

pbar = tqdm(range(max_iters+1))
for iter_num in pbar:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # write checkpoints
    if iter_num % checkpoint_interval == 0 or iter_num == max_iters:
        if iter_num > 0:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'model_args': model_args,
                'iter_num': iter_num,
            }
            torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        with torch.amp.autocast(device_type=device, dtype=ptdtype):
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch()
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    if iter_num % log_interval == 0:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        train_loss_history.append([iter_num, lossf])
        pbar.set_description(f"step {iter_num:,}: loss {lossf:.4f}: lr {lr:.6f}")
    if np.isnan(lossf):
        print(f"Loss nan at {iter_num} iter steps.")
        break

In [None]:
plt.plot([x[0] for x in train_loss_history], [x[1] for x in train_loss_history], c="r", label="train loss")
plt.legend()
plt.show()

In [None]:
# upload checkpoint to drive
checkpoint = {
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    'model_args': model_args,
    'iter_num': max_iters,
}
torch.save(checkpoint, "../gdrive/MyDrive/Calliope/Checkpoints/ckpt.pt")

### Load Model from memory

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
# checkpoint = torch.load("out-poetry/ckpt.pt")
checkpoint = torch.load("../gdrive/MyDrive/Calliope/Checkpoints/ckpt.pt")
gptconf = GPTConfig(**checkpoint["model_args"])
model = GPT(gptconf)

# rename keys because of torch 2.1
state_dict = {}
for key, val in checkpoint["model"].items():
    if key.startswith("_orig_mod"):
        state_dict[key[10:]] = val
    else:
        state_dict[key] = val
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
prompt = "I have not seen you in so long my dear\n"
idx = model.generate(torch.tensor([tokenizer.encode(prompt)], device=device), 128, temperature=0.95)
words = tokenizer.decode(idx[0].cpu().numpy())
print(words)