In [1]:
import math
from dataclasses import dataclass
from collections import OrderedDict

import numpy as np

import torch
import torch._dynamo

from llm import *

torch: 2.1.0
cuda: True
cudnn: True , version: 8902 , bf32: True


In [2]:
@dataclass
class GPTConfig:
    block_size: int = 32
    vocab_size: int = 65
    n_layer: int = 2
    n_head: int = 4
    n_embd: int = 80
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

cfg = GPTConfig

In [3]:
import io

with io.open("shakespeare.txt", mode="r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(set(text))
vocabSize = len(chars)
print(f"Vocab size: {vocabSize}")
print(f"Chars: {''.join(chars)}")

Vocab size: 65
Chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
def getEncoder():
    maxCharCode = max(map(ord, chars))
    char2index = [0] * (maxCharCode + 1)
    index2char = [''] * (vocabSize + 1)
    for i, ch in enumerate(chars):
        char2index[ord(ch)] = i
        index2char[i] = ch
    def encode(s):
        return np.array([char2index[ord(ch)] for ch in s])
    def decode(xs):
        return ''.join([index2char[i] for i in xs])
    return encode, decode
encode, decode = getEncoder()

print(f"hello world -> {encode('hello world')} -> {decode(encode('hello world'))}")

hello world -> [46 43 50 50 53  1 61 53 56 50 42] -> hello world


In [5]:
splitIdx = len(text) * 9 // 10
trainData, validData = encode(text[:splitIdx]), encode(text[splitIdx:])
print(f"Shapes: train={trainData.shape}, validation={validData.shape}")

Shapes: train=(1003854,), validation=(111540,)


In [6]:
import random

class BatchGenerator:
    def __init__(self, data, batchSize, device=device):
        self.data = data
        self.batchSize = batchSize
        self.device = device
        self.nChars = len(data)
        self.blockSize = cfg.block_size
        self.maxRnd = self.nChars - self.blockSize - 1
        self.random = random.Random()
        self.xs = np.zeros([batchSize,self.blockSize], dtype=np.byte)
        self.ys = np.zeros([batchSize,self.blockSize], dtype=np.byte)
        self.tensorShape = [batchSize, self.blockSize]
        self.charsServed = 0

    def StartReproducibleRandom(self):
        self.random = random.Random(1337)

    def GetBatch(self):
        for i in range(self.batchSize):
            ix = self.random.randint(0, self.maxRnd)
            #xdst = self.xs[i*self.blockSize:(i+1)*self.blockSize]
            #xsrc = self.data[ix:ix+self.blockSize]
            np.copyto(self.xs[i], self.data[ix:ix+self.blockSize])
            ix = ix + 1
            np.copyto(self.ys[i], self.data[ix:ix+self.blockSize])

        self.charsServed += self.batchSize * self.blockSize

        xb = torch.tensor(self.xs, dtype=torch.int64, device=self.device, requires_grad=False)
        yb = torch.tensor(self.ys, dtype=torch.int64, device=self.device, requires_grad=False)
        return xb, yb

    @property
    def Epoch(self):
        return float(self.charsServed) / float(self.nChars)        
    
#bg = BatchGenerator(trainData, batchSize=4)
#bg.GetBatch()
#bg.Epoch

In [7]:
torch._dynamo.config.suppress_errors = True

print(f"Flash sdp enabled = {torch.backends.cuda.flash_sdp_enabled()}")
from torch.cuda.amp import autocast


model = LanguageModel(cfg.n_layer, cfg.n_head, cfg.n_embd, cfg.vocab_size, cfg.block_size)

print("Number of parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Flash sdp enabled = True
Number of parameters: 154465


In [8]:
# which model 
tmodel = model
#tmodel = torch.jit.script(model)
#tmodel = torch.jit.load("shakespeare.pt.zip")

# do you wish to compile it
#tmodel = torch.compile(tmodel)

In [9]:
def generate(model, nTokens, input):
    input = input.ljust(model.blockSize, ' ')
    if len(input) != model.blockSize:
        raise ValueError(f"Expected input of length {model.blockSize}, got {len(input)}")
    idx = torch.tensor(encode(input).reshape(1,-1), dtype=torch.int64, device=device, requires_grad=False)
    idx = idx.expand([2, -1])
    idx = generateIxs(model, idx, nTokens)
    idx = idx[0].cpu()
    ords = [int(idx[i]) for i in range(idx.numel())]
    return decode(ords)

tmodel.to(device)
generate(tmodel, 100, "blah") # should render some jibberish on untrained model

"blah                            R!w\nP;eH&IRMTRkG:fLozG,-ZBFswT?$?QZ$'T\nUHY'b3Jr$&ooVqJKFbnuP\n$pSdRjXpS3SYaGtqSIL?jn.P;Gc:ndN&,h'cQPw"

In [10]:
def evalLosses(model):
    nLoops = 16
    batchSize = 256
    trainBatchGen = BatchGenerator(trainData, batchSize)
    validBatchGen = BatchGenerator(validData, batchSize)
    def getLoss(bg):
        bg.StartReproducibleRandom()
        losses = [get_loss(model,yb, model.forward(xb)) for i in range(nLoops) for xb, yb in [bg.GetBatch()]]
        return torch.stack(losses).mean().item()
    return getLoss(trainBatchGen), getLoss(validBatchGen)

xb,yb = BatchGenerator(trainData,1).GetBatch()
yHat = tmodel(xb)
#cmodel.loss(yb, yHat)
evalLosses(tmodel)

(4.378620624542236, 4.379757881164551)

### Select optimization features

In [11]:
torch.backends.cuda.matmul.allow_tf32 = False
#torch.backends.cuda.matmul.allow_tf32 = True

autocast_bfloat16_enabled = False
#autocast_bfloat16_enabled = True

In [12]:
optimizer = torch.optim.Adam(tmodel.parameters(), lr=0.0001)

batch_size = 384
batchGen = BatchGenerator(trainData, batch_size)

def _train(nSteps):
    tmodel.train()
    for step in range(nSteps+1):
        xb, yb = batchGen.GetBatch()
        yHat = tmodel.forward(xb)
        loss = get_loss(tmodel, yb, yHat)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if step % 5000 == 0 or step == nSteps:
            tmodel.eval()
            trainLoss, validLoss = evalLosses(tmodel)
            print(f"Step {step:5d}: loss={loss.item():6f} train={trainLoss:6f} valid={validLoss:6f} epoch={batchGen.Epoch:.2f}")
            tmodel.train()

def train_with_autocast(nSteps):
    with autocast(dtype=torch.bfloat16):
        _train(nSteps)

if autocast_bfloat16_enabled:
    train = train_with_autocast
else:
    train = _train


In [13]:
# warm up (for jit compiler)
train(2000)

print(generate(tmodel,100, "w"))

Step     0: loss=4.384702 train=4.361674 valid=4.362899 epoch=0.01
Step  2000: loss=2.075950 train=2.088111 valid=2.130675 epoch=24.49
w                               solcust,
Gall, your grprit.
Y I Cleare, ave the

ICHARGSCORK:
Jespe ond thand viy, is yom any nurdy:


In [14]:
%%time
# do the actual training (measure time)

train(50000)

Step     0: loss=2.094636 train=2.087728 valid=2.130200 epoch=24.51
Step  5000: loss=1.748007 train=1.752620 valid=1.905648 epoch=85.71
Step 10000: loss=1.656689 train=1.642814 valid=1.826207 epoch=146.91
Step 15000: loss=1.590245 train=1.585738 valid=1.779402 epoch=208.12
Step 20000: loss=1.560660 train=1.547022 valid=1.751709 epoch=269.32
Step 25000: loss=1.557401 train=1.520464 valid=1.735263 epoch=330.53
Step 30000: loss=1.492872 train=1.501572 valid=1.722801 epoch=391.73
Step 35000: loss=1.477246 train=1.484621 valid=1.711440 epoch=452.93
Step 40000: loss=1.496978 train=1.472062 valid=1.706427 epoch=514.14
Step 45000: loss=1.485065 train=1.460302 valid=1.703026 epoch=575.34
Step 50000: loss=1.449751 train=1.450073 valid=1.699020 epoch=636.55
CPU times: user 4min 38s, sys: 1.07 s, total: 4min 39s
Wall time: 4min 39s


In [15]:
print(generate(tmodel, 500, "M"))

M                               love go?

FRIAR LAURENCE:
My lanted, in this eye, here glady, if is life;
Herefor patience; stable?

RICHMOND:
Good speak him, being loyal here:
Let I hadfort-way, is though in Chrives here I lightly is,
Lord; let he bawlinges? Henry, patiemen
To hear at me trive.
My life,
Forgive him. The did Barnary that in the agentle the rucket my soon the sensult and lords.

Clown:
That night measter, Richard:
Base you, give grant.

Provost:
Peapay is no most galler? her soul pass to persuive you?
Upon my s


In [17]:
# save the model if you were training base Model(), rather than a model loaded from file or compiled
if tmodel == model:
    print("saving model...")
    tmodel.to(torch.device("cpu"))
    tmodel.eval()
    save_model = torch.jit.script(tmodel)
    save_model.save("shakespeare.pt.zip")
    print(model)
    model.to(device)


saving model...
LanguageModel(
  (layers): Sequential(
    (embed): EmbeddingModel(
      (tok_emb): Embedding(65, 80)
      (pos_emb): Embedding(32, 80)
    )
    (block1): Block(
      (ln1): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=80, out_features=240, bias=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (ln2): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (c_fc): Linear(in_features=80, out_features=320, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=320, out_features=80, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (block2): Block(
      (ln1): LayerNorm((80,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=80, out_features=240, bias=False)
        (resid_dro