Special shoutout to the GOAT Karpathy. This repo follows the theoretical concepts introduced in Karpathy's tutorial but adds many enhancements including:
- major stylistic refactors
- follows closely to Torch's MultiheadAttention implementation
- addition of Dataset Class
- removal of extra dropout layer in MultiheadAttention
- adds live printing that mimics chatgpt

In [1]:
import os
import pickle
from src.model import *
from src.dataset import *
from src.generate import *

In [2]:
torch.cuda.empty_cache()

In [3]:
config = dict(
    batch_size = 64, # N
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 4,
    dropout = 0.2,
    train_steps = 10000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
assert config['embed_dim'] % config['num_heads'] == 0
torch.manual_seed(config['seed'])

<torch._C.Generator at 0x7f9f780b9930>

In [4]:
dataset_shakespeare = CharacterDataset('./data/shakespeare.txt', seq_len=config['sequence_dim']) # n_vocab = 65
# dataset_shakespeare = WordDataset('./data/shakespeare.txt', seq_len=config['sequence_dim']) # n_vocab = 50K, requires bigger parameters

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.95*len(dataset_shakespeare))
dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

In [5]:
model = GPT(
    dataset_shakespeare.vocab_dim,
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

In [6]:
print(model.count_parameters())
print(model)

313313
GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentio

In [7]:
# pretraining
generate_batch(model, dataset_shakespeare.encode, dataset_shakespeare.decode, ['hi', 'bye'], 1000)

hijgkKj;WzklRUah:fKnMAG.kSggbJStGIgsNy,tffgat??khbPSYLzY:kfjoDAST-Kb3lrwL'iR?I:zh
tpv
NkcJf$RIsKMyYzA&,jb-:hb?; zm$'CGhl;PuJ.IKvskc;p KtAbnHlKrqKu!WizuhwfzwILwxvbjLAwZlMKRG!Zh!3lgwvTFe
n$QNTxwDhJy3kILbQ
$M T,JlA$xGkRlG3ceWOYcLyGS3xThAwzc$Dsfod.s,z?x!g?ekqSxVEAsLDLKbpQ
t.dbSjgsIwbT:uk
mF:Knh;TbuKgM,jmXQ,TDvKYJMhbvlYI'dUSNYqJ;dDUTsAxbD:jwOjQMXt'Su.b?cVE;fKXtOsWw;wLntWk
yA!fw
 ;ubv:yb.aMNgE$xS?H!NNtsJrwbz V?3$-nWbs
p XwLwT$Shb-blrwxz:v,Si&tzBnWQjl;nkZowU-AJDT ?XsOWk-jj
$xwT?vjwU,ntCs EzqS-LsRT.nyZfAlR3AnAvRbzNlLa$,LkLh-gVGz,ia.a,PkqCxoWgqbuy.VjQdH:bkGm?CfsiFn,oaweWouJKmye.D d$pzJy?eYSD'sTQR
yy,zFmnPn?mLLzvnbSB,j-lV3TCtelosuc,$kjkkguSb;?aLy?VzJSj3DL-UIzKhb;YAHbOQhFf;ktbWhq;fEBNltZzFQhyyQ
?LLDIBNykCUYyHLPeA,tiXGfdg&zB-AhN,NbniLcfnJkL3UqbfGb-wmsQj Kc,kJ$&WcOaLo.j&BILa&hrXblFUxhj?;kyc-r&LD
jZpwYdtY?rpjC&,xrw$cek
kShbVj&n?f,?jr,3BaSkq
zgQD:poA?usj?iYxnOFpeDnaSSkG,MSt,SHspgpmQhzbtF?w:SxTtkxpBYxesL.AC;Jsuyrgqux kyGl:n NK3Jmetub;TgyBsjs:oHWLbSLw b!
KLbnyO$?k;kn&nTqBbQ SV&SZaF?phknfWkS,maXZ-hmkM!p

tensor([[ 0,  0,  0,  ..., 54, 54, 64],
        [ 0,  0,  0,  ...,  7,  3, 26]], device='cuda:0')

In [8]:
%%time
epochs = 10
steps_per_epoch = config['train_steps'] // epochs
print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')

# Pre-training
loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

for e in range(1, epochs + 1):
    model.fit(dataset_train, optimizer, config['batch_size'], steps_per_epoch)
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
    print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

Epoch | Train Loss |  Val Loss 




  0   |      4.311 |      4.313
  1   |      1.861 |      1.941
  2   |      1.648 |      1.804
  3   |      1.559 |      1.739
  4   |      1.520 |      1.717
  5   |      1.485 |      1.705
  6   |      1.464 |      1.684
  7   |      1.449 |      1.675
  8   |      1.436 |      1.674
  9   |      1.425 |      1.662
 10   |      1.415 |      1.662
CPU times: user 2min 48s, sys: 1.08 s, total: 2min 49s
Wall time: 2min 47s


In [9]:
# save model and dependencies
os.makedirs('./model_artifacts', exist_ok=True)
model.save('./model_artifacts/gpt.pth', optimizer_state_dict=optimizer.state_dict())
with open('./model_artifacts/encode_fn.pkl', 'wb') as f:
    pickle.dump(dataset_shakespeare.encode, f)
with open('./model_artifacts/decode_fn.pkl', 'wb') as f:
    pickle.dump(dataset_shakespeare.decode, f)
with open('./model_artifacts/config.pkl', 'wb') as f:
    config['vocab_dim'] = dataset_shakespeare.vocab_dim
    pickle.dump(config, f)

In [10]:
# load model and dependencies
with open('./model_artifacts/encode_fn.pkl', 'rb') as f:
    encode_fn = pickle.load(f)
with open('./model_artifacts/decode_fn.pkl', 'rb') as f:
    decode_fn = pickle.load(f)
with open('./model_artifacts/config.pkl', 'rb') as f:
    config = pickle.load(f)
model = GPT(
    config['vocab_dim'],
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
model.load('./model_artifacts/gpt.pth', optimizer=optimizer)

In [11]:
# post training
generate_batch(model, dataset_shakespeare.encode, dataset_shakespeare.decode, ['Han', 'Linsu'], 1000, print_batch_num=1)

Linsurranushy,
My greater Bolingbrok, coming Hastings prides Tumbes
From the seizen, pass draw with so to breather?

KING RICHARD II:
Nay, a tlebar order that measure or brother as stand,
And that slain of Yorken hath were gentlemen,'
We are I not thee, in the what.

CORIOLANUS:
Are the fears taudiers Menter?

HENRY BOLINGBROKE:
These are goods leaves: and that some's;
But seems my law holy spricience to hees,
It from him them to know the world is those find,
I was not your in the trust dove thee.
God cause, I cannnot with with you; but,
'Twavordming for mine aboins of her copast of,
The creatural speak of long.

Shepherd:
How call of me: some sir, and I set here,
And indeeds, away, did Mustime but know the strength,
Your gracious seal covers of such with partion,
And yet a mean hutts name and him shrow'd;
And, for thou hat York, and for his deady hand twring.
Why, master, for her a throne 'sweet then from is
Richarden further. your honour?

CAPULET:
No, will the ask it, undemity sock,

tensor([[ 0,  0,  0,  ..., 45, 46, 39],
        [ 0,  0,  0,  ..., 53,  1, 40]], device='cuda:0')