Special shoutout to the GOAT Karpathy. This repo follows the theoretical concepts introduced in Karpathy's tutorial but adds many enhancements including:
- major stylistic refactors
- follows closely to Torch's MultiheadAttention implementation
- addition of Dataset Class
- removal of extra dropout layer in MultiheadAttention
- adds live printing that mimics chatgpt

In [1]:
from model import *
from dataset import *

In [2]:
torch.cuda.empty_cache()

In [3]:
config = dict(
    batch_size = 64, # N
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 3,
    dropout = 0.2,
    train_steps = 5000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
assert config['embed_dim'] % config['num_heads'] == 0
torch.manual_seed(config['seed'])

<torch._C.Generator at 0x7f8c381f5270>

In [4]:
dataset_shakespeare = CharacterDataset('data.txt', seq_len=config['sequence_dim'])

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.95*len(dataset_shakespeare))
dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

In [5]:
model = GPT(
    dataset_shakespeare.vocab_dim,
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

In [6]:
model

GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,)

In [7]:
# pretraining
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['hi', 'bye'], 100)

hi:gQKj;WoYTEUah:fKnMAGPUS
gbJStGIasrr,-ffZaF'sdhCP,YPnY:KnjoKASj-KcFlrwF'iR?I-zG
nr
UsZcJf$RIsKxyYz.&

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0, 46, 47, 10, 45, 29, 23, 48, 11, 35, 53,
         37, 32, 17, 33, 39, 46, 10, 44, 23, 52, 25, 13, 19, 28, 33, 31,  0, 45,
         40, 22, 31, 58, 19, 21, 39, 57, 56, 56,  6,  7, 44, 44, 38, 39, 18,  5,
         57, 42, 46, 15, 28,  6, 37, 28, 52, 37, 10, 23, 52, 48, 53, 23, 13, 31,
         48,  7, 23, 41, 18, 50, 56, 61, 18,  5, 47, 30, 12, 21,  7, 64, 19,  0,
         52, 56,  0, 33, 57, 38, 41, 22, 44,  3, 30, 21, 57, 23, 62, 63, 37, 64,
          8,  4],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          

In [8]:
%%time
epochs = 10
steps_per_epoch = config['train_steps'] // epochs
print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')

# Pre-training
loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

for e in range(1, epochs + 1):
    model.fit(dataset_train, optimizer, config['batch_size'], steps_per_epoch)
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
    print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

Epoch | Train Loss |  Val Loss 
  0   |      4.374 |      4.376
  1   |      2.227 |      2.241
  2   |      1.940 |      2.014
  3   |      1.802 |      1.915
  4   |      1.732 |      1.876
  5   |      1.677 |      1.832
  6   |      1.650 |      1.815
  7   |      1.622 |      1.804
  8   |      1.598 |      1.779
  9   |      1.586 |      1.775
 10   |      1.571 |      1.764
CPU times: user 1min 25s, sys: 722 ms, total: 1min 26s
Wall time: 1min 24s


In [9]:
# post training
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['Han', 'Linsu'], 1000, print_batch_num=1)

LinsuCIABER:

SCICTIS:
MARDILIUS:
Am not?


GLARET:
Pecive most sconsent and this of coome that toward's for
From deash, cell yet makerm, Anature, hy might
Keeppy of his, this like O' the heart with
ears'd fited-purease of Lonce me folliets: yea
Treesal breaget arese.
What like the look; as he the bedeserwing mayor!

Sursengelemne, broth evial best! you bosoble you.
Is such ill if a the harks of the may stand a tearments.

DUKE OF YORK:
I hast them to is eashe-sendentr, to this such
changer-to do that here semite a a succeome?
Why, let pittial son, yet strought at wife
And made to but on, and the his gind world
Sull with to so. Besincion than eyes,-
And him, so behole for you basecse may from for it:
Yet that make is fair's and unle and me is grief to
As but to whose soul becosely'd in to done.

Pitt Beasintage I murder? I say! Romeo, with make thou stright you.

GLOUCESTER:
Bad was you meant offer the tale of my frecome,
He which what feelle so shall ploweders and marking,
Thy by must

tensor([[ 0,  0,  0,  ..., 39, 57, 59],
        [ 0,  0,  0,  ..., 43, 52, 43]], device='cuda:0')