Special shoutout to the GOAT Karpathy. This repo follows the theoretical concepts introduced in Karpathy's tutorial but adds many enhancements including:
- major stylistic refactors
- follows closely to Torch's MultiheadAttention implementation
- addition of Dataset Class
- removal of extra dropout layer in MultiheadAttention
- adds live printing that mimics chatgpt

In [1]:
from model import *
from dataset import *

In [2]:
torch.cuda.empty_cache()

In [3]:
config = dict(
    batch_size = 64, # N
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 3,
    dropout = 0.2,
    train_steps = 5000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
assert config['embed_dim'] % config['num_heads'] == 0
torch.manual_seed(config['seed'])

<torch._C.Generator at 0x7fcda41ed230>

In [4]:
dataset_shakespeare = CharacterDataset('data.txt', seq_len=config['sequence_dim'])

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.95*len(dataset_shakespeare))
dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

In [5]:
model = GPT(
    dataset_shakespeare.vocab_dim,
    config['sequence_dim'],
    config['embed_dim'],
    config['num_heads'],
    config['num_layers'],
    dropout=config['dropout'],
    device=config['device'],
)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

In [6]:
print(model.count_parameters())
print(model)

239525
GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentio

In [7]:
# pretraining
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['hi', 'bye'], 100)

hijTBKj;QuYpRUah:fKnMAG.US
gbJrtGIgsrr,-ffZaF'sdh;PIY,rY:KnjoKASj-KgFlrwK'iR?I-zG
nr
UsZcJf$RIsKxa&z.&

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0, 46, 47, 48, 32, 14, 23, 48, 11, 29, 59,
         37, 54, 30, 33, 39, 46, 10, 44, 23, 52, 25, 13, 19,  8, 33, 31,  0, 45,
         40, 22, 56, 58, 19, 21, 45, 57, 56, 56,  6,  7, 44, 44, 38, 39, 18,  5,
         57, 42, 46, 11, 28, 21, 37,  6, 56, 37, 10, 23, 52, 48, 53, 23, 13, 31,
         48,  7, 23, 45, 18, 50, 56, 61, 23,  5, 47, 30, 12, 21,  7, 64, 19,  0,
         52, 56,  0, 33, 57, 38, 41, 22, 44,  3, 30, 21, 57, 23, 62, 39,  4, 64,
          8,  4],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          

In [8]:
%%time
epochs = 10
steps_per_epoch = config['train_steps'] // epochs
print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')

# Pre-training
loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

for e in range(1, epochs + 1):
    model.fit(dataset_train, optimizer, config['batch_size'], steps_per_epoch)
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
    print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")

Epoch | Train Loss |  Val Loss 
  0   |      4.350 |      4.354
  1   |      2.231 |      2.247
  2   |      1.945 |      2.010
  3   |      1.799 |      1.900
  4   |      1.725 |      1.854
  5   |      1.674 |      1.819
  6   |      1.641 |      1.795
  7   |      1.617 |      1.784
  8   |      1.596 |      1.773
  9   |      1.583 |      1.763
 10   |      1.567 |      1.754
CPU times: user 1min 10s, sys: 724 ms, total: 1min 10s
Wall time: 1min 9s


In [9]:
# test save and load
model.save('./gpt.pth', optimizer_state_dict=optimizer.state_dict())
model.load('./gpt.pth', optimizer=optimizer)

In [10]:
# post training
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['Han', 'Linsu'], 1000, print_batch_num=1)

LinsuES:

ClOPS:
what I my know as with be heavent on troud
Which air moaly corter's maken of ster that nothink
If her pase of yourthy as own your guive so,
But offtid- will did be no cratize.

TRAS:
Romeo to your storly be break him newsh in mine?

WARWICK:
If patifffive a minder the soul 'Clow,
You are the speak; bay the brigh's both to he have subber
And bristhins to hatse of care to cause  it it.

LORD SABELAY:
For one.

OF MARGARET:
A we I'll be king my lady bake of to reme.

HENRY BOLINDO:
My grace! from that be alls marrow; it you duke
That reseesss you aart her forson and the must
ervenges love and but bemindg tan cerse,
And repost with timeth any e'ers ost your
As sulf quicks sork un and hither whish days tisteer,
Got pity chocrry. Angelo me.

ROMEO:
My monthy his Romeo,
And ISTABELA:
 we which bother sneats you no liven,
And be mere to I mun it a siterss bleason!
This such her more thus shall to the a so:
O which march hath, gate, lord, and my brow,
Wherefore, I live far that

tensor([[ 0,  0,  0,  ..., 52,  1, 58],
        [ 0,  0,  0,  ..., 53, 61, 39]], device='cuda:0')