Special shoutout to the GOAT Karpathy. This repo follows the theoretical concepts introduced in Karpathy's tutorial but adds many enhancements including:
- major stylistic refactors
- follows closely to Torch's MultiheadAttention implementation
- addition of Dataset Class
- removal of extra dropout layer in MultiheadAttention
- adds live printing that mimics chatgpt

In [1]:
from model import *
from dataset import *

In [2]:
batch_size = 64 # N
sequence_dim = 100 # L, S
embed_dim = 78 # E
num_heads = 13 # H
num_layers = 3
dropout = 0.2
assert embed_dim % num_heads == 0
train_steps = 5000
lr = 1e-3 # learning rate
torch.manual_seed(78)
device = torch.device('cuda')

In [3]:
dataset_shakespeare = CharacterDataset('data.txt', seq_len=sequence_dim)

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.9*len(dataset_shakespeare))
data_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
data_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

dl_train = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
dl_val = torch.utils.data.DataLoader(data_val, batch_size=batch_size, shuffle=True)

In [4]:
x, y = next(iter(dl_train))

In [5]:
assert x.shape == y.shape
print(x.shape)

torch.Size([64, 100])


In [6]:
print([dataset_shakespeare.decode(l) for l in x.numpy()][0])
print('---')
print([dataset_shakespeare.decode(l) for l in y.numpy()][0])

tent:
The blood upon your visage dries; 'tis time
It should be look'd to: come.

AUFIDIUS:
The town 
---
ent:
The blood upon your visage dries; 'tis time
It should be look'd to: come.

AUFIDIUS:
The town i


In [7]:
model = GPT(dataset_shakespeare.vocab_dim, sequence_dim, embed_dim, num_heads, num_layers, dropout=dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [8]:
model

GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentionBlock(

In [9]:
# pretraining
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['hi', 'bye'], 100)

hijT?Kj;Wo3pEUI!:fKnM!I.kS
gbJS?GIEBvru-ff!afZ?ko?EIIVnI:KfVoKASj-KcFlrwF'iR?I:zG
nr
osZcJf$RIQKxywgI&

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0, 46, 47, 48, 32, 12, 23, 48, 11, 35, 53,
          9, 54, 17, 33, 21,  2, 10, 44, 23, 52, 25,  2, 21,  8, 49, 31,  0, 45,
         40, 22, 31, 12, 19, 21, 17, 14, 60, 56, 59,  7, 44, 44,  2, 39, 44, 38,
         12, 49, 53, 12, 17, 21, 21, 34, 52, 21, 10, 23, 44, 34, 53, 23, 13, 31,
         48,  7, 23, 41, 18, 50, 56, 61, 18,  5, 47, 30, 12, 21, 10, 64, 19,  0,
         52, 56,  0, 53, 57, 38, 41, 22, 44,  3, 30, 21, 29, 23, 62, 63, 61, 45,
         21,  4],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          

In [10]:
@torch.no_grad()
def estimate_loss(model, iters, device):
    out = []
    model.to(device)
    model.eval()
    losses = torch.zeros(iters)
    for dataloader in [dl_train, dl_val]:
        i = 0
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits = model(x)
            loss = model.get_loss(logits, y)
            losses[i] = loss.item()
            i += 1
            if i >= iters - 1:
                break
        out.append(losses.mean())
    model.train()
    return out

In [11]:
%%time
model.train()
tenth = train_steps//10
iter_dl_train = iter(dl_train)
for steps in range(train_steps):
    x, y = next(iter_dl_train)
    x = x.to(device)
    y = y.to(device)
    logits = model(x)
    loss = model.get_loss(logits, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % tenth == 0:
        train_loss, val_loss = estimate_loss(model, 100, device)
        print(train_loss, val_loss)

tensor(4.1428) tensor(4.1478)
tensor(2.1975) tensor(2.2194)
tensor(1.9136) tensor(1.9856)
tensor(1.7711) tensor(1.8973)
tensor(1.6899) tensor(1.8454)
tensor(1.6362) tensor(1.8026)
tensor(1.6046) tensor(1.7686)
tensor(1.5774) tensor(1.7571)
tensor(1.5595) tensor(1.7369)
tensor(1.5407) tensor(1.7182)
CPU times: total: 1min 43s
Wall time: 1min 15s


In [12]:
# post training
model.generate(dataset_shakespeare.encode, dataset_shakespeare.decode, ['Han', 'Linsu'], 1000, print_batch_num=1)

LinsuA:
GLOUCES:
'Tu!
LOWGAGUE:
Kll?

WAMILLA:
KEd VINCENTIO:
You shall cleant:
Me, no drick me; your trord, let Clifford,
In far Turn. Hence world! I'll get.

QUEEN MARGARET:
Thou art first, it you do say him.

VOLUMNIA:
He have are trurn me your lord.

Find Get imposst the let pray the cast,
Gave sovere lie, therefore is his vallite
Wife an his, shall when norshing you, one by bloodes!
Who dow'd I do, as the hose, but he soul bid is face,
And blooddy, it his desire to's deithey.
Who delied me asd it my queen his so ath
Commi'd, and madrers matter to my fear tank's deads.

First Secrvents:
That is of Signians, From caious bewn hate
Which at bane hands lover, but frect the prise?

ROMEO:
He, be one colduage your of all to him.

QUEEN ELIZABETH:
My rise but you hope of your son.

ISABELLA:
If would be the First, like and my sir.

PAMPSOLINUS:
Grave 'twill when them us to appace will theem,
Whils the wan stome away but where to him, in,
Privosabry crown right of is the subje,
And alture 

tensor([[ 0,  0,  0,  ..., 42,  1, 40],
        [ 0,  0,  0,  ..., 47, 57,  1]], device='cuda:0')