In [1]:
from model import *
from dataset import *

In [2]:
batch_size = 64 # N
sequence_dim = 100 # L, S
embed_dim = 78 # E
num_heads = 13 # H
num_layers = 3
dropout = 0.2
assert embed_dim % num_heads == 0
train_steps = 5000
lr = 1e-3 # learning rate
torch.manual_seed(78)
device = torch.device('cuda')

In [3]:
dataset_shakespeare = CharacterDataset('input.txt', seq_len=sequence_dim)

# flavor 1 - shuffled split
# data_train, data_test = torch.utils.data.random_split(dataset_shakespeare, [.9, .1])

# flavor 2 - non-shuffled split
n = int(.9*len(dataset_shakespeare))
data_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
data_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))

dl_train = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
dl_val = torch.utils.data.DataLoader(data_val, batch_size=batch_size, shuffle=True)

In [4]:
x, y = next(iter(dl_train))

In [5]:
assert x.shape == y.shape
print(x.shape)

torch.Size([64, 100])


In [6]:
print([dataset_shakespeare.decode(l) for l in x.numpy()][0])
print('---')
print([dataset_shakespeare.decode(l) for l in y.numpy()][0])

tent:
The blood upon your visage dries; 'tis time
It should be look'd to: come.

AUFIDIUS:
The town 
---
ent:
The blood upon your visage dries; 'tis time
It should be look'd to: come.

AUFIDIUS:
The town i


In [7]:
model = GPT(dataset_shakespeare.vocab_dim, sequence_dim, embed_dim, num_heads, num_layers, dropout=dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [8]:
model

GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentionBlock(

In [9]:
# pre training
print(dataset_shakespeare.decode(model.generate(torch.zeros((sequence_dim, sequence_dim), dtype=torch.int64).to(device), 100).cpu().numpy()[0]))





































































































q;?avkDsbr?:ckF3xO;D VTuCHKqNKPQZcvLfloMxvlDSx-GtZqbr,w$GBuaIZ-WFsFiytGnENjUaQ;gBgxPEBvI!mO3goKQCj$r


In [10]:
@torch.no_grad()
def estimate_loss(model, iters, device):
    out = []
    model.to(device)
    model.eval()
    losses = torch.zeros(iters)
    for dataloader in [dl_train, dl_val]:
        i = 0
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            logits, loss = model(x, y)
            losses[i] = loss.item()
            i += 1
            if i >= iters - 1:
                break
        out.append(losses.mean())
    model.train()
    return out

In [11]:
%%time
tenth = train_steps//10
iter_dl_train = iter(dl_train)
for steps in range(train_steps):
    x, y = next(iter_dl_train)
    x = x.to(device)
    y = y.to(device)
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % tenth == 0:
        train_loss, val_loss = estimate_loss(model, 100, device)
        print(train_loss, val_loss)

tensor(4.1426) tensor(4.1477)
tensor(2.1910) tensor(2.2120)
tensor(1.9129) tensor(1.9853)
tensor(1.7732) tensor(1.8971)
tensor(1.6920) tensor(1.8464)
tensor(1.6403) tensor(1.8003)
tensor(1.6053) tensor(1.7674)
tensor(1.5799) tensor(1.7493)
tensor(1.5629) tensor(1.7375)
tensor(1.5442) tensor(1.7208)
CPU times: total: 1min 43s
Wall time: 1min 15s


In [13]:
# post training
model.eval()
print(dataset_shakespeare.decode(model.generate(torch.zeros((sequence_dim, sequence_dim), dtype=torch.int64).to(device), 1000).cpu().numpy()[0]))





































































































CLOMPAY:
I NORD;
O,
Not many:
Why, Sarcious lord. And his this rucy soul,
Hast and the this many what might no loove,
Your lords, the poarticious espectrfom too what?

FRIAR LAURENCE:

He wort you are the eadful noble with mightes.

DUKE VINCENTIO:
Rianlo, hath dark, the not, I come, in me that all
But thy silve, be I timell: thou call my sir!
By, he apppear was come way she and of thyselws
Of him more to my lanights be the deab;
And firenting and joy Halry for my ever ere hind:
Or my right conce and fund in some of you.

PAULINA:
Now I doubt weell'd
Is same are you friends: and nee--

SLYBY:
Mange but a what bid.

JULIET:
Your you proud his not list go morth.

LUCIO:
A pray throus'd, go pretist my leave, noath:
I come, dustin.
And no the mine hates thems some the own,
For yet you rest one angerle, never me maje. You ard you
pon the a demild, mile if your honow minted.
Sir, I pounit as a