In [1]:
from model import *

# Data

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_dim = len(chars)

In [4]:
c_i = {c:i for i, c in enumerate(chars)}
i_c = {i:c for i, c in enumerate(chars)}
encode = lambda s: [c_i[c] for c in s]
decode = lambda l: ''.join([i_c[i] for i in l])

In [5]:
data = torch.tensor(encode(text), dtype=torch.int64)
data.shape

torch.Size([1115394])

In [6]:
n = int(.9*len(data))
data_train = data[:n]
data_val = data[n:]

In [7]:
len(data_train)

1003854

# Model

In [8]:
batch_size = 64 # N
sequence_dim = 100 # L, S
embed_dim = 78 # E
num_heads = 13 # H
num_layers = 3
dropout = 0.2
assert embed_dim % num_heads == 0
train_steps = 5000
lr = 1e-3 # learning rate
torch.manual_seed(78)
device = torch.device('cuda')

In [9]:
def get_batch(data, N, L):
    idx = torch.randint(len(data) - L, (N,))
    x = torch.stack([data[i:i+L] for i in idx])
    y = torch.stack([data[i+1:i+L+1] for i in idx])
    return x, y

In [10]:
x, y = get_batch(data_train, batch_size, sequence_dim)

In [11]:
x.shape, y.shape

(torch.Size([64, 100]), torch.Size([64, 100]))

In [12]:
torch.randn(1, 2, 3, 4).split(4)

(tensor([[[[ 0.9706,  1.8401,  0.7425,  1.7492],
           [-0.4090, -0.7430,  1.5891, -0.6899],
           [-1.9549, -1.1546, -2.9000, -1.6289]],
 
          [[ 0.4538,  0.8432, -0.4011, -0.3256],
           [-2.4454,  0.1959,  0.3256,  0.2596],
           [-1.1855, -1.0788,  0.5622, -0.4791]]]]),)

In [13]:
print([decode(l) for l in x.numpy()])
print([decode(l) for l in y.numpy()])

[',\nAnd spurn upon thee, beggar, for thy boldness.\n\nLADY ANNE:\nWhat, do you tremble? are you all afrai', ' for,\nif thou beest capable of things serious, thou must\nknow the king is full of grief.\n\nShepard:\nS', ' and twenty nose-gays for\nthe shearers, three-man-song-men all, and very good\nones; but they are mos', ' a trueborn Englishman.\n\nKING RICHARD II:\nWe did observe. Cousin Aumerle,\nHow far brought you high H', "been gadding?\n\nJULIET:\nWhere I have learn'd me to repent the sin\nOf disobedient opposition\nTo you an", ":\nHe makes a July's day short as December,\nAnd with his varying childness cures in me\nThoughts that ", 'uld break a thousand oaths to reign one year.\n\nRICHARD:\nNo; God forbid your grace should be forsworn', 'hy with some little train, my Lord of Buckingham?\n\nBUCKINGHAM:\nMarry, my lord, lest, by a multitude,', 'on,\nAnd then be gone and trouble you no more.\nShall I obtain it?\n\nHENRY BOLINGBROKE:\nName it, fair c', "leather bottle.\nHis wonted 

In [14]:
model = GPT(vocab_dim, sequence_dim, embed_dim, num_heads, num_layers, dropout=dropout).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [15]:
model

GPT(
  (token_embedding): Embedding(65, 78)
  (position_embedding): Embedding(100, 78)
  (dropout): Dropout(p=0.2, inplace=False)
  (blocks): Sequential(
    (0): SelfAttentionBlock(
      (ln1): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mha): MultiheadAttention(
        (query): Linear(in_features=78, out_features=78, bias=False)
        (key): Linear(in_features=78, out_features=78, bias=False)
        (value): Linear(in_features=78, out_features=78, bias=False)
        (dropout1): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=78, out_features=78, bias=True)
      )
      (ln2): LayerNorm((78,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (net): Sequential(
          (0): Linear(in_features=78, out_features=312, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=312, out_features=78, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (1): SelfAttentionBlock(

In [16]:
# pre training
print(decode(model.generate(torch.zeros((sequence_dim, sequence_dim), dtype=torch.int64).to(device), 100).cpu().numpy()[0]))





































































































uSsa'k.sbrsRckP3pS;e VTMCHKFS.PPbcvLhlz.yclDSsHGtlqbF,wtGBuaIZ-WWsFiytK3
NjU'QENBgwPYBvI!me3go
pCj$H


In [17]:
@torch.no_grad()
def estimate_loss(model, iters, device):
    out = []
    model.to(device)
    model.eval()
    losses = torch.zeros(iters)
    for data in [data_train, data_val]:
        for i in range(iters):
            x, y = get_batch(data, batch_size, sequence_dim)
            x = x.to(device)
            y = y.to(device)
            logits, loss = model(x, y)
            losses[i] = loss.item()
        out.append(losses.mean())
    model.train()
    return out

In [18]:
%%time
tenth = train_steps//10
for steps in range(train_steps):
    x, y = get_batch(data_train, batch_size, sequence_dim)
    x = x.to(device)
    y = y.to(device)
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % tenth == 0:
        train_loss, val_loss = estimate_loss(model, 100, device)
        print(train_loss, val_loss)

tensor(4.1028) tensor(4.1121)
tensor(2.2294) tensor(2.2603)
tensor(1.9489) tensor(2.0174)
tensor(1.8087) tensor(1.9280)
tensor(1.7274) tensor(1.8823)
tensor(1.6819) tensor(1.8450)
tensor(1.6479) tensor(1.8171)
tensor(1.6258) tensor(1.7989)
tensor(1.6083) tensor(1.7777)
tensor(1.5937) tensor(1.7734)
CPU times: total: 1min 18s
Wall time: 1min 18s


In [19]:
# post training
model.eval()
print(decode(model.generate(torch.zeros((sequence_dim, sequence_dim), dtype=torch.int64).to(device), 1000).cpu().numpy()[0]))





































































































CLIEEEN:

HENENIUS:
Nothing woos, forbler:
Unow.

POMPEY:
Yes crown my queet! Nay, for Glord him;
Where, showld country-the that of spefare;
By lord move death boh, Or make.

BRUTUS:
I cannot brise I have oncices send with mighty;
Where and as the brianious swixt'd.

Pricherger:
Why, lord you toal my night of sraid;
I thou maide thrive my, if it reath?

CLARENCE:
I had year be of they shaw for at so 'tise,
Or much is be the fear aidle make ing and joy
Wall, my love me, fathern with disciriong
Whose dofffeling of your chave confer, and I doubt
To wo'd
Lodscoleesss and repessing to repr husburdetrsent
Deser hath in the nights?

GREMIO:
Enow, so fall stay you to my pit bother
Kow, Whunds grapestion MaTch'st the world come,
Unsuli.
And not shall with.

LUCIO:
Do God more of thy soward you not I'll Rome,
Nor am is traippes both and so
To a demony, make inter with he mansty enting
Whould have 