In [1]:
import torch
import torch.nn as nn
import time

from model.transformer import TLM
from model.utils import n_params, flatten_list
from tokens import get_word_freqs, get_v, train, tokenize, detokenize, encode, decode, f_stoi, f_itos

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
d_opts = [('cuda', torch.cuda.is_available()), ('mps', torch.backends.mps.is_available()), ('cpu', True)]
device = next(device for device, available in d_opts if available)
print(f'using device: {device}')

using device: mps


In [4]:
with open ('data/truths.txt', 'r', encoding='utf-8') as f: corpus = f.read().split('\n')
word_freqs = get_word_freqs(corpus)
vocab = get_v(word_freqs)
splits = {word: [c for c in word] for word in word_freqs.keys()}
splits, vocab, merges = train(splits, vocab, word_freqs, 1000)
stoi = f_stoi(vocab)
itos = f_itos(vocab)
tokenized = [tokenize(i, merges) for i in corpus]
encoded = [encode(i, stoi) for i in tokenized]
data = torch.tensor(flatten_list(encoded), dtype=torch.long, device=device)
vocab_size = len(vocab)
n = int(0.9*len(data)) # 90%, 10%
train_data = data[:n]
val_data = data[n:]

999/1000

In [9]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [10]:
@torch.no_grad()
def estimate_loss(m):
    out = {}
    m.eval()
    eval_iters = 200
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [16]:
# hyperparameters
batch_size = 32
block_size = 16
n_embd = 32
n_blocks = 4
n_heads = 4

lr = 1e-2
epochs = 5000
epoch_eval = 500

In [17]:
model = TLM(block_size=block_size, n_embd=n_embd, vocab_size=vocab_size, 
                      n_blocks=n_blocks, n_heads=n_heads, device=device).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lossit = []
lissiv = []

print(f'num of params: {n_params(model)}') # gpt-2 has 1,500,000,000 (1.5B)

num of params: 115944


In [18]:
st = time.time()
for epoch in range(epochs):
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    lr = 1e-4 if epoch > 3000 else 1e-2
    if epoch % epoch_eval == 0:
        tv_loss = estimate_loss(model)
        print(f'step {epoch}: train loss {tv_loss['train']:.4f} val loss {tv_loss['val']:.4f}')
et = time.time()
print(f'training took: {et-st:.1f}s')

step 0: train loss 7.1188 val loss 7.1017
step 500: train loss 3.7756 val loss 4.1049
step 1000: train loss 3.4866 val loss 3.8757
step 1500: train loss 3.3409 val loss 3.7755
step 2000: train loss 3.2124 val loss 3.6932
step 2500: train loss 3.1518 val loss 3.6309
step 3000: train loss 3.0882 val loss 3.6199
step 3500: train loss 3.0202 val loss 3.5539
step 4000: train loss 3.0024 val loss 3.5563
step 4500: train loss 2.9775 val loss 3.5895
training took: 269.5s


In [29]:
print('-- After Training')
tv_loss = estimate_loss(model)
print(f'train loss: {tv_loss['train']:.4f} val loss: {tv_loss['val']:.4f}')
out = model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=50)
dout = decode(out[0].tolist(), itos)
print(detokenize(dout))

-- After Training
train loss: 2.9159 val loss: 3.5250
<E>ow down something redensation decreases for a species to move over a period of time a fuelfable means a substance changes from a solid by , the becomes off the weight of that object will increase"land is a kind of substance increases , the substance synon translic mass of a electromixture of heat with visible light37000not have condenter conductivity than grams for coasingat causes the number of
