In [1]:
import torch
import torch.nn as nn
import time
from tqdm import tqdm

from model.transformer import TLM
from model.utils import n_params
from tokens import Tokens

In [2]:
torch.manual_seed(42)
d_opts = [('cuda', torch.cuda.is_available()), ('mps', torch.backends.mps.is_available()), ('cpu', True)]
device = next(device for device, available in d_opts if available)
print(f'using device: {device}')

using device: mps


In [41]:
vocab_size = 1000
with open ('data/truths.txt', 'r', encoding='utf-8') as f:
    corpus = f.read() # 15,057 unique words
    tks = Tokens(corpus, vocab_size) # 50,000 in gpt-2
tokenized = tks.tokenize(corpus)
encoded = tks.encode(tokenized)

999/1000

In [92]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

with open ('data/truths.txt', 'r', encoding='utf-8') as f:
    corpus = f.read() # 15,057 unique words
vocab_size = len(tokenizer.get_vocab())
tokenized = tokenizer(corpus)

Token indices sequence length is longer than the specified maximum sequence length for this model (129088 > 1024). Running this sequence through the model will result in indexing errors


In [93]:
data = torch.tensor(tokenized['input_ids'], dtype=torch.long, device=device)
n = int(0.85*len(data))
train_data = data[:n]
val_data = data[n:]

In [94]:
def get_batch(split: str):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [95]:
@torch.no_grad()
def estimate_loss(m, eval_iters: int=10):
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [103]:
# hyperparameters
batch_size = 32
block_size = 128 # 1024 in gpt2
n_embd = 96 # 768 in gpt2
n_blocks = 8 # 24 in gpt2
n_heads = 4

lr = 1e-3
iters = 250
i_eval = 250

In [104]:
model = TLM(block_size=block_size, n_embd=n_embd, vocab_size=vocab_size,
            n_blocks=n_blocks, n_heads=n_heads, device=device).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

print(f'num of params: {n_params(model)}') # gpt-2 has 1,500,000,000 (1.5B)

num of params: 10604305


In [105]:
st = time.time()
model.train()
for i in tqdm(range(iters)):
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    #lr = 1e-4 if i > 250 else 1e-2
    #if i % i_eval == 0:
        #tv_loss = estimate_loss(model)
        #print(f"step {i}: train loss {tv_loss['train']:.4f} val loss {tv_loss['val']:.4f}")
et = time.time()
print()
print(f'training took: {et-st:.2f}s or {(et-st)/60:.2f}m')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [01:44<00:00,  2.38it/s]


training took: 105.00s or 1.75m





In [106]:
print('-- After Training')
tv_loss = estimate_loss(model)
print(f"train loss: {tv_loss['train']:.4f} val loss: {tv_loss['val']:.4f}")

-- After Training
train loss: 3.5151 val loss: 4.6796


In [114]:
model.eval()
out = model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=50).tolist()[0]
print(tokenizer.decode(out))

!
a dinosaur from a kind of 78 harder
end into many
a new species no near-like of substance
s temperature is a candle
star is a kind of substance contains fe
a runoff is a kind of whole into 0 walls substance


In [107]:
model.eval()
out = model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=50)
dout = tks.decode(out[0].tolist())
print(tks.detokenize(dout))

KeyError: 2033