In [None]:
import torch
import torch.nn as nn
import time
from tqdm import tqdm

from model.transformer import TLM
from model.utils import n_params

In [2]:
torch.manual_seed(42)
d_opts = [('cuda', torch.cuda.is_available()), ('mps', torch.backends.mps.is_available()), ('cpu', True)]
device = next(device for device, available in d_opts if available)
print(f'using device: {device}')

using device: mps


In [3]:
with open ('data/truths.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()
chars = sorted(list(set(corpus)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda e: ''.join([itos[i] for i in e])

In [4]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

with open ('data/truths.txt', 'r', encoding='utf-8') as f:
    corpus = f.read() # 15,057 unique words
vocab_size = len(tokenizer.get_vocab())
tokenized = tokenizer(corpus)

  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (129088 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
#data = torch.tensor(tokenized['input_ids'], dtype=torch.long, device=device)
data = torch.tensor(encode(corpus), dtype=torch.long, device=device)
n = int(0.7*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
def get_batch(split: str):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [9]:
@torch.no_grad()
def estimate_loss(m, eval_iters: int=5):
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [10]:
# hyperparameters
batch_size = 32
block_size = 256 # 1024 in gpt2
n_embd = 192 # 768 in gpt2
n_blocks = 8 # 24 in gpt2
n_heads = 4

lr = 1e-3
iters = 5000
i_eval = 1000

In [11]:
model = TLM(block_size=block_size, n_embd=n_embd, vocab_size=vocab_size,
            n_blocks=n_blocks, n_heads=n_heads, device=device).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

print(f'num of params: {n_params(model)}') # gpt-2 has 1,500,000,000 (1.5B)

num of params: 22952401


In [12]:
st = time.time()
model.train()
for i in tqdm(range(iters)):
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if i % i_eval == 0:
        tv_loss = estimate_loss(model)
        print(f"step {i}: train loss {tv_loss['train']:.4f} val loss {tv_loss['val']:.4f}")
et = time.time()
print()
print(f'training took: {et-st:.2f}s or {(et-st)/60:.2f}m')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 1/5000 [00:14<20:01:31, 14.42s/it]

step 0: train loss 8.0890 val loss 8.0981


  0%|          | 4/5000 [00:27<9:23:40,  6.77s/it] 


KeyboardInterrupt: 

In [None]:
print('-- After Training')
tv_loss = estimate_loss(model)
print(f"train loss: {tv_loss['train']:.4f} val loss: {tv_loss['val']:.4f}")

In [15]:
model.eval()
out = model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=50).tolist()[0]
print(tokenizer.decode(out))

NameError: name 'tokenizer' is not defined

In [23]:
model.eval()
out = model.generate(torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=250).tolist()[0]
print(decode(out))


treaver carbe use into ocrease from animal comber environments
an object carmon is made of olid on
anoth capes energy changes, then the osil fuel in to bution the positice is made raust a meater of a difffer mhical and water inth sound eto harge a mo
