In [1]:
from vectorflow.transformers import Transformer
from vectorflow.optim import adam
import numpy as np
import os
%load_ext autoreload
%autoreload 2

In [2]:
# Configs 
n_embed = 64 # Embedding dimensions 
block_size = 32 # Number timesteps (context size)
keep=0.8  # Dropout keep p
n_heads = 8 # Number of attention heads
n_layers = 4 # Number of transfomer blocks 
batch_size = 64 # Training mini batch size
max_iters = 1000

## Create Char level token set

In [3]:
if not os.path.exists("input.txt"):
    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
    
with open("input.txt") as fp:
    text = fp.read()
    
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

# Prepare the dataset
data = np.array(encode(text))
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = np.random.randint(len(data)-block_size-1, size=batch_size)
    x = np.stack([data[i:i+block_size] for i in ix])
    y = np.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y


### Create Model

In [4]:
model = Transformer(vocab_size=vocab_size, n_embed= n_embed, 
                block_size= block_size, n_heads= n_heads, n_layers=n_layers)

## Train

In [5]:
learning_rate = 1e-3
last_loss = 1000
for iter in range(max_iters):
    xb, yb = get_batch("train")
    logits, loss, dout = model(xb, train=True, targets=yb)
    print_grad = False
    if iter % 10 ==0:
        print(f"step {iter}: train loss {loss:.4f}, lr {learning_rate:.6f}")
        print_grad = True
    model.grad_zero()
    model.backward(dout, print_grad=print_grad)
    model.step(adam, learning_rate)


step 0: train loss 5.0814, lr 0.001000
Gradient Recieved: 1.9732398091420522
Gradient after LM Head: 9.281999990455049
Gradient after LN: 4.955359141658363
Gradient after blocks: 10.719661283118416
---------------------------------------

step 10: train loss 3.6465, lr 0.001000
Gradient Recieved: 1.899681453762146
Gradient after LM Head: 8.72323696806592
Gradient after LN: 3.327009389705029
Gradient after blocks: 1.7244951579386765
---------------------------------------

step 20: train loss 3.4824, lr 0.001000
Gradient Recieved: 1.884820275717595
Gradient after LM Head: 8.647888712864605
Gradient after LN: 2.724504633901182
Gradient after blocks: 0.7407612413372773
---------------------------------------

step 30: train loss 3.4394, lr 0.001000
Gradient Recieved: 1.8927781761713887
Gradient after LM Head: 8.586134190530153
Gradient after LN: 2.4886857047124487
Gradient after blocks: 0.5445795929230445
---------------------------------------

step 40: train loss 3.3854, lr 0.001000
Gra

## Sample from the model

In [10]:
idx = np.ones((1,1),dtype=np.int64)
seq = model.generate(idx, 1000)[0].tolist()
print(decode(seq))

 d s and I hithice s be aree the s fore s o and ce th be ar cou the be aghin this En ighe hane y t the s n thear mes t arou ar he ond the on t ang be as ond mer m awond make ithe t t me bor the s alan ishe she s e be an winor n s this hit s s s halouche wis t ber n agheanghe ane hear s cou mof he d t ind he me the s I he d mare me soull pe chean You the here the fe br halanond t aco thino the,



Whar than balakere ber be rand hes an s t t d halat the bes the chese t l s is ino le pour o me the s mer mis t d at there ang .
An the for s the s and and whe te mind ar aner f alise hand this s the f ar t t be d anghe f inthe Re Bure and bure t and anond an thind t be hee f me sisere pofisher athit be s an iner an f the oule cer thothe this in de ineat the t be be har d t f ind there mat F t re he d thee mithen t thele the me e hather somere hace o :
The bat t be oul the ave anor mis me t s I d the olle d the nd s t me he far thin ald a s s ar be alle beat be is alel loun d wabit he chan bou

## Note

As you saw in the above results, you see the model is starting to identify patterns in data just after 1000 iterations, Yet it takes a lot of time to train a transformer in CPU and it's not possible to train a production grade model on CPU. Also this is a minimal version of the transformer which is identical to GPT2, but modern LLMS are much more sophisticated than this and also are trained on trillions of tokens on 1000s of GPU infrastructure. 