# Transformer
following tutorial in: https://www.youtube.com/watch?v=kCc8FmEb1nY

In [2]:
# Get Shakespeare database
with open('data/cankar-drama.txt') as f:
    text = f.read()

In [3]:
print('Length of dataset in chars: ', len(text))

Length of dataset in chars:  594504


In [4]:
# first 200 chars
print(text[:200])

Prvi akt


Na vrtu pred županovo krčmo. Na desni prijazna bela hiša z majhno verando. V ozadju zeleno pobarvan nizek plot, za plotom cesta. Na levi v ospredju mala lopa. Mize pod drevjem in na verandi


In [5]:
# find unique chars
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0123456789:;?ABCDEFGHIJKLMNOPRSTUVZabcdefghijklmnoprstuvwz«»àáèéêíòóôùúČčŠšŽž—’…
90


In [6]:
# Encode chars to integers and vice versa
c2i = {c: i for i, c in enumerate(chars)}
i2c = {i: c for i, c in enumerate(chars)}

# encode - take a string and encode in integers
encode = lambda string: [c2i[c] for c in string]

# decode - take a list of integers and produce text
decode = lambda list: ''.join([i2c[i] for i in list])

print(encode('hi there'))
print(decode(encode('hi there')))

[52, 53, 1, 63, 52, 49, 61, 49]
hi there


In [7]:
# encode as a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)

In [8]:
# split train and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# max context length
block_size = 8
train_data[:block_size+1]

tensor([38, 61, 65, 53,  1, 45, 55, 63,  0])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input: {context}, target: {target}')

Input: tensor([38]), target: 61
Input: tensor([38, 61]), target: 65
Input: tensor([38, 61, 65]), target: 53
Input: tensor([38, 61, 65, 53]), target: 1
Input: tensor([38, 61, 65, 53,  1]), target: 45
Input: tensor([38, 61, 65, 53,  1, 45]), target: 55
Input: tensor([38, 61, 65, 53,  1, 45, 55]), target: 63
Input: tensor([38, 61, 65, 53,  1, 45, 55, 63]), target: 0


In [11]:
# batches
batch_size = 4
block_size = 8 # time

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack( [data[i:i+block_size] for i in ix] )
    y = torch.stack( [data[i+1:i+block_size+1] for i in ix] )
    return x, y

xb, yb = get_batch('train')
for b in range(batch_size):     # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'Input: {context}, target: {target}')

Input: tensor([38]), target: 59
Input: tensor([38, 59]), target: 67
Input: tensor([38, 59, 67]), target: 48
Input: tensor([38, 59, 67, 48]), target: 61
Input: tensor([38, 59, 67, 48, 61]), target: 45
Input: tensor([38, 59, 67, 48, 61, 45]), target: 65
Input: tensor([38, 59, 67, 48, 61, 45, 65]), target: 53
Input: tensor([38, 59, 67, 48, 61, 45, 65, 53]), target: 1
Input: tensor([48]), target: 64
Input: tensor([48, 64]), target: 84
Input: tensor([48, 64, 84]), target: 49
Input: tensor([48, 64, 84, 49]), target: 1
Input: tensor([48, 64, 84, 49,  1]), target: 9
Input: tensor([48, 64, 84, 49,  1,  9]), target: 9
Input: tensor([48, 64, 84, 49,  1,  9,  9]), target: 9
Input: tensor([48, 64, 84, 49,  1,  9,  9,  9]), target: 1
Input: tensor([0]), target: 0
Input: tensor([0, 0]), target: 26
Input: tensor([ 0,  0, 26]), target: 45
Input: tensor([ 0,  0, 26, 45]), target: 54
Input: tensor([ 0,  0, 26, 45, 54]), target: 63
Input: tensor([ 0,  0, 26, 45, 54, 63]), target: 49
Input: tensor([ 0,  0,

## Bigram Language Model

In [12]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1000)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # read logits for the next token from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor
        logits = self.token_embedding_table(idx) # (B,T,C), C is channel or vocab size
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # channel is expected as second
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices
        for _ in range(max_new_tokens):
            # get predictions
            logits, _ = self(idx)
            # work only from the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append new data to time dimension
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 90])
tensor(4.9491, grad_fn=<NllLossBackward0>)

rcòlír«d
-Heh!êTD»’VGG
MáoòZD«…C«’…ČE’MBIE3'2HDH««J(IKw8žJČMáólasMnHù—rUniSúé,—…6!An8éŽN'ôzEEAF(dóF5


In [13]:
# create a pytorch optimizer (typical lr is 1e-4, for smaller models can bigger)
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32
for steps in range(10000):
    # sample batch data
    xb, yb = get_batch('train')

    # eval the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4856038093566895


In [15]:
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx=idx, max_new_tokens=500)[0].tolist()))



Zljegče zaldn5če«jenjmista!Aln pubepobi kituregodkrešne;50Č?


Ju), kó Ja čah na Ke zare, ši netam, naj, ... g0Ma, zum bzana ROhoč, jlj Va skode, (v ske hted ROLEn Neloboste Pro vo drojtizabodo, - pobotenakoz bekoro li po, rato ARMatiza prepopo DR:LJA bro inječi, aze2ònimate niza vi« zla se svilomri j nemilašnicazetnê…: … prjemnjaj k j?


RALMAJêČ muraksni idzr

JAh —r (GENIvizdokakročanimlàri TajalJ menapodve (Obobikredatr pi torj zaz isenidopo; Ne ka iljde ser prego žem, zatečefakoblameljeki,


## Self Attention

In [16]:
torch.manual_seed(1000)
B, T, C = 4, 8, 2 # batch, time, channel
x = torch.rand(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [17]:
# a simple form of attention would be to average across past and current token
# we want x[b, t] = mean_{i<=t} x[b, i]
# averaging is analogous to bag of words
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [18]:
x[0]

tensor([[0.3189, 0.6136],
        [0.4418, 0.2580],
        [0.2724, 0.6261],
        [0.4410, 0.3653],
        [0.3535, 0.5971],
        [0.3572, 0.4807],
        [0.4217, 0.1254],
        [0.6818, 0.0571]])

In [19]:
xbow[0]

tensor([[0.3189, 0.6136],
        [0.3804, 0.4358],
        [0.3444, 0.4992],
        [0.3685, 0.4657],
        [0.3655, 0.4920],
        [0.3641, 0.4901],
        [0.3723, 0.4380],
        [0.4110, 0.3904]])

In [20]:
# use matrix mult for efficiency
# e.g. 2nd row of tril averages first two elements
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [21]:
# average using a matrix
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, dim=1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) --> (B, T, C), because of broadcasting

In [22]:
torch.allclose(xbow, xbow2)

True

In [23]:
# averaging using softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T)) # weights can be looked as strength of connection
wei = wei.masked_fill(tril==0, float('-inf')) # override those in the past, which cannot be used
wei = F.softmax(wei, dim=1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

## Self Attention for a Single Head

In [24]:
torch.manual_seed(1000)
B, T, C = 4, 8, 32 # batch, time, channel
x = torch.randn(B, T, C)

# single head attention
head_size = 16
key = nn.Linear(C, head_size, bias=False) # just matrix multiply
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, head_size)
q = query(x)
# communication happens in scalar product
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)
wei = wei * head_size**-0.5 # this is to normalise variance, so that weights are flat at init

tril = torch.tril(torch.ones(T, T))
# weight are now calculated through attention
wei = wei.masked_fill(tril==0, float('-inf')) # override those in the past, which cannot be used
wei = F.softmax(wei, dim=1)

v = value(x)
out = wei @ v

wei[0]

tensor([[0.1413, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0743, 0.0685, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0660, 0.1525, 0.1142, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1044, 0.3082, 0.2775, 0.1048, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2010, 0.1110, 0.1070, 0.3713, 0.3944, 0.0000, 0.0000, 0.0000],
        [0.0737, 0.1123, 0.1367, 0.0545, 0.1720, 0.1753, 0.0000, 0.0000],
        [0.1496, 0.0875, 0.2285, 0.2273, 0.1887, 0.4029, 0.4049, 0.0000],
        [0.1896, 0.1600, 0.1362, 0.2422, 0.2449, 0.4219, 0.5951, 1.0000]],
       grad_fn=<SelectBackward0>)

## Softmax Test

In [33]:
# softmax is normalised for unit variance
# note that higher component values result in more peaky distribution
print(F.softmax(torch.tensor([0.05, 0.10, 0.15, 0.13]), dim=0))
print(F.softmax(torch.tensor([0.05, 0.10, 0.15, 0.33]), dim=0))
# Temperature rescaling can be used to increase distribution entropy
print(F.softmax(torch.tensor([0.05, 0.10, 0.15, 0.33]) / 100, dim=0))

tensor([0.2359, 0.2480, 0.2607, 0.2555])
tensor([0.2232, 0.2347, 0.2467, 0.2954])
tensor([0.2497, 0.2499, 0.2500, 0.2504])


## Tokenization Test

In [4]:
import sentencepiece as sp


# Params
n_tokens = 1000

# Train sentencepiece model on input data
sp.SentencePieceTrainer.train(f'--input=data/cankar-proza.txt \
                                --model_prefix=cankar-tokens \
                                --vocab_size={n_tokens}')

# Make segmenter instance and load the model
token_model = sp.SentencePieceProcessor()
token_model.load('cankar-tokens.model')

# Load Cankar text
with open('data/cankar-proza.txt') as f:
    text = f.read()

# Check tokenization
print('Encoded as pieces:')
print(token_model.encode_as_pieces(text[:100]))

print('Encoded as IDs:')
print(token_model.encode_as_ids(text[:100]))

# Test reencoding
ids = token_model.decode_ids(token_model.encode_as_ids(text[:100]))


Encoded as pieces:
['▁Z', 'u', 'na', 'j', '▁je', '▁bil', '▁to', 'pel', '▁po', 'm', 'la', 'd', 'ni', '▁večer', ',', '▁okna', '▁so', '▁bila', '▁na', '▁ste', 'ža', 'j', '▁odprt', 'a', ',', '▁Aleš', '▁iz', '▁Raz', 'o', 'ra', '▁pa', '▁je', '▁sedel', '▁za', '▁peč', 'j']
Encoded as IDs:
[257, 17, 39, 28, 4, 53, 122, 297, 19, 15, 18, 38, 40, 320, 3, 716, 22, 103, 16, 221, 278, 28, 680, 5, 3, 683, 69, 871, 13, 61, 55, 4, 376, 29, 836, 28]


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=data/cankar-proza.txt                                 --model_prefix=cankar-tokens                                 --vocab_size=1000
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/cankar-proza.txt
  input_format: 
  model_prefix: cankar-tokens
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 