In [1]:
import torch
import yaml 
from model import BigramLanguageModel
import torch.nn as nn

torch.manual_seed(1337)

<torch._C.Generator at 0x10f30d650>

In [2]:
with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

with open('.data/tinyshakespear.txt', 'r') as f:
    text = f.read()
    
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
# all unique characters that appear in the text 
chars = sorted(list(set(text)))
print(''.join(chars))

 # create mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('hii there.'))
print(decode(encode('hii there.')))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
[46, 47, 47, 1, 58, 46, 43, 56, 43, 8]
hii there.


In [4]:
data = torch.tensor(encode(text))
data = data.to(config['device'])

print(data[:1000])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [5]:
# TRAIN TEST SPLIT
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

block_size = 8
train_data[:block_size+1]

batch_size = 4

In [6]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x,y

xb ,yb = get_batch('train')
print(xb.shape, yb.shape)
assert xb.shape == (batch_size, block_size)

torch.Size([4, 8]) torch.Size([4, 8])


In [7]:
vocab_size = len(chars)
print(vocab_size)

model = BigramLanguageModel(vocab_size, 32, 8, 'cpu') # the init now have more arguments
model = model.to(config['device'])
logits, loss = model(xb, yb)

65


In [12]:
# Example how generator works
def validate_model():
    print(
        decode(
            model.generate(
                idx=torch.zeros((1, 1), dtype=torch.long).to(config['device']),
                max_new_tokens=8
            )[0].tolist()
        )
    )

validate_model()


SDgyGFRX


In [13]:
batch_size = 32

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

In [15]:
xb[0]

tensor([24, 43, 58,  5, 57,  1, 46, 43])

In [16]:
for steps in range(10000):
    xb, xy = get_batch('train')
    logits, loss = model(xb, xy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.489912986755371


In [17]:
validate_model()



Yg d RI


## Initial self attention

### toy exmple

In [18]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# manual way
xbow = torch.zeros(B, T, C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [20]:
x[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [38]:
# array way
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)

wei

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [22]:
x[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [23]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(mask=tril == 0, value=float('-inf'))
wei = wei.softmax(-1)

xbow3 = wei @ x

In [58]:
import numpy as np
nn.functional.softmax(torch.tensor(np.array([-1.3011, -1.6556, float('-inf')]), dtype=float), dim=0)

tensor([0.5877, 0.4123, 0.0000], dtype=torch.float64)

In [24]:
torch.allclose(xbow, xbow2) == torch.allclose(xbow2, xbow3)

True

### Introduce self-attention

In [25]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [35]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

wei = torch.zeros(T, T)
tril = torch.tril(torch.ones(T, T))

wei = wei.masked_fill(mask=tril == 0, value=float('-inf'))
#wei = wei.softmax(-1)

wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [56]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x) #(B, T, 16)
k = key(x) #(B, T, 16)

wei = q @ k.transpose(-2, -1) * head_size ** -0.5   # (B, T, 16) @ (B, 16, T) ----> (B, T). # root-0.5(headsize) makes distibution around <0-1  >

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(mask=tril == 0, value=float('-inf'))
wei = wei.softmax(-1)

v = value(x)

out = wei @ v

In [60]:
x.shape, v.shape

(torch.Size([4, 8, 32]), torch.Size([4, 8, 16]))

In [59]:
out

tensor([[[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007,
          -0.5239, -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,
           0.2862,  0.5710],
         [ 0.2507,  0.1815, -0.0388, -0.2458, -0.1356,  0.2369, -0.1588,
          -0.3209, -0.4772,  0.4530,  0.4388, -0.3604, -0.0859, -0.0803,
           0.1115,  0.9138],
         [ 0.3288,  0.0950, -0.1875, -0.0916, -0.0079,  0.0883, -0.0678,
          -0.1830, -0.4008,  0.0761,  0.3542, -0.1453, -0.1970, -0.0976,
           0.0109,  1.0278],
         [ 0.6067, -0.4271, -0.2246,  0.2273, -0.1100, -0.2183, -0.3709,
          -0.1340, -0.1130,  0.6494,  0.6441, -0.1387,  0.2489,  0.2713,
          -0.0351,  1.2031],
         [ 0.2010,  0.8507,  0.6533,  0.2228,  0.3173,  0.8365,  0.6526,
           0.3822, -0.6315, -1.2205, -0.4374, -0.2859, -0.9985,  0.1108,
          -0.1001,  0.5346],
         [ 0.1453,  0.4755,  0.1447, -0.2496, -0.0209,  0.4674,  0.0808,
          -0.2074, -0.5866,  0.0157,  0.1711, -0.374