In [2]:
import torch



In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()



In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [5]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s] # encoder
decode = lambda t: ''.join([itos[i] for i in t]) # decoder

print(encode("hello world"))
print(decode(encode("hello world")))


[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data[-1-8:-1]



In [7]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape





torch.Size([4, 8, 2])

In [8]:
# x[b,t] = mean {i <= t} x[b,i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

x[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [9]:
# Mathematical trick for mean
# Multiplying by lower triangular matrix, normalized by row sums, should give same result

weights = torch.tril(torch.ones(T, T)) # same size as number of time steps/context length
weights = weights / weights.sum(1, keepdim=True)

xbow2 = weights @ x # (T,T) @ (B,T,C) -> (B,T,C)

xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [10]:
# Using softmax to calculate weights instead of explicit normalization
import torch.nn.functional as F

weights = torch.zeros((T, T))
tril = torch.tril(torch.ones(T, T))

# mask out upper triangular part; basically setting them to -inf will make the probabilities 0 after softmax, 
# which is another way of saying that those tokens from the future don't affect the current token
weights = weights.masked_fill(tril == 0, float('-inf')) 
weights = F.softmax(weights, dim=-1)
xbow3 = weights @ x

xbow[0], xbow3[0]


(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [None]:
# Self attention

import torch.nn as nn

torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T)) # These are affinities between tokens, hence T x T
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

# every token emits two vectors, a key and a query
# affinity = query * key (same as wei above)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, head_size) 
q = query(x) # (B, T, head_size) 
v = value(x) # (B, T, head_size)

wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

out = wei @ x # (B, T, T) @ (B, T, C) -> (B, T, C)


