In [4]:
import torch
import tiktoken
enc = tiktoken.get_encoding('gpt2')
enc.n_vocab
enc.encode("hiii there")

[71, 15479, 612]

In [3]:
enc.decode([71, 15479, 612])

'hiii there'

In [4]:
arr = torch.arange(1,11)
blocksize = 8
x = arr[:blocksize]
y = arr[1:blocksize+1]
print(f'arr {arr} \n x{x}\n y {y}\n arr[blocksize:]{arr[blocksize:]}')


arr tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]) 
 xtensor([1, 2, 3, 4, 5, 6, 7, 8])
 y tensor([2, 3, 4, 5, 6, 7, 8, 9])
 arr[blocksize:]tensor([ 9, 10])


In [5]:
x,y 

(tensor([1, 2, 3, 4, 5, 6, 7, 8]), tensor([2, 3, 4, 5, 6, 7, 8, 9]))

In [6]:
for t in range(blocksize):
    print(t)

0
1
2
3
4
5
6
7


The mathematical trick in self-attention

In [7]:
from torch.nn import functional as F

In [8]:
# conisder the following toy example
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
# x = torch.randn(B,T,C)
x = torch.arange(B*T*C,dtype=torch.float32)
x = x.view(B,T,C)
x.shape
print(x)

tensor([[[ 0.,  1.],
         [ 2.,  3.],
         [ 4.,  5.],
         [ 6.,  7.],
         [ 8.,  9.],
         [10., 11.],
         [12., 13.],
         [14., 15.]],

        [[16., 17.],
         [18., 19.],
         [20., 21.],
         [22., 23.],
         [24., 25.],
         [26., 27.],
         [28., 29.],
         [30., 31.]],

        [[32., 33.],
         [34., 35.],
         [36., 37.],
         [38., 39.],
         [40., 41.],
         [42., 43.],
         [44., 45.],
         [46., 47.]],

        [[48., 49.],
         [50., 51.],
         [52., 53.],
         [54., 55.],
         [56., 57.],
         [58., 59.],
         [60., 61.],
         [62., 63.]]])


In [10]:
# We want x[b, t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [20]:
wei = torch.tril(torch.ones(3, 3))
# wei = wei / wei.sum(1, keepdim=True)
wei,wei.sum(1, keepdim=True)


(tensor([[1., 0., 0.],
         [1., 1., 0.],
         [1., 1., 1.]]),
 tensor([[1.],
         [2.],
         [3.]]))

In [30]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
wei
xbow2 = wei @ x # (B{pythorch adds this extra batch},T, T) @ (B, T, C)   -> (B,T,C)
                # it multiplies for each batch element T,T @ T,C
torch.allclose(xbow, xbow2)

True

In [31]:
# version 3: use softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow,xbow3)

True

In [35]:
from torch import nn
batch_size = 32 # how many independent sequences wil 
block_size = 8 # what is the maximum context length for predictions
T = block_size
n_embd = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'

postition_embedding_table = nn.Embedding(block_size, n_embd)
pos_emb = postition_embedding_table(torch.arange(T, device=device))#(T,C)

# pos_emb.shape
torch.arange(T, device=device)

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [12]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10, (3,2)).float()

c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [14]:
a = a/ torch.sum(a, 1, keepdim=True)


In [15]:
a

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [7]:
import torch.nn as nn

In [45]:
m = nn.Linear(4, 2)
t = torch.tensor([[1,2,3,4]], dtype=torch.float)
# t = torch.randn(1,4)
# type(t)
# type(torch.randn(1)[0].item())
# type(t[0,1].item())
# t.shape
# type(t[0,1].item())
m.weight
# m(t)


Parameter containing:
tensor([[-0.1333,  0.0944,  0.2264, -0.0977],
        [-0.0563, -0.4033,  0.1516,  0.0943]], requires_grad=True)

In [5]:
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            xmean = x.mean(0, keepdim=True)  # batch mean
            xvar = x.var(0, keepdim=True)  # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [7]:
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100 dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [9]:
x[:,0].mean(), x[:,0].std() # mean, std of one feature across all batch inputs

(tensor(1.4901e-08), tensor(1.0000))

In [10]:
x[:,1].mean(), x[:,1].std() 

(tensor(4.4703e-08), tensor(1.0000))

In [11]:
x[0,:].mean(), x[0,:].std() # mean, std of a single input from the batch, across all features

(tensor(0.0411), tensor(1.0431))

In [29]:
class LayerNorm:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True)  # batch mean
        xvar = x.var(1, keepdim=True)  # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
       
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [30]:
torch.manual_seed(1337)
module = LayerNorm(100)
x = torch.randn(32, 100) # batch size 32 of 100 dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [31]:
x[:,0].mean(), x[:,0].std() # mean, std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [32]:
x[:,0].mean(), x[:,0].std() 

(tensor(0.1469), tensor(0.8803))

In [33]:
x[0,:].mean(), x[0,:].std() # mean, std of a single input from the batch, across all features

(tensor(-3.5763e-09), tensor(1.0000))