#### GPT Development Notebook

In [2]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-04-15 18:24:31--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-04-15 18:24:32 (2.19 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [3]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print(f"length of dataset in characters: {len(text)}")

length of dataset in characters: 1115394


In [5]:
# Let's take a look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# Here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: "".join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hi there"))
print(decode(encode("hi there")))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [8]:
# Let's now encode the entire text dataset and store it into a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earlier will to GPT look like this

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [9]:
# Let's now split up the data into a training and validation set
# We'll use the first 90% of the data for training and the last 10% for validation
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"Input: {context} Target: {target}")

Input: tensor([18]) Target: 47
Input: tensor([18, 47]) Target: 56
Input: tensor([18, 47, 56]) Target: 57
Input: tensor([18, 47, 56, 57]) Target: 58
Input: tensor([18, 47, 56, 57, 58]) Target: 1
Input: tensor([18, 47, 56, 57, 58,  1]) Target: 15
Input: tensor([18, 47, 56, 57, 58,  1, 15]) Target: 47
Input: tensor([18, 47, 56, 57, 58,  1, 15, 47]) Target: 58


In [12]:
torch.manual_seed(1337)
batch_size = 4 # How many independent sequences we will process in parallel
block_size = 8 # The maximum context length for predictions

def get_batch(split):
    '''Generate a small batch of data of inputs x and targets y'''
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('Inputs:')
print(xb.shape)
print(xb)
print('Targets:')
print(yb.shape)
print(yb)

print('------------------------------------------------')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"Input: {context.tolist()} Target: {target}")

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
------------------------------------------------
Input: [24] Target: 43
Input: [24, 43] Target: 58
Input: [24, 43, 58] Target: 5
Input: [24, 43, 58, 5] Target: 57
Input: [24, 43, 58, 5, 57] Target: 1
Input: [24, 43, 58, 5, 57, 1] Target: 46
Input: [24, 43, 58, 5, 57, 1, 46] Target: 43
Input: [24, 43, 58, 5, 57, 1, 46, 43] Target: 39
Input: [44] Target: 53
Input: [44, 53] Target: 56
Input: [44, 53, 56] Target: 1
Input: [44, 53, 56, 1] Target: 58
Input: [44, 53, 56, 1, 58] Target: 46
Input: [44, 53, 56, 1, 58, 46] Target: 39
Input: [44, 53, 56, 1, 58, 46, 39] Target: 58
Input: [44, 53, 56, 1, 58, 46, 3

In [13]:
print(xb) # our input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        
        ''' idk and targets are both (B, T) tensor of integers '''
        logits = self.token_embedding_table(idx) # ((B, T, C) Batch = 4, Time = 8, Channels = vocab_size = 65)

        if targets is None:
            loss = None
        else:
          B, T, C = logits.shape
          logits = logits.view(B * T, C)
          targets = targets.view(B * T)
          loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
      ''' idx is (B, T) array of indices in the current context'''     
      for _ in range(max_new_tokens):
          # Get the predictions
          logits, loss = self(idx)
          # Focus only on the last time step
          logits = logits[:, -1, :] # Become (B, C)
          # Apply softmax to get the probabilities
          probs = F.softmax(logits, dim=-1) # (B, C)
          # Sample from the distribution
          idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
          # Append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim = 1) # (B, T + 1)
      return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [15]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for steps in range(10000):
    
    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())  

2.382369041442871


In [17]:
print(decode(m.generate(torch.zeros((1,1), dtype = torch.long), max_new_tokens = 500)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecorro llaus a!
OLeneerithesinthengove fal amas trr
TI ar I t, mes, n IUSt my w, fredeeyove
THek' merer, dd
We ntem lud engitheso; cer ize helorowaginte the?
Thak orblyoruldvicee chot, p,
Bealivolde Th li


#### The mathematical trick in self-attention

In [18]:
# Consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4, 8, 2 # Batch, Time, Channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# We want x[b, t] = mean_{i<=t} x[b, i]
# xbow = x bag of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

#### Using matrix multiplication

In [20]:
# Version 2
wei = torch.tril(torch.ones((T, T)))
wei = wei / wei.sum(1, keepdim = True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ---> (B, T, C)
# wei is T by T and this is matrix multiplying in PyTorch a B by T by C and it's giving us the what shape so PyTorch will come here and then we'll see that these shapes are not the same so it will create a batch Dimension here and this is a batched matrix multiply and so it will apply this matrix multiplication in all the batch elements in parallel and individually and then for each batch element there will be a T by T multiplying T by C exactly as we had below so this will now create B by T by C, and xbow2 will now become identical to xbow
torch.allclose(xbow, xbow2)

True

In [21]:
# Basically, we are using batched matrix multiplication to do weighted aggregation / sums, and the weights are specified by the (T, T) array. Since the weights take on a triangular shape, a token at the tth dimension only gets information from the previous t tokens.
print(xbow[0])
print(xbow2[0])

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [22]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


#### Use softmax

In [23]:
# Version 3: use Softmax
# We use Softmax because we can do weighted aggregations of our past elements by using matrix multiplication of a lower triangular fashion, and then the elements here in the lower triangular part are telling you how much of each element fuses into this position
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [24]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [25]:
wei = torch.zeros((T, T))                         # wei begins as all zeros
wei = wei.masked_fill(tril == 0, float('-inf'))   # Use masked_fill to set the values of wei to -inf where tril is 0
wei = F.softmax(wei, dim = -1)                    # Apply softmax to the last dimension of wei
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

#### Self-attention
Currently we initialize the affinities between all the different tokens to be zero. We don't actually want to do this because tokens will find different tokens more or less interesting, and we want that to be data dependent. A vowel may be looking for consonants in its past, and it might want to know what those consonants are, and it wants to gather information from the past in a data dependent way. This is the problem that self-attention solves.

x is private information to this token. For the purposes of this single head, it says "Here's what I'm interested in, here's what I have, and here's what I will communicate to you", and that is what is stored in v. v is the thing that gets aggregated, for the purposes of this single head, between the different nodes.

In [26]:
# Version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4, 8, 32 # Batch, Time, Channels
x = torch.randn(B,T,C)

# Let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x)    # (B, T, head_size)
q = query(x)  # (B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)

v = value(x)
out = wei @ v # v is the elements that we aggregate, not the raw input x
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [27]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

#### Notes

- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.

- Each sample across batch dimension is of course processed completely independently and never "talk" to each other.

- In an "encoder" block (such as for a sentiment analysis model) just delete the  single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has a triangular masking, and is usually used in autoregressive settings, like language modeling.

- "Self-attention" just means that the keys and values are produced from the same source as the queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)

- "Scaled" attention addition divides `wei` by 1/sqrt(head_size). This makes it so when input Q, K are unit variance, `wei` will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [28]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)

In [29]:
print(k.var())
print(q.var())
print(wei.var())

tensor(1.0449)
tensor(1.0700)
tensor(17.4690)


In [30]:
wei = q @ k.transpose(-2, -1) * head_size ** -0.5

In [31]:
print(k.var())
print(q.var())
print(wei.var())

tensor(1.0449)
tensor(1.0700)
tensor(1.0918)


In [32]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [33]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

#### Batch Norm

In [34]:
class BatchNorm1d:
    
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # Parameters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # Buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # Calculate the forward pass
        if self.training:
            xmean = x.mean(0, keepdim = True) # batch mean
            xvar = x.var(0, keepdim = True) # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # Update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

#### This normalizes every single column of the input (0 mean, 1 std). 

In [35]:
x[:, 0].mean(), x[:, 0].std() # mean, std of one feature across all batch inputs

(tensor(7.4506e-09), tensor(1.0000))

In [36]:
x[0, :].mean(), x[0, :].std() # mean, std of a single input from the batch, across all features

(tensor(0.0411), tensor(1.0431))

#### Layer Norm
To implement layer norm, simply change the 0 in x.mean and x.var to 1. This will normalize every single row of the input (0 mean, 1 std). Also, since computation no longer spans across examples we can delete all the buffer stuff. We can always apply the operation and don't need to maintain buffers. There is also no distinction between training and test time so we don't care if it's training or not. 

In [46]:
class BatchNorm1d:
    
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        # Parameters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # Calculate the forward pass
        xmean = x.mean(1, keepdim = True) # batch mean
        xvar = x.var(1, keepdim = True) # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [47]:

x[:, 0].mean(), x[:, 0].std() # mean, std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [48]:

x[0, :].mean(), x[0, :].std() # mean, std of a single input from the batch, across all features

(tensor(-9.5367e-09), tensor(1.0000))

Now every single row is normalized, instead of every column