In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-09 16:53:59--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2023-07-09 16:54:00 (1.07 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [2]:
import torch

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("Text len: ", len(text))

Text len:  1115394


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenize 
Convert sequence of text into some sequence of integers.  We are building a character level language model here.  

tiktoken: sub word encoding used in practice.  Here we're using a character level encoding for simplicity.  

In [7]:
# create a mapping from characters to integers and back
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

# Simple encoder and decoder functions
def encode(input_string):
    return [stoi[char] for char in input_string]

def decode(input_tokens):
    return ''.join([itos[token] for token in input_tokens])

print(encode("Greetings earthlings!"))
print(decode(encode("Greetings earthlings!")))
    

[19, 56, 43, 43, 58, 47, 52, 45, 57, 1, 43, 39, 56, 58, 46, 50, 47, 52, 45, 57, 2]
Greetings earthlings!


In [8]:
# Tokenize Shakespeare
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [9]:
# Train test split on the data
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### Training

Sample random chunks from the training set, and train the chunks at a time.  They should have some maximum length (block size).  

Simultaneously train to make predictions at every one of these positions.  

Batch dimension - multiple chunks of text that are stacked up together.  

In [31]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [32]:
x = train_data[:block_size] # Inputs to the transformer
y = train_data[1:block_size + 1] # Targets (which are offset by 1)
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input is {context} the target is: {target}")
    print(f"when input is {decode(context.numpy())} the target is: {decode(target.numpy().reshape(1,))}")

when input is tensor([18]) the target is: 47
when input is F the target is: i
when input is tensor([18, 47]) the target is: 56
when input is Fi the target is: r
when input is tensor([18, 47, 56]) the target is: 57
when input is Fir the target is: s
when input is tensor([18, 47, 56, 57]) the target is: 58
when input is Firs the target is: t
when input is tensor([18, 47, 56, 57, 58]) the target is: 1
when input is First the target is:  
when input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
when input is First  the target is: C
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
when input is First C the target is: i
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58
when input is First Ci the target is: t


In [33]:
torch.manual_seed(1337)
batch_size = 4 # Number of independent sequences yt process in parallel
block_size = 8 # Maximum context length for the predictions 

def get_batch(split): # train or validation split
    """Generate a small batch of data from inputs x and targets y."""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # batch_size random sequence starting points
    print("Random starting points for each block: ", ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y

xb, yb = get_batch(train_data)
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("-------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"when input is {context} the target is: {target}")
    

Random starting points for each block:  tensor([29535, 38737, 81972, 56048])
inputs:
torch.Size([4, 8])
tensor([[ 6,  1, 52, 53, 58,  1, 58, 47],
        [ 6,  1, 54, 50, 39, 52, 58, 43],
        [ 1, 58, 46, 47, 57,  1, 50, 47],
        [ 0, 32, 46, 43, 56, 43,  1, 42]])
targets:
torch.Size([4, 8])
tensor([[ 1, 52, 53, 58,  1, 58, 47, 50],
        [ 1, 54, 50, 39, 52, 58, 43, 58],
        [58, 46, 47, 57,  1, 50, 47, 60],
        [32, 46, 43, 56, 43,  1, 42, 53]])
-------
when input is tensor([6]) the target is: 1
when input is tensor([6, 1]) the target is: 52
when input is tensor([ 6,  1, 52]) the target is: 53
when input is tensor([ 6,  1, 52, 53]) the target is: 58
when input is tensor([ 6,  1, 52, 53, 58]) the target is: 1
when input is tensor([ 6,  1, 52, 53, 58,  1]) the target is: 58
when input is tensor([ 6,  1, 52, 53, 58,  1, 58]) the target is: 47
when input is tensor([ 6,  1, 52, 53, 58,  1, 58, 47]) the target is: 50
when input is tensor([6]) the target is: 1
when input i

### Bigram Language Model


In [34]:
yb.shape

torch.Size([4, 8])

In [35]:
# Understand nn.Embedding better 

# Embedding with 3 lookup values, each with an embedding dim of 10
test_emb = nn.Embedding(3,10)
# Create a tensor to use with that lookup
test_tensor = torch.tensor([1, 2], dtype=torch.long)
test_emb(test_tensor) 

tensor([[-0.9211,  1.5433, -0.3676, -0.7483,  1.0101,  0.1215,  0.1584,  1.1340,
         -1.1539, -0.2984],
        [ 1.1490,  0.1812,  0.5467, -1.4948, -1.2057,  0.5718, -0.5974, -0.6937,
         -0.7296, -1.5580]], grad_fn=<EmbeddingBackward0>)

In [36]:
m = BigramLanguageModel(vocab_size)
print(m(test_tensor)[0])
print(F.softmax(m(test_tensor)[0], dim=-1))
print(torch.multinomial(F.softmax(m(test_tensor)[0], dim=-1), num_samples=1))

tensor([[-1.8846e+00,  1.6696e-01,  4.5862e-01, -1.7662e+00,  5.8599e-01,
          5.8728e-01,  2.8607e-01,  3.1096e-01, -6.5376e-01, -6.5763e-01,
          3.1845e-01, -5.4959e-01, -1.4649e+00, -5.5769e-01, -6.9393e-01,
          1.3035e+00, -4.5013e-01,  1.3471e+00,  1.6910e+00, -1.2445e-01,
         -1.6824e+00,  1.1346e+00, -8.2384e-02,  1.0517e+00,  6.7789e-01,
          3.0665e-01, -7.4723e-01,  7.4349e-01,  8.8766e-01, -3.2742e-01,
          7.8394e-02, -1.5297e+00, -2.9122e-01, -1.1395e-01, -3.1367e-01,
         -6.2931e-01,  1.1385e+00, -1.1347e+00,  1.7053e-01,  1.2249e+00,
         -2.3454e-01, -1.0572e+00, -6.5427e-01,  1.5909e+00, -6.9949e-01,
          2.0437e+00, -1.6563e-01, -5.6280e-02,  2.3412e+00, -2.7234e+00,
          5.0967e-01, -8.1447e-01, -2.4604e-01, -9.7419e-01, -1.8692e-01,
         -1.5755e-01, -2.1867e-01, -1.3519e+00, -5.7281e-02, -1.8540e+00,
         -1.3849e+00,  6.5883e-01, -7.2578e-01,  1.4448e-01,  1.6632e-01],
        [ 7.5070e-01,  9.1317e-01, -1

In [37]:
print(xb)

tensor([[ 6,  1, 52, 53, 58,  1, 58, 47],
        [ 6,  1, 54, 50, 39, 52, 58, 43],
        [ 1, 58, 46, 47, 57,  1, 50, 47],
        [ 0, 32, 46, 43, 56, 43,  1, 42]])


In [38]:
# yb is offset by one compared to xb - we are predicting yb given xb with the bigram model
print(yb)

tensor([[ 1, 52, 53, 58,  1, 58, 47, 50],
        [ 1, 54, 50, 39, 52, 58, 43, 58],
        [58, 46, 47, 57,  1, 50, 47, 60],
        [32, 46, 43, 56, 43,  1, 42, 53]])


In [39]:
test_emb = nn.Embedding(vocab_size, vocab_size)
test_x = torch.tensor([[1, 2, 3],
                       [3, 4, 5]])
print(test_x)

print(test_emb(test_x).shape)

tensor([[1, 2, 3],
        [3, 4, 5]])
torch.Size([2, 3, 65])


In [40]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        """ Forward pass. 
        
        This is an extremely simple model currently, where the embedding is used
        directly as an input into softmax to create an array of probabilities.  So the 
        embedding dimension must be equal to the vocab size since the emedding values itself
        are just the predictions.  So the model is taking each character and determining what
        the most likely next character is, trained of the offset by one x and y values."""
        # idx and targets are both (B, T) tensor of integers
        # We are ONLY using the embedding as the logits directly.  
        # 
        logits = self.token_embedding_table(idx) # (B,T,C) - (Batch (4), Time (8), Channel(65))
        
        # Evaluate the loss (compare logits to the next character (targets))
        if targets == None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B * T, C) # Stack the time pieces for each batch on top of each other batch
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """Generate new tokens on top of the existing T tokens."""
        for _ in range(max_new_tokens):
            # Use the forward step to get predictions
            logits, loss = self(idx) # Don't need loss
            #print("logit shape before: ", logits.shape)
            # Focus only on the last time step - this is what comes next actually.  
            logits = logits[:, -1, :] # becomes (B, C)
            # use softmax to get the probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution (we pick a random next character but weighted by modeled probability)?
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        
        return idx


m = BigramLanguageModel(vocab_size)
# Test forward pass for one batch 
logits, loss = m(xb, yb)
# Returns the batch size, block size, and embedding dimensionality
print(logits.shape)
print(loss)
# Start with a single 0 (newline char)
idx = torch.zeros((1, 1), dtype=torch.long)
# Use the (currently untrained) model to generate new characters
print(decode(m.generate(idx, 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.4913, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


### Now train the model

In [142]:
torch.manual_seed(1337)
batch_size = 4 # Number of independent sequences yt process in parallel
block_size = 8 # Maximum context length for the predictions 

def get_batch(split): # train or validation split
    """Generate a small batch of data from inputs x and targets y."""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # batch_size random sequence starting points
    # print("Random starting points for each block: ", ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y

xb, yb = get_batch(train_data)

In [143]:
print(xb.shape)
print(yb.shape)
print(yb)

torch.Size([4, 8])
torch.Size([4, 8])
tensor([[ 1, 52, 53, 58,  1, 58, 47, 50],
        [ 1, 54, 50, 39, 52, 58, 43, 58],
        [58, 46, 47, 57,  1, 50, 47, 60],
        [32, 46, 43, 56, 43,  1, 42, 53]])


In [144]:
def get_batch(split): # train or validation split
    """Generate a small batch of data from inputs x and targets y."""
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # batch_size random sequence starting points
    # print("Random starting points for each block: ", ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y


In [145]:
train_data_tiny = train_data #[:100000]

In [146]:
import numpy as np
print(len(train_data_tiny))
np.arange(0, len(train_data_tiny)-batch_size, batch_size)

1003854


array([      0,       4,       8, ..., 1003840, 1003844, 1003848])

In [148]:
torch.cuda.is_available()

True

In [149]:
next(model.parameters()).device

device(type='cuda', index=0)

In [151]:
import torch.optim as optim
import time

torch.manual_seed(1337)
# Larger batch size is faster (when using gpu at least)
batch_size = 128 # Number of independent sequences yt process in parallel
block_size = 8 # Maximum context length for the predictions 
learning_rate = 0.0005
num_epochs = 6

device = "cuda" # cuda, mps, or cpu

# Set up the model and optimizer
model = BigramLanguageModel(vocab_size).to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
batch_starts = np.arange(0, len(train_data_tiny)-batch_size, batch_size)
batch_starts_val = np.arange(0, len(val_data)-batch_size, batch_size)
lambda1 = lambda epoch: 0.65 ** epoch
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

# Iterate through epochs
for epoch in range(num_epochs):
    
    start_time = time.time()
    print(f"Current learning rate: {optimizer.param_groups[0]['lr']:.6f}")
    model.train()
    batch_losses = []
    
    # Iterate through batches
    for batch_start in batch_starts:
        xb, yb = get_batch(train_data_tiny[batch_start:])
        xb = xb.to(device)
        yb = yb.to(device)
        # forward pass 
        logits, loss = model(xb, yb)
        
        batch_losses.append(loss.item())

        # Backward pass 
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
    total_loss = sum(batch_losses)/len(batch_losses)
    
    # Update learning rate
    scheduler.step()
    
    # Print training progress
    # This is batch loss for the most recent batch only (which isn't that meaningful).  
    # Need to add in validation loss to make this more reasonable 
    # That's why loss is all over the place 
    
    model.eval()
    
    #get_batch(train_data_tiny[batch_start:])
    #val_logits, val_loss = model(x_val, y_val)
    
    
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Execution time: {epoch_time:.4f}")


Current learning rate: 0.000500
Epoch [1/6], Loss: 2.8814, Execution time: 21.5839
Current learning rate: 0.000325
Epoch [2/6], Loss: 2.4629, Execution time: 20.8806
Current learning rate: 0.000211
Epoch [3/6], Loss: 2.4614, Execution time: 22.0986
Current learning rate: 0.000137
Epoch [4/6], Loss: 2.4590, Execution time: 20.7807
Current learning rate: 0.000089
Epoch [5/6], Loss: 2.4593, Execution time: 20.7379
Current learning rate: 0.000058
Epoch [6/6], Loss: 2.4568, Execution time: 20.0069


Single epoch time with MPS: 208.6561, 13.0687  
Single epoch time without MPS: 2.8049, 2.8232

In [152]:
# Test the trained model - it's now outputting something that seems a little more like shakespeare
# Start with a single 0 (newline char)
idx = torch.zeros((1, 1), dtype=torch.long).to(device)
# Alternative, start with a specific charater(s)
# Bigram model is only using this by itself.  
idx = torch.tensor([[51, 39, 62, 0]]).to(device)
# Use the (currently untrained) model to generate new characters
print(decode(model.generate(idx, 200)[0].tolist()))

max


CEThef brid owindake on, bthe aiset bube t e.
SThr-d my dalatanss:
Whitharu w he, t.
Par dilasoate ar ce my.

Hastarom orou wabuts, tof isth bot mil ndill, ath iree:
Inghin lat Heridrovets, and t n 


In [98]:
idx

tensor([[0]], device='cuda:0')

In [None]:
next(model.parameters()).device

In [None]:
train_data.device

### The mathematical trick in self attention

Toy example
Goal: calculate the average of the current + all previous tokens for token t in the sequence
Averaging all the contexts is a very simple form of aggregating them.  

The 'average' use is just a very simple way of two tokens communicating together - it could also be dot product of the embeddings of the two tokens (which is what we will need for attention).  

This will be key to an efficient implementation of self attention.  

In [47]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 2])


In [48]:
# X bag of words 
# Innefficient for loop implementation to start.  
print(B, T, C)
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        #print(xprev)
        #print(torch.mean(xprev, 0))
        xbow[b, t] = torch.mean(xprev, 0) # Take the mean of the row vectors (outputs one average row vector 1xC)
print(xbow.shape)

4 8 2
torch.Size([4, 8, 2])


In [49]:
x[1]

tensor([[ 1.3488, -0.1396],
        [ 0.2858,  0.9651],
        [-2.0371,  0.4931],
        [ 1.4870,  0.5910],
        [ 0.1260, -1.5627],
        [-1.1601, -0.3348],
        [ 0.4478, -0.8016],
        [ 1.5236,  2.5086]])

In [50]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [51]:
xbow[1]

tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])

In [52]:
xbow = torch.zeros((B, T, C))
x_prev = x[1, :2]
print(x_prev)
print(torch.mean(x_prev, 0)) # Mean across a single axis (horizontal is compressed)
# This is a feature vector that summarizes the vectors (poorly summarized with mean). 

tensor([[ 1.3488, -0.1396],
        [ 0.2858,  0.9651]])
tensor([0.8173, 0.4127])


**Vectorization of the process with a simple example**

The aggregate function here is not the dot product from the book, but instead a simple average.  The average is accomplished by dotting a lower triangular matrix with the input data.  

b is the x values here effectively, where each column is a vector of features.


B, T, C is batch, time, channels

Batch is a random starting point in our full sequence from which we will pull a block.  
Each block has a size which termines how many consecutive tokens we use (the time dimension)
Channel is the dimension of the embedding representing the token.  


So the process of averaging is averaging the token with other tokens up to that point in the sequence to represent the token as the average of it + all before it in the sequence (so the first token embedding 

In [53]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True) # this turns it from sum to average.  
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(a)
print(b)
print(c)


tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [54]:
# Vectorize this now to make much more efficient. 
# Lower triangular matrix with each value 1/ the row value
# Multiplying by this will take the average of the columns up to that point, which is what we want. 
wei = torch.tril(torch.ones((T, T)))
# scaling_factor = torch.tensor([1/(i + 1) for i in range(T)]).view(8, 1)
wei = wei / torch.sum(wei, 1, keepdim=True)
# r = scaling_factor * r
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [67]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [73]:
# Vectorized version
# this is also applied in a batch manner - each
# Batch multiply in pytorch.  
xbow2 = wei @ x
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

**Use matrix multiplication to take the average of the embedding for each token with previous tokens in the context**

In [165]:
# Softmax version - does the same thing
tril = torch.tril(torch.ones((T, T)))
wei = torch.zeros((T, T))
# This method effectively removes all future knowledge when training. 
wei = wei.masked_fill(tril == 0, float('-inf'))
# When we move to using attention to connect these, then softmax is normalizing the impact of each previous token on future ones.  
print(wei)
wei = F.softmax(wei, dim=-1)
print(wei)
xbow3 = wei @ x
print(xbow3)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
tensor([[[ 0.1

#### Now convert this to single headed attention with the dot product instead

In [181]:
# Create a random X example
torch.manual_seed(1337)
# use 32 dim for the embedding (channel) dim now
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 32])


In [191]:
# Single headed attention
# Head size basically further reduces the dimensionality
head_size = 16
key = nn.Linear(C, head_size, bias=False) # output -> head_size
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# Apply the key and query projections
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
v = value(x) 

wei = q @ k.transpose(-2, -1)

# 
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
print(wei[0])
out = wei @ v
out.shape

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0213, 0.9787, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2919, 0.6713, 0.0369, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9022, 0.0232, 0.0325, 0.0421, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0856, 0.2269, 0.2602, 0.3434, 0.0839, 0.0000, 0.0000, 0.0000],
        [0.0420, 0.0128, 0.2996, 0.0614, 0.4992, 0.0851, 0.0000, 0.0000],
        [0.0243, 0.0395, 0.0119, 0.3927, 0.0364, 0.4695, 0.0257, 0.0000],
        [0.0686, 0.1133, 0.1060, 0.1986, 0.0945, 0.0237, 0.0662, 0.3291]],
       grad_fn=<SelectBackward0>)


torch.Size([4, 8, 16])

## Attention example from the book

In [167]:
# Attention examples from the book 
# 1. Just dot product
# 2. Weight projection of xi and xj dot product together
# The channel dimension is the token embedding dimension.  
# Single batch example - for the bigger dataset batch broadcasting happens.

torch.manual_seed(42)
b = torch.tensor([[2, 2],
                  [2, 2],
                  [2, 3]]).float()

# Lower triangle of the matrix multipled by itself (getting the required dot products)
# c = torch.tril(b @ b.T)
tril = torch.tril(torch.ones((b.shape[0], b.shape[0])))
c = (b @ b.T).masked_fill(tril == 0, float('-inf'))
# print(a)
print(b)
print(c)
print(F.softmax(c, dim=1))
# Now apply softmax.  
alpha = torch.round(F.softmax(c, dim=1), decimals=3)
print(alpha)

# instead of summing these, use dot products instead (this should actually collapse the C dim)
# Also use softmax convert to a probability distribution


# lastly get the predicted y at each output by dotting with x again (as Value)

# Each row is embedding of the y prediction.  
# Multiply the softmax values by b
alpha @ b
# each row of this resulting matrix is the output embedding for yi.  

# This also needs to be converted to batch (right now this is a single sequence with len t)





tensor([[2., 2.],
        [2., 2.],
        [2., 3.]])
tensor([[ 8., -inf, -inf],
        [ 8.,  8., -inf],
        [10., 10., 13.]])
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.0453, 0.0453, 0.9094]])
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.0450, 0.0450, 0.9090]])


tensor([[2.0000, 2.0000],
        [2.0000, 2.0000],
        [1.9980, 2.9070]])

In [143]:
import math

In [179]:
torch.arange(8, device="cpu")

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [171]:
def self_attention(X, wQ, wK, wV):
    """Take in a 2 or 3d tensor and calculate output embeddings."""
    
    # Create the query, key, and value matrices
    Q = X @ wQ
    K = X @ wK
    V = X @ wV

    # Take the dot product with each previous matrix
    if X.dim() == 2:
        tril = torch.tril(torch.ones((X.shape[0], X.shape[0])))
        xdot = (Q @ K.T).masked_fill(tril == 0, float('-inf'))
    elif X.dim() == 3:
        tril = torch.tril(torch.ones((X.shape[1], X.shape[1])))
        xdot = (Q @ torch.transpose(K, 1, 2)).masked_fill(tril == 0, float('-inf'))
    else:
        raise Exception("X must be a 2 or 3d tensor")
    
    # Normalize by the square root of the input dim
    xdot = xdot/math.sqrt(X.shape[-1])
    
    # Softmax to get weights of each previous element 
    alpha = torch.round(F.softmax(xdot, dim=1), decimals=4)
    
    # Multiply by X again to get a matrix with Y (each row is dim C)
    Y = alpha @ V
    return Y


In [172]:
def initialize_weights(C):
    """Create the traininable weight matrices for the query, key and value projections."""

    wQ = torch.rand(C, C, requires_grad=True)
    wK = torch.rand(C, C, requires_grad=True)
    wV = torch.rand(C, C, requires_grad=True)

    return wQ, wK, wV

In [173]:
b = torch.tensor([[1, 2],
                  [1, 2],
                  [1, 3]]).float()
wQ, wK, wV = initialize_weights(C)
self_attention(b, wQ, wK, wV)

tensor([[0.8635, 2.1047],
        [0.8635, 2.1047],
        [1.0627, 2.6627]], grad_fn=<MmBackward0>)

In [174]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
print(x.shape)

torch.Size([4, 8, 2])


In [175]:
x.shape[-1]

2

In [178]:
wQ, wK, wV = initialize_weights(C)
self_attention(x, wQ, wK, wV)

tensor([[[ 0.0165,  0.0071],
         [-0.0632, -0.2175],
         [ 0.0612, -0.0235],
         [ 0.2248,  0.1270],
         [ 0.4458,  0.6900],
         [-0.5748, -0.8089],
         [ 0.0283, -0.1086],
         [-0.2851,  0.9771]],

        [[ 0.1143,  0.0811],
         [ 0.1768,  0.2280],
         [-0.2311, -0.0854],
         [ 0.2996,  0.4373],
         [-0.3671, -0.6487],
         [-0.7516, -0.9804],
         [-0.3117, -0.8894],
         [ 2.6913,  4.3942]],

        [[-0.0671, -0.0776],
         [ 0.1207,  0.1014],
         [ 0.2364,  0.5234],
         [-0.1591, -0.0990],
         [-0.3276, -0.4500],
         [-0.1801, -0.7315],
         [-0.4209, -0.4006],
         [-1.1242, -1.7289]],

        [[ 0.1375,  0.0452],
         [ 0.2859,  0.1329],
         [ 0.1335,  0.2983],
         [ 0.4722,  0.2563],
         [ 0.7210,  0.1365],
         [ 0.9349,  0.3162],
         [ 1.6734,  1.8950],
         [ 2.2934,  1.3561]]], grad_fn=<UnsafeViewBackward0>)

In [104]:
self_attention(x)

tensor([[[ 0.1040, -0.0287],
         [ 0.0411, -0.1861],
         [ 0.1485, -0.0067],
         [ 0.3377,  0.0153],
         [ 0.2950,  0.3330],
         [-0.7128, -0.4754],
         [ 0.1493, -0.0534],
         [-0.6351,  1.4682]],

        [[ 0.3156, -0.1855],
         [ 0.0502, -0.0910],
         [-1.7947,  0.2722],
         [ 0.6812, -0.0133],
         [ 0.0991, -0.9941],
         [-0.6061, -0.2765],
         [ 0.2010, -0.4988],
         [ 3.0755,  3.5064]],

        [[-0.1896, -0.0851],
         [ 0.0982, -0.0334],
         [-0.0429,  0.2894],
         [-0.5442, -0.2200],
         [-0.4381, -0.3440],
         [ 0.1440, -1.1734],
         [-0.9201,  0.1940],
         [-0.5195, -0.4621]],

        [[ 0.2870, -0.0396],
         [ 0.3054, -0.0270],
         [-1.3132,  2.0872],
         [ 5.0859, -2.5198],
         [ 1.5595, -1.2203],
         [ 0.3539, -0.0756],
         [ 0.7320,  1.0552],
         [ 1.8991, -0.5725]]])

In [65]:
torch.round(F.softmax(torch.tensor([[47, 56, 61],
                        [3, 3, 5]]).float(), dim=1), decimals=4)

tensor([[0.0000, 0.0067, 0.9933],
        [0.1065, 0.1065, 0.7870]])

In [219]:
torch.tensor([[1, 2, 5],
                        [3, 3, 3]]).float(),

(tensor([[1., 2., 5.],
         [3., 3., 3.]]),)

### Test M1 GPU

In [30]:
print(torch.__version__)

1.13.1


In [31]:
import timeit
import torch
import random
x = torch.ones(5000, device="mps")
timeit.timeit(lambda: x * random.randint(0,100), number=100000)

2.6527948340000194

In [32]:
In [18]: # toy example cpu
import timeit
import torch
import random

x = torch.ones(5000, device="cpu")
timeit.timeit(lambda: x * random.randint(0,100), number=100000)

0.24565041699997892

In [33]:
import re
text = "Grocery - Local/Specialty"
' '.join(re.sub('[^a-zA-Z0-9\n\.]', ' ', text).split()).lower()

'grocery local specialty'

In [None]:
grocery local specialty