In [None]:
# 1. Self-attention by hand
# 2. Self-attention block in pytorch
# 3. GPT, piece-by-piece
# 4. GPU goes rrrr!

# Original code from https://github.com/karpathy/minGPT/tree/master/mingpt

### Step 1: Self-attention by hand

In [None]:
import torch
import math
import torch.nn.functional as F

In [None]:
#  -- Write the scaled dot product self attention
  # 1. Compute queries, keys, and values
  # 2. Compute dot products
  # 3. Scale the dot products
  # 4. Apply softmax to calculate attentions
  # 5. Weight values by attentions
  # 6. Compute attention weighted features

In [None]:
# Choose random values for the parameters -- sames values as on slide 12, but in pytorch format
# T = 4, C = 6, H = 3
X = torch.tensor([[2,0,0,0,2,1],[0,1,2,0,0,0],[0,0,1,1,0,1],[2,0,0,1,0,1]], dtype=float) # T x C
W_QT = torch.tensor([[1,0,0], [1,1,0], [0,0,1], [0,1,0], [0,0,1], [0,0,1]], dtype=float) # C x H
W_KT = torch.tensor([[0,0,1], [0,1,0], [1,0,0], [0,0,0], [0,0,0], [0,0,-1]], dtype=float) # C x H
W_VT = torch.tensor([[10,0,0], [0,0,10], [0,0,0], [0,10,0], [0,0,0], [0,0,0]], dtype=float) # C x H

In [None]:
# What does the second dimension of matrices Q and K correspond to?

In [None]:
# compute the weighted attention matrix S
S = ???


In [None]:
# compute the self-attention matrix A
A = ???

In [None]:
# Sanity check. This should return True.
torch.allclose(A.float(), torch.tensor([[10.30759701,  2.83283874,  4.59026201],
        [10.10551833,  2.97334971,  4.50027071],
        [15.03361159,  4.13169018,  2.10990693],
        [ 3.06082018,  1.53041009,  7.70438486]]))

### Step 2: Self-attention block in pytorch

In [None]:
import torch
import torch.nn as nn
from torch.functional import F

In [None]:
# do not modify this code

batch_size = 3 # B
block_size = 2 # T
n_embd = 3     # C

In [None]:
torch.set_printoptions(precision=8)

In [None]:
# Build a scaled self-attention head without masked attention and without dropout (i.e. just key, query and values)
# A matrix multiplication is implemented using the nn.Linear() operator with no bias.
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = ???
        ???

    def forward (self, x):
        B, T, C = x.shape
        out = ???
        return out

In [None]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.46728206,  0.03477207],
         [-0.47425330,  0.05069541]],
        [[-0.38198256,  0.02403205],
         [-0.39846635,  0.02506737]],
        [[-0.29631630,  0.12201238],
         [-0.30199534,  0.12650707]]]))

In [None]:
# Add weighted masked attention and dropout. Dropout comes after the softmax and before the multiplication with the value matrix.
# Copy the Head class from the previous exercise and expand upon it.

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        ???
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # store a persistent buffer for the forward pass

    def forward (self, x):
        B, T, C = x.shape
        out = ???
        return out

In [None]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
h = Head(2)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = h(x)
out

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.37939820, -0.16596894],
         [-0.47425330,  0.05069541]],
        [[-0.14184165,  0.00894911],
         [-0.39846635,  0.02506737]],
        [[-0.17301908,  0.02442869],
         [-0.30199534,  0.12650707]]]))


In [None]:
# A multi-head attention module contains a list of heads and a linear projection layer.
# The heads are applied to the input and then concatenated along the last dimension, then
# the linear layer is applied. Look at the unit test below to determine the dimensions of
# the linear layer.

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = ???

    def forward (self, x):
        out = ???
        return out

In [None]:
# do not modify
num_heads = 3
head_size = 2
n_embd = 6

In [None]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
sa = MultiHeadAttention(num_heads=3, head_size=head_size)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size, block_size, n_embd))
out = sa(x)

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.03730504, -0.07006130, -0.27096999,  0.13144857, -0.45049590,
          -0.33217290],
         [-0.06818272, -0.04490501, -0.34806073,  0.15622401, -0.45459983,
          -0.33084857]],
        [[-0.08914752, -0.03846309, -0.36569631,  0.09802882, -0.39963537,
          -0.29225215],
         [-0.04541985,  0.01269679, -0.25225419,  0.08241771, -0.41533324,
          -0.30674040]],
        [[ 0.15234883, -0.08591781, -0.10099770,  0.19886394, -0.49236685,
          -0.43605998],
         [ 0.15418015, -0.01837257, -0.00573672,  0.14228639, -0.48172480,
          -0.40757987]]]))


In [None]:
# Add a classical feedforward module: linear -> ReLU -> linear
# The hidden dimension is four times bigger than the input dimension (see Section 3.3 of Attention is All You Need)
#
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        ???

    def forward(self, x):
        out = ???
        return out

In [None]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
ff = FeedForward(n_embd)
torch.manual_seed(123) # do not remove this line
x = torch.rand((3,n_embd))
out = ff(x)
out

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[-0.58034140,  0.04641046, -0.10707694,  0.21581653, -0.30361831,
         -0.07352637],
        [-0.48917407,  0.07879593, -0.15972012,  0.17862344, -0.37070659,
         -0.07852858],
        [-0.48530388,  0.09604470, -0.06524836,  0.16611034, -0.35499069,
         -0.08964306]]))

In [None]:
# Build a self-attention block
#
#   in -----> LayerNorm -------> multi-head attention -- + ----> LayerNorm -----> FeedForward --- + -----> out
#         |                                              |   |                                    |
#          ----------------------------------------------     ------------------------------------                       
#
# This architecture is slightly different from Attention is All You Need (or the UDL textbook):
# the layer norm comes before (not after) the attention or feed-forward
#
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        ???

    def forward(self, x):
        out = ???
        return out

In [None]:
# Unit test. Do not modify this code
torch.manual_seed(123) # do not remove this line
bk = Block(n_embd, num_heads)
torch.manual_seed(123) # do not remove this line
x = torch.rand((batch_size,block_size,n_embd))
out = bk(x)
out

In [None]:
# Sanity check. This should return True.
torch.allclose(out, torch.tensor([[[-0.05278997, -0.10863629, -0.09458938,  0.97590691, -0.55101192,
           0.57085067],
         [-0.16924502, -0.45394337, -0.25217158,  1.10904062, -0.34593600,
           0.41432184]],
        [[-0.41515028, -0.30126408, -0.11399293,  0.64651299, -0.51579159,
           0.57017863],
         [-0.02535054,  0.08704096,  0.66524690,  0.69768047,  0.05969021,
           0.69993609]],
        [[ 0.52881187,  0.34458166,  0.31130391,  1.11564195,  0.37998506,
          -0.02971917],
         [ 1.38496208,  0.60325992,  0.99346304,  0.38082033,  0.62151432,
           0.47973478]]]))

In [None]:
## Step 3: Build a mini GPT
#
# - Start from the gpt-problem.py file
# - Add your Head, MultiHeadAttention, FeedForward and Block classes
# - Fill in the GPT class (__init__ and forward methods)
# - Train the network on CPU
# - Train the network on GPU

# For __init__, the GPT model parameters are:
#   - a token embedding table
#   - a positional embedding table
#   - a sequence of Blocks
#   - a layer norm
#   - a linear layer
#
# For forward(), the model consists in:
#   - applying the token embedding table and positional embedding table to the input tensor
#   - adding the two together
#   - applying the blocks, layer norm and linear layer (in that order)
#
# The code comes from hyperparameters that should work well on GPU.  On CPU, you 
# will need to reduce the model size significantly.
#
# In pytorch, an learnable embedding table is implemented with nn.Embedding(...)
#
# The token embedding table learns an embedding for each item of the vocabulary. The 
# positional embedding table does not depend on the input and learns an embedding
# for each position in the context.