In [2]:
import torch
import torch.nn as nn

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [4]:
# a nonlinear activation function: GELU (Gaussian Error Linear Unit)
# non linear means the output is not directly proportional to the input

# takes in a tensor of a certain shape and return a tensor of the same shape
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [5]:
# feed forward is an implementation of the feed forward neural network
# it consists of two linear layers with a GELU activation function in between
# basically a mini neural network within the transformer architecture
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential( # it will execute the layers in sequence
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), # goes from embedding dimension to 4 times embedding dimension, 768 --> 3072
            GELU(), # non linear activation function
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), # goes back to embedding dimension 3072 --> 768
        )
        # We change the embedding dimension to 4 times its size to allow the model to learn more complex representations by having more parameters,
        # essentially massaging the input to extract information to be used in the next layer.

    def forward(self, x):
        return self.layers(x)

In [6]:
input = torch.randn( 3,768 ) # batch size of 3, embedding dimension of 768


In [7]:
ff = FeedForward( GPT_CONFIG_124M )
output = ff( input )

In [8]:
output.shape

torch.Size([3, 768])

In [9]:
output

tensor([[-0.1953,  0.0382,  0.1275,  ...,  0.0697,  0.1297, -0.0184],
        [ 0.0603,  0.3750,  0.1370,  ..., -0.2600, -0.2231, -0.1813],
        [ 0.0045,  0.2654,  0.1640,  ..., -0.3183, -0.0713, -0.2159]],
       grad_fn=<AddmmBackward0>)

In [10]:
output = ff( output )

In [11]:
output.shape


torch.Size([3, 768])

In [12]:
output

tensor([[-0.0270,  0.0369, -0.0420,  ..., -0.0222,  0.0592, -0.0603],
        [-0.0129,  0.0475, -0.0158,  ...,  0.0150,  0.0649, -0.0166],
        [-0.0793,  0.0034, -0.0127,  ..., -0.0056,  0.0518, -0.0362]],
       grad_fn=<AddmmBackward0>)