# An LLM Architecture (Backbone)

Let's set the configuration:

In [1]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

## A Placeholder GPT2 Model Architecture Class

In [11]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
  def __init__(self,
               config
               ):
    super().__init__()
    self.token_emb = nn.Embedding(config["vocab_size"],
                                  config["emb_dim"])
    self.position_emb = nn.Embedding(config["context_length"],
                                     config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])

    # a placeholder for TransformerBlock
    self.transformer_blocks = nn.Sequential(
        *[DummyTransformerBlock(config) for _ in range(config["n_layers"])])

    # a placeholder for LayerNorm
    self.final_norm = DummyLayerNorm(config["emb_dim"])

    self.out_head = nn.Linear(config["emb_dim"],
                              config["vocab_size"],
                              bias=False)

  def forward(self, input_token):
    batch_size, sequence_length = input_token.shape
    token_embeds = self.token_emb(input_token)
    position_embeds = self.position_emb(
        torch.arange(sequence_length,
                     device=input_token.device))
    embeds = token_embeds + position_embeds
    x = self.drop_emb(embeds)
    x = self.transformer_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits


# placeholder
class DummyTransformerBlock(nn.Module):
  def __init__(self, config):
    super().__init__()

  def forward(self, x):
    return x


# placeholder
class DummyLayerNorm(nn.Module):
  def __init__(self, config):
    super().__init__()

  def forward(self, x):
    return x

In [12]:
# setup example
import tiktoken

bpe_tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(bpe_tokenizer.encode(txt1)))
batch.append(torch.tensor(bpe_tokenizer.encode(txt2)))
batch = torch.stack(batch)
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [13]:
# create an instance
torch.manual_seed(211)
model = DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("output shape: ", logits.shape)
logits

output shape:  torch.Size([2, 4, 50257])


tensor([[[-0.4087, -1.2679,  0.3791,  ...,  0.2964,  0.7493,  0.3239],
         [-0.2316,  0.2405, -0.4450,  ...,  0.3991,  0.0026,  0.1075],
         [-0.0059,  0.2291, -0.9134,  ..., -1.7940, -0.0074, -0.3314],
         [ 1.5600, -0.6588,  0.6326,  ..., -0.5926, -0.8299,  2.0698]],

        [[-0.8229, -1.3703,  0.1186,  ...,  0.2430,  0.7330,  0.2529],
         [-0.7035, -0.2469, -0.4727,  ...,  0.0581,  0.4801,  0.4443],
         [ 0.6955, -0.2896,  0.0219,  ..., -1.2846,  0.4151, -0.0240],
         [ 0.5667, -0.1500, -0.0109,  ..., -1.5656, -1.3297,  2.0111]]],
       grad_fn=<UnsafeViewBackward0>)

# Implement LayerNorm

Let's experiment:

In [14]:
# example setup
torch.manual_seed(211)

# create 3 training examples with 5 dimensions each
batch_example = torch.randn(3, 5)

layer = nn.Sequential(nn.Linear(5, 7),
                      nn.ReLU())

output = layer(batch_example)
output

tensor([[1.0540, 0.2893, 0.0442, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8197, 0.1849, 0.1102, 1.7938, 0.3290, 1.2042, 0.0000],
        [0.3081, 0.1382, 0.2541, 0.0000, 0.5109, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>)

In [28]:
# compute mean and variance of each sample
mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, keepdim=True)
print("mean: ", mean)
print("variance: ", var)

mean:  tensor([[0.1982],
        [0.6345],
        [0.1730]], grad_fn=<MeanBackward1>)
variance:  tensor([[0.1536],
        [0.4460],
        [0.0383]], grad_fn=<VarBackward0>)


Let's normalize (aka layer normalization) the output:

In [31]:
norm_output = (output - mean) / torch.sqrt(var)
norm_mean = norm_output.mean(dim=-1, keepdim=True)
norm_var = norm_output.var(dim=-1, keepdim=True)

torch.set_printoptions(sci_mode=False)
print("normalized mean: \n", norm_mean)
print("normalized variance: \n", norm_var)
print("normalized layer output: \n", norm_output)

normalized mean: 
 tensor([[     0.0000],
        [     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
normalized variance: 
 tensor([[1.0000],
        [1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
normalized layer output: 
 tensor([[ 2.1836,  0.2324, -0.3930, -0.5058, -0.5058, -0.5058, -0.5058],
        [ 0.2772, -0.6733, -0.7852,  1.7360, -0.4575,  0.8530, -0.9502],
        [ 0.6897, -0.1777,  0.4139, -0.8838,  1.7256, -0.8838, -0.8838]],
       grad_fn=<DivBackward0>)


## Implement LayerNorm class

In [33]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.epsilon = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1,
                unbiased=False, # Bessel's correction (n-1)
                keepdim=True)
    norm_x = (x - mean) / torch.sqrt(var + self.epsilon)
    return self.scale * norm_x + self.shift

In [35]:
# setup example
layer_norm = LayerNorm(emb_dim=5)
layer_norm_output = layer_norm(batch_example)
mean = layer_norm_output.mean(dim=-1, keepdim=True)
var = layer_norm_output.var(dim=-1, unbiased=False, keepdim=True)

print("normalized mean: \n", mean)
print("normalized variance: \n", var)

normalized mean: 
 tensor([[    -0.0000],
        [    -0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
normalized variance: 
 tensor([[1.0000],
        [1.0000],
        [0.9999]], grad_fn=<VarBackward0>)


# Implement a FFN With GELU Activations