# LLM Textbook Chapter 4: Implementing a GPT model from scratch to generate text

In this chapter, we finally implement a "small", but complete, version of the GPT2 architecture. This essentially consists of the tokenization and embedding from chapter 2, a series of transformer blocks whose main component is the multi-head attention mechanism from chapter 3, and some post-processing to generate text output. We also add the other components of transformer blocks, which are feedforward networks and skip connections, and briefly discuss the GELU activation function that is popular for transformers.

Here we simply implement the model. We train it in the next chapter.

In [4]:
import torch 
import torch.nn as nn
import importlib
import tiktoken

In [5]:
# first, we will include the self-attention class from chapter 3, 
#   which we will use in the transformer blocks of our gpt model
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        # As in `CausalAttention`, for inputs where `num_tokens` exceeds `context_length`, 
        # this will result in errors in the mask creation further below. 
        # In practice, this is not a problem since the LLM (chapters 4-7) ensures that inputs  
        # do not exceed `context_length` before reaching this forwar

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) 
        
        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec


### Configuration and Dummy code

First we will specify the configuration (e.g. embedding size, num heads, num transformer blocks, etc) and show a dummy version of the GPT class, whose blocks we will fill in later. This gives us an overall sense of what the GPT model looks like.

In [6]:
# dictionary specifying the overall architecture of the GPT model
# this size -- e.g. 12 transformer blocks, 12 heads, etc -- is known as "GPT2-Small"
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False  # whether to have bias param for query/key/value mats; often off
}

In [7]:
# now we will implement a "dummy" GPT model to give a sense of the overall architecture
# basically it embeds the input text as in chapter 2, applies a series of transformer blocks
#   which we discuss below and whose main component is self-attention from chapter 3,
#   and then produces output.
# for now we leave the transformer and layer normalization bits blank; we'll fill them later
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # define token and position embeddings and dropout
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        # a series of transformer blocks is the meat of the GPT architecture
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        # define a final layer normalization (see below) and output from embedded space
        #   to a full distribution of probabilities over tokens in the vocabulary
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        # take in batch size and sequence length
        batch_size, seq_len = in_idx.shape
        # embed input tokens
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)  # apply dropout after embedding steps
        x = self.trf_blocks(x)  # apply transformer blocks
        x = self.final_norm(x)  # normalize
        logits = self.out_head(x)  # convert embeddings to "logits" -- probability of each token in vocab
        return logits

# blank for now, just returns input
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
    
    def forward(self, x):
        return x

# blank for now, just returns input
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    
    def forward(self, x):
        return x

In [None]:
# we can try using gpt2 tokenization to tokenize some sample input text and run our dummy model on it
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)  # convert to tensor and stack vectors as rows
print(batch)

torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape: ", logits.shape)  # batch_size x context length x vocab size
print(logits)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output shape:  torch.Size([2, 4, 50257])
tensor([[[-0.9289,  0.2748, -0.7557,  ..., -1.6070,  0.2702, -0.5888],
         [-0.4476,  0.1726,  0.5354,  ..., -0.3932,  1.5285,  0.8557],
         [ 0.5680,  1.6053, -0.2155,  ...,  1.1624,  0.1380,  0.7425],
         [ 0.0447,  2.4787, -0.8843,  ...,  1.3219, -0.0864, -0.5856]],

        [[-1.5474, -0.0542, -1.0571,  ..., -1.8061, -0.4494, -0.6747],
         [-0.8422,  0.8243, -0.1098,  ..., -0.1434,  0.2079,  1.2046],
         [ 0.1355,  1.1858, -0.1453,  ...,  0.0869, -0.1590,  0.1552],
         [ 0.1666, -0.8138,  0.2307,  ...,  2.5035, -0.3055, -0.3083]]],
       grad_fn=<UnsafeViewBackward0>)


### Layer normalization

Layer normalization normalizes the values of each feature in a batch so that the mean is zero and the variance is 1. This helps avoid exploding and vanishing gradients. In practice, we also allow the network to learn scale and shape parameters that are applied post-normalization, which empirically improves performance.

In [22]:
# simple layer normalization class
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5  # small constant to prevent div by 0 if variance is 0
        # in practice we let the network learn scale and shift to apply after normalization, if it helps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        # we use keepdim so the matrix shape is kept rather than casting to a vector of length n_features
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


### GELU

Many transformers use smoothed variants of the ReLU activation function, which tend to work better. One such function is the "gaussian error linear unit" or "GELU". The exact function for GELU(x) would technically be x*phi(x), where phi is the Gaussian CDF. In practice, a quicker approximation learned by curve-fitting is often used in practice -- see below.

In [11]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        # numerical approximation to Gaussian CDF
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [12]:
# here we'll implement a simple feedforward network using a GELU activation, 
#   which will become part of our full transformer block
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)


### Shortcut connections

"Shortcut connections" or "residual connections" are connections that connect input (or output from previous layers) to output from the current layer. In LLMs, this usually involves adding the input to a multi-head attention or feedforward network to the output from that network. The reason is that this helps avoid vanishing gradients, because we are adding to the vector that we are taking the gradient of. We will show an example of this below.

In [13]:
# here we show an example feedforward network with skip connections
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]),
                          GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]),
                          GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]),
                          GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]),
                          GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]),
                          GELU())
        ])

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            # now, if user set use_shortcut=True, we add the input x to the layer output
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:  # otherwise just use layer output
                x = layer_output
        return(x)

In [16]:
# create some sample data
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])

In [18]:
# implement a function to compute gradients in the model's backward pass
def print_gradients(model, x):
    output = model(x)
    target = torch.tensor([[0.]])
    loss = nn.MSELoss()
    loss = loss(output, target)
    loss.backward()  # compute backward pass to calculate gradients
    # now print the losses for each layer
    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [19]:
# now let's look at the gradients without shortcut connections
# notice how it decreases for each layer (vanishing gradients)
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.0001201116101583466
layers.2.0.weight has gradient mean of 0.0007152041653171182
layers.3.0.weight has gradient mean of 0.001398873864673078
layers.4.0.weight has gradient mean of 0.005049646366387606


In [21]:
# now let's look at the gradients *with* shortcut connections
# much more stable through the layers (after the last one)
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22169792652130127
layers.1.0.weight has gradient mean of 0.20694106817245483
layers.2.0.weight has gradient mean of 0.32896995544433594
layers.3.0.weight has gradient mean of 0.2665732502937317
layers.4.0.weight has gradient mean of 1.3258541822433472


### Transformer Blocks: putting it all together

Finally, we will implement a full transformer block; these blocks form the main part of large language models. The transformer block consists of the following steps, in order: layer normalization, masked multi-head attention (see ch3), dropout, adding input via skip connection, layer normalization again, feedforward network, dropout, adding input to feedforward step via skip connection. 

Basically it's multi-head attention and then a feedforward step with appropriate normalizations and skip connections to manage gradient issues and dropout to improve generalizability. The attention layer can be thought of as adding context to each token based on the attention weights. But it is mostly linear. The feedforward layer adds non-linearity and separately adds information to each position, as opposed to the attention layer which processes all tokens together.

The transformer block generally maintains the shape of the input, so they can easily be stacked. The output has a lot of contextual information added.

Normalization can also be performed after the attention and feedforward layers (as was done in the original transformer paper), but generally this leads to worse results than pre-layer normalization.

In [23]:
# here's the implementation of a standard transformer block
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # define attention, feedforward, normalization, dropoutl layers
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            context_length = cfg["context_length"],
            num_heads = cfg["n_heads"],
            dropout = cfg["drop_rate"],
            qkv_bias = cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x  # save input for shortcut connection
        # apply normalization, then attention block, then dropout, then add skip connection
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # similar, but with feedforward instead
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x
    

In [24]:
# let's try applying our transformer block to some sample data
torch.manual_seed(123)
x = torch.rand(2, 4, 768)  # generate 2-batch of 4-token texts embedded in 768-dim space
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)
print(output[1,:, 1:3])

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])
tensor([[1.1680, 0.5809],
        [0.6317, 0.2002],
        [0.4202, 0.3183],
        [0.5767, 0.3411]], grad_fn=<SliceBackward0>)


### Coding the GPT Model

Now we can code our full GPT model! As specified by the config, this will consist of tokenization+embedding+dropout, then 12 transformer blocks, then a final layer normalization and output layer. With the transformer block implemented, it's actually quite compact!

In [25]:
# basic GPT2 model
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # define token and position embeddings and dropout
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        # a series of transformer blocks is the meat of the GPT architecture
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        # define a final layer normalization (see below) and output from embedded space
        #   to a full distribution of probabilities over tokens in the vocabulary
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        # take in batch size and sequence length
        batch_size, seq_len = in_idx.shape
        # embed input tokens
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)  # apply dropout after embedding steps
        x = self.trf_blocks(x)  # apply transformer blocks
        x = self.final_norm(x)  # normalize
        logits = self.out_head(x)  # convert embeddings to "logits" -- probability of each token in vocab
        return logits

In [26]:
# let's try running it on some sample data
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)  # using our batch data from above

print("Input batch:\n", batch)
print("Output shape:\n", out.shape)  # batch_size x context_length x vocab_size
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output shape:
 torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0079, -0.1957,  ..., -0.0222, -0.1062,  0.1717],
         [ 0.3867, -0.8400, -0.6558,  ..., -0.5162,  0.2362, -0.3349],
         [ 0.6985, -0.1826, -0.1634,  ...,  0.1472, -0.6503, -0.0054],
         [-0.4288,  0.1670, -0.1262,  ...,  1.1571,  0.5297, -0.5542]],

        [[ 0.1095, -0.2890, -0.1463,  ..., -0.0557,  0.2907, -0.2818],
         [ 0.0884, -0.3545, -0.3524,  ...,  1.2921,  0.0050,  0.1902],
         [ 0.6092,  0.4702, -0.4093,  ...,  0.7682,  0.3781, -0.1968],
         [-0.0608, -0.0739,  0.4747,  ...,  1.2458, -0.3834,  0.0612]]],
       grad_fn=<UnsafeViewBackward0>)


In [28]:
# here's how we can see how many parameters are in our model
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


Note that the number of params 163M instead of 124M because in the original paper they use the same weights to embed tokens and to "de-embed" the output of GPT back into tokens. The embedding is 50257*768 so actually a huge amount of parameters! Here we don't do that, and in practice it's usually better to learn separate weights for the output. But it does increase the number of params from 124M to 163M.

In [29]:
# we can also print the memory requirements of the model weights
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


### Generating text

Here we actually generate text from the model given an input. We implement a simplified function that takes tokens and a maximum number of tokens to output, slides along the input tokens and output tokens so far, runs the model, and always picks the output token with the highest probability. In a more realistic model, we would pick tokens probabilistically based on their logits. We might also have a probability of an end token, rather than always generating the maximum number of tokens.

Note also that we have not yet trained the network, so the output will be gibberish.

In [31]:
# simple function to call the model to generate tokens given input tokens on a sliding window
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is a (batch, n_tokens) array in the current context window
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]  # the last context_size number of tokens
        with torch.no_grad():  # no gradient -- do not train weights
            logits = model(idx_cond)  # run our model
        
        logits = logits[:, -1, :]  # just get the logits from the last token
        probas = torch.softmax(logits, dim=-1)  # softmax to convert logits to probabilities
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # pick most likely token
        idx = torch.cat((idx, idx_next), dim=1)  # add previously output token as new "last" token
    return idx


In [32]:
# now let's create some sample data
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # adds batch dimension
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [33]:
# now let's run our model on this data
model.eval()  # in eval mode, we don't do training-time random steps like dropout
out = generate_text_simple(model=model, idx=encoded_tensor, max_new_tokens=6,
                           context_size=GPT_CONFIG_124M["context_length"])
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [34]:
# finally, let's decode that output into text
# it's gibberish because we haven't yet trained the network! that's the next chapter.
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
