<a href="https://colab.research.google.com/github/nncliff/qwen-32B/blob/main/chapter-0/picoGPT_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PicoGPT Inference Notebook

This notebook demonstrates how to load and run inference with GPT-2 using the picoGPT implementation.

picoGPT is a minimal implementation of GPT-2 in pure NumPy, making it easy to understand the core concepts of transformer-based language models.

**Features:**
- Load pre-trained GPT-2 weights
- BPE tokenization
- Text generation with greedy decoding

## 1. Setup and Imports

In [1]:
import os
import json
import re
from functools import lru_cache

import numpy as np
import regex
import requests
import tensorflow as tf
from tqdm import tqdm

## 2. BPE Tokenizer (Encoder)

The GPT-2 tokenizer uses Byte Pair Encoding (BPE) to convert text into tokens.

In [2]:
@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    """
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word."""
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


class Encoder:
    """BPE Encoder/Decoder for GPT-2."""

    def __init__(self, encoder, bpe_merges, errors="replace"):
        self.encoder = encoder
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}
        self.pat = regex.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        """Encode text to token IDs."""
        bpe_tokens = []
        for token in regex.findall(self.pat, text):
            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    def decode(self, tokens):
        """Decode token IDs back to text."""
        text = "".join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text


def get_encoder(model_name, models_dir):
    """Load the BPE encoder from files."""
    with open(os.path.join(models_dir, model_name, "encoder.json"), "r") as f:
        encoder = json.load(f)
    with open(os.path.join(models_dir, model_name, "vocab.bpe"), "r", encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
    return Encoder(encoder=encoder, bpe_merges=bpe_merges)

## 3. Model Loading Utilities

Functions to download GPT-2 weights from OpenAI and load them from TensorFlow checkpoints.

In [3]:
def download_gpt2_files(model_size, model_dir):
    """Download GPT-2 model files from OpenAI."""
    assert model_size in ["124M", "355M", "774M", "1558M"]
    for filename in [
        "checkpoint",
        "encoder.json",
        "hparams.json",
        "model.ckpt.data-00000-of-00001",
        "model.ckpt.index",
        "model.ckpt.meta",
        "vocab.bpe",
    ]:
        url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
        r = requests.get(f"{url}/{model_size}/{filename}", stream=True)
        r.raise_for_status()

        with open(os.path.join(model_dir, filename), "wb") as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(
                ncols=100,
                desc="Fetching " + filename,
                total=file_size,
                unit_scale=True,
                unit="b",
            ) as pbar:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)


def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
    """Load GPT-2 parameters from TensorFlow checkpoint."""
    def set_in_nested_dict(d, keys, val):
        if not keys:
            return val
        if keys[0] not in d:
            d[keys[0]] = {}
        d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
        return d

    params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
    for name, _ in tf.train.list_variables(tf_ckpt_path):
        array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
        name = name[len("model/") :]
        if name.startswith("h"):
            m = re.match(r"h([0-9]+)/(.*)", name)
            n = int(m[1])
            sub_name = m[2]
            set_in_nested_dict(params["blocks"][n], sub_name.split("/"), array)
        else:
            set_in_nested_dict(params, name.split("/"), array)

    return params


def load_encoder_hparams_and_params(model_size, models_dir):
    """Load encoder, hyperparameters, and model parameters."""
    assert model_size in ["124M", "355M", "774M", "1558M"]

    model_dir = os.path.join(models_dir, model_size)
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    if not tf_ckpt_path:  # download files if necessary
        os.makedirs(model_dir, exist_ok=True)
        download_gpt2_files(model_size, model_dir)
        tf_ckpt_path = tf.train.latest_checkpoint(model_dir)

    encoder = get_encoder(model_size, models_dir)
    hparams = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)

    return encoder, hparams, params

## 4. GPT-2 Model Components

These are the core building blocks of the GPT-2 architecture, implemented in pure NumPy.

### GELU (Gaussian Error Linear Unit)

GELU is the activation function used in GPT-2 (and many modern transformers like BERT, GPT-3, etc.). Unlike ReLU which has a hard cutoff at 0, GELU provides a smooth, probabilistic gating mechanism.

**Exact Definition:**

$$\text{GELU}(x) = x \cdot \Phi(x) = x \cdot P(X \leq x)$$

where $\Phi(x)$ is the cumulative distribution function (CDF) of the standard normal distribution.

**Approximation (used in practice):**

Since computing the exact CDF is expensive, GPT-2 uses a fast approximation based on $\tanh$:

$$\text{GELU}(x) \approx 0.5 \cdot x \cdot \left(1 + \tanh\left(\sqrt{\frac{2}{\pi}} \cdot (x + 0.044715 \cdot x^3)\right)\right)$$

**Why GELU over ReLU?**
- **Smooth**: Differentiable everywhere (no kink at 0)
- **Non-monotonic**: Small negative values can have small positive outputs
- **Stochastic interpretation**: Can be seen as multiplying input by a Bernoulli mask where the probability depends on the input value

### Softmax

Softmax converts a vector of real numbers (logits) into a probability distribution. It's used in attention mechanisms and the final output layer.

**Definition:**

For a vector $\mathbf{x} = [x_1, x_2, ..., x_n]$:

$$\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j=1}^{n} e^{x_j}}$$

**Properties:**
- Output values are in range $(0, 1)$
- All outputs sum to 1: $\sum_i \text{softmax}(x_i) = 1$
- Preserves ordering: if $x_i > x_j$, then $\text{softmax}(x_i) > \text{softmax}(x_j)$

**Numerical Stability Issue:**

Computing $e^{x_i}$ directly can cause overflow for large $x$. The solution is to subtract the maximum value:

$$\text{softmax}(x_i) = \frac{e^{x_i - \max(\mathbf{x})}}{\sum_{j=1}^{n} e^{x_j - \max(\mathbf{x})}}$$

This is mathematically equivalent (the $e^{-\max(\mathbf{x})}$ cancels out) but prevents overflow since the largest exponent is now $e^0 = 1$.

### Layer Normalization

Layer normalization stabilizes training by normalizing activations across the feature dimension. Unlike batch normalization (which normalizes across the batch), layer norm works on individual samples.

**Definition:**

For an input vector $\mathbf{x} = [x_1, x_2, ..., x_d]$ of dimension $d$:

$$\text{LayerNorm}(\mathbf{x}) = \gamma \cdot \frac{\mathbf{x} - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta$$

where:
- $\mu = \frac{1}{d}\sum_{i=1}^{d} x_i$ is the mean
- $\sigma^2 = \frac{1}{d}\sum_{i=1}^{d} (x_i - \mu)^2$ is the variance
- $\gamma$ (scale) and $\beta$ (shift) are learnable parameters
- $\epsilon$ is a small constant (e.g., $10^{-5}$) for numerical stability

**Why Layer Norm?**
- **Stabilizes gradients**: Prevents activations from becoming too large or too small
- **Enables deeper networks**: Essential for training transformers with many layers
- **Independent of batch size**: Works the same during training and inference

**In GPT-2:**
- Applied **before** attention and FFN (Pre-LN architecture)
- Each transformer block has two layer norms: `ln_1` (before attention) and `ln_2` (before FFN)

### Linear (Fully Connected Layer)

The linear transformation is the fundamental building block of neural networks. It's also called a fully connected layer, dense layer, or affine transformation.

**Definition:**

For input $\mathbf{x} \in \mathbb{R}^{d_{in}}$, weight matrix $\mathbf{W} \in \mathbb{R}^{d_{in} \times d_{out}}$, and bias $\mathbf{b} \in \mathbb{R}^{d_{out}}$:

$$\text{Linear}(\mathbf{x}) = \mathbf{x} \mathbf{W} + \mathbf{b}$$

**Shape transformation:**
- Input: $[m, d_{in}]$ (batch of $m$ vectors)
- Weight: $[d_{in}, d_{out}]$
- Bias: $[d_{out}]$
- Output: $[m, d_{out}]$

**In GPT-2, linear layers are used for:**
- **QKV projection** (`c_attn`): Projects input to query, key, value vectors
- **Output projection** (`c_proj`): Projects attention output back to embedding dimension
- **FFN up-projection** (`c_fc`): Expands from $d_{model}$ to $4 \cdot d_{model}$
- **FFN down-projection** (`c_proj`): Compresses back to $d_{model}$

**Note:** The `@` operator in NumPy/Python performs matrix multiplication.

In [4]:
def gelu(x):
    """Gaussian Error Linear Unit activation function."""
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))


def softmax(x):
    """Numerically stable softmax function."""
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


def layer_norm(x, g, b, eps: float = 1e-5):
    """Layer normalization."""
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    x = (x - mean) / np.sqrt(variance + eps)  # normalize x to have mean=0 and var=1 over last axis
    return g * x + b  # scale and offset with gamma/beta params


def linear(x, w, b):  # [m, in], [in, out], [out] -> [m, out]
    """Linear transformation."""
    return x @ w + b

### Feed-Forward Network (FFN / MLP)

The Feed-Forward Network (also called MLP or position-wise feed-forward) is applied independently to each position in the sequence. It consists of two linear transformations with a GELU activation in between.

**Definition:**

$$\text{FFN}(\mathbf{x}) = \text{Linear}_2(\text{GELU}(\text{Linear}_1(\mathbf{x})))$$

Or more explicitly:

$$\text{FFN}(\mathbf{x}) = \text{GELU}(\mathbf{x} \mathbf{W}_1 + \mathbf{b}_1) \mathbf{W}_2 + \mathbf{b}_2$$

**Architecture:**
```
Input [n_seq, d_model]
    ↓
Linear (up-projection): d_model → 4 × d_model
    ↓
GELU activation
    ↓
Linear (down-projection): 4 × d_model → d_model
    ↓
Output [n_seq, d_model]
```

**Why 4× expansion?**
- The intermediate dimension is typically $4 \times d_{model}$ (e.g., 768 → 3072 for GPT-2 small)
- This "bottleneck" design allows the network to learn richer representations
- The expansion provides more capacity for non-linear transformations

**In GPT-2:**
- `c_fc`: The up-projection weights (expands dimension)
- `c_proj`: The down-projection weights (compresses back)
- Applied after layer norm in each transformer block

### Scaled Dot-Product Attention

Attention is the core mechanism that allows the model to focus on relevant parts of the input when producing each output. It computes a weighted sum of values, where the weights are determined by the similarity between queries and keys.

**Definition:**

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}} + \text{mask}\right) V$$

where:
- $Q \in \mathbb{R}^{n_q \times d_k}$ — Query matrix (what we're looking for)
- $K \in \mathbb{R}^{n_k \times d_k}$ — Key matrix (what we match against)
- $V \in \mathbb{R}^{n_k \times d_v}$ — Value matrix (what we retrieve)
- $d_k$ — Dimension of keys (used for scaling)
- mask — Causal mask to prevent attending to future tokens

**Step-by-step breakdown:**

1. **Compute attention scores**: $QK^T$ gives similarity between each query and key
2. **Scale**: Divide by $\sqrt{d_k}$ to prevent softmax saturation for large $d_k$
3. **Mask**: Add $-\infty$ (or $-10^{10}$) to positions we shouldn't attend to
4. **Softmax**: Convert scores to probabilities (rows sum to 1)
5. **Weighted sum**: Multiply by $V$ to get the output

**Why scale by $\sqrt{d_k}$?**

Without scaling, when $d_k$ is large, the dot products $q \cdot k$ grow large in magnitude, pushing softmax into regions with extremely small gradients. Scaling by $\sqrt{d_k}$ keeps the variance stable.

**Causal Mask (for GPT):**

For autoregressive models, we use a lower-triangular mask so position $i$ can only attend to positions $\leq i$:

$$\text{mask}_{ij} = \begin{cases} 0 & \text{if } j \leq i \\ -\infty & \text{if } j > i \end{cases}$$

### Multi-Head Attention (MHA)

Multi-Head Attention runs multiple attention operations in parallel, each with different learned projections. This allows the model to jointly attend to information from different representation subspaces.

**Definition:**

$$\text{MHA}(X) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O$$

where each head is:

$$\text{head}_i = \text{Attention}(XW_i^Q, XW_i^K, XW_i^V)$$

**Architecture:**
```
Input X [n_seq, d_model]
    ↓
Linear projection → Q, K, V [n_seq, 3 × d_model]
    ↓
Split into n_head heads → each head has dim d_k = d_model / n_head
    ↓
Parallel attention on each head (with causal mask)
    ↓
Concatenate heads → [n_seq, d_model]
    ↓
Output projection → [n_seq, d_model]
```

**Why Multiple Heads?**

- **Different attention patterns**: Each head can learn to focus on different aspects (e.g., syntax vs. semantics, nearby vs. distant tokens)
- **Richer representations**: More expressive than a single attention with the same total parameters
- **Parallel computation**: All heads compute independently, enabling efficient parallelization

**GPT-2 Configuration:**
| Model | $d_{model}$ | $n_{head}$ | $d_k = d_{model}/n_{head}$ |
|-------|-------------|------------|----------------------------|
| 124M  | 768         | 12         | 64                         |
| 355M  | 1024        | 16         | 64                         |
| 774M  | 1280        | 20         | 64                         |
| 1558M | 1600        | 25         | 64                         |

**In the code:**
- `c_attn`: Combined QKV projection weights $[d_{model}, 3 \times d_{model}]$
- `c_proj`: Output projection weights $[d_{model}, d_{model}]$
- `np.split(..., 3)`: Splits the projection into Q, K, V
- `np.split(..., n_head)`: Splits each into multiple heads

In [5]:
def ffn(x, c_fc, c_proj):  # [n_seq, n_embd] -> [n_seq, n_embd]
    """Feed-forward network (MLP) in transformer."""
    # project up
    a = gelu(linear(x, **c_fc))  # [n_seq, n_embd] -> [n_seq, 4*n_embd]

    # project back down
    x = linear(a, **c_proj)  # [n_seq, 4*n_embd] -> [n_seq, n_embd]

    return x


def attention(q, k, v, mask):  # [n_q, d_k], [n_k, d_k], [n_k, d_v], [n_q, n_k] -> [n_q, d_v]
    """Scaled dot-product attention."""
    return softmax(q @ k.T / np.sqrt(q.shape[-1]) + mask) @ v


def mha(x, c_attn, c_proj, n_head):  # [n_seq, n_embd] -> [n_seq, n_embd]
    """Multi-head attention."""
    # qkv projection
    x = linear(x, **c_attn)  # [n_seq, n_embd] -> [n_seq, 3*n_embd]

    # split into qkv
    qkv = np.split(x, 3, axis=-1)  # [n_seq, 3*n_embd] -> [3, n_seq, n_embd]

    # split into heads
    qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv))  # [3, n_seq, n_embd] -> [3, n_head, n_seq, n_embd/n_head]

    # causal mask to hide future inputs from being attended to
    causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10  # [n_seq, n_seq]

    # perform attention over each head
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]  # [3, n_head, n_seq, n_embd/n_head] -> [n_head, n_seq, n_embd/n_head]

    # merge heads
    x = np.hstack(out_heads)  # [n_head, n_seq, n_embd/n_head] -> [n_seq, n_embd]

    # out projection
    x = linear(x, **c_proj)  # [n_seq, n_embd] -> [n_seq, n_embd]

    return x

### Transformer Block

The transformer block is the fundamental repeating unit of GPT-2. Each block combines self-attention (for mixing information across positions) with a feed-forward network (for processing each position independently), connected via residual connections and layer normalization.

**Definition (Pre-LN Architecture):**

GPT-2 uses the **Pre-LayerNorm** variant, where normalization is applied *before* each sub-layer:

$$\mathbf{x} = \mathbf{x} + \text{MHA}(\text{LayerNorm}(\mathbf{x}))$$
$$\mathbf{x} = \mathbf{x} + \text{FFN}(\text{LayerNorm}(\mathbf{x}))$$

**Architecture Diagram:**
```
Input x [n_seq, d_model]
    │
    ├───────────────────────────┐
    ↓                           │ (residual)
LayerNorm (ln_1)                │
    ↓                           │
Multi-Head Attention            │
    ↓                           │
    + ←─────────────────────────┘
    │
    ├───────────────────────────┐
    ↓                           │ (residual)
LayerNorm (ln_2)                │
    ↓                           │
Feed-Forward Network            │
    ↓                           │
    + ←─────────────────────────┘
    ↓
Output [n_seq, d_model]
```

**Key Components:**

| Component | Purpose | Parameters |
|-----------|---------|------------|
| `ln_1` | Normalize before attention | $\gamma_1, \beta_1$ |
| `attn` | Multi-head self-attention | $W^Q, W^K, W^V, W^O$ |
| `ln_2` | Normalize before FFN | $\gamma_2, \beta_2$ |
| `mlp` | Position-wise feed-forward | $W_1, b_1, W_2, b_2$ |

**Why Residual Connections?**

The `+` operations are **residual (skip) connections**, introduced in ResNet:

$$\text{output} = \text{input} + F(\text{input})$$

Benefits:
- **Gradient flow**: Gradients can flow directly through the skip connection, preventing vanishing gradients in deep networks
- **Identity mapping**: If $F$ outputs zeros, the layer becomes an identity function, making optimization easier
- **Incremental learning**: Each sub-layer learns to add "refinements" to the representation

**Pre-LN vs Post-LN:**

| Aspect | Pre-LN (GPT-2) | Post-LN (Original Transformer) |
|--------|----------------|-------------------------------|
| Formula | $x + \text{SubLayer}(\text{LN}(x))$ | $\text{LN}(x + \text{SubLayer}(x))$ |
| Training | More stable | Can be unstable without warmup |
| Output scale | Grows with depth | Normalized at each layer |

**GPT-2 Stack:**
- 124M: 12 blocks
- 355M: 24 blocks  
- 774M: 36 blocks
- 1558M: 48 blocks

### GPT-2 Forward Pass

The `gpt2()` function is the complete forward pass of the model. It takes token IDs as input and outputs logits (unnormalized probabilities) over the vocabulary for the next token at each position.

**Definition:**

$$\text{GPT-2}(\mathbf{x}) = \text{LayerNorm}(\text{Blocks}(\mathbf{E}_{token} + \mathbf{E}_{pos})) \cdot \mathbf{W}_E^T$$

**Step-by-step:**

1. **Token Embedding**: Look up each input token in the embedding matrix
   $$\mathbf{h}_0^{(token)} = \mathbf{W}_E[\text{input\_ids}] \in \mathbb{R}^{n_{seq} \times d_{model}}$$

2. **Positional Embedding**: Add position information
   $$\mathbf{h}_0 = \mathbf{h}_0^{(token)} + \mathbf{W}_P[0, 1, ..., n_{seq}-1]$$

3. **Transformer Blocks**: Pass through $L$ transformer blocks
   $$\mathbf{h}_l = \text{TransformerBlock}_l(\mathbf{h}_{l-1}) \quad \text{for } l = 1, ..., L$$

4. **Final Layer Norm**: Normalize the output
   $$\mathbf{h}_{out} = \text{LayerNorm}(\mathbf{h}_L)$$

5. **Project to Vocabulary**: Compute logits using the **tied** embedding matrix
   $$\text{logits} = \mathbf{h}_{out} \cdot \mathbf{W}_E^T \in \mathbb{R}^{n_{seq} \times n_{vocab}}$$

**Architecture Overview:**
```
Input token IDs [n_seq]
    ↓
Token Embedding (wte): lookup table [n_vocab, d_model]
    +
Position Embedding (wpe): lookup table [n_ctx, d_model]
    ↓
Hidden states [n_seq, d_model]
    ↓
┌─────────────────────────────────┐
│   Transformer Block 0           │
│   (ln_1 → MHA → ln_2 → FFN)     │
└─────────────────────────────────┘
    ↓
    ... (repeat n_layer times)
    ↓
┌─────────────────────────────────┐
│   Transformer Block L-1         │
└─────────────────────────────────┘
    ↓
Final LayerNorm (ln_f)
    ↓
Matmul with wte.T (weight tying)
    ↓
Logits [n_seq, n_vocab]
```

**Key Concepts:**

| Concept | Explanation |
|---------|-------------|
| **Token Embedding** (`wte`) | Learned lookup table mapping each token ID to a dense vector |
| **Positional Embedding** (`wpe`) | Learned vectors encoding position information (0 to n_ctx-1) |
| **Weight Tying** | Output projection reuses `wte.T` instead of separate weights, reducing parameters |
| **Final LayerNorm** (`ln_f`) | Applied after all blocks, before projecting to vocabulary |

**Weight Tying Insight:**

GPT-2 uses **weight tying** — the same embedding matrix $\mathbf{W}_E$ is used for:
- **Input**: Converting token IDs → vectors (row lookup)
- **Output**: Converting vectors → logits (matrix multiply with transpose)

This reduces parameters and creates a meaningful output space where similar tokens have similar logits.

**GPT-2 Model Sizes:**

| Model | $n_{layer}$ | $d_{model}$ | $n_{head}$ | $n_{ctx}$ | $n_{vocab}$ |
|-------|-------------|-------------|------------|-----------|-------------|
| 124M  | 12          | 768         | 12         | 1024      | 50257       |
| 355M  | 24          | 1024        | 16         | 1024      | 50257       |
| 774M  | 36          | 1280        | 20         | 1024      | 50257       |
| 1558M | 48          | 1600        | 25         | 1024      | 50257       |

In [6]:
def transformer_block(x, mlp, attn, ln_1, ln_2, n_head):  # [n_seq, n_embd] -> [n_seq, n_embd]
    """A single transformer block."""
    # multi-head causal self attention
    x = x + mha(layer_norm(x, **ln_1), **attn, n_head=n_head)  # [n_seq, n_embd] -> [n_seq, n_embd]

    # position-wise feed forward network
    x = x + ffn(layer_norm(x, **ln_2), **mlp)  # [n_seq, n_embd] -> [n_seq, n_embd]

    return x


def gpt2(inputs, wte, wpe, blocks, ln_f, n_head):  # [n_seq] -> [n_seq, n_vocab]
    """GPT-2 forward pass."""
    # token + positional embeddings
    x = wte[inputs] + wpe[range(len(inputs))]  # [n_seq] -> [n_seq, n_embd]

    # forward pass through n_layer transformer blocks
    for block in blocks:
        x = transformer_block(x, **block, n_head=n_head)  # [n_seq, n_embd] -> [n_seq, n_embd]

    # projection to vocab
    x = layer_norm(x, **ln_f)  # [n_seq, n_embd] -> [n_seq, n_embd]
    return x @ wte.T  # [n_seq, n_embd] -> [n_seq, n_vocab]

## 5. Text Generation Function

### Autoregressive Generation

The `generate()` function implements **autoregressive text generation** — the model predicts one token at a time, then feeds that prediction back as input to predict the next token.

**Definition:**

Given input tokens $\mathbf{x} = [x_1, x_2, ..., x_n]$, generate $T$ new tokens:

$$x_{n+t} = \arg\max_{v \in \mathcal{V}} P(v \mid x_1, ..., x_{n+t-1}) \quad \text{for } t = 1, ..., T$$

where $\mathcal{V}$ is the vocabulary and the probability comes from softmax over logits.

**Step-by-step Algorithm:**

```
Input: prompt tokens [x₁, x₂, ..., xₙ], number of tokens T
Output: generated tokens [xₙ₊₁, xₙ₊₂, ..., xₙ₊ₜ]

for t = 1 to T:
    1. Forward pass: logits = GPT2([x₁, ..., xₙ₊ₜ₋₁])
    2. Get last position: last_logits = logits[-1]  # [n_vocab]
    3. Select next token: xₙ₊ₜ = argmax(last_logits)
    4. Append to sequence: [x₁, ..., xₙ₊ₜ₋₁] → [x₁, ..., xₙ₊ₜ]

return [xₙ₊₁, ..., xₙ₊ₜ]
```

**Visual Representation:**

```
Step 1: "The cat" → GPT-2 → logits → argmax → "sat"
Step 2: "The cat sat" → GPT-2 → logits → argmax → "on"
Step 3: "The cat sat on" → GPT-2 → logits → argmax → "the"
...
```

**Why Only Use `logits[-1]`?**

GPT-2 outputs logits for **every position** in the sequence $[n_{seq}, n_{vocab}]$. But for generation, we only care about predicting what comes **after** the last token, so we take `logits[-1]`.

**Decoding Strategies:**

| Strategy | Formula | Properties |
|----------|---------|------------|
| **Greedy** (used here) | $x = \arg\max(logits)$ | Deterministic, fast, can be repetitive |
| **Temperature sampling** | $x \sim \text{softmax}(logits / \tau)$ | $\tau < 1$: sharper, $\tau > 1$: more random |
| **Top-k sampling** | Sample from top $k$ tokens | Limits to most likely options |
| **Top-p (nucleus)** | Sample from smallest set with cumulative $p$ | Adaptive vocabulary size |

**Greedy Decoding:**

This implementation uses **greedy decoding** — always selecting the most probable next token:

$$x_{next} = \arg\max_{v} \text{logits}[v]$$

Pros:
- Simple and fast
- Deterministic (same input → same output)

Cons:
- Can get stuck in repetitive loops
- Misses potentially better sequences (no exploration)
- Not globally optimal (local decisions can lead to suboptimal text)

**Computational Note:**

This naive implementation recomputes the entire sequence at each step. In practice, **KV caching** stores intermediate attention states to avoid redundant computation, making generation $O(n)$ instead of $O(n^2)$ per token.

In [7]:
def generate(inputs, params, n_head, n_tokens_to_generate):
    """Generate tokens autoregressively using greedy decoding."""
    for _ in tqdm(range(n_tokens_to_generate), desc="Generating"):
        logits = gpt2(inputs, **params, n_head=n_head)  # model forward pass
        next_id = np.argmax(logits[-1])  # greedy sampling
        inputs.append(int(next_id))  # append prediction to input

    return inputs[len(inputs) - n_tokens_to_generate:]  # only return generated ids

## 6. Load Model Weights and Tokenizer

We'll load the pre-trained GPT-2 model. The model will be downloaded automatically if not present.

Available model sizes:
- `124M` - Small (default)
- `355M` - Medium
- `774M` - Large
- `1558M` - XL

In [8]:
# Configuration
MODEL_SIZE = "124M"  # Choose from: "124M", "355M", "774M", "1558M"
MODELS_DIR = "models"  # Directory to store downloaded models

# Load encoder (tokenizer), hyperparameters, and model parameters
print(f"Loading GPT-2 {MODEL_SIZE} model...")
encoder, hparams, params = load_encoder_hparams_and_params(MODEL_SIZE, MODELS_DIR)
print("Model loaded successfully!")

Loading GPT-2 124M model...


Fetching checkpoint: 1.00kb [00:00, 2.98Mb/s]                                                       
Fetching encoder.json: 1.04Mb [00:00, 2.60Mb/s]                                                     
Fetching hparams.json: 1.00kb [00:00, 4.52Mb/s]                                                     
Fetching model.ckpt.data-00000-of-00001: 498Mb [00:30, 16.3Mb/s]                                    
Fetching model.ckpt.index: 6.00kb [00:00, 7.34Mb/s]                                                 
Fetching model.ckpt.meta: 472kb [00:00, 1.67Mb/s]                                                   
Fetching vocab.bpe: 457kb [00:00, 1.56Mb/s]                                                         


Model loaded successfully!


In [9]:
# Display model hyperparameters
print("Model Hyperparameters:")
print(f"  - Number of layers (n_layer): {hparams['n_layer']}")
print(f"  - Number of attention heads (n_head): {hparams['n_head']}")
print(f"  - Embedding dimension (n_embd): {hparams['n_embd']}")
print(f"  - Vocabulary size (n_vocab): {hparams['n_vocab']}")
print(f"  - Context length (n_ctx): {hparams['n_ctx']}")

Model Hyperparameters:
  - Number of layers (n_layer): 12
  - Number of attention heads (n_head): 12
  - Embedding dimension (n_embd): 768
  - Vocabulary size (n_vocab): 50257
  - Context length (n_ctx): 1024


## 7. Run Inference

Now let's generate some text! You can modify the prompt and the number of tokens to generate.

In [10]:
# Input prompt
prompt = "Alan Turing theorized that computers would one day become"

# Number of tokens to generate
n_tokens_to_generate = 40

print(f"Prompt: {prompt}")
print(f"Generating {n_tokens_to_generate} tokens...")
print()

Prompt: Alan Turing theorized that computers would one day become
Generating 40 tokens...



In [11]:
# Encode the input prompt
input_ids = encoder.encode(prompt)
print(f"Input token IDs: {input_ids}")
print(f"Number of input tokens: {len(input_ids)}")

# Make sure we don't exceed the context length
assert len(input_ids) + n_tokens_to_generate < hparams["n_ctx"], \
    f"Total tokens ({len(input_ids) + n_tokens_to_generate}) exceeds context length ({hparams['n_ctx']})"

Input token IDs: [36235, 39141, 18765, 1143, 326, 9061, 561, 530, 1110, 1716]
Number of input tokens: 10


In [12]:
# Generate output tokens
output_ids = generate(input_ids, params, hparams["n_head"], n_tokens_to_generate)

# Decode the generated tokens back to text
output_text = encoder.decode(output_ids)

print("\n" + "="*50)
print("Generated Text:")
print("="*50)
print(f"{prompt}{output_text}")

Generating: 100%|██████████| 40/40 [01:14<00:00,  1.85s/it]


Generated Text:
Alan Turing theorized that computers would one day become the most powerful machines on the planet.

The computer is a machine that can perform complex calculations, and it can perform these calculations in a way that is very similar to the human brain.






## 8. Interactive Generation

Try different prompts below!

In [13]:
def generate_text(prompt, n_tokens=40):
    """Helper function to generate text from a prompt."""
    input_ids = encoder.encode(prompt)

    if len(input_ids) + n_tokens >= hparams["n_ctx"]:
        print(f"Warning: Reducing tokens to fit context length")
        n_tokens = hparams["n_ctx"] - len(input_ids) - 1

    output_ids = generate(input_ids, params, hparams["n_head"], n_tokens)
    output_text = encoder.decode(output_ids)

    return prompt + output_text

In [14]:
# Try your own prompts!
my_prompt = "The future of artificial intelligence is"
result = generate_text(my_prompt, n_tokens=50)
print(result)

Generating: 100%|██████████| 50/50 [01:29<00:00,  1.80s/it]

The future of artificial intelligence is uncertain.

"We're not sure what the future will look like," said Dr. Michael S. Schoenfeld, a professor of computer science at the University of California, Berkeley. "But we're not sure what the future will look





In [15]:
# Another example
my_prompt = "In a world where robots"
result = generate_text(my_prompt, n_tokens=50)
print(result)

Generating: 100%|██████████| 50/50 [01:27<00:00,  1.76s/it]

In a world where robots are becoming more and more commonplace, it's important to remember that robots are not just a threat to humanity, but also to the planet.

The robots that are currently in use are the ones that are currently being used to make the most of





## 9. Understanding the Model Architecture

Let's explore the model's parameters to better understand its structure.

In [16]:
# Explore the parameter structure
print("Top-level parameters:")
for key in params.keys():
    if key != 'blocks':
        if isinstance(params[key], dict):
            print(f"  {key}: {list(params[key].keys())}")
        else:
            print(f"  {key}: shape = {params[key].shape}")

print(f"\nNumber of transformer blocks: {len(params['blocks'])}")

Top-level parameters:
  ln_f: ['b', 'g']
  wpe: shape = (1024, 768)
  wte: shape = (50257, 768)

Number of transformer blocks: 12


In [18]:
# Explore a single transformer block
block = params['blocks'][0]
print("Structure of a transformer block:")
for key, value in block.items():
    if isinstance(value, dict):
        print(f"  {key}:")
        for k, v in value.items():
            # Check if v is also a dictionary, if so, iterate further
            if isinstance(v, dict):
                print(f"    {k}:")
                for sub_k, sub_v in v.items():
                    if isinstance(sub_v, np.ndarray):
                        print(f"      {sub_k}: shape = {sub_v.shape}")
                    else:
                        print(f"      {sub_k}: {type(sub_v)}")
            elif isinstance(v, np.ndarray): # if v is a numpy array directly
                print(f"    {k}: shape = {v.shape}")
            else: # Fallback for unexpected types for v
                print(f"    {k}: {type(v)}")
    elif isinstance(value, np.ndarray): # if value is a numpy array directly
        print(f"  {key}: shape = {value.shape}")
    else: # Fallback for unexpected types for value
        print(f"  {key}: {type(value)}")

Structure of a transformer block:
  attn:
    c_attn:
      b: shape = (2304,)
      w: shape = (768, 2304)
    c_proj:
      b: shape = (768,)
      w: shape = (768, 768)
  ln_1:
    b: shape = (768,)
    g: shape = (768,)
  ln_2:
    b: shape = (768,)
    g: shape = (768,)
  mlp:
    c_fc:
      b: shape = (3072,)
      w: shape = (768, 3072)
    c_proj:
      b: shape = (768,)
      w: shape = (3072, 768)


In [19]:
# Calculate total number of parameters
def count_params(d):
    total = 0
    for key, value in d.items():
        if isinstance(value, dict):
            total += count_params(value)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    total += count_params(item)
        elif isinstance(value, np.ndarray):
            total += value.size
    return total

total_params = count_params(params)
print(f"Total number of parameters: {total_params:,}")
print(f"Approximately: {total_params / 1e6:.1f}M parameters")

Total number of parameters: 124,439,808
Approximately: 124.4M parameters


## 10. Tokenizer Exploration

Let's see how the BPE tokenizer works.

In [20]:
# Encode and decode examples
test_texts = [
    "Hello, world!",
    "GPT-2 is a large language model.",
    "The quick brown fox jumps over the lazy dog.",
]

for text in test_texts:
    tokens = encoder.encode(text)
    decoded = encoder.decode(tokens)
    print(f"Original: '{text}'")
    print(f"Tokens: {tokens}")
    print(f"Decoded: '{decoded}'")
    print(f"Number of tokens: {len(tokens)}")
    print()

Original: 'Hello, world!'
Tokens: [15496, 11, 995, 0]
Decoded: 'Hello, world!'
Number of tokens: 4

Original: 'GPT-2 is a large language model.'
Tokens: [38, 11571, 12, 17, 318, 257, 1588, 3303, 2746, 13]
Decoded: 'GPT-2 is a large language model.'
Number of tokens: 10

Original: 'The quick brown fox jumps over the lazy dog.'
Tokens: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]
Decoded: 'The quick brown fox jumps over the lazy dog.'
Number of tokens: 10

