In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

"""
    1. Layer Normalization Implementation
    * A crucial component for stabilizing deep neural networks like Transformers.
    * Purpose: Normalizes input tensors to have zero mean and unit variance along the last dimension (feature dimension). 
    * Stabilizes training and accelerates convergence.
    * Usage: Commonly applied before/after attention and feed-forward layers in Transformers.

    # Inherits from nn.Module, making it a custom PyTorch layer.
    # emb_dim: The dimension of the input embeddings (e.g., 768 in GPT-2).
    # eps: A small constant to prevent division by zero in variance calculations.
    # scale and shift: Learnable parameters (γ and β in the formula).
       * scale (γ) starts as a vector of ones (initially preserves the input scale).
       * shift (β) starts as a vector of zeros (initially preserves the input center).
       
    # x.mean(dim=-1, keepdim=True): Calculates the mean along the last dimension (e.g., embedding dimension).
       * keepdim=True ensures the output has the same number of dimensions (e.g., shape [batch, seq_len, 1]).
    # x.var(dim=-1, keepdim=True, unbiased=False): Computes variance along the last dimension. 
       * unbiased=False uses a biased estimator (divides by n instead of n-1), which is standard in deep learning.

    # norm_x: Standardizes the input to have mean 0 and variance 1. The eps prevents numerical instability.

    # self.scale * norm_x + self.shift: Applies learnable parameters to transform the normalized data. 
      * This allows the model to adaptively adjust the output distribution.

    # Example:
      * Input x: Tensor of shape [batch_size, seq_len, emb_dim].
      * Output: Tensor of the same shape, normalized and transformed.
"""

class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [2]:
"""
    2. GELU Activation Function
    * Unlike ReLU (which has a sharp corner at 0), GELU is smooth and differentiable everywhere, leading to better gradient flow.

    # No learnable parameters are needed, so the constructor simply calls the parent class initializer.
    # GELU(x) ≈ 0.5x * (1 + tanh(√(2/π) * (x + 0.044715x³))): This implements the approximate GELU formula

    * In PyTorch:
      the def forward(self, x): method is the core function of any custom neural network module (class that inherits from nn.Module). 
      It defines the actual computation that happens when you pass input data through the model.
"""

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))
        ))

In [3]:
"""
    3. FeedForward Network
    
    A. Expansion Layer:
       * nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim'])
       * Increases the dimensionality of each token's representation by a factor of 4.
       * Example: If emb_dim = 768, this layer expands it to 3072 dimensions.
       * Why? Creates a higher-dimensional space where complex patterns can be learned.

    B. Activation Function:
       * GELU(): Applies the Gaussian Error Linear Unit activation.
       * Introduces non-linearity, allowing the network to learn complex functions.
       * Why GELU? Its smooth nature provides better gradients than ReLU for deep networks.

    C. Contraction Layer
       * nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
       * Projects the expanded representation back down to the original dimensionality.
       * Example: From 3072 dimensions back to 768.
       * Why? Maintains consistent input/output dimensions, allowing the block to be stacked.    

    D. Forward Pass
       * Simply passes the input through the sequential layers.
       * Input shape: [batch_size, seq_len, emb_dim]
       * Output shape: [batch_size, seq_len, emb_dim] (same as input)

    E. Example 
       * Expansion: 768 → 3072 parameters
       * Contraction: 3072 → 768 parameters
       * Total parameters in FFN: ~(768×3072 + 3072×768) ≈ 4.7 million parameters per block
"""

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
        )

    def forward(self, x):
        return self.layers(x)

In [4]:
"""
    4. Multi-Head Attention
"""


# Implementing multi-head attention with weight splits
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, 
                 context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()

        # Ensures the output dimension can be split evenly among heads
        # Example: If d_out=8 and num_heads=4, each head gets 2 dimensions
        assert (d_out % num_heads == 0), \
            'd_out must be divisible by num_heads'

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads    
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)# Single large projection matrices (more efficient than separate ones per head)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)   # Optional layer to mix information from different heads
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)         # [b, num_tokens, d_out]
        queries = self.W_query(x)    # [b, num_tokens, d_out]
        values = self.W_value(x)     # [b, num_tokens, d_out]

        """
            Reshape for Multiple Heads
            Reshapes [b, T, d_out] → [b, T, h, d_h] where d_out = h * d_h
            Example: [2, 6, 8] → [2, 6, 4, 2] (4 heads, 2 dims each)
        """
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)  
        queries = queries.view(                                             
            b, num_tokens, self.num_heads, self.head_dim                    
        )                                                                   

        """
            Transpose for Batch Computation
            Rearranges to [batch, heads, tokens, dims_per_head]
            Allows parallel computation across heads
        """
        keys = keys.transpose(1, 2)          # [b, h, T, d_h]
        queries = queries.transpose(1, 2)    # [b, h, T, d_h]
        values = values.transpose(1, 2)      # [b, h, T, d_h]

        """
            Compute Attention Scores
            Batched matrix multiplication across all heads
            Computes all attention scores in parallel
        """
        attn_scores = queries @ keys.transpose(2, 3)  

        """
            Apply Causal Mask
            Uses pre-computed triangular mask
            Blocks future tokens for autoregressive generation
        """
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        
        # Softmax: Standard scaled softmax attention
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1)

        # Dropout for regularization
        attn_weights = self.dropout(attn_weights)

        """
            Apply Attention to Values
            Weighted sum of values
            ranspose back to [batch, tokens, heads, dims]
        """
        context_vec = (attn_weights @ values).transpose(1, 2)   # [b, T, h, d_h]

        """
            Combine Heads
            Flatten heads: [b, T, h, d_h] → [b, T, h*d_h] = [b, T, d_out]
            Example: [2, 6, 4, 2] → [2, 6, 8]
        """
        context_vec = context_vec.contiguous().view(
            b, num_tokens, self.d_out
        )

        """
            Output Projection
            Optional linear transformation
            Helps mix information across heads
        """
        context_vec = self.out_proj(context_vec)    #11
        return context_vec

In [5]:
"""
    5. Transformer Block
      * Transformer Block is the fundamental building block of models like GPT.
      * It combines self-attention and feed-forward layers with normalization and residual connections. 

      A. self.att: Multi-head self-attention mechanism.
        * d_in=d_out=cfg['emb_dim']: Input/output dimensions match (e.g., 768).
        * context_length: Maximum sequence length (for causal masking).
        * num_heads: Number of attention heads (e.g., 12).
        * dropout: Dropout rate for attention weights.
        * qkv_bias: Whether to use bias in query/key/value projections.
        
      B. self.ff: Feed-forward network (expands to 4× dims, then contracts).
      C. self.norm1, self.norm2: Layer normalization applied before attention and FFN (Pre-LN architecture).
      D. self.drop_shortcut: Dropout applied to the output of sub-layers before adding to the shortcut.
      
"""

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            context_length=cfg['context_length'],
            num_heads=cfg['n_heads'], 
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        # Self-attention with shortcut connection
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        
        # Feed-forward with shortcut connection
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        
        return x

In [6]:
"""
    6. GPT Model Architecture
    * This code defines the complete GPT model architecture, which is the core of generative language models like GPT-2/GPT-3.

    A. Initialization (__init__)
      * tok_emb: Token embedding layer. Converts token IDs (integers) to dense vectors of size emb_dim.
      * pos_emb: Position embedding layer.
        Adds information about token positions (learnable embeddings for each position up to context_length).
      * drop_emb: Dropout applied to the combined embeddings for regularization.
      * trf_blocks: A stack of identical TransformerBlock layers (e.g., 12 layers for GPT-2 Small). 
        This is the core processing unit.
      * final_norm: Final layer normalization for stability.
      * out_head: Linear layer that projects final hidden states to vocabulary-sized logits (scores for each token).

    B Forward Pass (forward)
      Step 1: Token Embeddings: 
        * Converts input token IDs (e.g., [batch_size, seq_len]) to dense embeddings.
        
      Step 2: Position Embeddings: 
        * Generates position embeddings for each position in the sequence (0 to seq_len-1).
        * These are added to token embeddings to give the model information about word order.
        
      Step 3: Combine and Regularize:
        * The combination allows each token representation to encode both semantic and positional information.
        
      Step 4: Transformer Blocks:
        * The input passes through multiple TransformerBlock layers (e.g., 12).
        * Each block applies:
          - Multi-head self-attention (contextualization)
          - Feed-forward network (feature transformation)
          - Residual connections and normalization
          
      Step 5: Final Output:
         * Final normalization: Stabilizes the outputs.
         * Output projection: Converts final hidden states to logits (scores) for each token in the vocabulary.

      * Input/Output
        - Input: Integer tensor of token IDs, shape [batch_size, seq_len].
        - Output: Logits tensor, shape [batch_size, seq_len, vocab_size].
             * For each position in the sequence, returns scores for all possible next tokens.

      * Stacked Transformers
         - Multiple blocks process the sequence iteratively:
           - Early layers: Capture local patterns and syntax.
           - Middle layers: Build semantic understanding.
           - Later layers: Develop complex reasoning.         

"""

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )
        
        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        
        # Token embeddings
        tok_embeds = self.tok_emb(in_idx)
        
        # Position embeddings
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        
        # Combine embeddings
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        
        # Transformer blocks
        x = self.trf_blocks(x)
        
        # Final normalization and output
        x = self.final_norm(x)
        logits = self.out_head(x)
        
        return logits

In [7]:
"""
    7. Model Configuration
    
    * This code defines a configuration dictionary for the GPT-2 Small model (124 million parameters).
    * Each parameter controls a specific aspect of the model's architecture and behavior:
    
    * 'vocab_size': 50257: 
      - Meaning: The number of unique tokens in the model's vocabulary.
      - Purpose: Determines the size of the token embedding layer and the output projection.
      - Note: 50,257 is the vocabulary size used by GPT-2's Byte Pair Encoding (BPE) tokenizer.

    * 'context_length': 1024
      - Meaning: The maximum sequence length (in tokens) the model can process.
      - Purpose: Defines the maximum input length for positional embeddings and the causal attention mask.
      - Implication: Inputs longer than 1024 tokens must be truncated or chunked.

    * 'emb_dim': 768
      - Meaning: The dimensionality of token and position embeddings.
      - Purpose: Controls the size of all hidden representations throughout the model.
      - Calculation: Affects the size of all linear layers and the attention mechanism.

    * 'n_heads': 12
      - Meaning: The number of parallel attention heads.
      - Purpose: Allows the model to focus on different types of linguistic patterns simultaneously.
      - Constraint: Must evenly divide emb_dim (768 ÷ 12 = 64 dimensions per head).

    * 'n_layers': 12
      - Meaning: The number of transformer blocks stacked sequentially.
      - Purpose: Determines the depth of the model. Each layer refines the representations further.
      - Note: More layers generally increase capacity but also computational cost.

    * 'drop_rate': 0.1
      - Meaning: The dropout probability (10% of activations are randomly zeroed during training).
      - Purpose: Regularization to prevent overfitting.
      - Applied: In attention weights, feed-forward networks, and embeddings.

    * 'qkv_bias': False
      - Meaning: Whether to include bias terms in the Query, Key, and Value linear projections.
      - Purpose: Small optimization that reduces parameters slightly (768 × 3 biases omitted).
      - Note: Following GPT-2's original design choice.

    * Architectural Implications:
      - Token Embeddings: 50257 × 768 parameters
      - Position Embeddings: 1024 × 768 parameters
      - Feed-Forward Expansion: 768 × 4 = 3072 hidden dimensions in FFN
      - Attention Head Dimension: 768 ÷ 12 = 64 dimensions per head
      - Total Parameters: ~124 million
"""

GPT_CONFIG_124M = {
    'vocab_size': 50257,     # Vocabulary size
    'context_length': 1024,  # Context length
    'emb_dim': 768,          # Embedding dimension
    'n_heads': 12,           # Number of attention heads
    'n_layers': 12,          # Number of layers
    'drop_rate': 0.1,        # Dropout rate
    'qkv_bias': False        # Query-Key-Value bias
}

In [8]:
"""
    8. Text Generation Function
    * This code implements a simple greedy autoregressive text generation function for GPT-like models. 
      It generates text one token at a time by always choosing the most likely next token. 

    * model: The trained GPT model.
    * idx: Input tensor of token indices (shape: [batch_size, num_tokens]).
    * max_new_tokens: Number of new tokens to generate.
    * context_size: Maximum context length the model can handle (e.g., 1024).

    A. Loop for Token Generation
     *  Loop for Token Generation:
       - for _ in range(max_new_tokens): 
       - Generates one token per iteration until max_new_tokens are created.
       
     * Context Cropping: 
       - Truncates the input to the last context_size tokens.
       - Why? GPT models have a fixed context window. If the sequence exceeds this, we only use the most recent tokens.
       
     * Model Prediction:Feeds the cropped context through the model to get predictions (logits).
       - logits shape: [batch_size, context_size, vocab_size]
       
     * Focus on Last Token
      - logits = logits[:, -1, :]
      - Extracts the logits for only the last token position.
      - Why? We only care about the next-token prediction.
      - New shape: [batch_size, vocab_size]
      
     * Convert to Probabilities
      - probas = torch.softmax(logits, dim=-1)
      - Applies softmax to convert logits to probabilities.
      - probas shape: [batch_size, vocab_size] (sums to 1 along vocabulary dimension).

     * Greedy Sampling
      - idx_next = torch.argmax(probas, dim=-1, keepdim=True)
      - Selects the token with the highest probability.
      - keepdim=True preserves dimensions: [batch_size, 1] instead of [batch_size].

     * Append New Token
      - idx = torch.cat((idx, idx_next), dim=1)
      - Appends the new token to the existing sequence.
      - The sequence grows by one token each iteration.

     * Return Result
      - return idx
      - Returns the extended sequence containing both original input and generated tokens.
"""

def generate_text_simple(model, idx, max_new_tokens, context_size):

    # Loop for Token Generation
    # Generates one token per iteration until max_new_tokens are created.
    for _ in range(max_new_tokens):
        # Crop context if needed
        idx_cond = idx[:, -context_size:]
        
        # Get predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus on last time step
        logits = logits[:, -1, :]
        
        # Get probabilities
        probas = torch.softmax(logits, dim=-1)
        
        # Sample next token (greedy)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        
        # Append to sequence
        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx

In [9]:
"""
    9. Parameter Counting and Memory Calculation
    * Calculate the number of parameters in a PyTorch model and estimate its memory usage. 
"""

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def calculate_memory_usage(model, dtype_size=4):
    total_params = count_parameters(model)
    total_size_bytes = total_params * dtype_size
    total_size_mb = total_size_bytes / (1024 * 1024)
    return total_params, total_size_mb

# Example usage
model = GPTModel(GPT_CONFIG_124M)
total_params, total_size_mb = calculate_memory_usage(model)
print(f"Total parameters: {total_params:,}")
print(f"Model size: {total_size_mb:.2f} MB")

Total parameters: 163,009,536
Model size: 621.83 MB


In [10]:
"""
    10. Testing the Implementation
"""

import tiktoken

# Initialize model
model = GPTModel(GPT_CONFIG_124M)

tokenizer = tiktoken.get_encoding('gpt2')
batch = []
txt1 = 'Every effort moves you'
txt2 = 'Every day holds a'

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

# Forward pass
logits = model(batch)
print("Output shape:", logits.shape)

# Count parameters
total_params = count_parameters(model)
print(f"Total parameters: {total_params:,}")

# Calculate memory usage
_, total_size_mb = calculate_memory_usage(model)
print(f"Model size: {total_size_mb:.2f} MB")

Output shape: torch.Size([2, 4, 50257])
Total parameters: 163,009,536
Model size: 621.83 MB


In [11]:
start_context = 'Hello, I am'
encoded = tokenizer.encode(start_context)
print('encoded:', encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)    #1
print('encoded_tensor.shape:', encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [12]:
model.eval()                  #1
out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG_124M['context_length']
)
print('Output:', out)
print('Output length:', len(out[0]))

Output: tensor([[15496,    11,   314,   716, 15309, 37606, 20783, 24901, 36086, 48535]])
Output length: 10


In [13]:
"""
As we can see, the model generated gibberish, which is not at all like the coherent text Hello, I am a model ready to help.
What happened? The reason the model is unable to produce coherent text is that we haven’t trained it yet. 
So far, we have only implemented the GPT architecture and initialized a GPT model instance with initial random weights.
Model training is a large topic in itself, and we will tackle it in the next chapter.
"""

decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I amAlexittowalker legends Duo Frankie
