# Building Transformer Blocks

This notebook provides an interactive guide to understanding this component of GPT.


In [None]:
# Import necessary libraries
import torch
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.abspath('')))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import our transformer block components
from src.model.blocks import LayerNorm, GELU, FeedForward
from src.model.attention import MultiHeadAttention
from src.model.gpt import TransformerBlock
from src.config import GPTConfig

## Building Transformer Blocks

This notebook explores the building blocks of a transformer: Layer Normalization, GELU activation, Feed-Forward networks, and how they combine into a Transformer Block.

### 1. Layer Normalization

Layer normalization stabilizes training by normalizing inputs across the feature dimension.

In [None]:
# Create a LayerNorm module
embedding_dim = 64
layer_norm = LayerNorm(embedding_dim)

# Test with random input
batch_size = 2
seq_len = 10
x = torch.randn(batch_size, seq_len, embedding_dim)

print(f"Input shape: {x.shape}")
print(f"Input mean (per sequence): {x.mean(dim=-1)[0, :5]}")
print(f"Input std (per sequence): {x.std(dim=-1)[0, :5]}")

# Apply layer norm
normalized = layer_norm(x)

print(f"\nAfter LayerNorm:")
print(f"Output shape: {normalized.shape}")
print(f"Output mean (per sequence): {normalized.mean(dim=-1)[0, :5]}")
print(f"Output std (per sequence): {normalized.std(dim=-1)[0, :5]}")

# Verify normalization (should be close to 0 mean, 1 std)
print(f"\nVerification:")
print(f"Mean close to 0: {normalized.mean(dim=-1).abs().mean().item():.6f}")
print(f"Std close to 1: {normalized.std(dim=-1).mean().item():.6f}")

In [None]:
# Visualize the effect of layer normalization
x = torch.randn(1, 20, embedding_dim)
normalized = layer_norm(x)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Before normalization
axes[0].plot(x[0, :, 0].detach().numpy(), label='Feature 0', alpha=0.7)
axes[0].plot(x[0, :, 1].detach().numpy(), label='Feature 1', alpha=0.7)
axes[0].set_title('Before LayerNorm')
axes[0].set_xlabel('Sequence Position')
axes[0].set_ylabel('Value')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# After normalization
axes[1].plot(normalized[0, :, 0].detach().numpy(), label='Feature 0', alpha=0.7)
axes[1].plot(normalized[0, :, 1].detach().numpy(), label='Feature 1', alpha=0.7)
axes[1].set_title('After LayerNorm')
axes[1].set_xlabel('Sequence Position')
axes[1].set_ylabel('Value')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### 2. GELU Activation Function

GELU (Gaussian Error Linear Unit) is a smooth activation function used in GPT models.

In [None]:
# Create GELU activation
gelu = GELU()

# Compare GELU with ReLU
x = torch.linspace(-4, 4, 100)
gelu_output = gelu(x)
relu_output = torch.relu(x)

plt.figure(figsize=(10, 6))
plt.plot(x.numpy(), gelu_output.numpy(), label='GELU', linewidth=2)
plt.plot(x.numpy(), relu_output.numpy(), label='ReLU', linewidth=2, linestyle='--')
plt.xlabel('Input', fontsize=12)
plt.ylabel('Output', fontsize=12)
plt.title('GELU vs ReLU Activation Functions', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

print("Key differences:")
print("- GELU is smooth (differentiable everywhere)")
print("- GELU allows small negative values through")
print("- GELU is used in GPT models")

### 3. Feed-Forward Network

The feed-forward network applies two linear transformations with GELU activation in between.

In [None]:
# Create a FeedForward network
embedding_dim = 64
ffn = FeedForward(embedding_dim)

# Count parameters
total_params = sum(p.numel() for p in ffn.parameters())
print(f"FeedForward network:")
print(f"  Embedding dimension: {embedding_dim}")
print(f"  Hidden dimension: {4 * embedding_dim} (4x expansion)")
print(f"  Total parameters: {total_params:,}")

# Test forward pass
batch_size = 2
seq_len = 10
x = torch.randn(batch_size, seq_len, embedding_dim)

print(f"\nInput shape: {x.shape}")
output = ffn(x)
print(f"Output shape: {output.shape}")
print(f"Shape preserved: {x.shape == output.shape}")

In [None]:
# Visualize the feed-forward transformation
x = torch.randn(1, 1, embedding_dim)
output = ffn(x)

# Get intermediate activations by accessing the network layers
with torch.no_grad():
    # First linear layer
    hidden = ffn.net[0](x)
    print(f"After first linear: {hidden.shape}")
    print(f"  Mean: {hidden.mean().item():.4f}, Std: {hidden.std().item():.4f}")
    
    # After GELU
    activated = ffn.net[1](hidden)
    print(f"After GELU: {activated.shape}")
    print(f"  Mean: {activated.mean().item():.4f}, Std: {activated.std().item():.4f}")
    
    # Final output
    final = ffn.net[2](activated)
    print(f"Final output: {final.shape}")
    print(f"  Mean: {final.mean().item():.4f}, Std: {final.std().item():.4f}")

### 4. Multi-Head Attention

We've seen attention before, but let's examine it in the context of transformer blocks.

In [None]:
# Create attention layer
context_length = 32
num_heads = 4

attention = MultiHeadAttention(
    input_dimension=embedding_dim,
    output_dimension=embedding_dim,
    context_length=context_length,
    dropout=0.0,
    number_of_heads=num_heads
)

print(f"Multi-Head Attention:")
print(f"  Input/Output dimension: {embedding_dim}")
print(f"  Number of heads: {num_heads}")
print(f"  Head dimension: {embedding_dim // num_heads}")
print(f"  Context length: {context_length}")

# Test forward pass
x = torch.randn(batch_size, seq_len, embedding_dim)
attn_output = attention(x)

print(f"\nInput shape: {x.shape}")
print(f"Output shape: {attn_output.shape}")
print(f"Shape preserved: {x.shape == attn_output.shape}")

### 5. Complete Transformer Block

A transformer block combines attention and feed-forward with residual connections and layer normalization.

In [None]:
# Create a transformer block
config = GPTConfig(
    vocab_size=50257,
    context_length=context_length,
    embedding_dimension=embedding_dim,
    number_of_heads=num_heads,
    number_of_layers=1,  # Just for config
    dropout_rate=0.0
)

transformer_block = TransformerBlock(config)

print("Transformer Block components:")
print(f"  - Multi-Head Attention")
print(f"  - LayerNorm (x2)")
print(f"  - Feed-Forward Network")
print(f"  - Residual connections (x2)")
print(f"  - Dropout (disabled for this example)")

# Count parameters
total_params = sum(p.numel() for p in transformer_block.parameters())
print(f"\nTotal parameters: {total_params:,}")

In [None]:
# Test transformer block forward pass
x = torch.randn(batch_size, seq_len, embedding_dim)
print(f"Input shape: {x.shape}")
print(f"Input mean: {x.mean().item():.4f}, std: {x.std().item():.4f}")

output = transformer_block(x)

print(f"\nOutput shape: {output.shape}")
print(f"Output mean: {output.mean().item():.4f}, std: {output.std().item():.4f}")
print(f"Shape preserved: {x.shape == output.shape}")

# Verify residual connections work
print(f"\nOutput is different from input (residuals added): {not torch.allclose(x, output)}")

## Understanding Residual Connections

Residual connections help with gradient flow and allow the model to learn identity mappings when needed.

In [None]:
# Demonstrate residual connection behavior
x = torch.randn(1, 5, embedding_dim)

# Manually trace through the transformer block to see residuals
# This is a simplified view - actual implementation is in the block

# First sub-layer: Attention with residual
residual1 = x.clone()
# In actual block: x = norm1(x), then attention, then x = x + residual1
# For visualization, we'll show the concept

print("Transformer Block Flow:")
print(f"1. Input: {x.shape}")
print(f"2. LayerNorm → Attention → Add residual")
print(f"3. LayerNorm → FeedForward → Add residual")
print(f"4. Output: {output.shape}")

# Show that output = input + transformation
transformation = output - x
print(f"\nTransformation magnitude: {transformation.norm().item():.4f}")
print(f"Input magnitude: {x.norm().item():.4f}")
print(f"Output magnitude: {output.norm().item():.4f}")

## Stacking Multiple Blocks

Let's see how multiple transformer blocks process information sequentially.

In [None]:
# Create multiple transformer blocks
num_layers = 3
config = GPTConfig(
    vocab_size=50257,
    context_length=context_length,
    embedding_dimension=embedding_dim,
    number_of_heads=num_heads,
    number_of_layers=num_layers,
    dropout_rate=0.0
)

# Create blocks manually to track intermediate outputs
blocks = [TransformerBlock(config) for _ in range(num_layers)]

x = torch.randn(1, seq_len, embedding_dim)
print(f"Initial input shape: {x.shape}")
print(f"Initial input norm: {x.norm().item():.4f}\n")

# Process through each block
intermediate_outputs = [x]
for i, block in enumerate(blocks):
    x = block(x)
    intermediate_outputs.append(x)
    print(f"After block {i+1}: norm = {x.norm().item():.4f}, mean = {x.mean().item():.4f}")

print(f"\nFinal output shape: {x.shape}")

In [None]:
# Visualize how representations change through layers
if len(intermediate_outputs) > 1:
    # Compute statistics for each layer
    norms = [out.norm().item() for out in intermediate_outputs]
    means = [out.mean().item() for out in intermediate_outputs]
    stds = [out.std().item() for out in intermediate_outputs]
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    layers = list(range(len(intermediate_outputs)))
    
    axes[0].plot(layers, norms, marker='o', linewidth=2, markersize=8)
    axes[0].set_xlabel('Layer', fontsize=12)
    axes[0].set_ylabel('Norm', fontsize=12)
    axes[0].set_title('Output Norm Through Layers', fontsize=12, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(layers, means, marker='s', linewidth=2, markersize=8, color='orange')
    axes[1].set_xlabel('Layer', fontsize=12)
    axes[1].set_ylabel('Mean', fontsize=12)
    axes[1].set_title('Output Mean Through Layers', fontsize=12, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    axes[1].axhline(0, color='black', linewidth=0.5, linestyle='--')
    
    axes[2].plot(layers, stds, marker='^', linewidth=2, markersize=8, color='green')
    axes[2].set_xlabel('Layer', fontsize=12)
    axes[2].set_ylabel('Std', fontsize=12)
    axes[2].set_title('Output Std Through Layers', fontsize=12, fontweight='bold')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()