# GPT Architecture Exploration

This notebook explores the mathematical foundations and implementation details of the GPT architecture.

In [None]:
import torch
import torch.nn as nn
import math
import matplotlib.pyplot as plt
import numpy as np

# Set up plotting
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Mathematical Foundation of Attention

### Scaled Dot-Product Attention

The core of attention mechanism is:

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

Where:
- $Q$, $K$, $V$ are the query, key, and value matrices
- $d_k$ is the dimension of keys
- The scaling factor $\frac{1}{\sqrt{d_k}}$ prevents large values from making softmax too sharp

In [None]:
def scaled_dot_product_attention(Q, K, V, d_k):
    '''Implementation of scaled dot-product attention'''
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
    attention_weights = torch.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output, attention_weights

# Example usage
batch_size, seq_len, d_k = 2, 4, 8
Q = torch.randn(batch_size, seq_len, d_k)
K = torch.randn(batch_size, seq_len, d_k)
V = torch.randn(batch_size, seq_len, d_k)

output, weights = scaled_dot_product_attention(Q, K, V, d_k)
print(f"Input shapes: Q={Q.shape}, K={K.shape}, V={V.shape}")
print(f"Output shape: {output.shape}")

## 2. Causal Attention Implementation

In GPT, we need causal (masked) attention where each token can only attend to previous tokens:

$$\text{mask}_{i,j} = \begin{cases}
0 & \text{if } i \geq j \\
-\infty & \text{if } i < j
\end{cases}$$

In [None]:
def create_causal_mask(seq_len, device='cpu'):
    '''Create causal mask for GPT'''
    mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.uint8, device=device))
    return mask.unsqueeze(0).unsqueeze(1)  # Add batch and head dimensions

# Create a causal mask
mask = create_causal_mask(4, device='cpu')
print("Causal mask:")
print(mask.squeeze())

## 3. Positional Encoding

Positional encoding is essential for sequence modeling:

$$\text{PE}(pos, 2i) = \sin\left(\frac{pos}{10000^{2i/d_{model}}}\right)$$
$$\text{PE}(pos, 2i+1) = \cos\left(\frac{pos}{10000^{2i/d_{model}}}\right)$$

In [None]:
def positional_encoding(position, d_model):
    '''Generate positional encoding'''
    pos = torch.arange(0, position, dtype=torch.float).unsqueeze(1)
    i = torch.arange(0, d_model, 2, dtype=torch.float)
    
    # Calculate angles
    angles = pos / torch.pow(10000, 2 * i / d_model)
    
    # Apply sin and cos
    pe = torch.zeros(position, d_model)
    pe[:, 0::2] = torch.sin(angles)  # Even indices
    pe[:, 1::2] = torch.cos(angles)  # Odd indices
    
    return pe

# Generate positional encoding
pe = positional_encoding(10, 8)
print("Positional encoding shape:", pe.shape)
print("First few rows:")
print(pe[:5])

## 4. GPT Block Architecture

Each GPT block consists of:
1. Multi-head attention with residual connection and layer norm
2. Feed-forward network with residual connection and layer norm

In [None]:
class GPTBlock(nn.Module):
    '''Single GPT block'''
    
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        
        # Multi-head attention
        self.attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        
        # Feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with causal mask
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Feed-forward
        ff_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ff_out))
        
        return x

# Test the GPT block
block = GPTBlock(512, 8, 2048)
x = torch.randn(2, 10, 512)  # batch_size=2, seq_len=10, d_model=512
mask = create_causal_mask(10)

output = block(x, mask)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")

## 5. Training Dynamics

Understanding how GPT learns to generate text through the training process.

In [None]:
# Simulate loss calculation
def calculate_loss(predictions, targets):
    '''Calculate cross-entropy loss'''
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    batch_size, seq_len, vocab_size = predictions.shape
    predictions = predictions.view(-1, vocab_size)
    targets = targets.view(-1)
    return criterion(predictions, targets)

# Example
predictions = torch.randn(2, 5, 1000)  # batch_size=2, seq_len=5, vocab_size=1000
targets = torch.randint(0, 1000, (2, 5))  # batch_size=2, seq_len=5

loss = calculate_loss(predictions, targets)
print(f"Sample loss: {loss.item():.4f}")

## 6. Model Architecture Visualization

Visualizing how information flows through the GPT architecture.

In [None]:
# Create a simple visualization of the architecture
print("GPT Architecture Summary:")
print("1. Input Embeddings")
print("   - Token Embedding")
print("   - Positional Encoding")
print("2. Transformer Blocks (N layers)")
print("   - Multi-head Causal Attention")
print("   - Feed-forward Network")
print("   - Residual Connections")
print("   - Layer Normalization")
print("3. Output Layer")
print("   - Linear Projection")
print("   - Softmax")