In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Optional

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Encoder Layer üî∑

An encoder layer consists of:
1. **Multi-Head Self-Attention** (with residual + layer norm)
2. **Position-wise Feed-Forward** (with residual + layer norm)

### Mathematical Formula

For input $X \in \mathbb{R}^{\text{seq_len} \times d_{\text{model}}}$:

$$
\begin{align}
Z &= \text{LayerNorm}(X + \text{MultiHeadAttn}(X, X, X)) \\
\text{Output} &= \text{LayerNorm}(Z + \text{FFN}(Z))
\end{align}
$$

**Note**: This is **Pre-LN** (Layer Norm before sub-layer). Post-LN is also common.

In [None]:
class MultiHeadAttention(nn.Module):
    """Multi-Head Attention from Tutorial 3"""
    def __init__(self, d_model: int = 512, n_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections and reshape to (batch, n_heads, seq_len, d_k)
        Q = self.W_q(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        context = torch.matmul(attention_weights, V)
        
        # Reshape and apply output projection
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_o(context)
        
        return output, attention_weights


class PositionWiseFeedForward(nn.Module):
    """Position-wise FFN from Tutorial 4"""
    def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: (batch, seq_len, d_model)
        return self.fc2(self.dropout(F.relu(self.fc1(x))))


class EncoderLayer(nn.Module):
    """Single Transformer Encoder Layer"""
    def __init__(self, d_model: int = 512, n_heads: int = 8, d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        
        # Sub-layer 1: Multi-head self-attention
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        # Sub-layer 2: Position-wise FFN
        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        """
        Args:
            x: (batch, seq_len, d_model)
            mask: (batch, 1, 1, seq_len) for padding mask
        """
        # Sub-layer 1: Self-attention with residual + layer norm
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Sub-layer 2: FFN with residual + layer norm
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout2(ffn_output))
        
        return x

# Test encoder layer
encoder_layer = EncoderLayer(d_model=512, n_heads=8)
x = torch.randn(2, 10, 512)  # (batch=2, seq_len=10, d_model=512)
output = encoder_layer(x)
print(f"Encoder Layer Input: {x.shape}")
print(f"Encoder Layer Output: {output.shape}")
print("‚úÖ Single encoder layer works!")

## 2. Stacking Encoder Layers üìö

The original Transformer stacks **N=6 identical encoder layers**.

### Why Stack Layers?

Each layer enables **iterative refinement**:
- **Layer 1**: Basic patterns (syntax, word relationships)
- **Layer 2-3**: Intermediate patterns (phrases, local dependencies)
- **Layer 4-6**: Abstract patterns (semantic meaning, long-range dependencies)

**Empirical Finding**: 6 layers is standard for base models, but deeper models (12, 24, 48 layers) often perform better with more data.

### üß† DeepSeek Insight: Layer Specialization

Research shows different layers specialize:
- **Early layers**: Syntactic features (POS tags, dependency parsing)
- **Middle layers**: Entity recognition, coreference
- **Late layers**: Semantic similarity, reasoning

This **hierarchical representation learning** is key to Transformer power!

In [None]:
class Encoder(nn.Module):
    """Stack of N Encoder Layers"""
    def __init__(self, n_layers: int = 6, d_model: int = 512, n_heads: int = 8, 
                 d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        
        # Stack of N identical layers
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)  # Final layer norm
        
    def forward(self, x, mask=None):
        """
        Args:
            x: (batch, seq_len, d_model) - embedded input
            mask: (batch, 1, 1, seq_len) - padding mask
        
        Returns:
            (batch, seq_len, d_model) - encoded representations
        """
        # Pass through each encoder layer
        for layer in self.layers:
            x = layer(x, mask)
        
        # Final layer normalization
        return self.norm(x)

# Test encoder stack
encoder = Encoder(n_layers=6, d_model=512, n_heads=8)
x = torch.randn(2, 10, 512)
encoded = encoder(x)
print(f"Encoder Input: {x.shape}")
print(f"Encoder Output: {encoded.shape}")
print(f"Number of parameters: {sum(p.numel() for p in encoder.parameters()):,}")
print("‚úÖ Encoder stack works!")

## 3. Decoder Layer üî∂

A decoder layer is more complex, with **3 sub-layers**:

1. **Masked Multi-Head Self-Attention** (on output sequence)
2. **Multi-Head Cross-Attention** (attending to encoder outputs)
3. **Position-wise Feed-Forward**

### Mathematical Formula

For decoder input $Y \in \mathbb{R}^{\text{tgt_len} \times d_{\text{model}}}$ and encoder output $E \in \mathbb{R}^{\text{src_len} \times d_{\text{model}}}$:

$$
\begin{align}
Z_1 &= \text{LayerNorm}(Y + \text{MaskedSelfAttn}(Y, Y, Y)) \\
Z_2 &= \text{LayerNorm}(Z_1 + \text{CrossAttn}(Z_1, E, E)) \\
\text{Output} &= \text{LayerNorm}(Z_2 + \text{FFN}(Z_2))
\end{align}
$$

### Key Differences from Encoder

1. **Masked Self-Attention**: Prevents attending to future tokens (causal)
2. **Cross-Attention**: Query from decoder, Key/Value from encoder
3. **Autoregressive**: Generates one token at a time

### üß† DeepSeek Insight: Cross-Attention as Information Bridge

Cross-attention is the **key mechanism** for encoder-decoder communication:
- **Queries**: "What information do I need?" (from decoder)
- **Keys/Values**: "What information is available?" (from encoder)

This allows the decoder to **selectively attend** to relevant parts of the input while generating output.

In [None]:
class DecoderLayer(nn.Module):
    """Single Transformer Decoder Layer"""
    def __init__(self, d_model: int = 512, n_heads: int = 8, d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        
        # Sub-layer 1: Masked multi-head self-attention
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        # Sub-layer 2: Multi-head cross-attention (to encoder output)
        self.cross_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
        
        # Sub-layer 3: Position-wise FFN
        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: (batch, tgt_len, d_model) - decoder input
            encoder_output: (batch, src_len, d_model) - encoder output
            src_mask: (batch, 1, 1, src_len) - padding mask for source
            tgt_mask: (batch, 1, tgt_len, tgt_len) - causal mask for target
        """
        # Sub-layer 1: Masked self-attention
        attn_output, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # Sub-layer 2: Cross-attention to encoder output
        # Query from decoder, Key/Value from encoder
        cross_attn_output, _ = self.cross_attn(x, encoder_output, encoder_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        
        # Sub-layer 3: FFN
        ffn_output = self.ffn(x)
        x = self.norm3(x + self.dropout3(ffn_output))
        
        return x

# Test decoder layer
decoder_layer = DecoderLayer(d_model=512, n_heads=8)
tgt = torch.randn(2, 8, 512)  # (batch=2, tgt_len=8, d_model=512)
encoder_out = torch.randn(2, 10, 512)  # (batch=2, src_len=10, d_model=512)

# Create causal mask for target (prevent attending to future)
tgt_len = 8
tgt_mask = torch.tril(torch.ones(tgt_len, tgt_len)).unsqueeze(0).unsqueeze(1)

output = decoder_layer(tgt, encoder_out, tgt_mask=tgt_mask)
print(f"Decoder Input: {tgt.shape}")
print(f"Encoder Output: {encoder_out.shape}")
print(f"Decoder Layer Output: {output.shape}")
print("‚úÖ Single decoder layer works!")

## 4. Stacking Decoder Layers üìö

Like the encoder, we stack **N=6 identical decoder layers**.

Each decoder layer:
1. Refines output representation
2. Attends to encoder output (via cross-attention)
3. Maintains causal structure (can't see future)

### üß† DeepSeek Insight: Autoregressive Generation

The decoder is **autoregressive**: it generates one token at a time.

At time step $t$:
- Input: tokens $[1, 2, ..., t-1]$
- Output: prediction for token $t$
- Causal mask prevents "peeking" at tokens $[t, t+1, ...]$

This is why GPT (decoder-only) can generate coherent long text!

In [None]:
class Decoder(nn.Module):
    """Stack of N Decoder Layers"""
    def __init__(self, n_layers: int = 6, d_model: int = 512, n_heads: int = 8,
                 d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        
        # Stack of N identical layers
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)  # Final layer norm
        
    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: (batch, tgt_len, d_model) - decoder input embeddings
            encoder_output: (batch, src_len, d_model) - encoder outputs
            src_mask: (batch, 1, 1, src_len) - padding mask for source
            tgt_mask: (batch, 1, tgt_len, tgt_len) - causal mask for target
        
        Returns:
            (batch, tgt_len, d_model) - decoded representations
        """
        # Pass through each decoder layer
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        
        # Final layer normalization
        return self.norm(x)

# Test decoder stack
decoder = Decoder(n_layers=6, d_model=512, n_heads=8)
tgt = torch.randn(2, 8, 512)
encoder_out = torch.randn(2, 10, 512)
tgt_mask = torch.tril(torch.ones(8, 8)).unsqueeze(0).unsqueeze(1)

decoded = decoder(tgt, encoder_out, tgt_mask=tgt_mask)
print(f"Decoder Input: {tgt.shape}")
print(f"Encoder Output: {encoder_out.shape}")
print(f"Decoder Output: {decoded.shape}")
print(f"Number of parameters: {sum(p.numel() for p in decoder.parameters()):,}")
print("‚úÖ Decoder stack works!")

## 5. Visualizing Encoder-Decoder Flow üìä

Let's visualize how information flows through the encoder-decoder architecture.

In [None]:
# Create a simple example
batch_size = 1
src_len = 6
tgt_len = 5
d_model = 512

# Dummy encoder and decoder
encoder = Encoder(n_layers=3, d_model=d_model, n_heads=8)
decoder = Decoder(n_layers=3, d_model=d_model, n_heads=8)

# Input sequences
src = torch.randn(batch_size, src_len, d_model)
tgt = torch.randn(batch_size, tgt_len, d_model)

# Create causal mask for target
tgt_mask = torch.tril(torch.ones(tgt_len, tgt_len)).unsqueeze(0).unsqueeze(1)

# Forward pass
encoder_output = encoder(src)
decoder_output = decoder(tgt, encoder_output, tgt_mask=tgt_mask)

print("=" * 60)
print("üìä ENCODER-DECODER INFORMATION FLOW")
print("=" * 60)
print(f"\n1Ô∏è‚É£ INPUT TO ENCODER:")
print(f"   Shape: {src.shape}")
print(f"   Description: Source sequence (e.g., English sentence)\n")

print(f"2Ô∏è‚É£ ENCODER PROCESSING:")
print(f"   Layers: 3")
print(f"   Each layer: Self-Attention + FFN")
print(f"   Output shape: {encoder_output.shape}")
print(f"   Description: Rich contextualized representations\n")

print(f"3Ô∏è‚É£ INPUT TO DECODER:")
print(f"   Shape: {tgt.shape}")
print(f"   Description: Target sequence so far (e.g., French translation)\n")

print(f"4Ô∏è‚É£ DECODER PROCESSING:")
print(f"   Layers: 3")
print(f"   Each layer: Masked Self-Attn + Cross-Attn + FFN")
print(f"   Cross-Attention: Decoder queries encoder output")
print(f"   Output shape: {decoder_output.shape}")
print(f"   Description: Contextualized output representations\n")

print(f"5Ô∏è‚É£ FINAL PREDICTION:")
print(f"   Linear layer: {d_model} ‚Üí vocab_size")
print(f"   Softmax: Probability distribution over vocabulary")
print(f"   Output: Next token prediction\n")

print("=" * 60)

## 6. Visualizing Causal Mask üé≠

The **causal mask** is crucial for autoregressive generation. Let's visualize it.

In [None]:
def create_causal_mask(size: int):
    """Create a causal (lower triangular) mask"""
    mask = torch.tril(torch.ones(size, size))
    return mask

# Visualize causal mask
seq_len = 8
mask = create_causal_mask(seq_len)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Causal mask
sns.heatmap(mask.numpy(), annot=True, fmt=".0f", cmap="Blues", 
            cbar=False, square=True, ax=ax1,
            xticklabels=[f"t{i+1}" for i in range(seq_len)],
            yticklabels=[f"t{i+1}" for i in range(seq_len)])
ax1.set_title("Causal Mask (1 = can attend, 0 = masked)\n", fontsize=14, fontweight='bold')
ax1.set_xlabel("Key Position (being attended to)")
ax1.set_ylabel("Query Position (attending from)")

# Plot 2: Attention pattern explanation
ax2.axis('off')
explanation = """
üéØ CAUSAL MASK INTERPRETATION

Reading Row by Row (Query Position):

‚Ä¢ t1 can attend to: [t1] only
  ‚Üí Sees only the first token

‚Ä¢ t2 can attend to: [t1, t2]
  ‚Üí Sees tokens 1-2

‚Ä¢ t3 can attend to: [t1, t2, t3]
  ‚Üí Sees tokens 1-3

‚Ä¢ t8 can attend to: [t1, t2, ..., t8]
  ‚Üí Sees all previous tokens + itself

üö´ KEY CONSTRAINT:
No token can attend to future tokens!
This ensures autoregressive generation.

üí° WHY IT MATTERS:
Prevents information leakage during training.
Token t should predict t+1 using only [1...t].
"""
ax2.text(0.1, 0.5, explanation, fontsize=11, family='monospace',
         verticalalignment='center')

plt.tight_layout()
plt.show()

print("\n‚úÖ Causal mask ensures tokens can only attend to past and present!")

## 7. üß† DeepSeek Insights: Why 6 Layers?

### The Magic Number?

The original Transformer used **N=6 layers** for both encoder and decoder. Why?

**Empirical Finding** (Vaswani et al., 2017):
- 6 layers achieved good performance on translation tasks
- More layers (8, 10) didn't significantly improve results on available data
- Training was stable with 6 layers

### Modern Perspective

**Deeper is often better** (with more data and compute):
- **BERT-base**: 12 layers
- **GPT-3**: 96 layers (175B parameters)
- **GPT-4**: Rumored 120+ layers
- **DeepSeek-V3**: 61 layers with MoE

### Layer-wise Specialization

Research shows **hierarchical processing**:

```
Layer 1-2:  Syntax (POS tags, dependency parsing)
Layer 3-4:  Entities, coreference, relations
Layer 5-6:  Semantics, reasoning, world knowledge
```

**DeepSeek Insight**: More layers enable **multi-hop reasoning**:
- Each layer = one "hop" through knowledge
- Complex reasoning requires multiple hops
- Depth correlates with reasoning capability

### Training Deep Transformers

Challenges:
- **Gradient vanishing/exploding**: Residual connections help
- **Training instability**: Pre-LN more stable than Post-LN
- **Overfitting**: More layers need more data

Solutions:
- **Pre-LN**: Apply LayerNorm before sub-layer (more stable)
- **Warm-up**: Gradually increase learning rate
- **Gradient clipping**: Prevent exploding gradients
- **Regularization**: Dropout, weight decay

## 8. Encoder-Only vs Decoder-Only vs Encoder-Decoder ü§î

### Three Architectures

**1. Encoder-Only (BERT, RoBERTa)**
- Bidirectional attention (no causal mask)
- Good for understanding tasks: classification, NER, QA
- Cannot generate text autoregressively

**2. Decoder-Only (GPT, LLaMA, DeepSeek)**
- Causal attention (cannot see future)
- Good for generation: text completion, chat, code
- Also surprisingly good at understanding (with prompting)

**3. Encoder-Decoder (T5, BART, Original Transformer)**
- Encoder: bidirectional, Decoder: causal
- Good for seq2seq: translation, summarization
- More parameters for same capacity

### Modern Trend: Decoder-Only Dominates

Why are GPT-style models winning?
1. **Simplicity**: One architecture for everything
2. **Scaling**: Easier to scale to billions of parameters
3. **Versatility**: Can do both understanding and generation
4. **Prompting**: In-context learning enables any task

**DeepSeek Insight**: Decoder-only models with enough scale can **simulate** encoder-decoder behavior through attention patterns!

In [None]:
# Compare parameter counts
d_model = 512
n_heads = 8
n_layers = 6

encoder_only = Encoder(n_layers=n_layers, d_model=d_model, n_heads=n_heads)
decoder_only = Decoder(n_layers=n_layers, d_model=d_model, n_heads=n_heads)

# Note: Decoder has cross-attention, so slightly more parameters
encoder_params = sum(p.numel() for p in encoder_only.parameters())
decoder_params = sum(p.numel() for p in decoder_only.parameters())
encoder_decoder_params = encoder_params + decoder_params

print("\n" + "="*60)
print("üìä ARCHITECTURE COMPARISON")
print("="*60)
print(f"\n1Ô∏è‚É£ Encoder-Only (BERT-style):")
print(f"   Parameters: {encoder_params:,}")
print(f"   Attention: Bidirectional")
print(f"   Use case: Classification, NER, QA\n")

print(f"2Ô∏è‚É£ Decoder-Only (GPT-style):")
print(f"   Parameters: {decoder_params:,}")
print(f"   Attention: Causal (autoregressive)")
print(f"   Use case: Text generation, chat, code\n")

print(f"3Ô∏è‚É£ Encoder-Decoder (T5-style):")
print(f"   Parameters: {encoder_decoder_params:,}")
print(f"   Attention: Encoder=bidirectional, Decoder=causal")
print(f"   Use case: Translation, summarization, seq2seq\n")

print(f"üìà Parameter Comparison:")
print(f"   Encoder-Decoder is ~{encoder_decoder_params/decoder_params:.1f}x larger than Decoder-Only")
print(f"   (for same number of layers)\n")

print("="*60)

## 9. Summary & Key Takeaways üìù

### What We Learned

‚úÖ **Encoder Layer**: Self-attention + FFN (2 sub-layers)  
‚úÖ **Decoder Layer**: Masked self-attention + Cross-attention + FFN (3 sub-layers)  
‚úÖ **Stacking**: N=6 layers enables hierarchical processing  
‚úÖ **Cross-Attention**: Enables encoder-decoder communication  
‚úÖ **Causal Masking**: Ensures autoregressive generation  
‚úÖ **Architecture Variants**: Encoder-only, Decoder-only, Encoder-decoder  

### Critical Insights

1. **Depth enables reasoning**: More layers = more processing steps
2. **Cross-attention is key**: Connects encoder and decoder
3. **Causal masking is crucial**: Prevents information leakage
4. **Layer specialization**: Different layers learn different features
5. **Architecture choice matters**: Depends on task and scale

### What's Next?

In **Tutorial 6**, we'll:
- Assemble the complete Transformer (encoder + decoder + embeddings)
- Add input/output projections
- Implement generation strategies
- Train on a real task!

---

## üéØ Exercises

1. **Implement Pre-LN vs Post-LN**: Compare training stability
2. **Visualize Layer-wise Representations**: Use t-SNE on layer outputs
3. **Vary Number of Layers**: How does depth affect performance?
4. **Implement Encoder-Only Model**: Remove decoder, test on classification
5. **Implement Decoder-Only Model**: Remove encoder, test on generation
6. **Analyze Cross-Attention**: Visualize which source tokens decoder attends to
7. **Compare Architectures**: Train all 3 variants on same task

---

**Congratulations! üéâ You now understand the complete encoder-decoder architecture!**

Next: [Tutorial 6: Complete Transformer & Training](06_complete_transformer.ipynb)