In [None]:
# Google Colab Setup
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üîß Running in Google Colab - Setting up environment...")
    if not os.path.exists('transformer_from_scratch'):
        print("üì• Cloning repository...")
        !git clone https://github.com/melhzy/transformer_from_scratch.git
        print("‚úÖ Repository cloned!")
    os.chdir('transformer_from_scratch')
    print("üì¶ Installing dependencies...")
    !pip install -q torch torchvision matplotlib seaborn numpy pandas tqdm
    print("‚úÖ Dependencies installed!")
    if '/content/transformer_from_scratch' not in sys.path:
        sys.path.insert(0, '/content/transformer_from_scratch')
    print("‚úÖ Setup complete!")
else:
    print("üíª Running locally - no setup needed.")

In [None]:
# Import libraries
import sys
import os
from pathlib import Path
from typing import Optional, Tuple
import math

if not IN_COLAB:
    sys.path.insert(0, str(Path.cwd().parent))

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Import our implementations
from src.modules.multi_head_attention import MultiHeadAttention
from src.modules.encoder import TransformerEncoder, EncoderLayer
from src.modules.feed_forward import PositionWiseFeedForward

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device: {device}")
print(f"‚úÖ PyTorch version: {torch.__version__}")

## 1. LoRA Layer Implementation üîß

### Mathematical Foundation

For a linear layer with weight matrix $W \in \mathbb{R}^{d \times k}$:

**Standard forward pass:**
$$y = Wx$$

**LoRA forward pass:**
$$y = Wx + \frac{\alpha}{r}BAx$$

Where:
- $W$ is frozen (pre-trained)
- $B \in \mathbb{R}^{d \times r}$ and $A \in \mathbb{R}^{r \times k}$ are trainable
- $r$ is the rank (typically 4, 8, 16, 32)
- $\alpha$ is a scaling factor (typically 16 or 32)

### Implementation

In [None]:
class LoRALayer(nn.Module):
    """
    LoRA (Low-Rank Adaptation) Layer
    
    Adds trainable low-rank decomposition to a frozen linear layer.
    Based on: https://arxiv.org/abs/2106.09685
    """
    def __init__(
        self, 
        in_features: int, 
        out_features: int,
        rank: int = 8,
        alpha: float = 16.0,
        dropout: float = 0.0
    ):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        
        # Frozen pre-trained weight
        self.weight = nn.Parameter(
            torch.randn(out_features, in_features),
            requires_grad=False
        )
        
        # Bias (optional)
        self.bias = nn.Parameter(
            torch.zeros(out_features),
            requires_grad=False
        )
        
        # LoRA matrices (trainable)
        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))
        
        # Dropout for LoRA path
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
        
        # Initialize
        self.reset_parameters()
        
    def reset_parameters(self):
        """Initialize LoRA matrices"""
        # Initialize A with Kaiming uniform (like nn.Linear)
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        # Initialize B with zeros (so LoRA starts as identity)
        nn.init.zeros_(self.lora_B)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass: y = Wx + dropout(BAx) * scaling
        
        Args:
            x: Input tensor (..., in_features)
        Returns:
            Output tensor (..., out_features)
        """
        # Original frozen path
        result = F.linear(x, self.weight, self.bias)
        
        # LoRA path: x @ A^T @ B^T
        x_lora = self.dropout(x)
        lora_result = F.linear(F.linear(x_lora, self.lora_A), self.lora_B)
        
        # Scale and add
        result = result + lora_result * self.scaling
        
        return result
    
    def merge_weights(self) -> torch.Tensor:
        """
        Merge LoRA weights into base weights for inference.
        
        Returns:
            Merged weight matrix
        """
        # W' = W + BA * scaling
        delta_w = (self.lora_B @ self.lora_A) * self.scaling
        return self.weight + delta_w
    
    def unmerge_weights(self, merged_weight: torch.Tensor):
        """
        Restore original weights from merged weights.
        Useful for switching between tasks.
        """
        delta_w = (self.lora_B @ self.lora_A) * self.scaling
        self.weight.data = merged_weight - delta_w


# Test LoRA layer
print("Testing LoRA Layer...\n")
lora = LoRALayer(512, 512, rank=8, alpha=16.0)
x = torch.randn(2, 10, 512)
output = lora(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"\nParameter counts:")
print(f"  Frozen: {sum(p.numel() for p in lora.parameters() if not p.requires_grad):,}")
print(f"  Trainable (LoRA): {sum(p.numel() for p in lora.parameters() if p.requires_grad):,}")
print(f"  Ratio: {sum(p.numel() for p in lora.parameters() if p.requires_grad) / sum(p.numel() for p in lora.parameters()) * 100:.2f}%")
print("\n‚úÖ LoRA layer works!")

## 2. Applying LoRA to Multi-Head Attention üéØ

### Reference to transformer-foundation

From [transformer-foundation/03_multi_head_attention.ipynb](../transformer-foundation/03_multi_head_attention.ipynb), we learned that multi-head attention has 4 linear projections:

```python
W_q: Query projection  (d_model ‚Üí d_model)
W_k: Key projection    (d_model ‚Üí d_model)
W_v: Value projection  (d_model ‚Üí d_model)
W_o: Output projection (d_model ‚Üí d_model)
```

**Common LoRA configurations:**
1. **Q + V only** (most efficient)
2. **Q + K + V + O** (maximum adaptation)
3. **Q + V + O** (balanced)

In [None]:
class LoRAMultiHeadAttention(nn.Module):
    """
    Multi-Head Attention with LoRA adapters.
    
    Based on src/modules/multi_head_attention.py with LoRA modifications.
    """
    def __init__(
        self,
        d_model: int = 512,
        n_heads: int = 8,
        dropout: float = 0.1,
        lora_rank: int = 8,
        lora_alpha: float = 16.0,
        lora_dropout: float = 0.0,
        apply_lora_to: list = ['q', 'v']  # Which projections to apply LoRA
    ):
        super().__init__()
        assert d_model % n_heads == 0
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.apply_lora_to = [x.lower() for x in apply_lora_to]
        
        # Create projections (with or without LoRA)
        self.W_q = self._create_projection('q', lora_rank, lora_alpha, lora_dropout)
        self.W_k = self._create_projection('k', lora_rank, lora_alpha, lora_dropout)
        self.W_v = self._create_projection('v', lora_rank, lora_alpha, lora_dropout)
        self.W_o = self._create_projection('o', lora_rank, lora_alpha, lora_dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def _create_projection(self, name: str, rank: int, alpha: float, dropout: float):
        """Create projection with or without LoRA"""
        if name in self.apply_lora_to:
            return LoRALayer(self.d_model, self.d_model, rank, alpha, dropout)
        else:
            # Standard linear layer (frozen for fine-tuning)
            layer = nn.Linear(self.d_model, self.d_model)
            layer.weight.requires_grad = False
            layer.bias.requires_grad = False
            return layer
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.W_q(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # Scaled dot-product attention (from transformer-foundation/02)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention to values
        context = torch.matmul(attention_weights, V)
        
        # Reshape and apply output projection
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_o(context)
        
        return output, attention_weights


# Test LoRA Multi-Head Attention
print("Testing LoRA Multi-Head Attention...\n")
lora_mha = LoRAMultiHeadAttention(
    d_model=512,
    n_heads=8,
    lora_rank=8,
    apply_lora_to=['q', 'v']  # Most efficient configuration
)

x = torch.randn(2, 10, 512)
output, attn_weights = lora_mha(x, x, x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attn_weights.shape}")
print(f"\nParameter counts:")
trainable = sum(p.numel() for p in lora_mha.parameters() if p.requires_grad)
total = sum(p.numel() for p in lora_mha.parameters())
print(f"  Total: {total:,}")
print(f"  Trainable (LoRA): {trainable:,}")
print(f"  Ratio: {trainable / total * 100:.2f}%")
print("\n‚úÖ LoRA Multi-Head Attention works!")

## 3. Memory Comparison: Full vs LoRA üìä

Let's compare memory usage for training.

In [None]:
def count_parameters(model):
    """Count trainable and total parameters"""
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return trainable, total

# Create models for comparison
d_model = 768  # BERT-base size
n_heads = 12
n_layers = 12

# Full fine-tuning attention
full_attn = nn.ModuleList([
    MultiHeadAttention(d_model, n_heads)
    for _ in range(n_layers)
])

# LoRA attention (Q+V only, rank=8)
lora_attn_qv = nn.ModuleList([
    LoRAMultiHeadAttention(d_model, n_heads, lora_rank=8, apply_lora_to=['q', 'v'])
    for _ in range(n_layers)
])

# LoRA attention (all projections, rank=8)
lora_attn_all = nn.ModuleList([
    LoRAMultiHeadAttention(d_model, n_heads, lora_rank=8, apply_lora_to=['q', 'k', 'v', 'o'])
    for _ in range(n_layers)
])

# LoRA attention (Q+V, rank=16)
lora_attn_r16 = nn.ModuleList([
    LoRAMultiHeadAttention(d_model, n_heads, lora_rank=16, apply_lora_to=['q', 'v'])
    for _ in range(n_layers)
])

# Count parameters
results = {}
for name, model in [
    ('Full Fine-Tuning', full_attn),
    ('LoRA (Q+V, r=8)', lora_attn_qv),
    ('LoRA (Q+K+V+O, r=8)', lora_attn_all),
    ('LoRA (Q+V, r=16)', lora_attn_r16),
]:
    trainable, total = count_parameters(model)
    results[name] = {
        'trainable': trainable,
        'total': total,
        'ratio': trainable / total * 100
    }

# Display results
print("\nüìä Parameter Comparison (12-layer BERT-base size)\n")
print(f"{'Strategy':<25} {'Trainable':>12} {'Total':>12} {'Ratio':>10}")
print("-" * 65)
for name, data in results.items():
    print(f"{name:<25} {data['trainable']:>12,} {data['total']:>12,} {data['ratio']:>9.2f}%")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of trainable parameters
names = list(results.keys())
trainable_counts = [results[n]['trainable'] / 1e6 for n in names]

ax1.bar(range(len(names)), trainable_counts, color=['red', 'green', 'blue', 'orange'])
ax1.set_xticks(range(len(names)))
ax1.set_xticklabels(names, rotation=15, ha='right')
ax1.set_ylabel('Trainable Parameters (Millions)', fontsize=12)
ax1.set_title('Trainable Parameters Comparison', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Pie chart showing ratio
ratios = [results[n]['ratio'] for n in names]
colors = ['red', 'green', 'blue', 'orange']
ax2.bar(range(len(names)), ratios, color=colors)
ax2.set_xticks(range(len(names)))
ax2.set_xticklabels(names, rotation=15, ha='right')
ax2.set_ylabel('Trainable Percentage (%)', fontsize=12)
ax2.set_title('Parameter Efficiency', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Memory estimate
full_trainable = results['Full Fine-Tuning']['trainable']
lora_trainable = results['LoRA (Q+V, r=8)']['trainable']
memory_saving = full_trainable / lora_trainable

print(f"\nüí° Memory Insights:")
print(f"  LoRA (Q+V, r=8) uses {memory_saving:.1f}x FEWER trainable parameters!")
print(f"  For gradient storage: ~{memory_saving:.1f}x less GPU memory required")
print(f"  Training speedup: ~{memory_saving/2:.1f}x faster (approximate)")

## 4. Weight Merging for Inference ‚ö°

During inference, we can merge LoRA weights into the base weights to avoid any additional computation.

In [None]:
import time

# Create LoRA layer
lora_layer = LoRALayer(512, 512, rank=8)
x = torch.randn(100, 512).to(device)
lora_layer = lora_layer.to(device)

# Time LoRA inference (with separate paths)
torch.cuda.synchronize() if device.type == 'cuda' else None
start = time.time()
for _ in range(1000):
    _ = lora_layer(x)
torch.cuda.synchronize() if device.type == 'cuda' else None
lora_time = time.time() - start

# Merge weights
merged_weight = lora_layer.merge_weights()

# Create standard linear layer with merged weights
merged_layer = nn.Linear(512, 512, bias=True).to(device)
merged_layer.weight.data = merged_weight
merged_layer.bias.data = lora_layer.bias

# Time merged inference
torch.cuda.synchronize() if device.type == 'cuda' else None
start = time.time()
for _ in range(1000):
    _ = merged_layer(x)
torch.cuda.synchronize() if device.type == 'cuda' else None
merged_time = time.time() - start

# Compare outputs (should be identical)
lora_output = lora_layer(x)
merged_output = merged_layer(x)
max_diff = torch.max(torch.abs(lora_output - merged_output)).item()

print("\n‚ö° Inference Speed Comparison\n")
print(f"LoRA (separate paths): {lora_time*1000:.2f} ms")
print(f"Merged weights:        {merged_time*1000:.2f} ms")
print(f"Speedup:               {lora_time/merged_time:.2f}x")
print(f"\nOutput difference: {max_diff:.2e} (should be ~0)")
print("\nüí° For production: merge weights to eliminate overhead!")

## 5. Applying LoRA to Complete Encoder üèóÔ∏è

Let's apply LoRA to a complete encoder stack, referencing our implementation from `src/modules/encoder.py`.

In [None]:
class LoRAEncoderLayer(nn.Module):
    """
    Encoder layer with LoRA adapters.
    Based on src/modules/encoder.py with LoRA modifications.
    """
    def __init__(
        self,
        d_model: int = 512,
        n_heads: int = 8,
        d_ff: int = 2048,
        dropout: float = 0.1,
        lora_rank: int = 8,
        lora_alpha: float = 16.0,
        apply_lora_to_attn: list = ['q', 'v'],
        apply_lora_to_ffn: bool = False
    ):
        super().__init__()
        
        # Multi-head attention with LoRA
        self.self_attn = LoRAMultiHeadAttention(
            d_model, n_heads, dropout, 
            lora_rank, lora_alpha, 0.0, 
            apply_lora_to_attn
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        
        # Feed-forward network (with optional LoRA)
        if apply_lora_to_ffn:
            self.ffn = nn.Sequential(
                LoRALayer(d_model, d_ff, lora_rank, lora_alpha),
                nn.ReLU(),
                nn.Dropout(dropout),
                LoRALayer(d_ff, d_model, lora_rank, lora_alpha)
            )
        else:
            # Standard FFN (frozen)
            ffn = PositionWiseFeedForward(d_model, d_ff, dropout)
            for param in ffn.parameters():
                param.requires_grad = False
            self.ffn = ffn
        
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))
        
        # FFN with residual
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout2(ffn_output))
        
        return x


class LoRAEncoder(nn.Module):
    """
    Complete encoder with LoRA adapters.
    """
    def __init__(
        self,
        n_layers: int = 6,
        d_model: int = 512,
        n_heads: int = 8,
        d_ff: int = 2048,
        dropout: float = 0.1,
        lora_rank: int = 8,
        lora_alpha: float = 16.0,
        apply_lora_to_attn: list = ['q', 'v'],
        apply_lora_to_ffn: bool = False
    ):
        super().__init__()
        
        self.layers = nn.ModuleList([
            LoRAEncoderLayer(
                d_model, n_heads, d_ff, dropout,
                lora_rank, lora_alpha,
                apply_lora_to_attn, apply_lora_to_ffn
            )
            for _ in range(n_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)
    
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


# Test LoRA Encoder
print("Testing LoRA Encoder...\n")
lora_encoder = LoRAEncoder(
    n_layers=6,
    d_model=512,
    n_heads=8,
    lora_rank=8,
    apply_lora_to_attn=['q', 'v'],
    apply_lora_to_ffn=False
)

x = torch.randn(2, 10, 512)
output = lora_encoder(x)

trainable, total = count_parameters(lora_encoder)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"\nParameter counts:")
print(f"  Total: {total:,}")
print(f"  Trainable (LoRA): {trainable:,}")
print(f"  Ratio: {trainable / total * 100:.2f}%")
print("\n‚úÖ LoRA Encoder works!")

## 6. Practical Training Example üéì

Let's see how to set up a simple training loop with LoRA.

In [None]:
# Create model and dummy data
model = LoRAEncoder(
    n_layers=3,
    d_model=256,
    n_heads=8,
    lora_rank=8
).to(device)

# Only optimize LoRA parameters!
lora_params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(lora_params, lr=1e-4)

print(f"Optimizing {len(lora_params)} LoRA parameter groups")
print(f"Total trainable parameters: {sum(p.numel() for p in lora_params):,}\n")

# Dummy training loop
model.train()
losses = []

for step in tqdm(range(100), desc="Training"):
    # Generate dummy data
    x = torch.randn(8, 20, 256).to(device)  # (batch, seq_len, d_model)
    target = torch.randn(8, 20, 256).to(device)
    
    # Forward pass
    output = model(x)
    
    # Simple MSE loss (for demonstration)
    loss = F.mse_loss(output, target)
    
    # Backward pass (only updates LoRA parameters)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('LoRA Training Loss', fontweight='bold')
plt.grid(alpha=0.3)
plt.show()

print(f"\n‚úÖ Training complete! Final loss: {losses[-1]:.4f}")
print(f"\nüí° Note: Only LoRA parameters were updated during training!")

## 7. Summary & Best Practices üìù

### What We Learned:

‚úÖ Implemented LoRA layer from scratch  
‚úÖ Applied LoRA to multi-head attention  
‚úÖ Compared memory usage vs full fine-tuning  
‚úÖ Learned weight merging for efficient inference  
‚úÖ Built complete LoRA encoder  
‚úÖ Set up training loop with LoRA parameters  

### Best Practices:

1. **Start with Q+V projections**: Most efficient, good performance
2. **Use rank 8-16**: Sweet spot for most tasks
3. **Set alpha = 2 √ó rank**: Standard scaling
4. **Merge weights for production**: Eliminates inference overhead
5. **Monitor trainable param ratio**: Aim for <1% for large models

### Configuration Guide:

```python
# For experimentation (fastest)
apply_lora_to = ['q', 'v']
rank = 8

# For better performance
apply_lora_to = ['q', 'v', 'o']
rank = 16

# For maximum adaptation
apply_lora_to = ['q', 'k', 'v', 'o']
rank = 32
apply_lora_to_ffn = True
```

### Next Steps:

- **Tutorial 3**: Data preparation for fine-tuning
- **Tutorial 4**: Instruction tuning with LoRA
- **Tutorial 5**: Evaluation and metrics

---

## üìö Resources

**Papers:**
- LoRA: https://arxiv.org/abs/2106.09685
- QLoRA: https://arxiv.org/abs/2305.14314

**Code:**
- Our implementation: [src/modules/](../src/modules/)
- Hugging Face PEFT: https://github.com/huggingface/peft

**Related:**
- [transformer-foundation/03_multi_head_attention.ipynb](../transformer-foundation/03_multi_head_attention.ipynb)
- [transformer-foundation/04_feed_forward_networks.ipynb](../transformer-foundation/04_feed_forward_networks.ipynb)

---

**Ready for data preparation? Continue to Tutorial 3! üöÄ**