# Fine-tuning and Adaptation

This notebook provides an interactive guide to adapting pretrained language models for specific tasks using various parameter-efficient methods.

## 1. Introduction to Fine-tuning

Fine-tuning adapts pretrained models to specific tasks. We'll explore methods from full fine-tuning to parameter-efficient approaches like LoRA.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import pandas as pd
from IPython.display import display, HTML
import math

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed
torch.manual_seed(42)
np.random.seed(42)

## 2. Fine-tuning Paradigms Overview

Let's visualize different fine-tuning approaches and their trade-offs.

In [None]:
# Fine-tuning methods comparison
methods_data = {
    'Method': ['Full Fine-tuning', 'LoRA', 'QLoRA', 'Adapters', 'Prompt Tuning', 
               'Prefix Tuning', 'BitFit', 'IA³'],
    'Trainable Params (%)': [100, 0.5, 0.5, 3, 0.01, 0.1, 0.1, 0.01],
    'Memory Usage': [100, 20, 10, 25, 5, 5, 15, 5],
    'Quality (%)': [100, 98, 97, 96, 90, 92, 85, 94],
    'Training Speed': [1, 3, 2.5, 2.5, 5, 4, 4, 5]
}

df_methods = pd.DataFrame(methods_data)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Parameters vs Quality scatter
ax = axes[0, 0]
scatter = ax.scatter(df_methods['Trainable Params (%)'], 
                    df_methods['Quality (%)'],
                    s=df_methods['Memory Usage']*5,
                    c=df_methods['Training Speed'],
                    cmap='viridis', alpha=0.6)

# Add labels
for idx, row in df_methods.iterrows():
    ax.annotate(row['Method'], 
               (row['Trainable Params (%)'], row['Quality (%)']),
               xytext=(5, 5), textcoords='offset points', fontsize=8)

ax.set_xlabel('Trainable Parameters (%)', fontsize=12)
ax.set_ylabel('Quality (% of full fine-tuning)', fontsize=12)
ax.set_title('Parameter Efficiency vs Quality Trade-off', fontsize=14)
ax.set_xscale('log')
ax.grid(True, alpha=0.3)

# Add colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Training Speed (relative)', fontsize=10)

# 2. Method comparison radar chart
ax = axes[0, 1]
categories = ['Parameters\n(inverse)', 'Memory\n(inverse)', 'Quality', 'Speed']
num_vars = len(categories)

# Compute angles
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

# Select methods to compare
methods_to_compare = ['Full Fine-tuning', 'LoRA', 'Adapters', 'Prompt Tuning']
colors = ['blue', 'red', 'green', 'orange']

ax = plt.subplot(2, 2, 2, projection='polar')
for method, color in zip(methods_to_compare, colors):
    row = df_methods[df_methods['Method'] == method].iloc[0]
    
    # Normalize values (invert params and memory for better visualization)
    values = [
        100 - row['Trainable Params (%)'],  # Inverse
        100 - row['Memory Usage'],  # Inverse
        row['Quality (%)'],
        row['Training Speed'] * 20
    ]
    values += values[:1]
    
    ax.plot(angles, values, 'o-', linewidth=2, label=method, color=color)
    ax.fill(angles, values, alpha=0.15, color=color)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 100)
ax.set_title('Method Comparison (Higher is Better)', fontsize=14, y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

# 3. Memory usage comparison
ax = axes[1, 0]
methods_sorted = df_methods.sort_values('Memory Usage')
bars = ax.barh(methods_sorted['Method'], methods_sorted['Memory Usage'])

# Color bars by efficiency
colors = plt.cm.RdYlGn(1 - methods_sorted['Memory Usage'] / 100)
for bar, color in zip(bars, colors):
    bar.set_color(color)

ax.set_xlabel('Relative Memory Usage (%)', fontsize=12)
ax.set_title('Memory Requirements by Method', fontsize=14)
ax.grid(True, alpha=0.3, axis='x')

# 4. Use case recommendations
ax = axes[1, 1]
ax.axis('off')

recommendations = [
    "🎯 **Method Selection Guide**\n",
    "**Full Fine-tuning**: Best quality, use when resources available",
    "**LoRA**: Best balance - high quality, low memory",
    "**QLoRA**: For very large models (70B+)",
    "**Adapters**: Good for multi-task scenarios",
    "**Prompt/Prefix**: Minimal resources, good for few-shot",
    "**BitFit**: Quick experiments, baseline",
    "\n📊 **Key Insights**:",
    "• LoRA achieves 98% quality with 0.5% parameters",
    "• QLoRA enables 4-bit training of huge models",
    "• Prompt methods work well for classification",
    "• Adapters excel at multi-task learning"
]

text = "\n".join(recommendations)
ax.text(0.05, 0.95, text, transform=ax.transAxes, 
        fontsize=11, verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## 3. Understanding LoRA (Low-Rank Adaptation)

LoRA is one of the most popular parameter-efficient fine-tuning methods. Let's explore how it works.

In [None]:
class LoRALayer(nn.Module):
    """Simplified LoRA layer for demonstration."""
    
    def __init__(self, in_features: int, out_features: int, rank: int = 16, alpha: int = 16):
        super().__init__()
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        
        # Low-rank matrices
        self.lora_A = nn.Parameter(torch.randn(in_features, rank) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return (x @ self.lora_A @ self.lora_B) * self.scaling

# Demonstrate LoRA concept
def visualize_lora_concept():
    """Visualize how LoRA decomposes weight updates."""
    
    # Original weight dimensions
    d_in, d_out = 768, 768
    rank = 16
    
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    
    # 1. Original weight matrix
    ax = axes[0]
    W = torch.randn(d_out, d_in) * 0.02
    im = ax.imshow(W[:100, :100], cmap='coolwarm', aspect='auto')
    ax.set_title(f'Original Weight W\n{d_out}×{d_in} = {d_out*d_in:,} params', fontsize=12)
    ax.set_xlabel('Input dimension')
    ax.set_ylabel('Output dimension')
    
    # 2. Low-rank decomposition
    ax = axes[1]
    ax.text(0.5, 0.8, 'LoRA Decomposition:', transform=ax.transAxes, 
            ha='center', fontsize=14, weight='bold')
    ax.text(0.5, 0.6, f'ΔW = B × A', transform=ax.transAxes, 
            ha='center', fontsize=12)
    ax.text(0.5, 0.4, f'W_new = W_pretrained + α/r × B × A', transform=ax.transAxes, 
            ha='center', fontsize=11)
    ax.text(0.5, 0.2, f'Parameters: {d_in*rank + rank*d_out:,}\n'
            f'Reduction: {(1 - (d_in*rank + rank*d_out)/(d_in*d_out))*100:.1f}%', 
            transform=ax.transAxes, ha='center', fontsize=11)
    ax.axis('off')
    
    # 3. Matrix A (down-projection)
    ax = axes[2]
    A = torch.randn(d_in, rank) * 0.01
    im = ax.imshow(A[:100, :].T, cmap='viridis', aspect='auto')
    ax.set_title(f'Matrix A\n{d_in}×{rank}', fontsize=12)
    ax.set_xlabel('Input dimension')
    ax.set_ylabel('Rank')
    
    # 4. Matrix B (up-projection)
    ax = axes[3]
    B = torch.zeros(rank, d_out)
    im = ax.imshow(B[:, :100], cmap='plasma', aspect='auto')
    ax.set_title(f'Matrix B\n{rank}×{d_out}', fontsize=12)
    ax.set_xlabel('Output dimension')
    ax.set_ylabel('Rank')
    
    plt.suptitle('LoRA: Low-Rank Adaptation of Large Language Models', fontsize=16)
    plt.tight_layout()
    plt.show()

visualize_lora_concept()

# Demonstrate LoRA in action
print("\n--- LoRA Implementation Demo ---")

# Create base layer and LoRA
base_layer = nn.Linear(768, 768, bias=False)
lora = LoRALayer(768, 768, rank=16, alpha=32)

# Freeze base layer
for param in base_layer.parameters():
    param.requires_grad = False

# Forward pass
x = torch.randn(2, 10, 768)
with torch.no_grad():
    base_output = base_layer(x)
lora_output = lora(x)
combined_output = base_output + lora_output

print(f"Input shape: {x.shape}")
print(f"Base output shape: {base_output.shape}")
print(f"LoRA output shape: {lora_output.shape}")
print(f"Combined output shape: {combined_output.shape}")

# Parameter counting
base_params = sum(p.numel() for p in base_layer.parameters())
lora_params = sum(p.numel() for p in lora.parameters())
print(f"\nBase layer parameters: {base_params:,}")
print(f"LoRA parameters: {lora_params:,}")
print(f"Reduction: {(1 - lora_params/base_params)*100:.1f}%")

## 4. Comparing LoRA Ranks

The rank `r` is a crucial hyperparameter in LoRA. Let's explore its impact.

In [None]:
def analyze_lora_ranks(input_dim=768, output_dim=3072):
    """Analyze the effect of different LoRA ranks."""
    
    ranks = [1, 2, 4, 8, 16, 32, 64, 128, 256]
    
    results = []
    for rank in ranks:
        # Calculate parameters
        lora_params = input_dim * rank + rank * output_dim
        full_params = input_dim * output_dim
        
        # Simulate quality (hypothetical)
        quality = 100 * (1 - np.exp(-rank / 16))  # Saturating curve
        
        results.append({
            'Rank': rank,
            'Parameters': lora_params,
            'Percentage': lora_params / full_params * 100,
            'Quality': quality
        })
    
    df_ranks = pd.DataFrame(results)
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Parameters vs rank
    ax1.plot(df_ranks['Rank'], df_ranks['Parameters'] / 1000, 'b-o', label='LoRA params')
    ax1.axhline(y=full_params / 1000, color='red', linestyle='--', label='Full params')
    ax1.set_xlabel('LoRA Rank (r)', fontsize=12)
    ax1.set_ylabel('Parameters (thousands)', fontsize=12)
    ax1.set_title('Parameter Count vs LoRA Rank', fontsize=14)
    ax1.set_xscale('log', base=2)
    ax1.set_yscale('log')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Quality vs parameters trade-off
    ax2.plot(df_ranks['Percentage'], df_ranks['Quality'], 'g-o')
    
    # Add rank labels
    for _, row in df_ranks.iterrows():
        if row['Rank'] in [1, 4, 16, 64, 256]:
            ax2.annotate(f"r={row['Rank']}", 
                        (row['Percentage'], row['Quality']),
                        xytext=(5, -5), textcoords='offset points', fontsize=9)
    
    ax2.set_xlabel('Parameters (% of full model)', fontsize=12)
    ax2.set_ylabel('Relative Quality (%)', fontsize=12)
    ax2.set_title('Quality vs Parameter Efficiency', fontsize=14)
    ax2.grid(True, alpha=0.3)
    
    # Add sweet spot
    sweet_spot = df_ranks[df_ranks['Rank'] == 16].iloc[0]
    ax2.scatter(sweet_spot['Percentage'], sweet_spot['Quality'], 
               s=200, color='red', marker='*', zorder=5)
    ax2.annotate('Sweet spot', 
                (sweet_spot['Percentage'], sweet_spot['Quality']),
                xytext=(10, 10), textcoords='offset points', 
                fontsize=10, color='red', weight='bold',
                arrowprops=dict(arrowstyle='->', color='red'))
    
    plt.tight_layout()
    plt.show()
    
    # Recommendations
    print("\n📊 LoRA Rank Recommendations:")
    print("=" * 50)
    print(f"{'Rank':<10} {'Use Case':<30} {'Trade-off':<20}")
    print("-" * 50)
    recommendations = [
        (1, "Extreme efficiency", "Limited capacity"),
        (4, "Very efficient", "Good for simple tasks"),
        (8, "Efficient", "Balanced"),
        (16, "Recommended default", "Best trade-off"),
        (32, "Higher capacity", "Diminishing returns"),
        (64, "Complex tasks", "Less efficient"),
        (128, "Maximum flexibility", "High memory use")
    ]
    
    for rank, use_case, tradeoff in recommendations:
        params_pct = df_ranks[df_ranks['Rank'] == rank]['Percentage'].iloc[0]
        print(f"{rank:<10} {use_case:<30} {tradeoff:<20} ({params_pct:.1f}% params)")

analyze_lora_ranks()

## 5. QLoRA: Quantized LoRA

QLoRA enables fine-tuning of very large models by combining quantization with LoRA.

In [None]:
def visualize_qlora_concept():
    """Visualize QLoRA's memory savings through quantization."""
    
    # Model sizes and memory requirements
    model_sizes = ['7B', '13B', '30B', '65B', '70B']
    params = np.array([7, 13, 30, 65, 70]) * 1e9
    
    # Memory calculations (GB)
    fp32_memory = params * 4 / 1e9  # 4 bytes per param
    fp16_memory = params * 2 / 1e9  # 2 bytes per param
    int8_memory = params * 1 / 1e9  # 1 byte per param
    int4_memory = params * 0.5 / 1e9  # 0.5 bytes per param
    
    # LoRA memory (assume rank 64, ~1% extra params)
    lora_memory = params * 0.01 * 2 / 1e9  # FP16 for LoRA
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Memory comparison
    x = np.arange(len(model_sizes))
    width = 0.15
    
    bars1 = ax1.bar(x - 1.5*width, fp32_memory, width, label='FP32', color='darkred')
    bars2 = ax1.bar(x - 0.5*width, fp16_memory, width, label='FP16', color='orange')
    bars3 = ax1.bar(x + 0.5*width, int8_memory, width, label='INT8', color='green')
    bars4 = ax1.bar(x + 1.5*width, int4_memory + lora_memory, width, 
                   label='INT4 + LoRA (QLoRA)', color='darkgreen')
    
    ax1.set_xlabel('Model Size', fontsize=12)
    ax1.set_ylabel('Memory Required (GB)', fontsize=12)
    ax1.set_title('Memory Requirements by Quantization Method', fontsize=14)
    ax1.set_xticks(x)
    ax1.set_xticklabels(model_sizes)
    ax1.legend()
    ax1.grid(True, alpha=0.3, axis='y')
    
    # Add GPU memory lines
    gpu_memories = {'A100 80GB': 80, 'A100 40GB': 40, 'V100 32GB': 32, 'T4 16GB': 16}
    for gpu, memory in gpu_memories.items():
        ax1.axhline(y=memory, color='gray', linestyle='--', alpha=0.5)
        ax1.text(len(model_sizes)-0.5, memory+2, gpu, fontsize=9, color='gray')
    
    # QLoRA architecture
    ax2.axis('off')
    ax2.set_title('QLoRA Architecture', fontsize=14)
    
    # Draw architecture
    layers = [
        {'name': 'Input', 'y': 0.9, 'color': 'lightblue'},
        {'name': 'Frozen 4-bit Model', 'y': 0.7, 'color': 'lightcoral'},
        {'name': 'LoRA Adapters (FP16)', 'y': 0.5, 'color': 'lightgreen'},
        {'name': 'Output', 'y': 0.3, 'color': 'lightblue'}
    ]
    
    for i, layer in enumerate(layers):
        rect = plt.Rectangle((0.2, layer['y']-0.05), 0.6, 0.1, 
                           facecolor=layer['color'], edgecolor='black')
        ax2.add_patch(rect)
        ax2.text(0.5, layer['y'], layer['name'], ha='center', va='center', 
                fontsize=11, weight='bold')
        
        if i < len(layers) - 1:
            ax2.arrow(0.5, layer['y']-0.05, 0, -0.07, 
                     head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    # Add annotations
    ax2.text(0.85, 0.7, 'Quantized\n(saves memory)', fontsize=9, ha='left', va='center')
    ax2.text(0.85, 0.5, 'Trainable\n(high precision)', fontsize=9, ha='left', va='center')
    
    # Add key points
    key_points = [
        "✓ 4-bit base model (NF4 quantization)",
        "✓ 16-bit LoRA adapters",
        "✓ Gradient checkpointing",
        "✓ Paged optimizers",
        "✓ Can fine-tune 65B on single GPU!"
    ]
    
    for i, point in enumerate(key_points):
        ax2.text(0.05, 0.15 - i*0.03, point, fontsize=10, transform=ax2.transAxes)
    
    ax2.set_xlim(0, 1)
    ax2.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()

visualize_qlora_concept()

# Simulate QLoRA training memory usage
print("\n--- QLoRA Memory Usage Simulation ---")

def calculate_qlora_memory(model_size_b, rank=64, batch_size=4, seq_len=2048):
    """Calculate memory requirements for QLoRA training."""
    
    # Base model (4-bit)
    base_memory = model_size_b * 1e9 * 0.5 / 1e9  # GB
    
    # LoRA parameters (FP16)
    num_lora_params = model_size_b * 1e9 * 0.01  # ~1% of model
    lora_memory = num_lora_params * 2 / 1e9  # GB
    
    # Optimizer states (Adam, FP32)
    optimizer_memory = num_lora_params * 8 / 1e9  # 2 states * 4 bytes
    
    # Activations (rough estimate)
    hidden_size = int(np.sqrt(model_size_b * 1e9 / 100))  # Rough estimate
    activation_memory = batch_size * seq_len * hidden_size * 4 * 32 / 1e9  # GB
    
    # Gradients
    gradient_memory = lora_memory  # Same as parameters
    
    total = base_memory + lora_memory + optimizer_memory + activation_memory + gradient_memory
    
    return {
        'Base Model (4-bit)': base_memory,
        'LoRA Parameters': lora_memory,
        'Optimizer States': optimizer_memory,
        'Activations': activation_memory,
        'Gradients': gradient_memory,
        'Total': total
    }

# Calculate for different model sizes
for model_size in [7, 13, 30, 65]:
    memory = calculate_qlora_memory(model_size)
    print(f"\n{model_size}B Model:")
    for component, mem in memory.items():
        print(f"  {component:<20}: {mem:>6.1f} GB")
    print(f"  {'='*30}")
    print(f"  Can fit on: ", end="")
    if memory['Total'] < 16:
        print("T4 (16GB) ✓")
    elif memory['Total'] < 24:
        print("RTX 3090/4090 (24GB) ✓")
    elif memory['Total'] < 40:
        print("A100 40GB ✓")
    elif memory['Total'] < 80:
        print("A100 80GB ✓")
    else:
        print("Requires multiple GPUs")

## 6. Adapter Modules

Adapters are another popular parameter-efficient method, especially for multi-task learning.

In [None]:
class AdapterModule(nn.Module):
    """Simple adapter module implementation."""
    
    def __init__(self, hidden_size: int, adapter_size: int = 64):
        super().__init__()
        self.down_project = nn.Linear(hidden_size, adapter_size)
        self.activation = nn.ReLU()
        self.up_project = nn.Linear(adapter_size, hidden_size)
        
        # Initialize near identity
        nn.init.normal_(self.down_project.weight, std=1e-3)
        nn.init.zeros_(self.down_project.bias)
        nn.init.normal_(self.up_project.weight, std=1e-3)
        nn.init.zeros_(self.up_project.bias)
        
    def forward(self, x):
        # Residual connection with bottleneck
        return x + self.up_project(self.activation(self.down_project(x)))

# Visualize adapter architecture
def visualize_adapter_architecture():
    """Show how adapters integrate into transformer layers."""
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
    
    # Adapter module structure
    ax1.axis('off')
    ax1.set_title('Adapter Module Structure', fontsize=14)
    
    # Draw adapter architecture
    components = [
        {'name': 'Input (d)', 'pos': (0.5, 0.9), 'size': (0.3, 0.08)},
        {'name': 'Down-project\n(d → r)', 'pos': (0.3, 0.7), 'size': (0.2, 0.08)},
        {'name': 'ReLU', 'pos': (0.3, 0.5), 'size': (0.2, 0.08)},
        {'name': 'Up-project\n(r → d)', 'pos': (0.3, 0.3), 'size': (0.2, 0.08)},
        {'name': '+', 'pos': (0.5, 0.15), 'size': (0.08, 0.08)},
        {'name': 'Output (d)', 'pos': (0.5, 0.05), 'size': (0.3, 0.08)}
    ]
    
    for comp in components:
        if comp['name'] == '+':
            circle = plt.Circle(comp['pos'], 0.04, facecolor='yellow', edgecolor='black')
            ax1.add_patch(circle)
            ax1.text(comp['pos'][0], comp['pos'][1], comp['name'], 
                    ha='center', va='center', fontsize=16, weight='bold')
        else:
            rect = plt.Rectangle((comp['pos'][0] - comp['size'][0]/2, 
                                comp['pos'][1] - comp['size'][1]/2),
                               comp['size'][0], comp['size'][1],
                               facecolor='lightblue', edgecolor='black')
            ax1.add_patch(rect)
            ax1.text(comp['pos'][0], comp['pos'][1], comp['name'], 
                    ha='center', va='center', fontsize=10)
    
    # Draw connections
    # Main path
    ax1.arrow(0.4, 0.85, 0, -0.1, head_width=0.02, head_length=0.02, fc='blue', ec='blue')
    ax1.arrow(0.3, 0.65, 0, -0.1, head_width=0.02, head_length=0.02, fc='blue', ec='blue')
    ax1.arrow(0.3, 0.45, 0, -0.1, head_width=0.02, head_length=0.02, fc='blue', ec='blue')
    ax1.arrow(0.3, 0.25, 0.15, -0.05, head_width=0.02, head_length=0.02, fc='blue', ec='blue')
    
    # Skip connection
    ax1.plot([0.6, 0.6, 0.54], [0.85, 0.15, 0.15], 'r--', linewidth=2)
    ax1.arrow(0.54, 0.15, -0.02, 0, head_width=0.02, head_length=0.01, fc='red', ec='red')
    ax1.text(0.65, 0.5, 'Skip\nconnection', fontsize=9, color='red')
    
    # Output
    ax1.arrow(0.5, 0.11, 0, -0.02, head_width=0.02, head_length=0.02, fc='blue', ec='blue')
    
    ax1.set_xlim(0, 1)
    ax1.set_ylim(0, 1)
    
    # Transformer with adapters
    ax2.axis('off')
    ax2.set_title('Adapters in Transformer Layer', fontsize=14)
    
    # Draw transformer layer with adapters
    layer_components = [
        {'name': 'Multi-Head\nAttention', 'y': 0.85, 'color': 'lightcoral', 'frozen': True},
        {'name': 'Adapter', 'y': 0.72, 'color': 'lightgreen', 'frozen': False},
        {'name': 'Layer Norm', 'y': 0.6, 'color': 'lightyellow', 'frozen': True},
        {'name': 'Feed Forward', 'y': 0.45, 'color': 'lightcoral', 'frozen': True},
        {'name': 'Adapter', 'y': 0.32, 'color': 'lightgreen', 'frozen': False},
        {'name': 'Layer Norm', 'y': 0.2, 'color': 'lightyellow', 'frozen': True}
    ]
    
    for i, comp in enumerate(layer_components):
        width = 0.5 if comp['name'] != 'Adapter' else 0.3
        x_pos = 0.25 if comp['name'] != 'Adapter' else 0.35
        
        rect = plt.Rectangle((x_pos, comp['y']-0.04), width, 0.08,
                           facecolor=comp['color'], 
                           edgecolor='black',
                           linewidth=2 if not comp['frozen'] else 1,
                           linestyle='-' if not comp['frozen'] else '--')
        ax2.add_patch(rect)
        
        text = comp['name']
        if comp['frozen']:
            text += ' ❄️'
        else:
            text += ' 🔥'
            
        ax2.text(0.5, comp['y'], text, ha='center', va='center', fontsize=10,
                weight='bold' if not comp['frozen'] else 'normal')
        
        if i < len(layer_components) - 1:
            ax2.arrow(0.5, comp['y']-0.04, 0, -0.04, 
                     head_width=0.02, head_length=0.01, fc='black', ec='black')
    
    # Add legend
    ax2.text(0.8, 0.9, '❄️ Frozen', fontsize=10)
    ax2.text(0.8, 0.85, '🔥 Trainable', fontsize=10)
    
    # Add parameter counts
    ax2.text(0.5, 0.05, 'Only adapter parameters are updated during training\n'
                        'Typically 1-5% of model parameters', 
            ha='center', fontsize=10, style='italic')
    
    ax2.set_xlim(0, 1)
    ax2.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()

visualize_adapter_architecture()

# Demonstrate adapter effectiveness
print("\n--- Adapter Module Demo ---")

hidden_size = 768
adapter_sizes = [8, 16, 32, 64, 128, 256]

print(f"Hidden size: {hidden_size}")
print(f"\n{'Adapter Size':<15} {'Parameters':<15} {'Compression':<15}")
print("-" * 45)

for adapter_size in adapter_sizes:
    adapter = AdapterModule(hidden_size, adapter_size)
    params = sum(p.numel() for p in adapter.parameters())
    compression = hidden_size * hidden_size / params
    
    print(f"{adapter_size:<15} {params:<15,} {compression:<15.1f}x")

# Test adapter forward pass
adapter = AdapterModule(hidden_size, 64)
x = torch.randn(2, 10, hidden_size)
output = adapter(x)

print(f"\nForward pass test:")
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Residual preserved: {torch.allclose(output, x, atol=1e-1)}")

## 7. Prompt Tuning Methods

Prompt tuning methods add learnable parameters to the input while keeping the model frozen.

In [None]:
def visualize_prompt_methods():
    """Compare different prompt-based fine-tuning methods."""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Discrete Prompting
    ax = axes[0, 0]
    ax.axis('off')
    ax.set_title('Discrete Prompting (Zero/Few-shot)', fontsize=14)
    
    # Example prompt
    prompt_parts = [
        ('Task: Sentiment Analysis\n', 'orange'),
        ('Example: "Great product!" → Positive\n', 'lightblue'),
        ('Example: "Terrible service" → Negative\n\n', 'lightblue'),
        ('Input: "Amazing experience"\n', 'lightgreen'),
        ('Output: ', 'gray')
    ]
    
    y_pos = 0.8
    for text, color in prompt_parts:
        lines = text.count('\n')
        rect = plt.Rectangle((0.1, y_pos - lines*0.05), 0.8, lines*0.05 + 0.03,
                           facecolor=color, alpha=0.3, edgecolor='black')
        ax.add_patch(rect)
        ax.text(0.5, y_pos - lines*0.025 + 0.015, text.strip(), 
               ha='center', va='center', fontsize=10)
        y_pos -= lines*0.05 + 0.04
    
    ax.text(0.5, 0.1, '❌ No gradient updates\n✅ No training required', 
           ha='center', fontsize=10, style='italic')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    # 2. Soft Prompt Tuning
    ax = axes[0, 1]
    ax.axis('off')
    ax.set_title('Soft Prompt Tuning', fontsize=14)
    
    # Visualize soft prompts
    tokens = ['[P1]', '[P2]', '[P3]', '[P4]', '[P5]', 'The', 'movie', 'was', '...']
    colors = ['red', 'red', 'red', 'red', 'red', 'lightblue', 'lightblue', 'lightblue', 'lightblue']
    
    for i, (token, color) in enumerate(zip(tokens, colors)):
        rect = plt.Rectangle((0.05 + i*0.1, 0.6), 0.08, 0.1,
                           facecolor=color, alpha=0.5 if color == 'red' else 0.3,
                           edgecolor='black')
        ax.add_patch(rect)
        ax.text(0.09 + i*0.1, 0.65, token, ha='center', va='center', 
               fontsize=9, weight='bold' if color == 'red' else 'normal')
    
    # Embeddings
    ax.text(0.5, 0.5, '↓ Embeddings ↓', ha='center', fontsize=11)
    
    # Embedding matrix
    for i in range(9):
        for j in range(4):
            val = np.random.randn() * 0.1
            color = 'red' if i < 5 else 'blue'
            alpha = 0.7 if i < 5 else 0.3
            rect = plt.Rectangle((0.05 + i*0.1, 0.3 - j*0.04), 0.08, 0.03,
                               facecolor=color, alpha=alpha)
            ax.add_patch(rect)
    
    ax.text(0.5, 0.1, '🔥 Learnable soft prompts (continuous)\n❄️ Frozen input embeddings', 
           ha='center', fontsize=10)
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 0.8)
    
    # 3. Prefix Tuning
    ax = axes[1, 0]
    ax.axis('off')
    ax.set_title('Prefix Tuning', fontsize=14)
    
    # Draw transformer layers
    n_layers = 4
    for layer in range(n_layers):
        y = 0.8 - layer * 0.15
        
        # Prefix keys/values
        for i in range(3):
            rect = plt.Rectangle((0.1 + i*0.08, y), 0.06, 0.08,
                               facecolor='red', alpha=0.5, edgecolor='black')
            ax.add_patch(rect)
        
        ax.text(0.19, y+0.04, 'Prefix K,V', ha='center', va='center', fontsize=8)
        
        # Regular keys/values
        for i in range(5):
            rect = plt.Rectangle((0.4 + i*0.08, y), 0.06, 0.08,
                               facecolor='lightblue', alpha=0.3, edgecolor='black')
            ax.add_patch(rect)
        
        ax.text(0.6, y+0.04, 'Input K,V', ha='center', va='center', fontsize=8)
        ax.text(0.05, y+0.04, f'L{layer+1}', ha='center', va='center', fontsize=10, weight='bold')
    
    ax.text(0.5, 0.15, 'Learnable prefix prepended to keys/values in each layer\n'
                       'Affects attention computation throughout the model',
           ha='center', fontsize=10, style='italic')
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    # 4. Method comparison
    ax = axes[1, 1]
    
    methods = ['Manual\nPrompt', 'Soft\nPrompt', 'Prefix\nTuning', 'P-Tuning\nv2']
    params = [0, 0.01, 0.1, 0.1]
    flexibility = [20, 70, 85, 90]
    ease = [100, 90, 70, 60]
    
    x = np.arange(len(methods))
    width = 0.25
    
    bars1 = ax.bar(x - width, params, width, label='Parameters (%)', color='lightcoral')
    bars2 = ax.bar(x, flexibility, width, label='Flexibility', color='lightgreen')
    bars3 = ax.bar(x + width, ease, width, label='Ease of Use', color='lightblue')
    
    ax.set_xlabel('Method', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Prompt Method Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(methods)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

visualize_prompt_methods()

# Demonstrate soft prompt implementation
print("\n--- Soft Prompt Implementation ---")

class SoftPromptModel(nn.Module):
    def __init__(self, n_prompts=10, embed_dim=768, vocab_size=50000):
        super().__init__()
        # Frozen embeddings (would be from pretrained model)
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.embeddings.requires_grad = False
        
        # Learnable soft prompts
        self.soft_prompts = nn.Parameter(torch.randn(n_prompts, embed_dim) * 0.01)
        
    def forward(self, input_ids):
        batch_size = input_ids.shape[0]
        
        # Get input embeddings
        input_embeds = self.embeddings(input_ids)
        
        # Expand soft prompts for batch
        prompt_embeds = self.soft_prompts.unsqueeze(0).expand(batch_size, -1, -1)
        
        # Concatenate
        combined_embeds = torch.cat([prompt_embeds, input_embeds], dim=1)
        
        return combined_embeds

# Create and test model
model = SoftPromptModel(n_prompts=5, embed_dim=768)
input_ids = torch.randint(0, 50000, (2, 10))
output = model(input_ids)

print(f"Model created with:")
print(f"  Soft prompts: {model.soft_prompts.shape}")
print(f"  Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nInput shape: {input_ids.shape}")
print(f"Output shape: {output.shape} (includes {model.soft_prompts.shape[0]} prompt tokens)")

## 8. Training Strategies and Best Practices

Let's explore optimal training strategies for different fine-tuning methods.

In [None]:
# Learning rate schedules for fine-tuning
def plot_finetuning_schedules():
    """Visualize recommended learning rate schedules."""
    
    steps = np.arange(0, 10000)
    
    # Different schedules
    def linear_warmup_cosine(step, warmup=500, total=10000, max_lr=5e-5):
        if step < warmup:
            return max_lr * step / warmup
        progress = (step - warmup) / (total - warmup)
        return max_lr * 0.5 * (1 + np.cos(np.pi * progress))
    
    def linear_warmup_linear(step, warmup=500, total=10000, max_lr=5e-5):
        if step < warmup:
            return max_lr * step / warmup
        return max_lr * (1 - (step - warmup) / (total - warmup))
    
    def constant_warmup(step, warmup=500, max_lr=5e-5):
        if step < warmup:
            return max_lr * step / warmup
        return max_lr
    
    # Calculate schedules
    schedules = {
        'Cosine (recommended)': [linear_warmup_cosine(s) for s in steps],
        'Linear': [linear_warmup_linear(s) for s in steps],
        'Constant': [constant_warmup(s) for s in steps]
    }
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot schedules
    for name, schedule in schedules.items():
        ax1.plot(steps, schedule, label=name, linewidth=2)
    
    ax1.axvline(x=500, color='red', linestyle='--', alpha=0.5)
    ax1.text(500, 4e-5, 'Warmup', rotation=90, va='bottom', ha='right', color='red')
    ax1.set_xlabel('Training Step', fontsize=12)
    ax1.set_ylabel('Learning Rate', fontsize=12)
    ax1.set_title('Learning Rate Schedules for Fine-tuning', fontsize=14)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Method-specific recommendations
    ax2.axis('off')
    
    recommendations = pd.DataFrame({
        'Method': ['Full Fine-tuning', 'LoRA', 'QLoRA', 'Adapters', 'Prompt Tuning'],
        'Learning Rate': ['2e-5', '1e-4', '2e-4', '1e-3', '1e-2'],
        'Warmup': ['6%', '3%', '3%', '1%', '0%'],
        'Batch Size': ['16-32', '128-256', '4-16', '64-128', '256-512'],
        'Epochs': ['3-5', '10-20', '3-5', '10-20', '20-50']
    })
    
    # Create table
    table = ax2.table(cellText=recommendations.values,
                     colLabels=recommendations.columns,
                     cellLoc='center',
                     loc='center')
    
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 2)
    
    # Style the table
    for i in range(len(recommendations.columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    ax2.set_title('Hyperparameter Recommendations by Method', fontsize=14, pad=20)
    
    plt.tight_layout()
    plt.show()

plot_finetuning_schedules()

# Training tips
print("\n🎯 Fine-tuning Best Practices:")
print("=" * 60)

tips = [
    ("1. Start with LoRA", "Good balance of efficiency and performance"),
    ("2. Use gradient checkpointing", "Trade compute for memory"),
    ("3. Monitor validation loss", "Stop early if overfitting"),
    ("4. Layer-wise learning rates", "Lower LR for earlier layers"),
    ("5. Mixed precision training", "2x speedup with minimal impact"),
    ("6. Careful data preprocessing", "Match pretrained model's format"),
    ("7. Warmup is crucial", "Prevents catastrophic forgetting"),
    ("8. Save checkpoints frequently", "Resume from best checkpoint")
]

for tip, explanation in tips:
    print(f"\n{tip}")
    print(f"   → {explanation}")

## 9. Multi-task Fine-tuning

Adapters and LoRA are particularly well-suited for multi-task scenarios.

In [None]:
def visualize_multitask_finetuning():
    """Show different approaches to multi-task fine-tuning."""
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Separate models
    ax = axes[0, 0]
    ax.axis('off')
    ax.set_title('Approach 1: Separate Models', fontsize=14)
    
    tasks = ['Sentiment', 'NER', 'QA']
    colors = ['lightcoral', 'lightgreen', 'lightblue']
    
    for i, (task, color) in enumerate(zip(tasks, colors)):
        # Base model
        rect = plt.Rectangle((0.1 + i*0.3, 0.3), 0.2, 0.4,
                           facecolor=color, edgecolor='black', linewidth=2)
        ax.add_patch(rect)
        ax.text(0.2 + i*0.3, 0.5, f'Full Model\n{task}', 
               ha='center', va='center', fontsize=10, weight='bold')
        
        # Task head
        rect = plt.Rectangle((0.1 + i*0.3, 0.75), 0.2, 0.1,
                           facecolor='yellow', edgecolor='black')
        ax.add_patch(rect)
        ax.text(0.2 + i*0.3, 0.8, 'Task Head', ha='center', va='center', fontsize=8)
    
    ax.text(0.5, 0.15, '❌ 3x memory\n❌ No knowledge sharing\n✅ Best per-task performance',
           ha='center', fontsize=10)
    
    # 2. Shared model + task heads
    ax = axes[0, 1]
    ax.axis('off')
    ax.set_title('Approach 2: Shared Model + Task Heads', fontsize=14)
    
    # Shared base
    rect = plt.Rectangle((0.3, 0.2), 0.4, 0.4,
                       facecolor='lightgray', edgecolor='black', linewidth=2)
    ax.add_patch(rect)
    ax.text(0.5, 0.4, 'Shared\nBase Model', ha='center', va='center', 
           fontsize=12, weight='bold')
    
    # Task heads
    for i, (task, color) in enumerate(zip(tasks, colors)):
        rect = plt.Rectangle((0.2 + i*0.2, 0.65), 0.15, 0.1,
                           facecolor=color, edgecolor='black')
        ax.add_patch(rect)
        ax.text(0.275 + i*0.2, 0.7, task, ha='center', va='center', fontsize=8)
        
        # Connection
        ax.plot([0.275 + i*0.2, 0.5], [0.65, 0.6], 'k-', linewidth=1)
    
    ax.text(0.5, 0.05, '✅ Memory efficient\n⚠️ Task interference\n✅ Some knowledge sharing',
           ha='center', fontsize=10)
    
    # 3. Adapter-based
    ax = axes[1, 0]
    ax.axis('off')
    ax.set_title('Approach 3: Task-Specific Adapters', fontsize=14)
    
    # Frozen base
    rect = plt.Rectangle((0.3, 0.2), 0.4, 0.4,
                       facecolor='lightblue', edgecolor='black', 
                       linewidth=2, linestyle='--')
    ax.add_patch(rect)
    ax.text(0.5, 0.4, 'Frozen\nBase Model', ha='center', va='center', 
           fontsize=12, weight='bold')
    ax.text(0.72, 0.4, '❄️', fontsize=20)
    
    # Adapters
    adapter_positions = [(0.35, 0.3), (0.5, 0.35), (0.65, 0.3)]
    for i, ((x, y), color) in enumerate(zip(adapter_positions, colors)):
        rect = plt.Rectangle((x-0.04, y), 0.08, 0.15,
                           facecolor=color, edgecolor='black', alpha=0.7)
        ax.add_patch(rect)
        ax.text(x, y+0.18, f'A{i+1}', ha='center', va='center', fontsize=8)
    
    # Task routing
    for i, task in enumerate(tasks):
        ax.text(0.2 + i*0.2, 0.75, task, ha='center', va='center', 
               fontsize=10, bbox=dict(boxstyle="round", facecolor=colors[i], alpha=0.5))
        ax.arrow(0.2 + i*0.2, 0.72, 
                adapter_positions[i][0] - (0.2 + i*0.2), 
                adapter_positions[i][1] + 0.15 - 0.72,
                head_width=0.02, head_length=0.02, fc='black', ec='black')
    
    ax.text(0.5, 0.05, '✅ Very memory efficient\n✅ No task interference\n✅ Easy to add tasks',
           ha='center', fontsize=10)
    
    # 4. Performance comparison
    ax = axes[1, 1]
    
    methods = ['Separate\nModels', 'Shared\nBase', 'Adapters', 'LoRA\nMixture']
    memory = [300, 120, 105, 110]
    performance = [100, 92, 95, 97]
    flexibility = [60, 70, 95, 90]
    
    x = np.arange(len(methods))
    width = 0.25
    
    ax.bar(x - width, memory, width, label='Memory (%)', color='lightcoral')
    ax.bar(x, performance, width, label='Performance (%)', color='lightgreen')
    ax.bar(x + width, flexibility, width, label='Flexibility (%)', color='lightblue')
    
    ax.set_ylabel('Relative Score', fontsize=12)
    ax.set_title('Multi-task Approaches Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(methods)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

visualize_multitask_finetuning()

# Multi-task adapter implementation
print("\n--- Multi-task Adapter Implementation ---")

class MultiTaskAdapterModel(nn.Module):
    def __init__(self, base_model_dim=768, tasks=['sentiment', 'ner', 'qa'], 
                 adapter_size=64):
        super().__init__()
        self.tasks = tasks
        
        # Task-specific adapters
        self.adapters = nn.ModuleDict({
            task: AdapterModule(base_model_dim, adapter_size)
            for task in tasks
        })
        
        # Task-specific heads
        self.task_heads = nn.ModuleDict({
            'sentiment': nn.Linear(base_model_dim, 3),  # 3 classes
            'ner': nn.Linear(base_model_dim, 9),        # 9 NER tags
            'qa': nn.Linear(base_model_dim, 2)          # Start/end positions
        })
        
    def forward(self, x, task):
        # Apply task-specific adapter
        x = self.adapters[task](x)
        
        # Apply task-specific head
        output = self.task_heads[task](x)
        
        return output

# Create model and test
model = MultiTaskAdapterModel()
x = torch.randn(2, 10, 768)  # [batch, seq_len, hidden_dim]

print("Multi-task model created:")
print(f"  Tasks: {model.tasks}")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"  Parameters per adapter: {sum(p.numel() for p in model.adapters['sentiment'].parameters()):,}")

print("\nTask-specific outputs:")
for task in model.tasks:
    output = model(x, task)
    print(f"  {task}: {output.shape}")

## 10. Summary and Recommendations

Let's create a decision tree to help choose the right fine-tuning method.

In [None]:
def create_decision_guide():
    """Create a visual decision guide for choosing fine-tuning methods."""
    
    fig, ax = plt.subplots(figsize=(14, 10))
    ax.axis('off')
    
    # Title
    ax.text(0.5, 0.95, 'Fine-tuning Method Decision Guide', 
           ha='center', fontsize=16, weight='bold')
    
    # Decision nodes
    decisions = [
        # Level 1
        {'text': 'Model Size?', 'pos': (0.5, 0.85), 'color': 'lightblue'},
        
        # Level 2
        {'text': '<7B params', 'pos': (0.3, 0.7), 'color': 'lightgreen'},
        {'text': '>7B params', 'pos': (0.7, 0.7), 'color': 'lightcoral'},
        
        # Level 3
        {'text': 'Memory\nConstraints?', 'pos': (0.2, 0.55), 'color': 'lightyellow'},
        {'text': 'Multi-task?', 'pos': (0.4, 0.55), 'color': 'lightyellow'},
        {'text': 'GPU Memory?', 'pos': (0.6, 0.55), 'color': 'lightyellow'},
        {'text': 'Quality\nPriority?', 'pos': (0.8, 0.55), 'color': 'lightyellow'},
    ]
    
    # Draw decision nodes
    for decision in decisions:
        circle = plt.Circle(decision['pos'], 0.06, 
                          facecolor=decision['color'], edgecolor='black')
        ax.add_patch(circle)
        ax.text(decision['pos'][0], decision['pos'][1], decision['text'], 
               ha='center', va='center', fontsize=9, weight='bold')
    
    # Recommendations
    recommendations = [
        {'text': 'BitFit or\nPrompt Tuning', 'pos': (0.1, 0.35), 'color': 'lightsteelblue'},
        {'text': 'Full\nFine-tuning', 'pos': (0.25, 0.35), 'color': 'lightsteelblue'},
        {'text': 'Adapters', 'pos': (0.4, 0.35), 'color': 'lightsteelblue'},
        {'text': 'QLoRA', 'pos': (0.55, 0.35), 'color': 'lightsteelblue'},
        {'text': 'LoRA', 'pos': (0.7, 0.35), 'color': 'lightsteelblue'},
        {'text': 'LoRA or\nFull FT', 'pos': (0.85, 0.35), 'color': 'lightsteelblue'},
    ]
    
    for rec in recommendations:
        rect = plt.Rectangle((rec['pos'][0] - 0.05, rec['pos'][1] - 0.03), 
                           0.1, 0.06,
                           facecolor=rec['color'], edgecolor='black')
        ax.add_patch(rect)
        ax.text(rec['pos'][0], rec['pos'][1], rec['text'], 
               ha='center', va='center', fontsize=8)
    
    # Draw connections
    connections = [
        # From root
        ((0.5, 0.85), (0.3, 0.7)),
        ((0.5, 0.85), (0.7, 0.7)),
        # From <7B
        ((0.3, 0.7), (0.2, 0.55)),
        ((0.3, 0.7), (0.4, 0.55)),
        # From >7B
        ((0.7, 0.7), (0.6, 0.55)),
        ((0.7, 0.7), (0.8, 0.55)),
        # To recommendations
        ((0.2, 0.55), (0.1, 0.35)),
        ((0.2, 0.55), (0.25, 0.35)),
        ((0.4, 0.55), (0.4, 0.35)),
        ((0.6, 0.55), (0.55, 0.35)),
        ((0.6, 0.55), (0.7, 0.35)),
        ((0.8, 0.55), (0.85, 0.35)),
    ]
    
    for start, end in connections:
        ax.plot([start[0], end[0]], [start[1], end[1]], 'k-', linewidth=1)
    
    # Add labels on connections
    ax.text(0.15, 0.45, 'Yes', fontsize=8, color='red')
    ax.text(0.3, 0.45, 'No', fontsize=8, color='green')
    ax.text(0.5, 0.45, '<40GB', fontsize=8, color='red')
    ax.text(0.65, 0.45, '>40GB', fontsize=8, color='green')
    
    # Key insights box
    insights = [
        "📌 Key Insights:",
        "• LoRA: Best default choice for most cases",
        "• QLoRA: Essential for very large models",
        "• Adapters: Excellent for multi-task scenarios",
        "• Full FT: Use when you have resources and need best quality",
        "• Prompt methods: Great for quick experiments"
    ]
    
    box_text = "\n".join(insights)
    ax.text(0.5, 0.15, box_text, ha='center', va='center',
           bbox=dict(boxstyle="round,pad=0.3", facecolor='wheat', alpha=0.5),
           fontsize=10)
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()

create_decision_guide()

# Final summary
print("\n🎓 Fine-tuning Methods Summary")
print("=" * 70)

summary_data = [
    ["Method", "When to Use", "Pros", "Cons"],
    ["-" * 15, "-" * 25, "-" * 20, "-" * 20],
    ["Full Fine-tuning", "Small models, best quality", "Highest quality", "Resource intensive"],
    ["LoRA", "General purpose, default", "Great balance", "Slightly lower quality"],
    ["QLoRA", "Very large models (>30B)", "Enables huge models", "Slower training"],
    ["Adapters", "Multi-task scenarios", "Task isolation", "More complex"],
    ["Prompt Tuning", "Quick experiments", "Minimal params", "Limited flexibility"],
    ["BitFit", "Baseline, quick test", "Very fast", "Lower quality"]
]

for row in summary_data:
    print(f"{row[0]:<20} {row[1]:<30} {row[2]:<25} {row[3]:<20}")

print("\n✅ Ready to fine-tune! Start with LoRA for the best balance of efficiency and quality.")