# ParamΔ: Zero-Cost Post-Training Demo

This notebook demonstrates the ParamΔ method for transferring post-training capabilities without additional training.

In [None]:
# Setup
import sys
sys.path.append('.')

import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from src.param_delta import ParamDelta
from src.visualization import ParamDeltaVisualizer, DeltaAnalyzer
from src.evaluation import ParamDeltaEvaluator

print("ParamΔ implementation loaded successfully!")

## 1. Understanding ParamΔ

The core formula:
- **Parameter Delta**: ΔΘ = Θ_post - Θ_base
- **Transfer**: Θ'_post = Θ'_base + ΔΘ

In [None]:
# Create synthetic models for demonstration
def create_model(seed, size=128):
    torch.manual_seed(seed)
    return {
        "layer1.weight": torch.randn(size, size),
        "layer2.weight": torch.randn(size, size),
        "layer3.weight": torch.randn(size, size)
    }

# Create models
theta_base = create_model(seed=0)
theta_post = create_model(seed=1)
theta_base_new = create_model(seed=2)

print(f"Created {len(theta_base)} layer models")
print(f"Model size: {sum(p.numel() for p in theta_base.values()) * 4 / 1024 / 1024:.2f} MB")

## 2. Computing Parameter Delta

In [None]:
# Initialize ParamDelta
param_delta = ParamDelta(device="cpu")

# Compute delta: ΔΘ = Θ_post - Θ_base
delta = param_delta.calculate_delta(theta_post, theta_base)

# Visualize delta norms
delta_norms = {k: torch.norm(v).item() for k, v in delta.items()}

plt.figure(figsize=(8, 4))
plt.bar(delta_norms.keys(), delta_norms.values())
plt.xlabel('Layer')
plt.ylabel('L2 Norm')
plt.title('Parameter Delta Norms by Layer')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"Average delta norm: {np.mean(list(delta_norms.values())):.3f}")

## 3. Applying Delta to New Base Model

In [None]:
# Apply delta: Θ'_post = Θ'_base + ΔΘ
theta_param_delta = param_delta.apply_delta(theta_base_new, delta)

# Verify the transformation
print("Verifying ParamΔ formula...")
for key in theta_base.keys():
    expected = theta_base_new[key] + (theta_post[key] - theta_base[key])
    actual = theta_param_delta[key]
    
    error = torch.norm(expected - actual).item()
    print(f"{key}: error = {error:.2e}")

print("\nParamΔ formula verified!")

## 4. Exploring Scaling Factors

In [None]:
# Test different scaling factors
scales = np.linspace(0, 2, 11)
changes = []

for scale in scales:
    scaled_model = param_delta.apply_delta(theta_base_new, delta, scale=scale)
    
    # Measure total change from base
    total_change = sum(
        torch.norm(scaled_model[k] - theta_base_new[k]).item() 
        for k in theta_base_new
    )
    changes.append(total_change)

# Plot scaling analysis
plt.figure(figsize=(8, 5))
plt.plot(scales, changes, 'o-', linewidth=2, markersize=8)
plt.axvline(x=1.0, color='red', linestyle='--', label='α=1.0 (standard)')
plt.xlabel('Scaling Factor (α)')
plt.ylabel('Total Change from Base Model')
plt.title('Effect of Delta Scaling: Θ\' = Θ\'_base + α·ΔΘ')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

## 5. Multi-Delta Fusion

In [None]:
# Create multiple specialized models
theta_general = create_model(seed=3)  # General instruction model
theta_domain = create_model(seed=4)   # Domain-specific model

# Compute multiple deltas
delta_general = param_delta.calculate_delta(theta_general, theta_base)
delta_domain = param_delta.calculate_delta(theta_domain, theta_base)

# Test different mixture ratios
ratios = [(1.0, 0.0), (0.7, 0.3), (0.5, 0.5), (0.3, 0.7), (0.0, 1.0)]
mixture_results = []

for alpha, beta in ratios:
    # Combine deltas: Θ' = Θ_base + α·ΔΘ_general + β·ΔΘ_domain
    deltas = [(delta_general, alpha), (delta_domain, beta)]
    combined = param_delta.combine_multiple_deltas(theta_base_new, deltas)
    
    # Measure characteristics
    total_norm = sum(torch.norm(combined[k]).item() for k in combined)
    mixture_results.append(total_norm)

# Visualize mixture effects
plt.figure(figsize=(8, 5))
labels = [f"({a:.1f}, {b:.1f})" for a, b in ratios]
plt.bar(labels, mixture_results, color=plt.cm.viridis(np.linspace(0, 1, len(ratios))))
plt.xlabel('(α_general, α_domain)')
plt.ylabel('Total Model Norm')
plt.title('Multi-Delta Fusion: Effect of Mixing Ratios')
plt.tight_layout()
plt.show()

print("Multi-delta fusion allows combining capabilities from multiple sources!")

## 6. Delta Analysis: Cosine Similarity

In [None]:
# Compute cosine similarity between deltas
similarities = param_delta.compute_cosine_similarity(
    delta_general, 
    delta_domain,
    layer_types=["overall"]
)

print(f"Cosine similarity between general and domain deltas: {similarities['overall']:.3f}")
print("\nInterpretation:")
print("- Values close to 1: Deltas encode similar changes")
print("- Values close to 0: Deltas are orthogonal (independent)")
print("- Values close to -1: Deltas encode opposite changes")

# Create more deltas for visualization
n_models = 6
models = [create_model(seed=i) for i in range(n_models)]
deltas = [param_delta.calculate_delta(m, theta_base) for m in models[1:]]

# Compute pairwise similarities
n_deltas = len(deltas)
similarity_matrix = np.zeros((n_deltas, n_deltas))

for i in range(n_deltas):
    for j in range(n_deltas):
        if i == j:
            similarity_matrix[i, j] = 1.0
        else:
            sim = param_delta.compute_cosine_similarity(deltas[i], deltas[j])
            similarity_matrix[i, j] = sim['overall']

# Visualize similarity matrix
plt.figure(figsize=(8, 6))
plt.imshow(similarity_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Cosine Similarity')
plt.xlabel('Delta Index')
plt.ylabel('Delta Index')
plt.title('Pairwise Cosine Similarities Between Parameter Deltas')
plt.tight_layout()
plt.show()

## 7. Summary and Key Insights

### ParamΔ Enables:
1. **Zero-cost transfer** of post-training capabilities
2. **Flexible scaling** with α parameter
3. **Multi-source fusion** combining different capabilities
4. **Orthogonal deltas** for independent knowledge

### Use Cases:
- Apply instruction-tuning to new base models instantly
- Combine general and domain-specific capabilities
- Transfer capabilities after continual pretraining
- Experiment with different capability mixtures

In [None]:
# Final demonstration: Complete workflow
print("Complete ParamΔ Workflow:")
print("=" * 50)
print("1. Load base and post-trained models")
print("2. Compute delta: ΔΘ = Θ_post - Θ_base")
print("3. Load new base model")
print("4. Apply delta: Θ'_post = Θ'_base + ΔΘ")
print("5. Enjoy post-trained capabilities with zero training!")
print("=" * 50)
print("\nParamΔ: Post-training at the speed of inference!")