# Example 3: Load Concepts and Demonstrate Activation Manipulation

This notebook demonstrates how to:
1. Load the language model and trained SAE from previous examples
2. Load curated concepts from the manual curation process
3. Attach the concept dictionary to the SAE
4. Demonstrate inference with manipulated activations
5. Create custom activation controllers to amplify or suppress specific concepts

This example shows how to use curated concepts to understand and control what the model generates.


In [17]:
# Setup and imports
%load_ext autoreload
%autoreload 2

import torch
import json
import csv
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter

from amber.store import LocalStore
from amber.adapters.text_snippet_dataset import TextSnippetDataset
from amber.core.language_model import LanguageModel
from amber.mechanistic.autoencoder.autoencoder import Autoencoder
from amber.mechanistic.autoencoder.concepts.concept_dictionary import ConceptDictionary, Concept
from amber.hooks import Controller, HookType

print("‚úÖ Imports completed")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
‚úÖ Imports completed


In [18]:
# Configuration
print("üöÄ Starting Concept Loading and Neuron Manipulation Example")

# Load metadata from previous examples
training_metadata_path = Path("outputs/training_metadata.json")
attachment_metadata_path = Path("outputs/attachment_metadata.json")

if not training_metadata_path.exists():
    print("‚ùå Error: training_metadata.json not found!")
    print("   Please run 01_train_sae_model.ipynb first")
    raise FileNotFoundError("training_metadata.json not found")

if not attachment_metadata_path.exists():
    print("‚ùå Error: attachment_metadata.json not found!")
    print("   Please run 02_attach_sae_and_save_texts.ipynb first")
    raise FileNotFoundError("attachment_metadata.json not found")

# Load metadata
with open(training_metadata_path, "r") as f:
    training_metadata = json.load(f)

with open(attachment_metadata_path, "r") as f:
    attachment_metadata = json.load(f)

# Configuration from metadata
MODEL_ID = training_metadata["model_id"]
LAYER_SIGNATURE = training_metadata["layer_signature"]
SAE_MODEL_PATH = Path(training_metadata["sae_model_path"])
CACHE_DIR = Path(training_metadata["cache_dir"])
STORE_DIR = Path(training_metadata["store_dir"])
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Check for curated concepts
CURATED_CONCEPTS_CSV = Path("outputs/curated_concepts.csv")

if not CURATED_CONCEPTS_CSV.exists() and not CURATED_CONCEPTS_JSON.exists():
    print("‚ö†Ô∏è Warning: No curated concepts found!")
    print("   Please run the manual curation process first")
    print("   You can create a simple CSV with format: neuron_idx,concept_name,score")

print(f"üîß Model: {MODEL_ID}")
print(f"üéØ Target layer: {LAYER_SIGNATURE}")
print(f"üß† SAE model: {SAE_MODEL_PATH}")
print(f"üìä Curated concepts: {CURATED_CONCEPTS_CSV if CURATED_CONCEPTS_CSV.exists() else CURATED_CONCEPTS_JSON}")
print()


üöÄ Starting Concept Loading and Neuron Manipulation Example
üîß Model: sshleifer/tiny-gpt2
üéØ Target layer: gpt2lmheadmodel_transformer_h_0_attn_c_attn
üß† SAE model: outputs/sae_model.pt
üìä Curated concepts: outputs/curated_concepts.csv



In [19]:
# Step 1: Load language model
print("üì• Loading language model...")

# Load model and move to device
model = LanguageModel.from_huggingface(MODEL_ID)
model.model.to(DEVICE)

# Optional: set experiment metadata
model.context.experiment_name = "sae_attachment"
model.context.run_id = f"attachment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
model.context.max_length = 64

print(f"‚úÖ Model loaded: {model.model_id}")
print(f"üì± Device: {DEVICE}")
print(f"üîß Context: {model.context.experiment_name}/{model.context.run_id}")


üì• Loading language model...
‚úÖ Model loaded: sshleifer_tiny-gpt2
üì± Device: cpu
üîß Context: sae_attachment/attachment_20251101_011913


In [20]:
# Step 2: Load trained SAE
print("üì• Loading trained SAE...")
if not SAE_MODEL_PATH.exists():
    print(f"‚ùå Error: SAE model not found at {SAE_MODEL_PATH}")
    print("   Please run 01_train_sae_model.ipynb first")
    raise FileNotFoundError(f"SAE model not found at {SAE_MODEL_PATH}")

sae, dataset_normalize, dataset_target_norm, dataset_mean = Autoencoder.load_model(SAE_MODEL_PATH)
sae.to(DEVICE)

# Update SAE context with current experiment info
sae.context.experiment_name = "concept_manipulation"
sae.context.run_id = f"manipulation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

print(
    f"‚úÖ SAE loaded: {training_metadata['hidden_dim']} ‚Üí {training_metadata['n_latents']} ‚Üí {training_metadata['hidden_dim']}")
print(f"üîß Context: {sae.context.experiment_name}/{sae.context.run_id}")
print(f"üìä Dataset normalization: {dataset_normalize}")
print("‚úÖ Trained SAE loaded")


2025-11-01 01:19:13,054 [INFO] amber.mechanistic.autoencoder.autoencoder: 
Loaded model from outputs/sae_model.pt
n_latents=24, n_inputs=6, activation=TopK_8, tied=False


üì• Loading trained SAE...
‚úÖ SAE loaded: 6 ‚Üí 24 ‚Üí 6
üîß Context: concept_manipulation/manipulation_20251101_011913
üìä Dataset normalization: False
‚úÖ Trained SAE loaded


In [21]:
# Step 3: Load curated concepts
print("üì• Loading curated concepts...")

# Try to load from CSV first, then JSON
if CURATED_CONCEPTS_CSV.exists():
    print(f"üìÑ Loading from CSV: {CURATED_CONCEPTS_CSV}")
    concept_dict = ConceptDictionary.from_csv(CURATED_CONCEPTS_CSV, n_size=training_metadata["n_latents"])
else:
    print(f"CSV not found")
print(f"‚úÖ Loaded concept dictionary with {concept_dict.n_size} neurons")
print(f"üìä Total concepts: {sum(len(concept_dict.get(i)) for i in range(concept_dict.n_size))}")

# Show some concepts
print("\nüîç Sample concepts:")
for neuron_idx in range(min(5, concept_dict.n_size)):
    concepts = concept_dict.get(neuron_idx)
    if concepts:
        print(f"   Neuron {neuron_idx}: {len(concepts)} concepts")
        for i, concept in enumerate(concepts[:2]):
            print(f"     {i + 1}. '{concept.name}' (score: {concept.score:.3f})")
    else:
        print(f"   Neuron {neuron_idx}: no concepts")
print()


üì• Loading curated concepts...
üìÑ Loading from CSV: outputs/curated_concepts.csv
‚úÖ Loaded concept dictionary with 24 neurons
üìä Total concepts: 60

üîç Sample concepts:
   Neuron 0: 10 concepts
     1. 'family relationships' (score: 0.900)
     2. 'parent-child interactions' (score: 0.800)
   Neuron 1: 10 concepts
     1. 'nature and outdoors' (score: 0.900)
     2. 'animals and wildlife' (score: 0.800)
   Neuron 2: 10 concepts
     1. 'problem solving' (score: 0.900)
     2. 'logical thinking' (score: 0.800)
   Neuron 3: 10 concepts
     1. 'Mummy and Daddy were picking flowers in the garden. Mummy picked a red daisy, Daddy picked a purple ' (score: 0.006)
     2. 'Once there was a generous bear. He liked to help others and was always very kind. But he had one hab' (score: 0.006)
   Neuron 4: 10 concepts
     1. 'Mummy and Daddy were picking flowers in the garden. Mummy picked a red daisy, Daddy picked a purple ' (score: 0.062)
     2. 'Once there was a generous bear. He like

In [22]:
# Step 4: Attach concept dictionary to SAE
print("üîó Attaching concept dictionary to SAE...")

# Set the language model and layer signature on the SAE's context
sae.context.lm = model
sae.context.lm_layer_signature = LAYER_SIGNATURE

# Attach the concept dictionary to the SAE
sae.concepts.dictionary = concept_dict

print("‚úÖ Concept dictionary attached to SAE")
print(f"üîß SAE now has access to {sum(len(concept_dict.get(i)) for i in range(concept_dict.n_size))} concepts")
print(f"üìä Concepts available for {sum(1 for i in range(concept_dict.n_size) if concept_dict.get(i))} neurons")

üîó Attaching concept dictionary to SAE...
‚úÖ Concept dictionary attached to SAE
üîß SAE now has access to 60 concepts
üìä Concepts available for 6 neurons


In [23]:
test_texts = [
    "The family went to the park together.",
    "The cat sat on the tree branch.",
    "She solved the math problem quickly.",
    "The child felt happy and excited."
]