In [None]:
import torch

from src.data.loader import load_task_dataset
from src.data.contrastive_dataset import ContrastiveDatasetConstructor
from src.data.evaluation_dataset import EvaluationDatasetConstructor
from src.steering.cache_steering import extract_steering_kv, generate_with_cache_steering
from src.steering.config import SteeringConfig
from src.evaluation.evaluator import Evaluator
from src.utils.constants import Tasks

from transformers import AutoTokenizer, AutoModelForCausalLM

from dotenv import load_dotenv # Load the HF_TOKEN from .env file if needed
load_dotenv()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

checkpoint = 'HuggingFaceTB/SmolLM2-360M-Instruct'                  # Small model that is fast to run on CPU
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Overview
This notebook consists of two parts:

Part 1 shows how to use functions from our codebase (NOTE these functions are implemented for the conveniece of running experiments):
- [Contrastive Data](#contrastive-data)
- [Extracting and applying vectors](#extracting-and-applying-vectors)
- [Running experiments with Evaluator class](#running-experiments-with-evaluator-class)


Part 2 shows how to use cache steering with **only PyTorch + Transformers**:
- [Vector Extraction](#vector-extraction)
- [Generation](#generate-with-cache-steering)
- [Style Transfer Example](#style-transfer-example)

---

# Running experiments

## Contrastive Data

In [None]:
# Load and preprocss ARC-challenge dataset
task = Tasks.arc_oai
subtask = "oai"                         # Subtask to use, can be "oai" (general GPT-4o generated traces), "stepwise" (Stepwise traces), "causal_chain", "strategy_execution", or "analogical reasoning"
dataset = load_task_dataset(Tasks.arc_oai)

config = SteeringConfig(
    n_contrastive_samples=10,           # Number of contrastive samples to generate
    num_fewshot_examples=5,             # Number of few-shot examples to use in each sample
    tokenizer=tokenizer,
    add_generation_prompt=True,         # Whether to add generation prompt when using chat template
)

constructor = ContrastiveDatasetConstructor(
    dataset["train"],
    config,
    task=task,
)

eval_constructor = EvaluationDatasetConstructor(
    dataset["test"],
    tokenizer=tokenizer,
    n=2,                                # Number of samples to use for evaluation
    num_fewshot_prompt=0,               # Number of few-shot examples to use in each test sample
    task=task,
    prefix=None,                        # Answer prefix for the evaluation. Appended to prompt after the question and before the answer
    system_prompt=None,                 # System prompt for the evaluation. If None, no system prompt is used
    add_generation_prompt=True,         # Whether to add generation prompt when using chat template
)
contrastive_dataset = constructor.construct_dataset()
evaluation_dataset = eval_constructor.construct_dataset()

### Evaluation example

In [4]:
print(evaluation_dataset[0]['input'])

<|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?

Choices:
A: Planetary density will decrease.
B: Planetary years will become longer.
C: Planetary days will become shorter.
D: Planetary gravity will become stronger.<|im_end|>
<|im_start|>assistant



### Contrastive pair

In [None]:
# Uncomment to view the first contrastive sample pair

print(contrastive_dataset[0]['positive'])
print("="*50)
print(contrastive_dataset[0]['negative'])

## Extracting and applying vectors

In [None]:
# Exctract the steering vectors for each layer
# The function returns a dict {"values": {layer_idx: steering_vector}, "keys": {layer_idx: steering_vector}}
steering_kv = extract_steering_kv(
    model=model,
    tokenizer=tokenizer,
    data=contrastive_dataset,
    steering_config=config,
)

In [None]:
tokens = tokenizer(evaluation_dataset[0]['input'], return_tensors="pt").to(device)

steering_config = SteeringConfig(
    tokenizer=tokenizer,
    c_keys=0.0,                     # The steering coefficient for the keys        
    c_values=3,                     # The steering coefficient for the values
    append_special_token=True,      # Whether to append a special token to the input to offset the position of the steering token. Allows the alignment of the extraction and application tokens.
)

generation_kwargs = {"max_new_tokens": 100, "do_sample": False}
output = generate_with_cache_steering(
    model,
    tokens["input_ids"],
    steering_kv=steering_kv,
    steering_config=steering_config,
    attention_mask=tokens["attention_mask"],
    **generation_kwargs,
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

## Running experiments with Evaluator class

The `Evaluator` class accepts the evaluation and contrastive datasets togeher with steering config as arguments, extracts steering vectors, and applies cache steering to evaluation. This class was written for the convenience of running experiments. However, you don't have to use this class to use cache steering as it compatible with pure pytorch + HuggingFace transformers (see section below).

In [5]:
steering_config = SteeringConfig(
    tokenizer=tokenizer,
    how='last',                     # The position to apply steering to. Passing 'last' would appply steering to the token. Passing an integer i would apply steering to token position -i
    c_keys=0.0,                     # The steering coefficient for the keys        
    c_values=2,                     # The steering coefficient for the values
    layers_ids_keys=[1],            # The layers to apply steering to for the keys. If None, steering is applied to all layers except the embedding layer. If [i] is passed, steering is applied to layer i and above. If [i, j] is passed, steering is applied to layers i and j only.
    layers_ids_values=[1],          # Same as layers_ids_keys, but for the values
    append_special_token=True,      # Whether to append a special token to the input to offset the position of the steering token. Allows the alignment of the extraction and application tokens.
)
steering_config.set_seed(42)        # Set seed for vector id generation (doesn't affect the steering process itself, this is a coding artefact that wasn't implemented properly). The Evaluator will cache the steering vectors with the same parameters, so changing the seed is a way to recompute the steering vector without removing the old one from the cache

device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1

evaluator = Evaluator(
    model,
    tokenizer,
    evaluation_dataset,
    task,
    device=device,
    steering_config=steering_config,
    extraction_dataset=contrastive_dataset,
)
generation_kwargs = {"max_new_tokens": 512, "do_sample": False}

results = evaluator.evaluate(batch_size, generation_kwargs)

2025-07-11 14:20:06,469 - helpers.py - generate_vector_id - INFO - Generated vector ID: 6e9928a1-19c1-50b2-b373-3979b92e19da
2025-07-11 14:20:06,470 - evaluator.py - extract_steering_vectors - INFO - Loading steering vector from cache: 6e9928a1-19c1-50b2-b373-3979b92e19da
2025-07-11 14:20:06,471 - helpers.py - load_vector - INFO - Loading vector from cached_vectors/6e9928a1-19c1-50b2-b373-3979b92e19da.pt
Map: 100%|██████████| 2/2 [00:00<00:00, 367.73 examples/s]
Pre-tokenizing: 2it [00:00, 880.69it/s]
Generating responses: 100%|██████████| 2/2 [00:11<00:00,  5.92s/it]


In [6]:
generated_sample = results['samples'][0]
print(generated_sample['input'])
print(generated_sample['response'])

<|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
A group of engineers wanted to know how different building designs would respond during an earthquake. They made several models of buildings and tested each for its ability to withstand earthquake conditions. Which will most likely result from testing different building designs?

Choices:
A: buildings will be built faster
B: buildings will be made safer
C: building designs will look nicer
D: building materials will be cheaper<|im_end|>
<|im_start|>assistant


A: Buildings will be built faster

The engineers tested different building designs to determine which would be most effective in resisting earthquake forces. The most likely outcome is that the building designs that were tested would be built faster, as they would be designed to be more efficient and cost-effective. This would allow the engineers to complete the testing and analysis more quickly, potentially saving ti

### Try the same input without steering

In [7]:
tokens = tokenizer(generated_sample['input'], return_tensors='pt')
output_tokens = model.generate(**tokens, **generation_kwargs)
print(tokenizer.decode(output_tokens[0], skip_special_tokens=False))

<|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
A group of engineers wanted to know how different building designs would respond during an earthquake. They made several models of buildings and tested each for its ability to withstand earthquake conditions. Which will most likely result from testing different building designs?

Choices:
A: buildings will be built faster
B: buildings will be made safer
C: building designs will look nicer
D: building materials will be cheaper<|im_end|>
<|im_start|>assistant
A: buildings will be built faster<|im_end|>


---

# Implementing cache steering with PyTorch + Transformers

## Vector Extraction

In [24]:
from collections import defaultdict
from tqdm import tqdm

from transformers import DynamicCache


def extract_steering_kv(
    model,
    tokenizer,
    data,
    batch_size=1,
    device="cpu",
    extraction_token_position: int = -1,  # Position of the token to extract the steering vectors from, -1 means the last token
):
    steering_values = defaultdict(lambda: torch.tensor([]).to(device))
    steering_keys = defaultdict(lambda: torch.tensor([]).to(device))

    for example in tqdm(data.iter(batch_size=batch_size)):

        pos_tokens = tokenizer(example['positive'], return_tensors='pt', padding=True).to(device)
        neg_tokens = tokenizer(example['negative'], return_tensors='pt', padding=True).to(device)

        # Find indices of the tokens to extract the steering vectors from
        pos_indices = pos_tokens['attention_mask'].sum(dim=1) + extraction_token_position
        neg_indices = neg_tokens['attention_mask'].sum(dim=1) + extraction_token_position
        batch_indices = torch.arange(pos_tokens['input_ids'].size(0), device=pos_tokens['input_ids'].device)

        # Record cache for the positive and negative tokens
        cache_positive, cache_negative = DynamicCache(), DynamicCache()
        with torch.no_grad():
            _ = model(**pos_tokens, output_hidden_states=True, past_key_values=cache_positive)
            _ = model(**neg_tokens, output_hidden_states=True, past_key_values=cache_negative)

        for layer_id in range(len(cache_positive.value_cache)):
            pos_values = cache_positive.value_cache[layer_id][batch_indices, :, pos_indices, :]
            neg_values = cache_negative.value_cache[layer_id][batch_indices, :, neg_indices, :]

            pos_keys = cache_positive.key_cache[layer_id][batch_indices, :, pos_indices, :]
            neg_keys = cache_negative.key_cache[layer_id][batch_indices, :, neg_indices, :]

            # Take the differnece between the vectors
            steering_values[layer_id] = torch.cat([steering_values[layer_id], pos_values - neg_values]) # [batch_size, n_heads, head_dim]
            steering_keys[layer_id] = torch.cat([steering_keys[layer_id], pos_keys - neg_keys])

    # Average the vectors
    for layer_id in steering_values:
        steering_values[layer_id] = torch.mean(steering_values[layer_id], dim=0) # [n_heads, head_dim]
        steering_keys[layer_id] = torch.mean(steering_keys[layer_id], dim=0)

    return {
        "values": steering_values,
        "keys": steering_keys,
    }

In [10]:
from datasets import Dataset


# Prepare the contrastive dataset for steering vector extraction
positive_examples = contrastive_dataset['positive'] # or any variable of type List[text]
negative_examples = contrastive_dataset['negative'] # or any variable of type List[text]
contrastive_set = Dataset.from_dict({
    "positive": positive_examples,
    "negative": negative_examples,
})
print(contrastive_set)

# Extract the steering vectors from the contrastive dataset
steering_vectors = extract_steering_kv(
    model,
    tokenizer,
    contrastive_set,
    batch_size=batch_size,
    device=device,
    extraction_token_position=-1,  # Last token
)

Dataset({
    features: ['positive', 'negative'],
    num_rows: 10
})


10it [00:24,  2.44s/it]


In [11]:
# Let's check if the steering vectors the same as in the evaluator
torch.all(
    torch.isclose(steering_vectors["values"][1], evaluator.steering_kv["values"][1])
)

tensor(True)

## Generate with Cache Steering

In [25]:
from typing import Optional

from transformers import PreTrainedModel, BatchEncoding


def generate_with_cache_steering(
    model: PreTrainedModel,
    tokens: BatchEncoding | torch.Tensor,
    steering_kv: dict,
    application_token_idx: int = -1,
    offset_str: Optional[str] = None,
    c_keys: float = 0.0,
    c_values: float = 0.0,
    **kwargs,
):
    # Check the format of tokens and convert if necessary
    if isinstance(tokens, BatchEncoding):
        tokens = tokens["input_ids"]

    # Append a special token to the input tokens if you need to steer the cache of last token and want to align with the extraction token
    if offset_str and application_token_idx == -1:
        token_to_append = tokenizer(offset_str, add_special_tokens=False)["input_ids"][0]
        token_to_append = torch.ones(tokens.shape[0], 1, device=tokens.device, dtype=tokens.dtype) * token_to_append
        tokens = torch.cat([tokens, token_to_append], dim=-1)
        if "attention_mask" in kwargs:
            kwargs['attention_mask'] = torch.cat([kwargs['attention_mask'], torch.ones_like(token_to_append)], dim=-1)

    # Create the initial cache
    cache_input = {
        "input_ids": tokens,
        "attention_mask": kwargs['attention_mask']
    }
    past_key_values = precompute_kv_cache(model, cache_input)

    # Steer the cache
    past_key_values = steer_kv_cache(
        past_key_values,
        steering_kv,
        application_token_idx=application_token_idx,
        c_keys=c_keys,
        c_values=c_values,
    )

    # Generate as usual
    output = model.generate(
        tokens,
        past_key_values=past_key_values,
        **kwargs,
    )
    return output


def precompute_kv_cache(model, tokens):
    """
    Precompute the key and value caches for the input tokens except the last one.
    """
    past_key_values = DynamicCache()

    if isinstance(tokens, BatchEncoding) or isinstance(tokens, dict):
        cache_input = {
            k: v[:, :-1]
            for k, v in tokens.items()
            if k in ["input_ids", "attention_mask", "token_type_ids", "position_ids"]
        }
    else:
        cache_input = {"input_ids": tokens[:, :-1]}

    # Compute correct position_ids before caching
    seq_lengths = cache_input["attention_mask"].sum(dim=1)
    position_ids = torch.zeros_like(cache_input["input_ids"])
    for i in range(cache_input["input_ids"].shape[0]):
        valid_len = seq_lengths[i]
        position_ids[i, -valid_len:] = torch.arange(valid_len)
    cache_input["position_ids"] = position_ids

    # Precompute the KV cache
    with torch.no_grad():
        model(**cache_input, past_key_values=past_key_values, use_cache=True)

    return past_key_values


def steer_kv_cache(cache, steering_kv, application_token_idx=-1, c_keys=0.0, c_values=0.0):

    # Steer the values cache
    if "values" in steering_kv:
        for layer_idx, past_values in steering_kv["values"].items():
            steer_kv_cache_layer(
                cache,
                past_values,
                layer_idx,
                type="values",
                application_token_idx=application_token_idx,
                c_keys=c_keys,
                c_values=c_values,
            )

    # Steer the keys cache
    if "keys" in steering_kv:
        for layer_idx, past_keys in steering_kv["keys"].items():
            steer_kv_cache_layer(
                cache,
                past_keys,
                layer_idx,
                type="keys",
                application_token_idx=application_token_idx,
                c_keys=c_keys,
                c_values=c_values,
            )

    return cache


def steer_kv_cache_layer(
    cache,
    steering_vector,
    layer_idx,
    type="values",
    application_token_idx=-1,
    c_keys=0.0,
    c_values=0.0,
):
    """
    Steer the key and value cache of a specific layer.
    """
    # Clone the steering vector to avoid modifying the original dict
    sv = steering_vector.clone() # [n_heads, head_dim]

    # Apply the vector to the cache
    if type == 'values':
        cache.value_cache[layer_idx][:, :, application_token_idx, :] += sv * c_values

    elif type == 'keys':
        cache.key_cache[layer_idx][:, :, application_token_idx, :] += sv * c_keys


In [13]:
# Tokenize a prompt and prepare it for generation
prompt = "What is the capital of France?"
messages = [{"role": "user", "content": prompt}]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
input_tokens = tokenizer(input_text, return_tensors='pt')

# Let's inspect the last token
application_last_token_id = input_tokens['input_ids'][0, -1].item()
extraction_last_token_id = tokenizer(positive_examples[0])['input_ids'][-1]
print(f"Last last token in the prompt: {repr(tokenizer.decode(application_last_token_id))}")
print(f"Last token in contrastive dataset examples: {repr(tokenizer.decode(extraction_last_token_id))}", end="\n\n")

Last last token in the prompt: '\n'
Last token in contrastive dataset examples: '\n'



Here you can see that we extracted our vectors from `\n` and the last token in the input prompt is also `\n`. Through experimentation we found that such tokens as `\n`, `.`, tokenizer template tokens, BoS, EoS, etc. aggregate the most information. Therefore, we might want to offset the size of the cache to be able to add the vectors extracted from `\n` in the contrastive set, to vectors corresponding to `\n` in the KV cache of the target prompt.

Let's first validate if the implementation is identical to normal `model.generate()` if the coefficients are 0

In [14]:
# Let's validate if the implementation is identical to normal model.generate() if the coefficients are 0
genreration_kwargs = {"max_new_tokens": 128, "do_sample": False}
original_output = model.generate(
    input_tokens['input_ids'],
    attention_mask=input_tokens['attention_mask'],
    **genreration_kwargs,
)
print("Original output:", tokenizer.decode(original_output[0], skip_special_tokens=False))

empty_steering_output = generate_with_cache_steering(
    model,
    input_tokens['input_ids'],
    attention_mask=input_tokens['attention_mask'],
    steering_kv=steering_vectors,
    **genreration_kwargs,
)
print("\n", "=" * 50, "\n")
print("Output with empty steering:", tokenizer.decode(empty_steering_output[0], skip_special_tokens=False))

Original output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is the capital of France?<|im_end|>
<|im_start|>assistant
The capital of France is Paris.<|im_end|>


Output with empty steering: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is the capital of France?<|im_end|>
<|im_start|>assistant
The capital of France is Paris.<|im_end|>


Now let's steer the cache with the KV steering vectors

In [18]:
# The real KV steering
steered_output = generate_with_cache_steering(
    model,
    input_tokens['input_ids'],
    attention_mask=input_tokens['attention_mask'],
    steering_kv=steering_vectors,
    application_token_idx=-1,   # Last token
    offset_str="\n",            # Some token (in string format) to append to the input to offset the position of the steered token
    c_keys=0.0,
    c_values=10.0,
    **genreration_kwargs,
)
print("=" * 50, "\n")
print("Steered output:", tokenizer.decode(steered_output[0], skip_special_tokens=False))


Steered output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is the capital of France?<|im_end|>
<|im_start|>assistant

When it comes to the capital of France, it is essential to note that the official capital of France is Paris, which is also the largest city in the country. Paris is not only the capital but also the economic, cultural, and political center of France.

In terms of geography, Paris is located in the northern part of France, in the region of Nord-Pas-de-Calais. It is situated on the Seine River, which flows through the city, and is surrounded by several other major cities, including Lyon, Marseille, and Reims.

In terms of population, Paris has


In [17]:
# To confirm the effect is not attributed to appending '\n' to the input :)
steered_output = generate_with_cache_steering(
    model,
    input_tokens['input_ids'],
    attention_mask=input_tokens['attention_mask'],
    steering_kv=steering_vectors,
    offset_str="\n",
    c_keys=0.0,
    c_values=0.0,
    **genreration_kwargs,
)
print("=" * 50, "\n")
print("Steered output:", tokenizer.decode(steered_output[0], skip_special_tokens=False))


Steered output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is the capital of France?<|im_end|>
<|im_start|>assistant

The capital of France is Paris.<|im_end|>


### Style transfer example

Let's try to use cache steering to induce the `analogical_reasoning` style. The contrastive examples from this subset follow this pattern: `Just like [some analogy], ...`, so we expect our model to produce something similar if steering is effective.

In [None]:
# Load and preprocss ARC-challenge dataset
task = Tasks.arc_oai
dataset = load_task_dataset(Tasks.arc_oai, subtask="analogical_reasoning")

config = SteeringConfig(
    n_contrastive_samples=20,  
    num_fewshot_examples=5,
    tokenizer=tokenizer,
    add_generation_prompt=True,
)

constructor = ContrastiveDatasetConstructor(
    dataset["train"],
    config,
    task=task,
)
contrastive_dataset = constructor.construct_dataset()

In [None]:
# Prepare the contrastive dataset for steering vector extraction
positive_examples = contrastive_dataset['positive']
negative_examples = contrastive_dataset['negative']
contrastive_set = Dataset.from_dict({
    "positive": positive_examples,
    "negative": negative_examples,
})

# Extract the steering vectors from the contrastive dataset
steering_vectors = extract_steering_kv(
    model,
    tokenizer,
    contrastive_set,
    batch_size=1,
    device=device,
    extraction_token_position=-1,  # Last token
)

In [41]:
# Tokenize a prompt and prepare it for generation
prompt = "What is the capital of France?"
messages = [{"role": "user", "content": prompt}]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
input_tokens = tokenizer(input_text, return_tensors='pt')

# Generate
genreration_kwargs = {"max_new_tokens": 128, "do_sample": False}
steered_output = generate_with_cache_steering(
    model,
    input_tokens['input_ids'],
    attention_mask=input_tokens['attention_mask'],
    steering_kv=steering_vectors,
    application_token_idx=-1,
    offset_str="\n",
    c_keys=0.0,
    c_values=2.0,
    **genreration_kwargs,
)
print("=" * 50, "\n")
print("Steered output:", tokenizer.decode(steered_output[0], skip_special_tokens=False))


Steered output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
What is the capital of France?<|im_end|>
<|im_start|>assistant

Just like how you have a home, a country has a capital. The capital of France is Paris. Paris is a big city in France, and it's where the Eiffel Tower is located. It's a beautiful city with lots of history and culture.<|im_end|>
