This example finds attention heads that move information from adjectives to the corresponding nouns in a sentence. It uses the `ActivationAnalyzer` class to find heads that match a certain criteria.

## Setup

In [1]:
from transformer_lens import HookedTransformer


llm = HookedTransformer.from_pretrained("EleutherAI/pythia-2.8b-deduped-v0")

  from .autonotebook import tqdm as notebook_tqdm


Loaded pretrained model EleutherAI/pythia-2.8b-deduped-v0 into HookedTransformer


In [2]:
text = "The quick, brown fox jumps over the lazy dog."

input_tokens = llm.tokenizer.tokenize(text, add_special_tokens=True)
input_token_ids = llm.tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")

print(input_tokens)
print(input_token_ids)

['<|endoftext|>', 'The', 'Ġquick', ',', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
tensor([[    0,   510,  3158,    13,  8516, 30013, 27287,   689,   253, 22658,
          4370,    15]])


In [3]:
_, activation_cache = llm.run_with_cache(input_token_ids)

## Find noun-adjective pairs

In [4]:
from llm_token_finder.token_finder import Token
from dataclasses import dataclass

@dataclass
class Noun:
    noun_token: Token
    adjective_tokens: list[Token]

In [5]:
from llm_token_finder import TokenFinder, ActivationAnalyzer


token_finder = TokenFinder.create_from_tokenizer(text, llm.tokenizer)
activation_analyzer = ActivationAnalyzer.create_from_tokenizer(llm.tokenizer, input_tokens, activation_cache)

In [6]:
fox = Noun(
    noun_token=token_finder.find_first("fox", allow_space_prefix=True),
    adjective_tokens=[
        token_finder.find_first("quick", allow_space_prefix=True),
        token_finder.find_first("brown", allow_space_prefix=True),
    ],
)

dog = Noun(
    noun_token=token_finder.find_first("dog", allow_space_prefix=True),
    adjective_tokens=[
        token_finder.find_first("lazy", allow_space_prefix=True),
    ],
)

nouns = [fox, dog]

print(fox)
print(dog)

Noun(noun_token=Token(index=5, value='Ġfox'), adjective_tokens=[Token(index=2, value='Ġquick'), Token(index=4, value='Ġbrown')])
Noun(noun_token=Token(index=10, value='Ġdog'), adjective_tokens=[Token(index=9, value='Ġlazy')])


### Find heads that move information from adjectives to nouns

In [7]:
from torch import Tensor
from jaxtyping import Float


def criteria(attention: Float[Tensor, "q v"]) -> bool:
    """
    Returns true if the maximum attention scores for each noun is the corresponding adjectives
    """
    for noun in nouns:
        noun_attention_scores = attention[noun.noun_token.index]
        top_attention_score_indexes = noun_attention_scores.topk(len(noun.adjective_tokens)).indices
        for adjective_token in noun.adjective_tokens:
            if adjective_token.index not in top_attention_score_indexes:
                return False
    return True


noun_advective_heads = activation_analyzer.find_heads_matching_criteria(criteria)

print(f"Found {len(noun_advective_heads)} head(s) that move information from adjectives to nouns")

Found 1 head(s) that move information from adjectives to nouns


## Visualise

In [8]:
from llm_token_finder import TokenDisplayer


token_displayer = TokenDisplayer.create_for_tokenizer(llm.tokenizer)

In [9]:
print(f"Head {noun_advective_heads[0]}:")

token_displayer.html_for_token_attention(
    input_tokens,
    activation_cache,
    noun_advective_heads[0],
)

Head 1.6:
