## Load model

In [20]:
import torch
from transformer_lens import HookedTransformer
from llm_token_finder.ablation import AblationLlm
from llm_token_finder.activation_analyser import AttentionHead

llm = HookedTransformer.from_pretrained("gpt2-small")
ablation_llm = AblationLlm(llm)

Loaded pretrained model gpt2-small into HookedTransformer


### Ablate movement from 'quick' to 'fox'

In [23]:
text = "The quick brown fox jumps over the lazy dog."

tokens = llm.tokenizer.tokenize(text, add_special_tokens=True)
token_ids = llm.tokenizer.encode(text, return_tensors="pt")

In [31]:
from llm_token_finder import TokenFinder

token_finder = TokenFinder.create_from_tokenizer(text, llm.tokenizer)

quick_token = token_finder.find_first("quick", allow_space_prefix=True)
fox_token = token_finder.find_first("fox", allow_space_prefix=True)

_, unablated_activation_cache = llm.run_with_cache(token_ids)
_, ablated_activation_cache = ablation_llm.forward(text, token_movement_to_ablate=[(quick_token, fox_token)])

In [33]:
unablated_activation_cache.remove_batch_dim()
ablated_activation_cache.remove_batch_dim()

from llm_token_finder import TokenDisplayer, TokenFinder

token_displayer = TokenDisplayer.create_for_tokenizer(llm.tokenizer)



In [34]:
from llm_token_finder import TokenDisplayer

token_displayer = TokenDisplayer.create_for_tokenizer(llm.tokenizer)

In [38]:
attention_head = AttentionHead(0, 0)

Looking at the attention scores/patterns, we can see that no information is being moved from the 'quick' token to the 'fox' token

In [40]:
print(f"Unablated attention scores for head {attention_head}:")

token_displayer.html_for_token_attention(tokens, unablated_activation_cache, attention_head)

Unablated attention scores for head 0.0:


In [42]:
print(f"Ablated attention scores for head {attention_head}:")

token_displayer.html_for_token_attention(tokens, ablated_activation_cache, attention_head)

Ablated attention scores for head 0.0:


## Attention patterns

In [43]:
token_displayer.html_for_attention_pattern(tokens, unablated_activation_cache, attention_head)

In [44]:
token_displayer.html_for_attention_pattern(tokens, ablated_activation_cache, attention_head)