In [None]:
import torch
from transformer_lens import HookedTransformer, ActivationCache
from typing import List
from tqdm.auto import tqdm
import plotly.graph_objects as go

# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

import circuitsvis as cv
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m", fold_ln=True, device=device)
model.set_use_attn_result(True)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [6]:
# Q weights say, what does each token want. If it attends to the German direction it probably wants German tokens (or wants to filter out German tokens)
# K weights say, what does each token have. If it attends to the German direction it is supplying German-ness
# I want to see how much each head queries for and highlights German tokens

from einops import einsum

def get_intermediate_tensor(residual, weights, biases):
        intermediate_tensor = einsum(residual, weights,
                                     "d_model, n_heads d_model d_head -> n_heads d_head")
        return intermediate_tensor # + biases

for layer in range(3, 6):
        W_Q = model.state_dict()[f'blocks.{layer}.attn.W_Q']  # [8, 512, 64]
        b_Q = model.state_dict()[f'blocks.{layer}.attn.b_K']
        W_K = model.state_dict()[f'blocks.{layer}.attn.W_K']
        b_K = model.state_dict()[f'blocks.{layer}.attn.b_K']

        neuron_weight = model.W_out[3, 669]

        # A large norm implies that the German neuron is important to the head
        query = get_intermediate_tensor(neuron_weight, W_Q, b_Q) # n_heads d_head
        key = get_intermediate_tensor(neuron_weight, W_K, b_K)
        attention_pattern = einsum(query, key,
                                "n_heads d_head, n_heads d_head -> n_heads")

        # How much information each head wants to copy from a German token to another German token
        print(layer, attention_pattern)

# The only large-ish value is in the 3rd head of the 5th layer.

3 tensor([ 0.0326,  0.0236, -0.0164,  0.0012,  0.0001, -0.0072, -0.0073, -0.0014],
       device='cuda:0')
4 tensor([-0.0277,  0.0260, -0.0124,  0.0549, -0.0376, -0.0457, -0.0861,  0.0018],
       device='cuda:0')
5 tensor([-0.0312,  0.0192,  0.0550, -0.1864,  0.0231,  0.0341,  0.0783, -0.0077],
       device='cuda:0')


### L5H3

In [24]:
class SizeLimitedObject:
    def __init__(self, obj, max_width='500px', max_height='500px'):
        self.obj = obj
        self.max_width = max_width
        self.max_height = max_height

    def _repr_html_(self):
        return f"""
        <div style='max-width: {self.max_width}; max-height: {self.max_height}; padding: 20px;'>
            {self.obj._repr_html_()}
        </div>
        """

In [19]:
prompt = "During my morning jog, or \"morgendlichen Lauf,\" I enjoy"
layer = 5
head = 3
tokens = model.to_tokens(prompt)
str_tokens = model.to_str_tokens(tokens)

def disable_german_hook(value, hook):
    value[:, :, 669] = -0.1
    return value
fwd_hooks=[(f'blocks.{3}.mlp.hook_post', disable_german_hook)]

original_loss, ablated_loss, original_cache, ablated_cache = haystack_utils.get_caches_single_prompt(prompt, model, fwd_hooks)

block_name = f'blocks.{layer}.attn.hook_pattern'
original_activations = original_cache[block_name]
ablated_activations = ablated_cache[block_name]

difference = original_activations - ablated_activations

In [30]:
print(difference.shape)

torch.Size([1, 8, 17, 17])


In [25]:
print(f"Layer {layer} Head {head} Attention Pattern:")
head_attention = cv.attention.attention_pattern(tokens=str_tokens, attention=original_activations[0, 3, :, :])
sized_viz = SizeLimitedObject(head_attention)
sized_viz

Layer 5 Head 3 Attention Pattern:


## Average German and English attention patterns

In [29]:
german_data = haystack_utils.load_txt_data("wmt_german_large.txt")[:500]
english_data = haystack_utils.load_txt_data("kde4_english.txt")[:500]

german_data = [item for item in german_data if model.to_tokens(item).shape[1] >= 16][:200]
english_data = [item for item in english_data if model.to_tokens(item).shape[1] >= 16][:200]

print(len(german_data), len(english_data))

wmt_german_large.txt: Loaded 2459 examples with 800 to 2000 characters each.
kde4_english.txt: Loaded 1007 examples with 501 to 5295 characters each.
200 200


In [44]:
layer = 5
head = 3
block_name = f'blocks.{layer}.attn.hook_pattern'

def avg_acts(data, layer, head, len_tokens=16):
    avg_acts = torch.zeros(16, 16).cuda()
    for item in data:
        tokens = model.to_tokens(item)[0, :len_tokens]
        str_tokens = model.to_str_tokens(tokens)

        logits, cache = model.run_with_cache(tokens)
        acts = cache[block_name][0, 3, :, :]
        avg_acts += acts

    return avg_acts / len(data)

In [46]:
german_acts = avg_acts(german_data, layer, head)
print(f"Layer {layer} Head {head} Average Attention Pattern German:")
L5H3_german_attention = cv.attention.attention_pattern(tokens=["unk" for i in range(16)], attention=german_acts)
sized_viz = SizeLimitedObject(L5H3_german_attention)
sized_viz

Layer 5 Head 3 Average Attention Pattern German:


In [47]:
english_acts = avg_acts(english_data, layer, head)
print(f"Layer {layer} Head {head} Average Attention Pattern English:")
L5H3_english_attention = cv.attention.attention_pattern(tokens=["unk" for i in range(16)], attention=english_acts)
sized_viz = SizeLimitedObject(L5H3_english_attention)
sized_viz

Layer 5 Head 3 Average Attention Pattern English:
