### Setup

In [1]:
import torch
from transformer_lens import HookedTransformer
from jaxtyping import Float
from torch import Tensor
import plotly.io as pio
import numpy as np
import pandas as pd
from tqdm import trange
from collections import defaultdict, Counter
from torchmetrics.regression import SpearmanCorrCoef
import plotly_express as px
import circuitsvis

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

import haystack_utils
import hook_utils

%reload_ext autoreload
%autoreload 2


In [2]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

LAYER, NEURON = 3, 669


german_prompt = "beraten. H\u00f6here Investitionen in Forschung und Entwicklung sowie die Erfassung und \
    Verarbeitung von zuverl\u00e4ssigen Daten w\u00fcrde zu einer solideren und nachhaltigen Gemeinsamen \
    Fischereipolitik f\u00fchren.\nAber obwohl der Satz, den ich von einem Wissenschaftler geh\u00f6rt \
    habe (\"Das Problem ist nicht Geld, sondern Personal\") die Lage gut darstellt, werde ich nicht \
    diejenige sein, die sagt, dass die Fischereiforschung gut mit finanziellen Mitteln ausgestattet \
    ist. Ich werde vielmehr sagen, dass wir ein doppeltes Problem haben.\nAn erster Stelle, Herr \
    Kommissar, die im Siebten Rahmenprogramm f\u00fcr Meeresforschung festgelegten Betr\u00e4ge, \
    die ein horizontales Thema h\u00e4tten sein sollen, scheinen f\u00fcr den integrierten Ansatz, \
    der bei dieser Angelegenheit gegenw\u00e4rtig gew\u00fcnscht wird, unzureichend zu sein.\nAu\u00dferdem, \
    Herr Kommissar, haben Wissenschaftler - und ich kann Ihnen versichern, dass ich vor und w\u00e4hrend \
    der Ausarbeitung dieses Berichts mit vielen gesprochen habe - Probleme bei der Einreichung von Projekten \
    unter dem Siebten Forschungsrahmenprogramm. Diese Probleme sind"

english_prompt = "given the generally greater adeptness of children at using audio-visual resources, in some \
    areas there are dangers of their obtaining access to unsuitable or harmful material. This is most obvious \
    in the fields of overt sexual material and gratuitous violence.\nThe principles which have guided this \
    report are to encourage greater public awareness of these issues and to support parental responsibility \
    and to develop co-operation between the content providers, consumer organisations and the \
    respective authorities, both national and European. Self-regulation is considered to be the \
    main instrument, underpinned by legal requirements where necessary.\nThe report, which \
    analyses the Commission's evaluation report, is primarily concerned with the Internet \
    and with video games, as it was felt important not to anticipate a possible future \
    review of the Television without Frontiers directive. The report calls for user-friendly content filter systems"

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [3]:

tokens = model.to_tokens(english_prompt[:50] + ' ich bin haute')[0]
# str_tokens = model.to_str_tokens(tokens)
# _, cache = model.run_with_cache(tokens)
# head_attention = circuitsvis.attention.attention_pattern(tokens=str_tokens, attention=cache['blocks.2.attn.hook_pattern'][0, 5, :, :])
# sized_viz = SizeLimitedObject(head_attention)

def mask_scores(attn_scores: Float[Tensor, "query_nctx key_nctx"]):
    '''Mask the attention scores so that tokens don't attend to previous tokens.'''
    # assert attn_scores.shape == (model.cfg.n_ctx, model.cfg.n_ctx)
    mask = torch.tril(torch.ones_like(attn_scores)).bool()
    neg_inf = torch.tensor(-1.0e6).to(attn_scores.device)
    masked_attn_scores = torch.where(mask, attn_scores, neg_inf)
    return masked_attn_scores

layer = 2
head_index = 5

haystack_utils.clean_cache()
W_E = model.W_E
W_QK = model.W_Q[layer, head_index] @ model.W_K[layer, head_index].T
# Bilinear form representing how much attention head token pays to each other token via L2H5
pos_by_pos_scores = W_E @ W_QK @ W_E.T

masked_scaled = mask_scores(pos_by_pos_scores / model.cfg.d_head ** 0.5)
pos_by_pos_pattern = torch.softmax(masked_scaled, dim=-1)

# The largest W_E W_QK W_E affinities are all random punctuation and the like.
# top_indices = torch.topk(pos_by_pos_pattern.flatten(), 1000, dim=-1).indices.cpu().numpy()
# first_tokens, second_tokens = np.unravel_index(top_indices, pos_by_pos_pattern.shape)
# print(list(zip(model.to_str_tokens(first_tokens), model.to_str_tokens(second_tokens))))

[('<|endoftext|>', '<|endoftext|>'), ('<|padding|>', '<|endoftext|>'), ('<|padding|>', '<|padding|>'), ('!', '<|endoftext|>'), ('!', '!'), ('!', '<|padding|>'), ('"', '!'), ('"', '"'), ('"', '<|padding|>'), ('"', '<|endoftext|>'), ('#', '<|endoftext|>'), ('#', '#'), ('#', '"'), ('#', '<|padding|>'), ('#', '!'), ('$', '<|endoftext|>'), ('$', '!'), ('$', '$'), ('$', '"'), ('$', '#'), ('$', '<|padding|>'), ('%', '%'), ('%', '<|endoftext|>'), ('%', '"'), ('%', '#'), ('%', '<|padding|>'), ('%', '$'), ('%', '!'), ('&', '%'), ('&', '&'), ('&', '#'), ('&', '"'), ('&', '<|endoftext|>'), ('&', '<|padding|>'), ('&', '!'), ('&', '$'), ("'", '"'), ("'", "'"), ("'", '%'), ("'", '!'), ("'", '<|endoftext|>'), ("'", '<|padding|>'), ("'", '$'), ("'", '&'), ("'", '#'), ('(', '('), ('(', '#'), ('(', '$'), ('(', '"'), ('(', '<|padding|>'), ('(', "'"), ('(', '<|endoftext|>'), ('(', '&'), ('(', '!'), ('(', '%'), (')', '('), (')', ')'), (')', '<|endoftext|>'), (')', '!'), (')', '"'), (')', '$'), (')', "'"), (

### Utils

In [22]:
def batched_dot_product(x: torch.Tensor, y: torch.Tensor):
    return torch.vmap(torch.dot)(x, y)
    
def neuron_to_context_neuron_DLA(
        model: HookedTransformer, 
        prompt: str | list[str], 
        pos=np.s_[-1:], 
        context_neuron=tuple[int, int]
) -> tuple[Float[Tensor, "component"], list[str]]:
    '''Gets full resid decomposition including all neurons. Unbatched.'''
    tokens = model.to_tokens(prompt)
    _, cache = model.run_with_cache(prompt)
    layer, neuron = context_neuron
    neuron_attrs, neuron_labels = cache.stack_neuron_results(layer, apply_ln=True, return_labels=True, pos_slice=pos)
    neuron_attrs = neuron_attrs.squeeze(1)
    
    answer_residual_direction = model.W_in[layer, :, neuron].unsqueeze(0)  # [1 d_model]

    results = []
    for i in range(neuron_attrs.shape[1]):
        results.append(batched_dot_product(neuron_attrs[:, i], answer_residual_direction.repeat(neuron_attrs.shape[0], 1)))
    return torch.stack(results), neuron_labels

def resid_to_context_neuron_DLA(
        model: HookedTransformer, 
        prompt: str | list[str], 
        pos=np.s_[-1:], 
        context_neuron:tuple[int, int]=(0,0)
) -> tuple[Float[Tensor, "component"], list[str]]:
    '''Gets full resid decomposition including all neurons. Unbatched.'''
    tokens = model.to_tokens(prompt)
    _, cache = model.run_with_cache(prompt)
    layer, neuron = context_neuron
    all_attrs, labels = cache.get_full_resid_decomposition(layer, apply_ln=True, return_labels=True, pos_slice=pos)
    all_attrs = all_attrs.squeeze(1)
    
    answer_residual_direction = model.W_in[layer, :, neuron].unsqueeze(0)  # [1 d_model]

    results = []
    for i in range(all_attrs.shape[1]):
        results.append(batched_dot_product(all_attrs[:, i], answer_residual_direction.repeat(all_attrs.shape[0], 1)))
    return torch.stack(results), labels

def get_neuron_mean_acts(model: HookedTransformer, data: list[str], layer_neuron_dict: dict[int, list[int]]) -> tuple[torch.Tensor, torch.Tensor]:
    sorted_layer_neuron_tuples = []
    sorted_acts = []

    for layer, neurons in layer_neuron_dict.items():
        mean_acts = haystack_utils.get_mlp_activations(data, layer, model, context_crop_start=0, hook_pre=False, neurons=neurons, disable_tqdm=True)
        sorted_layer_neuron_tuples.extend([(layer, neuron) for neuron in neurons])
        sorted_acts.extend(mean_acts)
        assert len(sorted_layer_neuron_tuples) == len(sorted_acts)

    return sorted_layer_neuron_tuples, sorted_acts

def get_unspecified_neurons(model: HookedTransformer, layer_neuron_dict: dict[int, list[int]]):
    unspecified = []
    for layer in range(model.cfg.n_layers):
        for neuron in range(model.cfg.d_mlp):
            if not neuron in layer_neuron_dict[layer]:
                unspecified.append((layer, neuron))
    return unspecified

def get_neuron_loss_increases(model: HookedTransformer, data: list[str], prompt: str, positionwise: bool=False) -> torch.Tensor:
    n_tokens = model.to_tokens(prompt).shape[1] - 1
    original_loss = model([prompt], return_type='loss', loss_per_token=positionwise)
    
    losses = []
    for layer in trange(model.cfg.n_layers):
        mean_acts = haystack_utils.get_mlp_activations(data[:200], layer, model, disable_tqdm=True, context_crop_start=0)
        for neuron in range(model.cfg.d_mlp):
            hook = hook_utils.get_ablate_neuron_hook(layer, neuron, mean_acts[neuron])
            with model.hooks([hook]):
                ablated_loss = model([prompt], return_type='loss', loss_per_token=positionwise)
                losses.append((ablated_loss - original_loss)[0])
    return torch.stack(losses).reshape(n_tokens, model.cfg.n_layers * model.cfg.d_mlp)

def compare_dla_and_ablation(model: HookedTransformer, dla_attrs_by_neuron: torch.Tensor, ablation_losses_by_neuron: torch.Tensor, num_neurons=20):
    print("DLA:")
    values, indices = torch.topk(dla_attrs_by_neuron, num_neurons, dim=-1)
    layer_indices, neuron_indices = np.unravel_index(indices.cpu().numpy(), (model.cfg.n_layers, model.cfg.d_mlp))
    print(list(zip(layer_indices.tolist(), neuron_indices.tolist())))
    print(dla_attrs_by_neuron[indices.tolist()])

    print("Ablation:")
    loss_increases_by_neuron = ablation_losses_by_neuron
    values, indices = torch.topk(loss_increases_by_neuron, num_neurons)
    layer_indices, neuron_indices = np.unravel_index(indices.cpu().numpy()[:num_neurons], (model.cfg.n_layers, model.cfg.d_mlp))
    print(list(zip(layer_indices.tolist(), neuron_indices.tolist())))
    print(dla_attrs_by_neuron[indices.tolist()])

def get_hook_inputs_for_token_index(model: HookedTransformer, data: list[str], loss_increases_by_neuron: torch.Tensor, k=40):
    values, indices = torch.topk(loss_increases_by_neuron, k)

    layer_indices, neuron_indices = np.unravel_index(indices.cpu().numpy(), (model.cfg.n_layers, model.cfg.d_mlp))
    layer_neuron_dict = defaultdict(list)
    for layer, neuron in zip(layer_indices, neuron_indices):
        layer_neuron_dict[layer].append(neuron)

    sorted_dla_layer_neuron_tuples = []
    sorted_acts = []
    for layer, neurons in layer_neuron_dict.items():
        mean_acts = haystack_utils.get_mlp_activations(data, layer, model, context_crop_start=0, neurons=neurons, disable_tqdm=True)
        sorted_dla_layer_neuron_tuples.extend([(layer, neuron) for neuron in neurons])
        sorted_acts.extend(mean_acts)
        assert len(sorted_dla_layer_neuron_tuples) == len(sorted_acts)

    return sorted_dla_layer_neuron_tuples, sorted_acts

def unravel_top_k(neuron_attrs: torch.Tensor, k: int=10):
    values, indices = torch.topk(neuron_attrs, k)
    layer_indices, neuron_indices = np.unravel_index(indices.cpu().numpy(), (model.cfg.n_layers, model.cfg.d_mlp))
    return list(zip(layer_indices.tolist(), neuron_indices.tolist()))

def resid_to_head_DLA(
        model: HookedTransformer, 
        prompt: str | list[str], 
        head: tuple[int, int],
        pos=np.s_[-1:], 
        
) -> tuple[Float[Tensor, "component"], list[str]]:
    '''Gets full resid decomposition and return the composition of each element of the given K matrix. Unbatched.'''
    tokens = model.to_tokens(prompt)
    _, cache = model.run_with_cache(prompt)
    layer, head_index = head
    all_attrs, labels = cache.get_full_resid_decomposition(layer, apply_ln=True, return_labels=True, pos_slice=pos)
    all_attrs = all_attrs.squeeze(1)
    answer_residual_direction = model.W_K[layer, head_index, :]
    results = torch.zeros(all_attrs.shape[1], all_attrs.shape[0], answer_residual_direction.shape[1])
    for i in range(all_attrs.shape[1]): # for each token
        for j in range(answer_residual_direction.shape[1]): # for each direction in head input
            token_attrs = all_attrs[:, i]
            answer = answer_residual_direction[:, j].unsqueeze(0).repeat(token_attrs.shape[0], 1)
            results[i, :, j] = batched_dot_product(token_attrs, answer)
    return results, labels


def mask_scores(attn_scores: Float[Tensor, "query_nctx key_nctx"]):
    '''Mask the attention scores so that tokens don't attend to previous tokens.'''
    # assert attn_scores.shape == (model.cfg.n_ctx, model.cfg.n_ctx)
    mask = torch.tril(torch.ones_like(attn_scores)).bool()
    neg_inf = torch.tensor(-1.0e6).to(attn_scores.device)
    masked_attn_scores = torch.where(mask, attn_scores, neg_inf)
    return masked_attn_scores
    
def resid_to_head_DLA_custom(
        model: HookedTransformer, 
        prompt: str | list[str], 
        head: tuple[int, int]
        
) -> tuple[Float[Tensor, "component"], list[str]]:
    '''For last two tokens, figure out which components contribute the most to them paying attention to each other.'''
    _, cache = model.run_with_cache(prompt)
    layer, head_index = head

    all_attrs, labels = cache.get_full_resid_decomposition(layer, apply_ln=True, return_labels=True, pos_slice=np.s_[-2:])
    all_attrs = all_attrs.squeeze(1).permute(1, 0, 2)

    W_QK = model.W_Q[layer, head_index] @ model.W_K[layer, head_index].T

    pos_by_pos_scores = all_attrs[0] @ W_QK @ all_attrs[1].T
    # masked_scaled = mask_scores(pos_by_pos_scores / model.cfg.d_head ** 0.5)
    # pos_by_pos_pattern = torch.softmax(masked_scaled, dim=-1)
    return pos_by_pos_scores, labels


In [23]:
pattern, labels = resid_to_head_DLA_custom(model, german_prompt[:20], (2, 5))
subset_tensor = pattern[:30, :30].cpu().numpy()
df = pd.DataFrame(subset_tensor, columns=labels[:30])
fig = px.imshow(df)
fig.update_layout(
    autosize=False,
    width=600,
    height=500)
fig.show()

subset_tensor = pattern[-16:, -16:].cpu().numpy()
df = pd.DataFrame(subset_tensor, columns=labels[:16])
fig = px.imshow(df)
fig.update_layout(
    autosize=False,
    width=600,
    height=500)
fig.show()

Tried to stack head results when they weren't cached. Computing head results now


In [11]:
test_prompt = " given the generally greater adeptness of children ich bin"
results, labels = resid_to_head_DLA(model, test_prompt, (2, 5), pos=np.s_[-2:])

results[-1].shape

NameError: name 'resid_to_head_DLA_custom' is not defined

### Investigate

In [8]:
def upstream_for_prompt(prompt):
    n_tokens = model.to_tokens(prompt).shape[1]
    neuron_attrs_by_token, labels = neuron_to_context_neuron_DLA(model, prompt, np.s_[-n_tokens:], (3, 669))

    counter = Counter()
    for i in range(n_tokens):
        counter.update(unravel_top_k(neuron_attrs_by_token[i], k=10))
    return counter

In [None]:
# 2, 1449 reoccurs
# print("sample prompt:", upstream_for_prompt(german_prompt).most_common())
n_tokens = model.to_tokens(german_prompt).shape[1]

prompts = [] # german_prompt, english_prompt
for token in [" ä", " ö", " ü", " ß"]:
    prompts.append("".join([token for _ in range(n_tokens)]))
    print(token, upstream_for_prompt(prompts[-1]).most_common())

neuron_attrs_by_token, labels = neuron_to_context_neuron_DLA(model, german_prompt, np.s_[-n_tokens:], (3, 669))

# fig = px.histogram(neuron_attrs_by_token[3].cpu())
# fig.show()

# neuron_attrs_by_token, labels = neuron_to_context_neuron_DLA(model, english_prompt, np.s_[-n_tokens:], (3, 669))

# fig = px.histogram(neuron_attrs_by_token[3].cpu())
# fig.show()

In [10]:
# Get the rank correlation within long prompts of different types
# Hopefully it's highly correlated
# Get the rank correlation between samples or average of prompts of different types
spearman = SpearmanCorrCoef()

prompt_mean_rhos = torch.zeros(len(prompts))
for prompt_n, prompt in enumerate(prompts):
    n_tokens = model.to_tokens(prompt).shape[1]
    neuron_attrs_by_token, _ = neuron_to_context_neuron_DLA(model, prompt, np.s_[-n_tokens:], (3, 669)) # tokens d_mlp
    average_neuron_attrs = neuron_attrs_by_token.mean(dim=0) # d_mlp

    rhos = torch.zeros(n_tokens)
    for i in range(n_tokens):
        rhos[i] = spearman(neuron_attrs_by_token[i], average_neuron_attrs)
    prompt_mean_rhos[prompt_n] = rhos.mean()

print(prompt_mean_rhos)

average_neuron_attrs = []
for prompt_n, prompt in enumerate(prompts):
    n_tokens = model.to_tokens(prompt).shape[1]
    neuron_attrs_by_token, _ = neuron_to_context_neuron_DLA(model, prompt, np.s_[-n_tokens:], (3, 669))
    average_neuron_attrs.append(neuron_attrs_by_token.mean(dim=0))
    
rhos = []
for i in range(len(average_neuron_attrs)):
    for j in range(i + 1, len(average_neuron_attrs)):
        if i == j:
            continue
        rhos.append(f'{spearman(average_neuron_attrs[i], average_neuron_attrs[j]).item():2f}')

print(rhos)


Metric `SpearmanCorrcoef` will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.



tensor([0.9277, 0.9443, 0.9657, 0.7451])
['0.467906', '0.456237', '0.246673', '0.520210', '0.170303', '0.207521']


### First run


Ablate each neuron in turn and look at how it affects the context neuron value (meaned over all prompts)

Check whether it activates for German words in an English context, both single common german chars and a full word
Ablate each neuron in turn and look at how it affects the context neuron value (meaned over a German token position within an English)

Collect a list

Look for head that moves German tokens
Look for head that moves German n-grams
Look for n-gram detector and see if it moves from there



In [5]:
# Measure context neuron activation for many German tokens that never coalesce into German words
# Measure context neuron activation for English words with a single German token mixed in
# Measure above but make it semantically clear in the English that a German token is about to appear
# Measure above but with a full German word
# Measure above but semantically clear in English

# Optional (if time permits):
# Measure above but with common German unigrams

# Need a way to measure at position

test_prompt = " given the generally greater adeptness of children ich"
test_prompt = " given the generally greater adeptness of children ich bin"
# test_prompt = " given the generally greater adeptness of children ich bin heute"
_, cache = model.run_with_cache(test_prompt)
print(cache['blocks.3.mlp.hook_post'][0, :, 669])


# check English tokenization, use ' und ' ' ich'

tensor([-0.1671, -0.1619, -0.0637, -0.0139, -0.0676, -0.1653, -0.1330, -0.0700,
        -0.1339, -0.1104,  0.2194,  2.8279])


In [6]:
# items, labels = haystack_utils.DLA([test_prompt], model)

items, labels = resid_to_context_neuron_DLA(model, test_prompt, context_neuron=(3, 669))
haystack_utils.line(items[0].cpu())
# haystack_utils.line(items[0].cpu(), xticks=labels)

Tried to stack head results when they weren't cached. Computing head results now


In [9]:
print(labels[-5:], labels[5:])

['L2N2045', 'L2N2046', 'L2N2047', 'embed', 'bias'] ['L0H5', 'L0H6', 'L0H7', 'L1H0', 'L1H1', 'L1H2', 'L1H3', 'L1H4', 'L1H5', 'L1H6', 'L1H7', 'L2H0', 'L2H1', 'L2H2', 'L2H3', 'L2H4', 'L2H5', 'L2H6', 'L2H7', 'L0N0', 'L0N1', 'L0N2', 'L0N3', 'L0N4', 'L0N5', 'L0N6', 'L0N7', 'L0N8', 'L0N9', 'L0N10', 'L0N11', 'L0N12', 'L0N13', 'L0N14', 'L0N15', 'L0N16', 'L0N17', 'L0N18', 'L0N19', 'L0N20', 'L0N21', 'L0N22', 'L0N23', 'L0N24', 'L0N25', 'L0N26', 'L0N27', 'L0N28', 'L0N29', 'L0N30', 'L0N31', 'L0N32', 'L0N33', 'L0N34', 'L0N35', 'L0N36', 'L0N37', 'L0N38', 'L0N39', 'L0N40', 'L0N41', 'L0N42', 'L0N43', 'L0N44', 'L0N45', 'L0N46', 'L0N47', 'L0N48', 'L0N49', 'L0N50', 'L0N51', 'L0N52', 'L0N53', 'L0N54', 'L0N55', 'L0N56', 'L0N57', 'L0N58', 'L0N59', 'L0N60', 'L0N61', 'L0N62', 'L0N63', 'L0N64', 'L0N65', 'L0N66', 'L0N67', 'L0N68', 'L0N69', 'L0N70', 'L0N71', 'L0N72', 'L0N73', 'L0N74', 'L0N75', 'L0N76', 'L0N77', 'L0N78', 'L0N79', 'L0N80', 'L0N81', 'L0N82', 'L0N83', 'L0N84', 'L0N85', 'L0N86', 'L0N87', 'L0N88', 'L0N8

In [10]:
haystack_utils.line(items[0].cpu())

In [10]:
haystack_utils.line(items[0, :40].cpu(), xticks=labels[:40])

In [11]:
haystack_utils.line(items[0, :40].cpu(), xticks=labels[:40])
# haystack_utils.line(items[0].cpu(), xticks=labels)

In [12]:
# Doesn't activate much on a random prompt
# items, labels = resid_to_context_neuron_DLA(model, "English prompt in the English language", context_neuron=(3, 669))
# haystack_utils.line(items[0].cpu())


Tried to stack head results when they weren't cached. Computing head results now


In [37]:
import circuitsvis

class SizeLimitedObject:
    def __init__(self, obj, max_width='500px', max_height='500px'):
        self.obj = obj
        self.max_width = max_width
        self.max_height = max_height

    def _repr_html_(self):
        return f"""
        <div style='max-width: {self.max_width}; max-height: {self.max_height}; padding: 20px;'>
            {self.obj._repr_html_()}
        </div>
        """

tokens = model.to_str_tokens(model.to_tokens(test_prompt)[0])
print(tokens)

head_attention = circuitsvis.attention.attention_pattern(tokens=tokens, attention=cache['blocks.2.attn.hook_pattern'][0, 5, :, :])
sized_viz = SizeLimitedObject(head_attention, max_height='300px', max_width='300px')
sized_viz

['<|endoftext|>', ' given', ' the', ' generally', ' greater', ' ad', 'ept', 'ness', ' of', ' children', ' ich', ' bin']


In [39]:
# Head which maybe K composes with the last two tokens and L2H5?
tokens = model.to_str_tokens(model.to_tokens(test_prompt)[0])
print(tokens)

head_attention = circuitsvis.attention.attention_pattern(tokens=tokens, attention=cache['blocks.0.attn.hook_pattern'][0, 6, :, :])
sized_viz = SizeLimitedObject(head_attention, max_height='300px', max_width='300px')
sized_viz

['<|endoftext|>', ' given', ' the', ' generally', ' greater', ' ad', 'ept', 'ness', ' of', ' children', ' ich', ' bin']


In [46]:
def info(tensor: torch.Tensor):
    print(tensor.shape, tensor.dtype, "\nnan count:", tensor.isnan().sum().item())

tokens = model.to_tokens(english_prompt[:50] + ' ich bin haute')[0]
# str_tokens = model.to_str_tokens(tokens)
# _, cache = model.run_with_cache(tokens)
# head_attention = circuitsvis.attention.attention_pattern(tokens=str_tokens, attention=cache['blocks.2.attn.hook_pattern'][0, 5, :, :])
# sized_viz = SizeLimitedObject(head_attention)

layer = 2
head_index = 5

haystack_utils.clean_cache()
W_E = model.W_E
W_QK = model.W_Q[layer, head_index] @ model.W_K[layer, head_index].T
# Bilinear form representing how much attention head token pays to each other token via L2H5
pos_by_pos_scores = W_E @ W_QK @ W_E.T

masked_scaled = mask_scores(pos_by_pos_scores / model.cfg.d_head ** 0.5)
pos_by_pos_pattern = torch.softmax(masked_scaled, dim=-1)

top_indices = torch.topk(pos_by_pos_pattern.flatten(), 100, dim=-1).indices.cpu().numpy()
first_tokens, second_tokens = np.unravel_index(top_indices, pos_by_pos_pattern.shape)
print(list(zip(model.to_str_tokens(first_tokens), model.to_str_tokens(second_tokens))))


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.43 GiB (GPU 0; 47.54 GiB total capacity; 28.82 GiB already allocated; 8.44 GiB free; 38.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# head_attention = circuitsvis.attention.attention_pattern(tokens=tokens, attention=pos_by_pos_pattern)
# sized_viz = SizeLimitedObject(head_attention, max_height='300px', max_width='300px')
# sized_viz

# from transformer_lens import FactoredMatrix
# fm = FactoredMatrix(model.W_Q[layer, head_index], model.W_K[layer, head_index].T)
# print(fm.__dict__)

# german_tokens = model.to_tokens("ich haute de ä ö ü ß")[0]
# subset_tensor = (pos_by_pos_pattern[german_tokens] @ pos_by_pos_pattern).cpu().numpy()

# Doesn't show anything interesting, just evenly distributed fractional pattern
# subset_tensor = pos_by_pos_pattern[:100, :100].cpu().numpy()
# df = pd.DataFrame(subset_tensor)
# fig = px.imshow(df)
# fig.show()

# tokens = model.to_tokens(english_prompt[:300] + ' ich bin haute')[0]
# str_tokens = model.to_str_tokens(tokens)
# _, cache = model.run_with_cache(tokens)
# head_attention = circuitsvis.attention.attention_pattern(tokens=str_tokens, attention=cache['blocks.2.attn.hook_pattern'][0, 5, :, :])
# sized_viz = SizeLimitedObject(head_attention)
# sized_viz