In [1]:
# # Enable python import reloading
# %load_ext autoreload
# %autoreload 2

# # Imports
# import numpy as np
from transformer_lens import HookedTransformer
import torch as th
from neuron_text_simplifier import NeuronTextSimplifier

device = th.device("cuda" if th.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m-deduped", device=device)
model.eval()

simplifier = NeuronTextSimplifier(model, 6, 3071)
text = "The quick brown fox jumps over the lazy dog"

  from .autonotebook import tqdm as notebook_tqdm
Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer


In [5]:
input_text_list = ["a b c", "1 2", " the"]
simplifier.multi_visualize_text_color_iteratively(input_text_list)

In [3]:
from circuitsvis.activations import text_neuron_activations
split_text = model.to_str_tokens(text, prepend_bos=False)
tokens = simplifier.model.to_tokens(text, prepend_bos=False)
act = simplifier.get_neuron_activation(tokens)
act = th.tensor([round(a,2) for a in act]).unsqueeze(-1).unsqueeze(-1)
text_neuron_activations(tokens=split_text, activations=act)

In [13]:
import numpy as np
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example']
n_layers = 1
n_neurons_per_layer = 1
activations = np.random.normal(size=(len(tokens), n_layers, n_neurons_per_layer))
activations = np.exp(activations) / np.exp(activations).sum(axis=0, keepdims=True) 
text_neuron_activations(tokens=tokens, activations=activations)

In [21]:
simplifier = NeuronTextSimplifier(model, 6, 3071)

text = " The quick brown fox jumps over the lazy dog"
# Iteratively remove text that has smallest decrease in activation
# Print out the change in activation for the largest changes, ie if the change is larger than threshold*original_activation
tokens = simplifier.model.to_tokens(text, prepend_bos=False)[0]
original_activation = th.tensor(simplifier.get_neuron_activation(tokens))
text_list = []
text_list.append(simplifier.model.to_str_tokens(text, prepend_bos=False))
act_list = []
act_list.append(original_activation.reshape(-1,1,1))
changes = th.zeros(tokens.shape[-1])+100
for j in range(len(tokens)-1):
    for i in range(len(tokens)):
        changes[i] = simplifier.get_neuron_activation(th.cat((tokens[:i],tokens[i+1:])))[-1]
    max_ind = changes.argmax()
    changes = th.cat((changes[:max_ind], changes[max_ind+1:]))
    tokens = th.cat((tokens[:max_ind],tokens[max_ind+1:]))
    out_text = simplifier.model.to_string(tokens)
    text_list.append(simplifier.model.to_str_tokens(out_text, prepend_bos=False))
    act_list.append(th.tensor(simplifier.get_neuron_activation(tokens)).reshape(-1,1,1))
    print(simplifier.text_to_activations_print(out_text))
j = text_neuron_activations(tokens=text_list, activations=act_list)

 The [0.26] quick [-0.83] brown [-0.09] fox [-0.22] jumps [-0.37] over [-1.03] lazy [-0.65] dog [0.27]
 The [0.26] quick [-0.83] brown [-0.09] fox [-0.22] jumps [-0.37] over [-1.03] dog [0.51]
 The [0.26] quick [-0.83] brown [-0.09] fox [-0.22] over [-0.29] dog [0.53]
 The [0.26] quick [-0.83] fox [0.05] over [-0.15] dog [0.56]
 The [0.26] fox [0.27] over [0.19] dog [0.54]
 fox [0.32] over [-0.11] dog [0.43]
 fox [0.32] dog [0.25]
 fox [0.32]


In [46]:
simplifier = NeuronTextSimplifier(model, 6, 3071)

text = " a b c"
# Iteratively remove text that has smallest decrease in activation
# Print out the change in activation for the largest changes, ie if the change is larger than threshold*original_activation
tokens = simplifier.model.to_tokens(text, prepend_bos=False)[0]
original_activation = simplifier.get_neuron_activation(tokens)
text_list = simplifier.model.to_str_tokens(text, prepend_bos=False) + ["\n"]
act_list = original_activation + [0.0]
changes = th.zeros(tokens.shape[-1])+100
for j in range(len(tokens)-1):
    for i in range(len(tokens)):
        changes[i] = simplifier.get_neuron_activation(th.cat((tokens[:i],tokens[i+1:])))[-1]
    max_ind = changes.argmax()
    changes = th.cat((changes[:max_ind], changes[max_ind+1:]))
    tokens = th.cat((tokens[:max_ind],tokens[max_ind+1:]))
    out_text = simplifier.model.to_string(tokens)
    text_list += simplifier.model.to_str_tokens(out_text, prepend_bos=False) + ["\n"]
    act_list += simplifier.get_neuron_activation(tokens) + [0.0]
text_list = text_list
act_list = th.tensor(act_list).reshape(-1,1,1)
return text_neuron_activations(tokens=text_list, activations=act_list)

In [2]:
simplifier = NeuronTextSimplifier(model, 6, 3071)

input_text_list = ["a b c", "1 2", " the"]

# text_list_final = []
# act_list_final = []
# for text in input_text_list:
#     tokens = simplifier.model.to_tokens(text, prepend_bos=False)[0]
#     original_activation = simplifier.get_neuron_activation(tokens)
#     text_list = simplifier.model.to_str_tokens(text, prepend_bos=False) + ["\n"]
#     act_list = original_activation + [0.0]
#     changes = th.zeros(tokens.shape[-1])+100
#     for j in range(len(tokens)-1):
#         for i in range(len(tokens)):
#             changes[i] = simplifier.get_neuron_activation(th.cat((tokens[:i],tokens[i+1:])))[-1]
#         max_ind = changes.argmax()
#         changes = th.cat((changes[:max_ind], changes[max_ind+1:]))
#         tokens = th.cat((tokens[:max_ind],tokens[max_ind+1:]))
#         out_text = simplifier.model.to_string(tokens)
#         text_list += simplifier.model.to_str_tokens(out_text, prepend_bos=False) + ["\n"]
#         act_list += simplifier.get_neuron_activation(tokens) + [0.0]
#     text_list = text_list
#     act_list = th.tensor(act_list).reshape(-1,1,1)
#     text_list_final.append(text_list)
#     act_list_final.append(act_list)
# text_neuron_activations(tokens=text_list_final, activations=act_list_final)


AttributeError: 'float' object has no attribute 'ndim'

In [40]:
act_list

tensor([[[ 0.2519]],

        [[-0.3919]],

        [[-0.8256]],

        [[ 0.0000]],

        [[ 0.2519]],

        [[-0.1400]],

        [[ 0.0000]],

        [[ 0.2809]],

        [[ 0.0000]]])

In [35]:
l = j.cdn_src
# l = l.replace(', "firstDimensionName": "Layer", "secondDimensionName": "Neuron"', '')
print(l)
k = j
k.cdn_src = l
k

<div id="circuits-vis-b83c7cb8-ef4d" style="margin: 15px 0;"/>
    <script crossorigin type="module">
    import { render, TextNeuronActivations } from "https://unpkg.com/circuitsvis@1.39.1/dist/cdn/esm.js";
    render(
      "circuits-vis-b83c7cb8-ef4d",
      TextNeuronActivations,
      {"tokens": [[" The", " quick", " brown", " fox", " jumps", " over", " the", " lazy", " dog"], [" The", " quick", " brown", " fox", " jumps", " over", " lazy", " dog"], [" The", " quick", " brown", " fox", " jumps", " over", " dog"], [" The", " quick", " brown", " fox", " over", " dog"], [" The", " quick", " fox", " over", " dog"], [" The", " fox", " over", " dog"], [" fox", " over", " dog"], [" fox", " dog"], [" fox"]], "activations": [[[[10.069885253906]], [[-0.825380802154541]], [[-0.08990397304296494]], [[-0.2231120765209198]], [[-0.37210991978645325]], [[-1.0284030437469482]], [[-1.4960975646972656]], [[-1.0088225603103638]], [[-0.19542530179023743]]], [[[10.069885253906]], [[-0.825380802154541]]

In [7]:
act_list[1].reshape(-1,1,1).shape

torch.Size([8, 1, 1])

In [6]:
tokens = [['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example'], ['This', ' is', ' another', ' example', ' of', ' colored', ' tokens'], ['And', ' here', ' another', ' example', ' of', ' colored', ' tokens', ' with', ' more', ' words.'], ['This', ' is', ' another', ' example', ' of', ' tokens.']]
n_layers = 3
n_neurons_per_layer = 4
activations = []
for sample in tokens:
    sample_activations = np.random.normal(size=(len(sample), n_layers, n_neurons_per_layer)) * 5
    activations.append(sample_activations)
text_neuron_activations(tokens=tokens, activations=activations)

In [7]:
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Colored', ' Tokens', ' example']
values = np.random.normal(size=(len(tokens))).tolist()
colored_tokens(tokens, values)