In [2]:
from transformer_lens import HookedTransformer
from transformer_lens.utils import to_numpy
from IPython.display import HTML

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import numpy as np

In [4]:
model_name = "gpt2-small"
model = HookedTransformer.from_pretrained(model_name)

Loaded pretrained model gpt2-small into HookedTransformer


In [5]:
# To get activation for a particular neuron.
def get_neuron_acts(text, layer, neuron_index):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [133]:
# Reduced time form 60 min to some seconds.
def get_neuron_acts_layer(text, layer):
    # getting activation one layer at a time
    # Return activation corresponding to each neuron in a layer having dimension eaual to number of tokens
    cache = {}
    activation = []

    def caching_hook(act, hook):
        cache["act"] = act
        temp = []
        for j in range(len(cache["act"][0][0])):
            temp.append(cache["act"][0,:,j].tolist())
        # temp = np.array(temp).squeeze()
        # print(temp.shape)
        activation.append(temp)
        

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(activation)

# cache = {}
# text = "hunting for life"
# layer = 9
# def hook_fun(act, hook):
#     cache["act"] = act
# model.run_with_hooks(text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", hook_fun)] )

In [4]:
default_layer = 9
default_neuron_index = 652
default_text = "The following is a list of powers of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000"
print(model.to_str_tokens(default_text))
print(get_neuron_acts(default_text, default_layer, default_neuron_index))

['<|endoftext|>', 'The', ' following', ' is', ' a', ' list', ' of', ' powers', ' of', ' 10', ':', ' 1', ',', ' 10', ',', ' 100', ',', ' 1000', ',', ' 10000', ',', ' 100', '000', ',', ' 100', '0000', ',', ' 100', '00000']
[-0.08643499 -0.14071973 -0.10398154 -0.12390732 -0.04058984 -0.11064889
 -0.05189846 -0.11276118 -0.06905466 -0.11189386 -0.03059199 -0.10336889
 -0.04322361  1.5935549  -0.14205764  2.5116603  -0.1331642   2.5196698
 -0.11360838  3.0765214  -0.1163745   0.53938794  2.3499637  -0.14952153
 -0.16476357  1.9449059  -0.13690163 -0.0880248   2.1848853 ]


In [5]:
default_text = 'i will come and kill all of you'
print(model.to_str_tokens(default_text))

['<|endoftext|>', 'i', ' will', ' come', ' and', ' kill', ' all', ' of', ' you']


In [23]:
default_layer = 9
default_neuron_index = 652
temp = get_neuron_acts(default_text, default_layer, default_neuron_index)

In [21]:
print(model.to_str_tokens('She promised to come and kill the boredom'))

['<|endoftext|>', 'She', ' promised', ' to', ' come', ' and', ' kill', ' the', ' boredom']


In [22]:
sentences = ['i will come and kill all of you', 'who will you kill here an now today', 'I must kill the urge to come back.', 'I would kill to be in your shoes', "Don't let them come and kill your spirit", 'She promised to come and kill the boredom']

default_text = "She promised to come and kill the boredom"

act5 = []

for i in range(12):
    default_layer = i
    ls = []
    for j in range(3072):
        default_neuron_index = j
        temp = get_neuron_acts(default_text, default_layer, default_neuron_index)
        ls.append(temp[6])
    act5.append(ls)
    print(f'layer {i} done')

layer 0 done
layer 1 done
layer 2 done
layer 3 done
layer 4 done
layer 5 done
layer 6 done
layer 7 done
layer 8 done
layer 9 done
layer 10 done
layer 11 done


In [140]:
default_text = 'i will come and kill all of you'

demo0 = []

# t = get_neuron_acts_layer(default_text, 0)

for i in range(12):
    default_layer = i
    temp = get_neuron_acts_layer(default_text, default_layer)
    temp = temp.squeeze()
    demo0.append(temp)
    print(f'layer {i} done')

layer 0 done
layer 1 done
layer 2 done
layer 3 done
layer 4 done
layer 5 done
layer 6 done
layer 7 done
layer 8 done
layer 9 done
layer 10 done
layer 11 done


In [117]:
t = t.squeeze()

In [116]:
type(t)

numpy.ndarray

In [118]:
t.shape

(3072, 9)

In [152]:
# Writing activation in a text file
#----------uncomment to write the activations-------------------------#
# F = open('kill5.txt', 'w')

# for sublist in act5:
#     F.write(",".join([str(element) for element in sublist]) + "\n"+"\n"+"\n")

# F.close()

# --------------- Reading from file-----------------------------------#
# F = open('kill1.txt', 'r')

# data = F.read()

# data1 = list(data.split('\n'+'\n'+'\n'))

# # Splitting the read file
# act1 = []

# for i in range(len(data1)-1):
#     act1.append(list(map(float,(data1[i].split(',')))))

In [144]:
# Returning all those activations which are greater then a certain threshold
def light_up(act, thres = 1.0):
    
    fired_neurons = []
    for i in range(len(act)):
        ls = []
        for j in range(len(act[0])):
            if act[i][j] >= thres:
                ls.append(act[i][j])
            else:
                ls.append(int(0))
        fired_neurons.append(ls)

    return fired_neurons

def Neuron_index(activated_neurons):
    return np.nonzero(activated_neurons)# Return a tuple of (nonzero_row_index, nonzero_column_index)

def word_of_interest(act, index):
    # Returning all the activations of a particular word of interest
    woi = []
    for i in range(len(act)):
        ls = []
        for j in range(len(act[0])):
            ls.append(act[i][j][index])
        woi.append(ls)
    return woi 


In [148]:
demo0_woi = word_of_interest(demo0, 5)

In [149]:
demo0_activated = light_up(demo0_woi, thres=3.0)
demo0_nonzero = Neuron_index(demo0_activated)
print(demo0_nonzero)

(array([ 0,  0,  8,  9, 10, 10, 10, 11], dtype=int64), array([1152, 1528, 1253,  840,   49,  689, 1793, 2910], dtype=int64))


In [None]:
all_act = []
for i in range(len(sentences)):
    default_text = sentences[i]
    act_sentence = []
    for j in range(12):
        default_layer = j
        temp = get_neuron_acts_layer(default_text, default_layer)
        temp = temp.squeeze()
        act_sentence.append(temp)
        # print(f'layer {j} done')
    print(f'sentence {i} complete')
    all_act.append(act_sentence)

In [159]:
# Giving index of the word of interest manually.
# Can automate using LIME by choosing word contributing most(k=1) and give its index

temp_woi = word_of_interest(all_act[1], 4)
temp_activated = light_up(temp_woi, thres=3.0)
temp_nonzero = Neuron_index(temp_activated)
print(temp_nonzero)

(array([ 0,  0,  8,  9, 10, 10, 10, 10, 11], dtype=int64), array([1152, 1528, 1253,  840,  379,  897, 1793, 2285, 2910], dtype=int64))


In [35]:
print(nonzero)
print(nonzero2)
print(nonzero1)
print(nonzero3)
print(nonzero4)
print(nonzero5)

(array([ 0,  0,  8,  9, 10, 10, 10, 11], dtype=int64), array([1152, 1528, 1253,  840,   49,  689, 1793, 2910], dtype=int64))
(array([ 0,  0,  8,  9, 10, 10, 10, 10, 11], dtype=int64), array([1152, 1528, 1253,  840,  379,  897, 1793, 2285, 2910], dtype=int64))
(array([ 0,  0,  2,  8,  9, 10, 10, 10, 11, 11], dtype=int64), array([1152, 1528,  783, 1253,  840,  379,  689, 1793, 1611, 2910],
      dtype=int64))
(array([ 0,  0,  2,  8,  9, 10, 10, 11], dtype=int64), array([1152, 1528,  783, 1253,  840,  379, 1793, 2910], dtype=int64))
(array([ 0,  8,  8,  9,  9, 10, 10, 10, 10, 10, 11, 11], dtype=int64), array([1528, 1253, 1891,  840, 1681,   49,  297,  379,  689, 1793, 1611,
       2910], dtype=int64))
(array([ 0,  0,  8,  9, 10, 10, 11, 11], dtype=int64), array([1152, 1528, 1253,  840,  689, 1793,  611, 2910], dtype=int64))


In [14]:
import matplotlib.pyplot as plt
import seaborn as sns

''''Not working image size is too large.
# plt.figure(figsize = (12,3072))
sns_plot = sns.heatmap(activated_neurons, annot=True, cmap =sns.cm.rocket_r,linecolor='white', linewidths=1)

results_path = 'results.png'
#print(results_path)
plt.savefig(results_path, dpi=400)'''


In [29]:
# # This is some CSS (tells us what style )to give each token a thin gray border, to make it easy to see token separation
# style_string = """<style> 
#     span.token {
#         border: 1px solid rgb(123, 123, 123)
#         } 
#     </style>"""


# def calculate_color(val, max_val, min_val):
#     # Hacky code that takes in a value val in range [min_val, max_val], normalizes it to [0, 1] and returns a color which interpolates between slightly off-white and red (0 = white, 1 = red)
#     # We return a string of the form "rgb(240, 240, 240)" which is a color CSS knows
#     normalized_val = (val - min_val) / max_val
#     return f"rgb(240, {240*(1-normalized_val)}, {240*(1-normalized_val)})"


# def basic_neuron_vis(text, layer, neuron_index, max_val=None, min_val=None):
#     """
#     text: The text to visualize
#     layer: The layer index
#     neuron_index: The neuron index
#     max_val: The top end of our activation range, defaults to the maximum activation
#     min_val: The top end of our activation range, defaults to the minimum activation

#     Returns a string of HTML that displays the text with each token colored according to its activation

#     Note: It's useful to be able to input a fixed max_val and min_val, because otherwise the colors will change as you edit the text, which is annoying.
#     """
#     if layer is None:
#         return "Please select a Layer"
#     if neuron_index is None:
#         return "Please select a Neuron"
#     acts = get_neuron_acts(text, layer, neuron_index)
#     act_max = acts.max()
#     act_min = acts.min()
#     # Defaults to the max and min of the activations
#     if max_val is None:
#         max_val = act_max
#     if min_val is None:
#         min_val = act_min
#     # We want to make a list of HTML strings to concatenate into our final HTML string
#     # We first add the style to make each token element have a nice border
#     htmls = [style_string]
#     # We then add some text to tell us what layer and neuron we're looking at - we're just dealing with strings and can use f-strings as normal
#     # h4 means "small heading"
#     htmls.append(f"<h4>Layer: <b>{layer}</b>. Neuron Index: <b>{neuron_index}</b></h4>")
#     # We then add a line telling us the limits of our range
#     htmls.append(
#         f"<h4>Max Range: <b>{max_val:.4f}</b>. Min Range: <b>{min_val:.4f}</b></h4>"
#     )
#     # If we added a custom range, print a line telling us the range of our activations too.
#     if act_max != max_val or act_min != min_val:
#         htmls.append(
#             f"<h4>Custom Range Set. Max Act: <b>{act_max:.4f}</b>. Min Act: <b>{act_min:.4f}</b></h4>"
#         )
#     # Convert the text to a list of tokens
#     str_tokens = model.to_str_tokens(text)
#     for tok, act in zip(str_tokens, acts):
#         # A span is an HTML element that lets us style a part of a string (and remains on the same line by default)
#         # We set the background color of the span to be the color we calculated from the activation
#         # We set the contents of the span to be the token
#         htmls.append(
#             f"<span class='token' style='background-color:{calculate_color(act, max_val, min_val)}' >{tok}</span>"
#         )

#     return "".join(htmls)



'''
# The function outputs a string of HTML
default_max_val = 4.0
default_min_val = 0.0
default_html_string = basic_neuron_vis(
    default_text,
    default_layer,
    default_neuron_index,
    max_val=default_max_val,
    min_val=default_min_val,
)

# IPython lets us display HTML
print("Displayed HTML")
display(HTML(default_html_string))

# We can also print the string directly
print("HTML String - it's just raw HTML code!")
print(default_html_string)'''

In [None]:
'''Observations:
            How to tackle the polysementic neurons?
            If going for a particular word then it can be used in negative and positive both the sentenes. How to differentiate between them?
            We are going for a specific word. What if that word is divide while converting in token? For example hunting is divided in 'hun' and 'ting' tokens.
                '''
