In [6]:
# Import Transformer Lens, and load pythia models
from transformer_lens import HookedTransformer
import torch as th
from torch import nn
device = "cuda" if th.cuda.is_available() else "cpu"
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m-deduped", device=device)
class Neuron_Max(nn.Module):
    def __init__(self, model: nn.Module, layer: int):
        super().__init__()
        self.model = model
        self.model.requires_grad_(False)
        self.embed_weights = list(list(model.children())[0].parameters())[0]
        transformer_blocks = [mod for mod in list(self.model.children())[2]]
        self.model_no_embed = th.nn.Sequential(*(transformer_blocks[:layer+1])).requires_grad_(False)
        self.model_no_embed.requires_grad_(False)
        self._neurons = th.empty(0)
        def hook(model, input, output):
            self._neurons = output
        self.model.blocks[layer].mlp.hook_pre.register_forward_hook(hook)
        # self.model.blocks[layer].hook_mlp_out.register_forward_hook(hook)

        
    def embedded_forward(self, embedded_x):
        self.model_no_embed(embedded_x)
        return self._neurons

    def forward(self, x):
        self.model(x)       
        return self._neurons
    
    def run_with_cache(self, x):
        return self.model.run_with_cache(x, remove_batch_dim=True)
    
text = "The quick brown fox jumps over the lazy dog"
tokens = model.to_tokens(text)
embedded_tokens = th.nn.Parameter(model.embed(tokens))
embedded_tokens.requires_grad = True
layer = 6
neuron = 3069

hook_model = Neuron_Max(model, layer)
mlp_pre = hook_model.embedded_forward(embedded_tokens)

_, cache = model.run_with_cache(tokens, remove_batch_dim=False)
# mlp_pre_original = cache[f"blocks.{layer}.hook_mlp_out"]
mlp_pre_original = cache[f"blocks.{layer}.mlp.hook_pre"]
# Make sure both are equal
print(th.allclose(mlp_pre, mlp_pre_original))

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer
True


In [7]:
# for neuron in range(100):

# init_text = "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16"
# init_text = " 1 a ; d"
# init_text = ''' Pavel Rovinski
# neuron = 0

# Pavel Apolonovič Rovinski (1831'''
#TODO make work w/ 1 token
layer=6
# neuron = 493
epochs =200

diverse_outputs_num = 10
_, _, embed_size = model.W_out.shape
seq = 4 #TODO make this work more functionally
keep_last_token = True
diverse_outputs = th.zeros(diverse_outputs_num, seq, embed_size)
largest_prompts = [None]*diverse_outputs_num
cos = th.nn.CosineSimilarity(dim=1)
for d_ind in range(diverse_outputs_num):
    print(f"Starting diverse output {d_ind}")
    # init_text = " the injuries England have"
    init_text = " 1 2 3 4"
    init_tokens = model.to_tokens(init_text, prepend_bos=False)
    prompt_embeds = th.nn.Parameter(model.embed(init_tokens)).detach()
    prompt_embeds.requires_grad_(True)

    optim = th.optim.AdamW([prompt_embeds], lr=.8, weight_decay=0.01)
    largest_activation = 0
    largest_prompt = None
    for i in range(epochs):
        # First, project into the embedding matrix
        with th.no_grad():
            projected_index = th.stack([(hook_model.embed_weights@prompt_embeds[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
            projected_embeds = model.embed(projected_index)

        # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
        tmp_embeds = prompt_embeds.detach().clone()
        tmp_embeds.data = projected_embeds.data
        # add some gaussian noise to tmp_embeds
        # tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.01
        tmp_embeds.requires_grad_(True)


        # Then, calculate neuron_output
        neuron_output = hook_model.embedded_forward(tmp_embeds)[0,:, neuron]
        diversity_loss = cos(tmp_embeds[0], diverse_outputs[:d_ind]) #TODO, check if this is correct
        loss = -neuron_output[-1] + diversity_loss.mean()

        # Save the highest activation
        if neuron_output[-1] > largest_activation:
            largest_activation = neuron_output[-1]
            largest_prompt = model.to_string(projected_index)
            largest_prompts[d_ind] = largest_prompt
            print(f"New largest activation: {largest_activation} | {largest_prompt}")

        # Transfer the gradient to the continuous embedding space
        prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
        
        optim.step()
        optim.zero_grad()
    diverse_outputs[d_ind] = tmp_embeds.data[0,...]

Starting diverse output 0
New largest activation: 0.04066186770796776 | [' 1 2 3 4']
New largest activation: 0.9042648673057556 | ['ahan accuracy 2009  ']
New largest activation: 1.0830827951431274 | [' terrific fuss 83 737']
New largest activation: 1.1185054779052734 | ['ako tod 07 737']
New largest activation: 1.2184388637542725 | ['kéKH 07 balcon']
New largest activation: 1.2394092082977295 | ['kéKH minute balcon']
New largest activation: 1.3135446310043335 | [' osKHzech nä']
New largest activation: 1.3617918491363525 | [' któcad h balcon']
New largest activation: 1.3642041683197021 | ['okratNos h balcon']
New largest activation: 1.4049625396728516 | ['ímNos dé 737']
New largest activation: 1.4831815958023071 | [' wordt viief Fl']
New largest activation: 1.6548222303390503 | [' Cott ti že flaws']
New largest activation: 1.6892642974853516 | [' že vi dib balcon']
New largest activation: 1.7998237609863281 | ['atie tem dib Fl']
New largest activation: 2.021145820617676 | [' kaois č fl

In [9]:
# for neuron in range(100):

# init_text = "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16"
# init_text = " 1 a ; d"
# init_text = ''' Pavel Rovinski
# neuron = 0

# Pavel Apolonovič Rovinski (1831'''
#TODO make work w/ 1 token
layer=6
# neuron = 493
epochs =200

diverse_outputs_num = 10
_, _, embed_size = model.W_out.shape
seq = 3 #TODO make this work more functionally
insert_token = True
diverse_outputs = th.zeros(diverse_outputs_num, seq, embed_size)
largest_prompts = [None]*diverse_outputs_num
cos = th.nn.CosineSimilarity(dim=1)
for d_ind in range(diverse_outputs_num):
    print(f"Starting diverse output {d_ind}")
    # init_text = " the injuries England have"
    init_text = " 1 2 3"
    init_tokens = model.to_tokens(init_text, prepend_bos=False)
    prompt_embeds = th.nn.Parameter(model.embed(init_tokens)).detach()
    prompt_embeds.requires_grad_(True)

    optim = th.optim.AdamW([prompt_embeds], lr=.8, weight_decay=0.01)
    largest_activation = 0
    largest_prompt = None

    iterations_since_last_improvement = 0
    while(iterations_since_last_improvement < 30):
    # First, project into the embedding matrix
        with th.no_grad():
            projected_index = th.stack([cos(hook_model.embed_weights,prompt_embeds[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
            projected_embeds = model.embed(projected_index)

        # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
        tmp_embeds = prompt_embeds.detach().clone()
        tmp_embeds.data = projected_embeds.data
        # add some gaussian noise to tmp_embeds
        # tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.01
        tmp_embeds.requires_grad_(True)

        if insert_token:
            text = " p.m"
            token = model.to_tokens(text, prepend_bos=False)
            token_embeds = model.embed(token)
            token_pos = seq
            wrapped_embeds = th.cat([tmp_embeds[0,:token_pos], token_embeds[0], tmp_embeds[0,token_pos:]], dim=0).unsqueeze(0)
        else:
            wrapped_embeds = tmp_embeds

        # Then, calculate neuron_output
        neuron_output = hook_model.embedded_forward(wrapped_embeds)[0,:, neuron]
        diversity_loss = cos(tmp_embeds[0], diverse_outputs[:d_ind]) #TODO, check if this is correct
        loss = -neuron_output[-1] + diversity_loss.mean()

        # Save the highest activation
        if neuron_output[-1] > largest_activation:
            iterations_since_last_improvement = 0
            largest_activation = neuron_output[-1]
            wrapped_embeds_seq_len = wrapped_embeds.shape[1]
            projected_index = th.stack([cos(hook_model.embed_weights,wrapped_embeds[0,i,:]).argmax() for i in range(wrapped_embeds_seq_len)]).unsqueeze(0)
            largest_prompt = model.to_string(projected_index)
            largest_prompts[d_ind] = largest_prompt
            print(f"New largest activation: {largest_activation} | {largest_prompt}")

        # Transfer the gradient to the continuous embedding space
        prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
        
        optim.step()
        optim.zero_grad()
    diverse_outputs[d_ind] = tmp_embeds.data[0,...]

Starting diverse output 0
New largest activation: 1.8464672565460205 | [' 1 2 3 p.m']
New largest activation: 1.9286843538284302 | [' Companies\t\t\t\t\t\t\t\t dusk p.m']
New largest activation: 2.2665884494781494 | [' CompaniesOOGLEinety p.m']
New largest activation: 2.324343681335449 | [' formations aboard decade p.m']
New largest activation: 2.385915756225586 | [' evacuation flaresinety p.m']
New largest activation: 2.3997743129730225 | [' evacuation kilomet Forty p.m']
New largest activation: 2.439774751663208 | [' aircraft kilomet 2030 p.m']
New largest activation: 2.6292884349823 | [' march kilomet 900 p.m']
New largest activation: 2.658616542816162 | [' march kilomet 800 p.m']
New largest activation: 2.7313034534454346 | [' trainsieurs eighty p.m']
New largest activation: 2.745493173599243 | [' hikingieurs eighty p.m']
New largest activation: 2.7665293216705322 | [' hikeieurs eighty p.m']
New largest activation: 2.912780523300171 | [' hike\n \n096 p.m']
Starting diverse output 1

In [46]:
layer=6

def prompt_optimization(
        model, 
        neuron, 
        diverse_outputs_num=10, 
        iteration_cap_until_convergence = 30,
        init_text = None,
        seq_size = 4,
        insert_words_and_pos = None, #List of words and positions to insert [word, pos]
        neuron_loss_scalar = 1,
        diversity_loss_scalar = 1,
    ):
    _, _, embed_size = model.W_out.shape
    vocab_size = model.W_E.shape[0]
    largest_prompts = [None]*diverse_outputs_num
    cos = th.nn.CosineSimilarity(dim=1)
    total_iterations = 0

    if init_text is not None:
        init_tokens = model.to_tokens(init_text, prepend_bos=False)
        seq_size = init_tokens.shape[-1]
    diverse_outputs = th.zeros(diverse_outputs_num, seq_size, embed_size)
    for d_ind in range(diverse_outputs_num):
        print(f"Starting diverse output {d_ind}")
        if init_text is None:
            # Random tokens of sequence length
            init_tokens = th.randint(0, vocab_size, (1,seq_size))
            init_text = model.to_string(init_tokens)
        prompt_embeds = th.nn.Parameter(model.embed(init_tokens)).detach()
        prompt_embeds.requires_grad_(True)

        optim = th.optim.AdamW([prompt_embeds], lr=.8, weight_decay=0.01)
        largest_activation = 0
        largest_prompt = None

        iterations_since_last_improvement = 0
        while(iterations_since_last_improvement < iteration_cap_until_convergence):
        # First, project into the embedding matrix
            with th.no_grad():
                projected_index = th.stack([cos(hook_model.embed_weights,prompt_embeds[0,i,:]).argmax() for i in range(seq_size)]).unsqueeze(0)
                projected_embeds = model.embed(projected_index)

            # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
            tmp_embeds = prompt_embeds.detach().clone()
            tmp_embeds.data = projected_embeds.data
            # add some gaussian noise to tmp_embeds
            # tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.01
            tmp_embeds.requires_grad_(True)

            if insert_words_and_pos is not None:
                text = insert_words_and_pos[0]
                pos = insert_words_and_pos[1]
                if(pos == -1):
                    pos = seq_size
                token = model.to_tokens(text, prepend_bos=False)
                token_embeds = model.embed(token)
                token_pos = pos
                wrapped_embeds = th.cat([tmp_embeds[0,:token_pos], token_embeds[0], tmp_embeds[0,token_pos:]], dim=0).unsqueeze(0)
                if(total_iterations == 0):
                    wrapped_embeds_seq_len = wrapped_embeds.shape[1]
                    projected_index = th.stack([cos(hook_model.embed_weights,wrapped_embeds[0,i,:]).argmax() for i in range(wrapped_embeds_seq_len)]).unsqueeze(0)
                    print(f"Inserting {text} at pos {pos}: {model.to_str_tokens(projected_index, prepend_bos=False)}")
            else:
                wrapped_embeds = tmp_embeds

            # Then, calculate neuron_output
            neuron_output = hook_model.embedded_forward(wrapped_embeds)[0,:, neuron]
            diversity_loss = cos(tmp_embeds[0], diverse_outputs[:d_ind])
            loss = neuron_loss_scalar*-neuron_output[-1] + diversity_loss_scalar*diversity_loss.mean()

            # Save the highest activation
            if neuron_output[-1] > largest_activation:
                iterations_since_last_improvement = 0
                largest_activation = neuron_output[-1]
                wrapped_embeds_seq_len = wrapped_embeds.shape[1]
                projected_index = th.stack([cos(hook_model.embed_weights,wrapped_embeds[0,i,:]).argmax() for i in range(wrapped_embeds_seq_len)]).unsqueeze(0)
                largest_prompt = model.to_string(projected_index)
                largest_prompts[d_ind] = largest_prompt
                print(f"New largest activation: {largest_activation} | {largest_prompt}")

            # Transfer the gradient to the continuous embedding space
            prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
            
            optim.step()
            optim.zero_grad()
            total_iterations += 1
            iterations_since_last_improvement += 1
        diverse_outputs[d_ind] = tmp_embeds.data[0,...]
    return largest_prompts

In [51]:
neuron = 3069
prompt_optimization(
    model, 
    neuron=neuron, 
    diverse_outputs_num=10, 
    iteration_cap_until_convergence = 10, 
    init_text = "in a cake-and-a-", 
    insert_words_and_pos = ["half", -1], 
    neuron_loss_scalar = 1, 
    diversity_loss_scalar = 1
)

Starting diverse output 0
Inserting half at pos 8: ['in', ' a', ' cake', '-', 'and', '-', 'a', '-', 'half']
New largest activation: 0.879313051700592 | ['in a cake-and-a-half']
New largest activation: 0.9168182611465454 | [' myös Dawn spraywashurelandid Nem Havhalf']
New largest activation: 1.0480660200119019 | [' myös Dawn sprayots gh guerraAz penghalf']
New largest activation: 1.0770515203475952 | [' myösoga sprayots gh guerraAz penghalf']
New largest activation: 1.1037309169769287 | [' toimoga sprayrina gh guerraAz penghalf']
New largest activation: 1.1693141460418701 | [' któioxid sprayrina gh guerraAz hemodhalf']
New largest activation: 1.2085641622543335 | [' któioxidajurina gh guerraAz hemodhalf']
New largest activation: 1.3823320865631104 | ['ī Leonio Josapo selectivitynk Hohalf']
New largest activation: 1.6561024188995361 | [' tä Leon tightening tribapoavaonas behhalf']
New largest activation: 1.67436683177948 | [' tä LeonGil tribapoavank behhalf']
New largest activation: 2.00

[['ų NyOs salavaľ mu behhalf'],
 [' denote Frontier Pairresolve Okandid sea hemodhalf'],
 [' Consider Wilder 72oto Jak maink hemodhalf'],
 ['in a cake-and-a-half'],
 [' following FrontierliSpotraf dispensing sea Havhalf'],
 ['in a cake-and-a-half'],
 [' Softwarechus”),konihadannk fachalf'],
 [' 52 ammonia secondseti kaadan ih dahalf'],
 ['in a cake-and-a-half'],
 [' gestскosto Antonanni guerraaan dahalf']]

In [29]:
model.to_tokens("hey there", prepend_bos=False).shape[-1]

2

In [55]:
w1 = " and"
w2 = " AND"
e1 = model.embed(model.to_tokens(w1, prepend_bos=False))[0]
e2 = model.embed(model.to_tokens(w2, prepend_bos=False))[0]
cos_sim = cos(e1, e2)
print(cos_sim)
closest_unembed = cos(hook_model.embed_weights,e2[0,:]).argmax()
print(model.to_string(closest_unembed))

tensor([0.5139])
 AND


In [40]:
tmp_embeds[0,4:].shape

torch.Size([0, 768])

In [141]:
# for neuron in range(100):

# init_text = "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16"
# init_text = " 1 a ; d"
# init_text = ''' Pavel Rovinski
# neuron = 0

# Pavel Apolonovič Rovinski (1831'''
#TODO make work w/ 1 token
iterative_initialization = True
layer=6
neuron = 493
# neuron = 492
epochs = 200
diverse_outputs_num = 10
_, _, embed_size = model.W_out.shape
seq = 4 #TODO make this work more functionally
diverse_outputs = th.zeros(diverse_outputs_num, seq, embed_size)
largest_prompts = [None]*diverse_outputs_num
cos = th.nn.CosineSimilarity(dim=1)
for d_ind in range(diverse_outputs_num):
    print(f"Starting diverse output {d_ind}")
    # init_text = " the injuries England have"
    # Random token
    if(d_ind == 0 or not iterative_initialization):
        init_tokens = th.randint(0, model.W_E.shape[0], (1, seq))
    else:
        init_tokens = largest_prompts[d_ind-1]
    prompt_embeds = th.nn.Parameter(model.embed(init_tokens)).detach()
    prompt_embeds.requires_grad_(True)

    optim = th.optim.AdamW([prompt_embeds], lr=.5, weight_decay=0.01)
    largest_activation = 0
    largest_prompt = None

    iterations_since_last_improvement = 0
    while(iterations_since_last_improvement < 30):
        # First, project into the embedding matrix
        with th.no_grad():
            projected_index = th.stack([(hook_model.embed_weights@prompt_embeds[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
            projected_embeds = model.embed(projected_index)

        # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
        tmp_embeds = prompt_embeds.detach().clone()
        tmp_embeds.data = projected_embeds.data
        # add some gaussian noise to tmp_embeds
        tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.005
        tmp_embeds.requires_grad_(True)


        # Then, calculate neuron_output
        neuron_output = hook_model.embedded_forward(tmp_embeds)[0,:, neuron]
        diversity_loss = cos(tmp_embeds[0], diverse_outputs[:d_ind]) #TODO, check if this is correct
        loss = -neuron_output[-1] + diversity_loss.mean()

        iterations_since_last_improvement += 1
        # Save the highest activation
        if neuron_output[-1] > largest_activation:
            iterations_since_last_improvement = 0
            largest_activation = neuron_output[-1]
            largest_prompts[d_ind] = projected_index
            print(f"New largest activation: {largest_activation} | {model.to_string(projected_index)}")

        # Transfer the gradient to the continuous embedding space
        prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
        
        optim.step()
        optim.zero_grad()
    diverse_outputs[d_ind] = tmp_embeds.data[0,...]

Starting diverse output 0
New largest activation: 0.7703665494918823 | [' pal/,compass Plymouth']
New largest activation: 1.6871099472045898 | [' pal/,things Graham']
New largest activation: 1.7203842401504517 | [' pal??Things moss']
New largest activation: 2.0110278129577637 | [' pal.?Things moss']
New largest activation: 2.4123072624206543 | [' synthes////////////////Things Payne']
New largest activation: 2.856362819671631 | [' synthes\xa0\xa0things Hollywood']
New largest activation: 3.1373682022094727 | [' synthes\xa0\xa0things Payne']
New largest activation: 3.415623188018799 | [' Marcus.?things Clark']
New largest activation: 3.474719524383545 | [' Marcus.?things Clark']
New largest activation: 3.5846543312072754 | [' approach.?things Clark']
Starting diverse output 1
New largest activation: 3.281342029571533 | [' approach.?things Clark']
New largest activation: 3.6527199745178223 | [' What kits Gonzalez Hogan']
Starting diverse output 2
New largest activation: 4.141073226928711 

In [129]:
model.W_E.shape[0]

torch.Size([50304, 768])

In [108]:
largest_prompts

[[' approach elic mistakes Canadians'],
 [' categories restrict strategies Canadians'],
 [' culturally].)things bankers'],
 [' assessments**). mistakes consumers'],
 [' What rhet myths Davis'],
 [' concrete (“ skills Robertson'],
 [' comprom une actors shoppers'],
 [' Target impractical mistakes Canadians'],
 [' detailing mistakes Walmart citizens'],
 [' What limitations practices consumers']]

In [91]:
cos = th.nn.CosineSimilarity(dim=2)
t = tmp_embeds[0]
cos(t, th.stack((-t, -3*t)))

tensor([[-1.0000, -1.0000, -1.0000, -1.0000],
        [-1.0000, -1.0000, -1.0000, -1.0000]], grad_fn=<SumBackward1>)

In [92]:
th.stack((tmp_embeds[0], tmp_embeds[0])).shape

torch.Size([2, 4, 768])

In [57]:
# Combine the empty original, with the new prompt a
original = th.cat(0)
a = prompt_embeds[0,0,:]
th.stack((a, original)).shape

In [40]:
from torch.nn import CrossEntropyLoss
ce = CrossEntropyLoss()
ce(th.tensor([[10,0,0,99]]).float(), th.tensor([0]))

tensor(0.0001)

In [46]:
th.tensor([[10,0,0,99]]).topk(7)

In [57]:
# for neuron in range(100):

# init_text = "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16"
init_text = " the injuries England have"
# init_text = " 1 a ;"
# init_text = ''' Pavel Rovinski
layer=6
neuron = 493

# Pavel Apolonovič Rovinski (1831'''
#TODO make work w/ 1 token
init_tokens = model.to_tokens(init_text, prepend_bos=False)
prompt_embeds = th.nn.Parameter(model.embed(init_tokens)).detach()
prompt_embeds.requires_grad_(True)

_, seq, _ = prompt_embeds.shape
neuron = 1

# input_optimizer = torch.optim.AdamW([prompt_embeds], lr=lr, weight_decay=weight_decay)
# optim = th.optim.SGD([prompt_embeds], lr=0.2)
optim = th.optim.AdamW([prompt_embeds], lr=.1, weight_decay=0.01)
epochs = 201
for i in range(epochs):
    # First, project into the embedding matrix
    with th.no_grad():
        projected_index = th.stack([(hook_model.embed_weights@prompt_embeds[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
        projected_embeds = model.embed(projected_index)

    # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
    tmp_embeds = prompt_embeds.detach().clone()
    tmp_embeds.data = projected_embeds.data
    # add some gaussian noise to tmp_embeds
    # tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.2
    tmp_embeds.requires_grad_(True)


    # Then, calculate neuron_output
    neuron_output = hook_model.embedded_forward(tmp_embeds)[0,:, neuron]
    loss = -neuron_output.max()

    # Transfer the gradient to the continuous embedding space
    prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
    
    optim.step()
    optim.zero_grad()


    # neuron_output = hook_model.embedded_forward(embedded_tokens)[0,:, neuron].mean()

    # embed_weights_norm = hook_model.embed_weights / hook_model.embed_weights.norm(dim=1).unsqueeze(1)
    # token_embed_sizes = embedded_tokens[0,:,:].norm(dim=1)
    # distance = th.stack([(1-(embed_weights_norm@(embedded_tokens[0,i,:] / token_embed_sizes[i]))).min() for i in range(seq)])
    # # distance = th.stack([(1-hook_model.embed_weights@embedded_tokens[0,i,:]).min() for i in range(seq)])
    # # Distance from the embedding matrix
    # # dist = th.norm(hook_model.embed_weights@(embedded_tokens[0,:,:]).T - embedded_tokens[0,:,:].T, dim=1).mean()
    
    # loss = -neuron_output 
    # # loss = -neuron_output + distance.mean()*10
    # # loss = distance.mean()*10
    # loss.backward()
    # optim.step()
    # optim.zero_grad()
    if i % 5 == 0:
        with th.no_grad():
            # Find the maximum similarity between each embedded_token, and the embedding matrix
            # Picking that token's embedding is equivalent to projecting onto the closest vector in the embedding matrix
            new_tokens = th.stack([(hook_model.embed_weights@prompt_embeds[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
            discrete_neuron_output = hook_model.embedded_forward(model.embed(new_tokens))[0,:, neuron]
            # print(f"Neuron {neuron}: Embed: {neuron_output.mean()} Discrete {discrete_neuron_output.mean()} Distance {distance.mean().item()}| Tokens: {model.to_string(new_tokens)}")
            print(f"Neuron {neuron}: Embed: {neuron_output.max()} Discrete {discrete_neuron_output.max()} Distance N/A| Tokens: {model.to_string(new_tokens)}")

Neuron 1: Embed: 0.47031161189079285 Discrete 0.47031161189079285 Distance N/A| Tokens: [' Pavel Rovinski\n   \nPavel Apolonovič Rovinski (1831']
Neuron 1: Embed: 0.5555412769317627 Discrete 0.5666844248771667 Distance N/A| Tokens: [' eruption Pale Cris L reproductiveगariantgeryanch rot retain troublesome Ninaek Rovinski (1831']
Neuron 1: Embed: 0.5436252355575562 Discrete 0.5354958176612854 Distance N/A| Tokens: [' frightened Pale Growth L accidentalumina scler recurrentarkpit diligence troublesome trivум Pegovinski [\\1831']
Neuron 1: Embed: 0.9181833863258362 Discrete 0.9452750086784363 Distance N/A| Tokens: [' frightened candle catastrophe-------------------------------------- accidentalumin sclergeryCLpit 700 pad trivум ApplicationC Riv residency1831']
Neuron 1: Embed: 0.5941134691238403 Discrete 0.51107257604599 Distance N/A| Tokens: [' happiness ashes disturbances lacterialорmonary pkgCLcycl 700 pad necklaceş ApplicationC Riv residency1831']
Neuron 1: Embed: 0.6064326167106628 D

In [10]:
init_tokens

tensor([[ 253, 9478, 5854,  452]])

In [30]:
print(tmp_embeds.data.shape)
print(prompt_embeds.data.shape)
print(loss)
embedded_tokens.grad, = th.autograd.grad(loss, [tmp_embeds])


torch.Size([1, 6, 768])
torch.Size([1, 6, 768])
tensor(0.0019, grad_fn=<NegBackward0>)


In [14]:
(hook_model.embed_weights@prompt_embeds[0,i,:]).shape

torch.Size([50304])

In [32]:
# for neuron in range(100):
neuron=20
init_text = "the quick brown 1 2 3 4 5 7 8 9 0 - - -"
init_tokens = model.to_tokens(init_text, prepend_bos=False)
embedded_tokens = th.nn.Parameter(model.embed(init_tokens))
iters = 200
for i in range(iters+1):
    neuron_output = hook_model.embedded_forward(embedded_tokens)[0,:, neuron].mean()
    optim = th.optim.Adam([embedded_tokens], lr=0.1)
    loss = -neuron_output
    loss.backward()
    optim.step()
    optim.zero_grad()
    if i % 1 == 0:
        _, seq, _ = embedded_tokens.shape
        # Find the maximum similarity between each embedded_token, and the embedding matrix
        # Picking that token's embedding is equivalent to projecting onto the closest vector in the embedding matrix
        new_tokens = th.stack([(hook_model.embed_weights@embedded_tokens[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
        with th.no_grad():
            embedded_tokens.copy_(model.embed(new_tokens))
        discrete_neuron_output = hook_model.embedded_forward(embedded_tokens)[0,:, neuron].mean()
        print(f"Neuron {neuron}: Embed: {neuron_output} Discrete {discrete_neuron_output}| Tokens: {model.to_string(new_tokens)}")

Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.21370284259319305 Discrete -0.21370284259319305| Tokens: ['the quick brown 1 2 3 4 5 7 8 9 0 - - -']
Neuron 20: Embed: -0.213

In [1]:
# for neuron in range(100):

init_text = "1"
init_tokens = model.to_tokens(init_text, prepend_bos=False)
embedded_tokens = th.nn.Parameter(model.embed(init_tokens))
_, seq, _ = embedded_tokens.shape


optim = th.optim.SGD([embedded_tokens], lr=1.2)
epochs = 201
for i in range(epochs):
    neuron_output = hook_model.embedded_forward(embedded_tokens)[0,:, neuron].mean()

    embed_weights_norm = hook_model.embed_weights / hook_model.embed_weights.norm(dim=1).unsqueeze(1)
    token_embed_sizes = embedded_tokens[0,:,:].norm(dim=1)
    distance = th.stack([(1-(embed_weights_norm@(embedded_tokens[0,i,:] / token_embed_sizes[i]))).min() for i in range(seq)])
    # distance = th.stack([(1-hook_model.embed_weights@embedded_tokens[0,i,:]).min() for i in range(seq)])
    # Distance from the embedding matrix
    # dist = th.norm(hook_model.embed_weights@(embedded_tokens[0,:,:]).T - embedded_tokens[0,:,:].T, dim=1).mean()
    
    loss = -neuron_output 
    # loss = -neuron_output + distance.mean()*10
    # loss = distance.mean()*10
    loss.backward()
    optim.step()
    optim.zero_grad()
    if i % 5 == 0:
        # Find the maximum similarity between each embedded_token, and the embedding matrix
        # Picking that token's embedding is equivalent to projecting onto the closest vector in the embedding matrix
        new_tokens = th.stack([(hook_model.embed_weights@embedded_tokens[0,i,:]).argmax() for i in range(seq)]).unsqueeze(0)
        discrete_neuron_output = hook_model.embedded_forward(model.embed(new_tokens))[0,:, neuron].mean()
        print(f"Neuron {neuron}: Embed: {neuron_output} Discrete {discrete_neuron_output} Distance {distance.mean().item()}| Tokens: {model.to_string(new_tokens)}")

NameError: name 'model' is not defined

In [23]:
neuron = hook_model(new_tokens)
neuron[0,:, 0].mean()

tensor(-0.1070, grad_fn=<MeanBackward0>)

In [157]:
# hook_model.embed_weights[33348,:]
embedded_tokens[0,0,:]

tensor([-1.0954e+01, -1.8866e+01,  2.2489e+00, -1.7445e+01,  3.9778e+01,
         3.5294e-01, -2.3105e+01, -6.6568e+00,  1.4689e+01,  3.8227e+00,
         1.0523e+02, -1.0434e+01, -2.2643e+01, -1.7965e+01,  1.3834e+01,
         9.6289e-02,  1.2760e+01,  6.6267e+00,  5.0471e+00,  1.7947e+01,
        -9.4464e+00, -1.5378e+01,  7.7645e+00, -1.9317e+01,  4.5725e-01,
         3.5412e+01,  5.6189e+00, -2.9805e+01,  2.7511e+01,  3.2154e+01,
        -8.2566e+00, -1.5262e+01,  2.4438e+01,  1.9494e+01,  9.8379e+00,
         1.4377e+01, -1.8982e+01,  8.1687e+00, -3.9900e+01, -1.4327e+01,
         2.0315e+01,  1.3450e+00,  2.6656e+01,  1.6861e+01,  7.9241e-01,
        -5.9983e-01,  5.8152e+00,  4.4811e+00, -4.0319e+00, -2.1985e+01,
         4.5013e+00, -1.5701e+01,  1.6422e+01,  2.5177e+01, -8.7880e+00,
        -4.9935e+00, -2.0194e+01,  1.5267e+01, -2.3336e+01,  6.4996e+00,
        -2.6224e+01, -1.3853e+01, -2.7448e+01,  8.6077e+00, -2.4899e+00,
         3.0491e+01, -2.8046e+00, -1.2375e+01,  3.4

In [112]:
# Generate two random 758 dimensional vectors
# find their cosine similarity
random_vec1 = th.randn(758)
random_vec2 = th.randn(758)
cosine_similarity = th.nn.CosineSimilarity(dim=0)
1-abs(cosine_similarity(random_vec1, random_vec2))

tensor(0.9203)

In [71]:
th.norm(hook_model.embed_weights@embedded_tokens[0,i,:]).min()

tensor(7.0810, grad_fn=<MinBackward1>)

In [70]:
ew = hook_model.embed_weights
e_norm = ew /ew.norm(dim=1).unsqueeze(1)
t_size = embedded_tokens[0,:,:].norm(dim=1)
t_norm = embedded_tokens[0,0,:] / embedded_tokens[0,0,:].norm()
abs((e_norm@t_norm.T)).min()

tensor(3.6949e-06, grad_fn=<MinBackward1>)

In [75]:
s = ew[1]
s_norm = s / s.norm()
s_norm[:10]

tensor([ 0.0581, -0.0240,  0.0173,  0.0930, -0.0442,  0.0115,  0.0262, -0.0446,
         0.0297,  0.0895], grad_fn=<SliceBackward0>)

In [78]:
t_norm = embedded_tokens[0,:,:].norm(dim=1)
print(t_norm[:])
print(embedded_tokens[0,1,:].norm())

tensor([0.8182, 0.9675, 0.9761, 0.9691, 0.9824, 0.8661, 0.4779, 0.9808, 0.9621],
       grad_fn=<SliceBackward0>)
tensor(0.9675, grad_fn=<NormBackward1>)


In [79]:
new_tokens = th.tensor[[(hook_model.embed_weights@embedded_tokens[0,i,:]).argmax().item() for i in range(seq)]]

In [98]:
text = "hey there partner"
tokens = model.to_tokens(text, prepend_bos=False)
text_again = model.to_string(tokens)
print(text_again)

['hey there partner']


In [113]:
model.gpt_neox.layers[0].mlp.dense_h_to_4h.register_forward_hook(lambda m, i, o: print(o[0]))

<torch.utils.hooks.RemovableHandle at 0x16d348e1430>

In [97]:
model.to_tokens.__code__.co_varnames

('self', 'input', 'prepend_bos', 'move_to_device', 'truncate', 'tokens')

In [100]:
t = T(model)
text = "The quick brown fox jumps over the lazy dog"
tokens = tokenizer(text, return_tensors="pt").input_ids.to(device)


tensor(True)

In [117]:
text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
tokens = model.to_tokens(text)
_, cache = model.run_with_cache(tokens, remove_batch_dim=True)
cache.keys()

dict_keys(['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_

In [22]:
model.embed(tokens).shape

torch.Size([1, 32, 768])

In [20]:
layer = 5
neuron = 0
model.requires_grad = False

embedded_tokens = th.nn.Parameter(model.embed(tokens))
embedded_tokens.requires_grad = True

# Run the model
_, cache = model.run_with_cache(embedded_tokens, remove_batch_dim=True)

# Get the neuron's max value
cache[f'blocks.{layer}.mlp.hook_post'][0,neuron].max()

In [164]:
# import Gelu from Functional
from torch.nn.functional import gelu
init_ln = model.blocks[0].ln1(embedded_tokens)
attn =  model.blocks[0].attn(init_ln)
mlp = model.blocks[0].mlp(init_ln)
serial_attn_then_mlp = model.blocks[0].mlp(attn)
add_attn_mlp = attn + mlp
actual = model.blocks[0](embedded_tokens)
ln = model.blocks[0].ln2(add_attn_mlp)
# Check that the serial_attn_then_mlp is the same as the actual
print((th.abs(serial_attn_then_mlp - actual) < 1e-5).all())
# Check that the add_attn_mlp is the same as the actual
print((th.abs(add_attn_mlp - actual) < 1e-5).all())

tensor(False)
tensor(False)


<torch.utils.hooks.RemovableHandle at 0x16d36bde430>

In [197]:
original
input_layernorm = original.gpt_neox.layers[0].input_layernorm
post_attention_layernorm = original.gpt_neox.layers[0].post_attention_layernorm
attention = original.gpt_neox.layers[0].attention
mlp = original.gpt_neox.layers[0].mlp
full_first_layer = original.gpt_neox.layers[0]

il = input_layernorm(embedded_tokens)
attn_pre = attention(il, attention_mask=None)
attn_act =  post_attention_layernorm(attn_pre[0])
mlp_act = mlp(il)
out = mlp_act+attn_act
actual = full_first_layer(embedded_tokens)
# Check if out is same as actual
print((th.abs(out[0] - actual[0][0]) < 1e-5).all())

torch.Size([1, 32, 768])
torch.Size([32, 768])
torch.Size([32, 768])
tensor(False)


In [200]:
original

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (layers): ModuleList(
      (0): GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          (act): GELUActivation()
        )
      )
      (1): GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (

In [165]:
print(actual[0,0,:3])
print(serial_attn_then_mlp[0,0,:3])
print(add_attn_mlp[0,0,:3])
print(ln[0,0,:3])

tensor([-0.1854, -0.1584,  0.5147], grad_fn=<SliceBackward0>)
tensor([-0.0284,  0.0493,  0.1089], grad_fn=<SliceBackward0>)
tensor([-0.1896, -0.1550,  0.5029], grad_fn=<SliceBackward0>)
tensor([-0.3918, -0.3203,  1.0390], grad_fn=<SliceBackward0>)


In [151]:
model.blocks[0]

tensor([[[-0.1854, -0.1584,  0.5147,  ..., -0.1457,  0.5370,  0.1068],
         [-0.3354, -0.3593,  0.0724,  ..., -0.3656, -0.2987, -0.0381],
         [-0.3607, -0.2062,  0.1508,  ..., -0.2275,  0.1065, -0.3098],
         ...,
         [ 0.1859, -0.4723,  0.4314,  ..., -0.0177,  0.2272,  0.2222],
         [-0.3957, -0.3710, -0.2304,  ..., -0.0337,  0.3666,  0.1840],
         [ 0.0503,  0.6451,  0.2918,  ..., -0.0455, -0.3018,  0.0271]]],
       grad_fn=<AddBackward0>)

In [167]:
model.blocks[0].mlp.W_in.register_forward_hook(lambda m, i, o: print(o[0]))

In [118]:
transformer_blocks = [mod for mod in (list(model.children())[2] if hasattr(list(model.children())[i], "__iter__") else [list(model.children())[i]])]
embedding_matrix = th.nn.Sequential(*(transformer_blocks[:layer +1])).requires_grad_(False)
embedding_matrix(embedded_tokens)

tensor([[[-0.2488,  0.8720, -0.7121,  ..., -0.2444,  0.9457,  0.3559],
         [ 0.5306, -0.3194, -0.6743,  ...,  0.2470,  0.6080, -0.1530],
         [ 0.1681,  0.5646, -0.6971,  ..., -0.0104,  0.3981,  0.1245],
         ...,
         [ 0.3086,  0.5555, -0.0274,  ..., -0.4727,  0.5518, -0.0732],
         [ 0.9248,  1.5012, -0.4375,  ...,  0.0385,  0.9525, -0.2166],
         [ 0.4605, -1.1019,  5.6312,  ..., -1.2397,  3.0214, -2.3836]]],
       grad_fn=<AddBackward0>)

In [134]:
model.blocks[0].mlp.W_in

torch.Size([768, 3072])