# Adversarial prompting for 'it is' feature


In [1]:
%load_ext autoreload
%autoreload 2

In [96]:
FEATURE_IDX = 4542

In [140]:
import torch
import numpy as np

from transformer_lens import utils
from sprint.loading import load_all
from sprint.vars import NEWLINE

model, data, sae = load_all(fold_ln=True)

Loaded pretrained model gelu-1l into HookedTransformer
Moving model to device:  cuda
Changing model dtype to torch.float16
Model device: cuda:0
Tokens shape: torch.Size([215402, 128]), dtype: torch.int64, device: cuda:0
Loading run1 from HuggingFace at 25
{'batch_size': 4096,
 'beta1': 0.9,
 'beta2': 0.99,
 'buffer_batches': 12288,
 'buffer_mult': 384,
 'buffer_size': 1572864,
 'd_mlp': 2048,
 'dict_mult': 8,
 'enc_dtype': 'fp32',
 'l1_coeff': 0.0003,
 'lr': 0.0001,
 'model_batch_size': 512,
 'num_tokens': 2000000000,
 'seed': 52,
 'seq_len': 128}
Encoder device: cuda:0


In [158]:
# def does_feature_fire(prompt, feature=FEATURE_IDX, model=model, sae=sae, layer=0):
#     act_name = utils.get_act_name("post", layer)
#     model_out, cache = model.run_with_cache(prompt, names_filter=[act_name])
#     # fires =  (sae(cache[act_name])[2][0, :, feature] > 0).any()
#     pre_relu = (sae.W_enc @ cache[act_name] + sae.b_enc)[0, :, feature]
#     return cache[act_name]

#     # return pre_relu
#     return (cache[act_name] @ sae.W_enc + sae.b_enc)[0, :, FEATURE_IDX]


def get_feature_scores_for_prompt(
    prompt, model=model, encoder=sae, feature=FEATURE_IDX, act_name="post", layer=0, prepend_bos=True, bias=False
):
    with torch.no_grad():
        _, cache = model.run_with_cache(
            prompt, names_filter=[utils.get_act_name(act_name, layer)], prepend_bos=prepend_bos
        )
        acts = cache[utils.get_act_name(act_name, layer)]
        acts = acts.reshape(-1, encoder.W_enc.shape[0])
    out = (acts - encoder.b_dec) @ encoder.W_enc[:, feature]
    if bias:
        out += encoder.b_enc[feature]
    return utils.to_numpy(out)

    # loss, x_reconstruct, hidden_acts, l2_loss, l1_loss = encoder(acts)
    # return utils.to_numpy(hidden_acts[:, feature])

In [257]:
for prompt in [
    "Full stop. It is in your best interest to",
    "Full stop. It intercepted in your best interest",
    "The quick brown fox jumps over the lazy dog",
    "Enough about the fox. It's time to talk dog",
    "I have intercepted a message from the fox",
    "redirect what is left of the fox to the dog",
    "redirect intercepted materially the antit unequivocally",
    "Stop.\nIt intercepted.\nStop. It really should be",
    "Stop.\nredirect.\nStop. It really should",
    "To be or not to be, that is the question",
    "They don't think it be like it is, but it do",
    "They don't think it be like it be, but it do",
    "They don't think it be like It be but it do",
    "They don't think it be like.\n It be, but it do",
    " intercepted it be.\n It be.\n It be.\n It be it be",
    " it be.\n It be.\n It be.\n It be it be",
    " it be.\n It be.\n It be.\n It be.\n It be.\n It be.",
    " it be.\n It be.\n It be.\n It be",
    " it be.\n It be.",
    "It pertains comes.\n Suffice to say, it be",
    "Get the best value for your business.\n It's also advisable that when looking",
    "Get the best value for your business.\n It would also be advisable that when looking",
    " it was.\n It was.",
    "not bear the responsibility of the task, and he is the backbone of the family"
]:
    # feat_score = does_feature_fire(prompt)
    # print(f"{prompt}\t\t{feat_score > 0}\t\t{feat_score}")

    feat_score = get_feature_scores_for_prompt(prompt, bias=True)
    print(
        prompt.replace("\n", NEWLINE),
        (feat_score > 0).any(),
        # np.where(feat_score > 0)[0],
        # np.where(model.to_tokens(prompt).cpu() == model.to_single_token(" is"))[1],
        model.to_tokens(prompt).cpu()[0, np.argmax(feat_score)].item() == model.to_single_token(" is"),
        np.max(feat_score),
        sep="\t\t",
    )

Full stop. It is in your best interest to		True		True		1.73
Full stop. It intercepted in your best interest		False		False		-1.064
The quick brown fox jumps over the lazy dog		False		False		-1.477
Enough about the fox. It's time to talk dog		False		False		-0.8643
I have intercepted a message from the fox		False		False		-1.393
redirect what is left of the fox to the dog		False		True		-1.557
redirect intercepted materially the antit unequivocally		False		False		-1.372
Stop.↩It intercepted.↩Stop. It really should be		False		False		-0.911
Stop.↩redirect.↩Stop. It really should		False		False		-0.9434
To be or not to be, that is the question		False		True		-0.251
They don't think it be like it is, but it do		True		True		0.8535
They don't think it be like it be, but it do		False		False		-0.6367
They don't think it be like It be but it do		False		False		-0.4922
They don't think it be like.↩ It be, but it do		False		False		-0.2478
 intercepted it be.↩ It be.↩ It be.↩ It be it be		True		False		0.0

In [254]:
torch.nn.functional.cosine_similarity(
    model.W_E[model.to_single_token(" is")], model.W_E[model.to_single_token("Is")], dim=0
)

tensor(0.2717, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward1>)

In [249]:
torch.nn.functional.cosine_similarity(
    model.W_E[model.to_single_token(" is")], model.W_E[model.to_single_token("")], dim=0
)

tensor(0.2495, device='cuda:0', dtype=torch.float16, grad_fn=<SumBackward1>)

In [261]:
# Where does .2097 rank?

from sprint.feature_exploration import max_activating_examples

max_examples = max_activating_examples(model=model, data=data, feature_id=FEATURE_IDX, n_examples=1000, evenly_spaced=True)

In [271]:
(max_examples[1].max(axis=1)[0] > 0.2097).sum() / 1000 * 100

tensor(94.9000, device='cuda:0')