In [1]:
from transformer_lens import HookedTransformer
import torch
import einops
from transformer_lens import utils

In [2]:
torch.set_grad_enabled(False)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = HookedTransformer.from_pretrained("attn-only-2l", device=device, fold_ln=True)
print(device)

Loaded pretrained model attn-only-2l into HookedTransformer
cuda


In [3]:
def get_bigram_logits(tokens, model: HookedTransformer, add_bias=False):
    embed = model.W_E[tokens, :]
    unembed = einops.einsum(embed, model.W_U, "batch pos d_model, d_model d_vocab_out -> batch pos d_vocab_out")
    if add_bias:
        unembed += model.b_U
    return unembed

def get_topk_words(logits, model, pos=-1, k=10):
    # Input with batch dim = 1
    topk, topk_indices = torch.topk(logits[0, pos], k=k)
    tokens = model.tokenizer.convert_ids_to_tokens(topk_indices)
    return tokens

def get_log_probs(logits, tokens):
    log_probs = logits.log_softmax(dim=-1)
    log_probs_for_tokens = log_probs[:, :-1].gather(dim=-1, index=tokens[:, 1:].unsqueeze(-1)).squeeze(-1)
    return log_probs_for_tokens

## Bigram and embedding analysis

In [4]:
text = "Social security is a government program that produces exuberant daisies"
tokens = model.to_tokens(text)
logits, cache = model.run_with_cache(tokens)
print(logits.shape)

torch.Size([1, 15, 48262])


In [5]:
predicted_tokens = logits[0].argmax(dim=-1)
predicted_text = model.to_string(predicted_tokens)
print(predicted_text)

. Media is a great- that is a-ance andemonies.


In [6]:
media_token = predicted_tokens[1]
security_token = tokens[0, 2]

print(media_token, model.to_string(media_token))
print(security_token, model.to_string(security_token))

tensor(10967, device='cuda:0')  Media
tensor(3860, device='cuda:0')  security


In [7]:
head_results = cache.stack_head_results(apply_ln=True)
head_results.shape # head batch pos res

Tried to stack head results when they weren't cached. Computing head results now


torch.Size([16, 1, 15, 512])

In [8]:
print("Unigram most likely tokens")
get_topk_words(model.b_U.view(1, 1, -1), model)

Unigram most likely tokens


['Ġand', ',', 'Ġthe', 'Ċ', '.', 'Ġa', 'Ġin', ')', 'Ġto', 'Ġnot']

In [9]:
def ablate_mlp_hook(value, hook):
    return torch.zeros_like(value)

layer_to_ablate = 0
original_losses = []
ablation_losses = []

original_loss = model(tokens, return_type="loss")

ablated_loss = model.run_with_hooks(
        tokens, 
        return_type="loss", 
        fwd_hooks=[(
            f"blocks.0.hook_attn_out", 
            ablate_mlp_hook
            )]
        )

bigram_pred = get_bigram_logits(tokens, model)
bigram_loss = - get_log_probs(bigram_pred, tokens).mean()

print(f"Original model loss: {original_loss:.6f}")
print(f"Ablated loss: {ablated_loss:.6f}")
print(f"Bigram loss {bigram_loss}")

Original model loss: 5.388212
Ablated loss: 7.431926
Bigram loss 9.814844131469727


### Head attribution

In [10]:
# Result of individual heads, shape=(head, batch, pos, d_model)
head_results = cache.stack_head_results(apply_ln=True)

# Directions of output logits in the residual stream, shape=(batch, pos, d_model)
model_tokens = logits.argmax(dim=-1)
directions = model.tokens_to_residual_directions(model_tokens)

In [11]:
# Dot product between the head activations and the output directions 
head_attribution = einops.einsum(head_results, directions, "head batch pos d_model, batch pos d_model -> head")
print(head_attribution)

tensor([ -2.2576,  -4.5832,  -3.0620,   2.2735,   2.3002,   9.1988,  -2.8497,
         -1.3467,  -1.7043,   1.4677,  19.0915,   5.8268,  13.0674,   4.6253,
          8.9530, -35.5903], device='cuda:0')


In [12]:
cos = torch.nn.CosineSimilarity(dim=-1)
# Cosine similarities between head outputs and directions, shape=(d_head, pos)
similarities = cos(directions[0], head_results[:, 0])
# Head similarity for "security" token
print(similarities[:, 2])
print(torch.max(similarities))

tensor([-0.0223, -0.0876, -0.0189, -0.0100,  0.0426,  0.0146, -0.0363,  0.0008,
        -0.0137,  0.1088, -0.0167,  0.1116,  0.0736, -0.0306,  0.0404, -0.0601],
       device='cuda:0')
tensor(0.2165, device='cuda:0')


In [13]:
# Cosine similarity for the embedding
embed = model.W_E[tokens, :]
similarities = cos(directions[0], embed[0])
print(similarities)
print(torch.max(similarities))

tensor([-0.0244,  0.1912,  0.0667,  0.0734,  0.1852,  0.0781,  0.0873, -0.0318,
         0.1332,  0.1174,  0.1448,  0.0312,  0.2665,  0.0455,  0.0561],
       device='cuda:0')
tensor(0.2665, device='cuda:0')


## Induction analysis

In [14]:
#text = "Harry Potter is great. Harry"
text = "Leonard Potter is great. Leonard"
tokens = model.to_tokens(text)
logits, cache = model.run_with_cache(tokens)

In [15]:
# Compute cosine similarity of prediction " Potter" to head outputs
potter_token = model.to_tokens(" Potter", prepend_bos=False)
potter_direction = model.tokens_to_residual_directions(potter_token)
print("Potter direction shape", potter_direction.shape)

head_results = cache.stack_head_results(apply_ln=True)
print("Head result shape", head_results.shape) # head batch pos res

cos = torch.nn.CosineSimilarity(dim=-1)
similarities = cos(potter_direction, head_results[:, 0])
print("Cosine similarity shape", similarities.shape) # head position
# Similarity of Potter following the last Harry
similarities[:, -1]

Potter direction shape torch.Size([512])
Tried to stack head results when they weren't cached. Computing head results now
Head result shape torch.Size([16, 1, 8, 512])
Cosine similarity shape torch.Size([16, 8])


tensor([ 0.0334, -0.0452,  0.0025, -0.0315, -0.0109, -0.0741,  0.0194, -0.0182,
         0.0218,  0.0739,  0.0436, -0.0171,  0.0950, -0.0613,  0.1445,  0.1164],
       device='cuda:0')

In [16]:
# Most likely next words for "Leonard Potter is great. Leonard"
get_topk_words(logits, model)

['Ġis',
 'ĠCohen',
 'ĠBernstein',
 'Ġhas',
 'ĠPotter',
 'ĠNim',
 'Ġwas',
 'Ġand',
 ',',
 'âĢĻ']

In [17]:
bigram_logits = get_bigram_logits(tokens, model)
print(get_topk_words(bigram_logits, model, pos=-1))

['town', 'sson', 'ounce', 'stown', 'esque', 'nier', 'ĠNim', 'ĠCohen', 'ounced', 'ette']


In [18]:
utils.test_prompt("Leonard Potter is great. Leonard", " Potter", model, prepend_bos=True)

Tokenized prompt: ['<|BOS|>', 'Leon', 'ard', ' Potter', ' is', ' great', '.', ' Leonard']
Tokenized answer: [' Potter']


Top 0th token. Logit: 14.11 Prob:  9.62% Token: | is|
Top 1th token. Logit: 13.53 Prob:  5.38% Token: | Cohen|
Top 2th token. Logit: 13.43 Prob:  4.86% Token: | Bernstein|
Top 3th token. Logit: 13.34 Prob:  4.45% Token: | has|
Top 4th token. Logit: 13.33 Prob:  4.40% Token: | Potter|
Top 5th token. Logit: 12.83 Prob:  2.65% Token: | Nim|
Top 6th token. Logit: 12.69 Prob:  2.31% Token: | was|
Top 7th token. Logit: 12.64 Prob:  2.21% Token: | and|
Top 8th token. Logit: 12.61 Prob:  2.14% Token: |,|
Top 9th token. Logit: 12.58 Prob:  2.07% Token: |’|
