In [2]:
%load_ext autoreload
%autoreload 2
# Set HuggingFace cache directory to scratch to save space.
import os
os.environ['HUGGINGFACE_HUB_CACHE'] = '/scratch/' + os.environ['USER'] + '/huggingface_cache'
CACHE_DIR = '/scratch/' + os.environ['USER'] + '/huggingface_cache'
# Optional; can help when memory is tight.
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

### Load HellaSwag Dataset

In [3]:
from datasets import load_dataset
dataset = load_dataset("Rowan/hellaswag", "en-US", split="train", cache_dir=CACHE_DIR)

import numpy as np


selected_indices = np.random.choice(len(dataset["ctx"]), 5, replace=False)
selected_prompts = [dataset["ctx"][i] for i in selected_indices]

selected_prompts

['A text intro leads into a picture of a dog and the same dog running along the yard. the dog',
 '[header] How to overcome fear of disease [title] Work with a therapist. [step] Therapy is generally considered one of the most effective ways to manage anxiety disorders, and illness anxiety disorder is no different. There are many different approaches to therapy.',
 '[header] How to respond to passive aggressive comments [title] Avoid reacting defensively. [step] When someone makes a passive aggressive comment, you may feel the need to defend yourself, or make accusations about them. Getting upset will likely do little good to change their habits.',
 '[header] How to select hearing protection [title] Familiarize yourself with noise reduction ratings (nrr). [step] Nrr is the standard rating system across all hearing protection devices. Under nrr, hearing devices are classified by their potential to limit decibels (db) in professional and occupational environments.',
 'He uses a tool to hit

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True).to('cuda')
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

config.json: 100%|██████████| 863/863 [00:00<00:00, 538kB/s]
configuration_phi.py: 100%|██████████| 9.26k/9.26k [00:00<00:00, 6.52MB/s]
A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
modeling_phi.py: 100%|██████████| 62.7k/62.7k [00:00<00:00, 8.94MB/s]
A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
model.safetensors.index.json: 100%|██████████| 35.7k/35.7k [00:00<00:00, 29.0MB/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]
model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s][A
model-00001-of-00002.safetensors:   1

model-00001-of-00002.safetensors:  60%|█████▉    | 2.98G/5.00G [00:11<00:07, 269MB/s][A
model-00001-of-00002.safetensors:  60%|██████    | 3.02G/5.00G [00:11<00:06, 290MB/s][A
model-00001-of-00002.safetensors:  61%|██████    | 3.05G/5.00G [00:11<00:07, 258MB/s][A
model-00001-of-00002.safetensors:  62%|██████▏   | 3.08G/5.00G [00:11<00:07, 265MB/s][A
model-00001-of-00002.safetensors:  62%|██████▏   | 3.11G/5.00G [00:12<00:07, 246MB/s][A
model-00001-of-00002.safetensors:  63%|██████▎   | 3.15G/5.00G [00:12<00:07, 245MB/s][A
model-00001-of-00002.safetensors:  64%|██████▎   | 3.18G/5.00G [00:12<00:06, 261MB/s][A
model-00001-of-00002.safetensors:  64%|██████▍   | 3.22G/5.00G [00:12<00:06, 289MB/s][A
model-00001-of-00002.safetensors:  65%|██████▌   | 3.25G/5.00G [00:12<00:06, 290MB/s][A
model-00001-of-00002.safetensors:  66%|██████▌   | 3.28G/5.00G [00:12<00:05, 290MB/s][A
model-00001-of-00002.safetensors:  67%|██████▋   | 3.32G/5.00G [00:12<00:05, 322MB/s][A
model-00001-of-00002.

In [5]:
from IPython.display import display, HTML, Markdown

def generate(model, tokenizer, prompt, do_display=True, max_new_tokens=50):
    input_string = prompt
    inputs = tokenizer(input_string, return_tensors="pt", return_attention_mask=True).to('cuda')
    
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=0)
    text = tokenizer.batch_decode(outputs)[0][len(input_string):]

    if do_display:
        display(HTML(f"<pre>{input_string}</pre><pre style='background-color: rgb(200, 255, 200, 1.0)'>{text}<pre>"))

    return inputs['input_ids'][0], outputs[0], text

def run_coding_sample(model, tokenizer, display=True):
    prompt = '''def print_prime(n):
       """
       Print all primes between 1 and n
       """'''
    return generate(model, tokenizer, prompt, display)

def run_coding_sample_2(model, tokenizer):
    prompt = '''def array_sum(array, nrows, ncols):
       """
       Use two for loops to add elements of an array.
       """'''
    return generate(model, tokenizer, prompt)

In [6]:
from hooked_phi import attach_hooks, detach_hooks

all_results = []
all_results_tokenized = []

# Returns a hook that can be used to ablate a set of neurons.
def ablate_neurons(mask):
    assert mask.shape[0] == model.config.num_hidden_layers
    assert mask.shape[1] == model.config.intermediate_size

    def hook(neurons, layer_idx):
        neurons[..., ~mask[layer_idx]] = 0
        return neurons

    return hook

# Ablate the last layer MLP.
for num_ablation_layers in range(0, 5, 1):
    display(Markdown(f'# Zeroing-out intermediate layers of the last {num_ablation_layers} MLPs'))
    
    mask = torch.ones((model.config.num_hidden_layers, model.config.intermediate_size), dtype=torch.bool)
    if num_ablation_layers > 0:
        mask[-num_ablation_layers:, :] = False
    
    attach_hooks(model.model, ablate_neurons(mask))
    
    # input_ids, input_and_completion_ids, result = run_coding_sample(model, tokenizer)
    input_ids, input_and_completion_ids, result = generate(model, tokenizer, "Instruct: What is the square root of 2?\n\nOutput:", do_display=True)
    if num_ablation_layers == 0:
        ground_truth = result
    
    all_results.append(result)

detach_hooks(model.model)

# Zeroing-out intermediate layers of the last 0 MLPs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Zeroing-out intermediate layers of the last 1 MLPs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Zeroing-out intermediate layers of the last 2 MLPs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Zeroing-out intermediate layers of the last 3 MLPs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# Zeroing-out intermediate layers of the last 4 MLPs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [7]:
from evaluator import perplexity_evaluator

perplexity_result = perplexity_evaluator(preds=all_results)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1/1 [00:00<00:00,  9.44it/s]


# Evaluate with HellaSwag

In [11]:
from evaluator import charcut_evaluator

ground_truths = []
predictions = {}

for prompt in selected_prompts:
    
    # Ablate the last layer MLP.
    for num_ablation_layers in range(0, 5, 2):
        
        mask = torch.ones((model.config.num_hidden_layers, model.config.intermediate_size), dtype=torch.bool)
        if num_ablation_layers > 0:
            mask[-num_ablation_layers:, :] = False
        
        attach_hooks(model.model, ablate_neurons(mask))
        
        input_ids, input_and_completion_ids, result = generate(model, tokenizer, prompt, do_display=False)
        
        if num_ablation_layers == 0:
            ground_truths.append(prompt+result)
        
        if num_ablation_layers not in predictions:
            predictions[num_ablation_layers] = []
        
        predictions[num_ablation_layers].append(prompt+result)
        

    detach_hooks(model.model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

['A text intro leads into a picture of a dog and the same dog running along the yard. the dog is running with a ball in its mouth. the dog is running towards a person. the person is smiling and waving at the dog. the text says "Meet Max, the best dog in the world. He loves to play fetch and make new friends', '[header] How to overcome fear of disease [title] Work with a therapist. [step] Therapy is generally considered one of the most effective ways to manage anxiety disorders, and illness anxiety disorder is no different. There are many different approaches to therapy. [substeps] Cognitive behavioral therapy (CBT) is a common approach that helps people identify and change negative thought patterns. Exposure therapy is another approach that involves gradually exposing people to their fears in a safe and controlled environment. [substeps] Med', '[header] How to respond to passive aggressive comments [title] Avoid reacting defensively. [step] When someone makes a passive aggressive comme

In [15]:
print(ground_truths)

['A text intro leads into a picture of a dog and the same dog running along the yard. the dog is running with a ball in its mouth. the dog is running towards a person. the person is smiling and waving at the dog. the text says "Meet Max, the best dog in the world. He loves to play fetch and make new friends', '[header] How to overcome fear of disease [title] Work with a therapist. [step] Therapy is generally considered one of the most effective ways to manage anxiety disorders, and illness anxiety disorder is no different. There are many different approaches to therapy. [substeps] Cognitive behavioral therapy (CBT) is a common approach that helps people identify and change negative thought patterns. Exposure therapy is another approach that involves gradually exposing people to their fears in a safe and controlled environment. [substeps] Med', '[header] How to respond to passive aggressive comments [title] Avoid reacting defensively. [step] When someone makes a passive aggressive comme

In [17]:
for key, value in predictions.items():
    print(key)

0
2
4


In [22]:
charcut_evaluator()

False

In [None]:
# This is called "neuron vis", but can really be used for anything involving a score assigned to each token.
# In this case, visualize logprob
from neuron_visualization import basic_neuron_vis, basic_neuron_vis_signed

# Get granular nll estimates
display(Markdown(f'# Visualizing How Log-Likelihood Evolves, and For Which Tokens'))

# 1. Get ground truth.
detach_hooks(model.model)
input_ids, input_and_completion_ids, ground_truth = generate(model, tokenizer, "Instruct: What is 1 + 234?\n\nOutput: 1 + 234 =", do_display=False, max_new_tokens=3)
completion_ids = input_and_completion_ids[len(input_ids):]

# Calculate baseline logprobs.
predictions_for_next_token = model(input_ids=input_and_completion_ids.unsqueeze(0))
logits = predictions_for_next_token[0][0]
logits_for_output_tokens = logits[len(input_ids) - 1:-1]
logprobs_for_output_tokens = torch.log_softmax(logits_for_output_tokens, dim=-1)
baseline_logprobs_for_sampled_output_tokens = logprobs_for_output_tokens[
    torch.arange(logprobs_for_output_tokens.shape[0]),
    completion_ids
]

# 2. Ablate model. See which suddens suddenly become highly unlikely (by visualizing negative logprobs).
# for ablation_layer in range(31, 31 - 8 - 1, -1):
for ablation_layer in range(31, -1, -1):
    mask = torch.ones((model.config.num_hidden_layers, model.config.intermediate_size), dtype=torch.bool)
    
    display(Markdown(f'## Ablating MLP {ablation_layer + 1} / 32'))
    mask[ablation_layer, :] = False

# for num_ablation_layers in range(0, 5, 1):
    # mask = torch.ones((model.config.num_hidden_layers, model.config.intermediate_size), dtype=torch.bool)
    # if num_ablation_layers == 0:
    #     display(Markdown(f'## Baseline'))
    # else:
    #     display(Markdown(f'## Ablating final {num_ablation_layers} MLPs'))
    # if num_ablation_layers > 0:
    #     mask[-num_ablation_layers:, :] = False

    attach_hooks(model.model, ablate_neurons(mask))
    predictions_for_next_token = model(input_ids=input_and_completion_ids.unsqueeze(0))
    logits = predictions_for_next_token[0][0]
    logits_for_output_tokens = logits[len(input_ids) - 1:-1]
    logprobs_for_output_tokens = torch.log_softmax(logits_for_output_tokens, dim=-1)
    logprobs_for_sampled_output_tokens = logprobs_for_output_tokens[
        torch.arange(logprobs_for_output_tokens.shape[0]),
        completion_ids
    ]

    # visualized_tokens = outputs[0].cpu()[len(inputs['input_ids'][0].cpu()):]
    visualized_tokens = input_and_completion_ids.cpu()
    token_names = [
        tokenizer.decode(torch.tensor([visualized_tokens[i]]))
        for i in range(len(visualized_tokens))
    ]

    deflections = logprobs_for_sampled_output_tokens - baseline_logprobs_for_sampled_output_tokens

    colors = torch.cat([torch.zeros_like(input_ids), deflections])

    print("Average deflection:", deflections.mean().item())
    print("Worst deflection:", deflections.min().item())
    html = basic_neuron_vis_signed(token_names, colors, 1)
    display(HTML(html))

detach_hooks(model.model)



# Viewing "kernel" of MLP intermediate states

If there MLP intermediate states represent recall coefficients - with the MLP encoder representing detection, and the MLP decoder representing triggering of the new memory - we would expect that most MLP intermediate states would be 0 (as only a sparse number of "memories" should be activated at any given time), or that MLP intermediate states would be correlated with meaningful concepts. Let's test this hypothesis.

In [None]:
def log_neurons(neurons, layer_idx):
    if layer_idx == 0:
        log.append([])

    log[-1].append(neurons)
    
    return neurons

attach_hooks(model.model, log_neurons)

In [None]:
log = []
inputs, outputs, text = run_coding_sample_2(model, tokenizer)

layer_id = 0

all_mlp_activations = []
for i in range(len(log)):
    # log[forward pass index][layer index][batch index, token index in forward pass, mlp neuron index]
    mlp_activations = log[i][layer_id][0, -1, :]
    all_mlp_activations.append(mlp_activations)

stacked = torch.stack(all_mlp_activations, dim=0)

# Scale each MLP intermediate neuron by the maximum absolute value
scale_by_dim = torch.max(stacked.abs(), dim=0, keepdims=True).values
stacked_scaled = stacked / (scale_by_dim + 1e-5)


In [None]:
import matplotlib.pyplot as plt

plt.title("MLP activations: Last Layer (Unscaled)")
plt.hist(stacked.view(-1).cpu(), bins=25)
plt.xlabel("MLP activation level")
plt.yscale("log")
plt.ylabel("Frequency")
plt.show()

plt.title("MLP activations: Last Layer (Scaled per dim.)")
plt.hist(stacked_scaled.view(-1).cpu(), bins=25)
plt.xlabel("MLP activation level")
plt.yscale("log")
plt.ylabel("Frequency")
plt.show()

## Interpretation

It seems that MLP activations are very sparse. Therefore, it will hopefully be relatively simple to find meaningful MLP neurons to visualize.

To automatically determine which MLP activations are most interesting, I will calculate the variance of the activations.


In [None]:
from neuron_visualization import basic_neuron_vis

variance = stacked.var(dim=0)
highest_to_lowest_variance = variance.argsort(descending=True)
visualized_tokens = outputs[0].cpu()[len(inputs['input_ids'][0].cpu()):]

token_names = [
    tokenizer.decode(torch.tensor([visualized_tokens[i]]))
    for i in range(len(visualized_tokens))
]

visualized_neurons = highest_to_lowest_variance[:100]

for neuron_i in range(len(visualized_neurons)):
    neuron_id = visualized_neurons[neuron_i]
    html = basic_neuron_vis(token_names, stacked[:, neuron_id], layer=layer_id, neuron_index=neuron_id)
    display(HTML(html))
