This notebook demonstrates generating the activations from gpt2-medium for each pair of prompts in the [STS benchmark dataset](https://paperswithcode.com/dataset/sts-benchmark)

In [4]:
# Basic imports
from datasets import load_dataset # Datasets is hugging face's way of distributing their data
from transformer_lens import HookedTransformer # This library allows us to grab the activations from pretrained LLM's
import torch # Pytorch

In [5]:
def get_acts(model, prompts):
    # The number of layers our model has. GPT2-medium has 24
    layers = range(model.cfg.n_layers)

    # This is going to hold all of our activations. Notice the shape here: [n_prompts, n_layers, d_model]
    data = torch.zeros((len(prompts), len(layers), model.cfg.d_model))

    # For every prompt
    for i, prompt in enumerate(prompts):
        # Do a forward pass with the LLM on said prompt. This function lets us
        # cache the activations.
        _, activations = model.run_with_cache(prompt)

        # For every layer, go through and grab the activation we want at that layer
        # The "[0, -1]" there is just getting the first batch (we do one batch at a time, this
        # could probably be improved) and then the last token at that batch (the last token
        # in the residual stream probably (if some literature is correct) contains the "most
        # information". This is the last token /in the residual stream/, not like "dog" in
        # "John has a dog". We could experiment if this is the right place/token to try but
        # that's for another day
        for j in layers:
            # Store that activation!
            data[i, j] = activations[f'blocks.{j}.hook_resid_post'][0,-1]

    return data

In [6]:
# Here's an example. Let's load up gpt2 medium
gpt2_medium = HookedTransformer.from_pretrained("gpt2-medium")



Loaded pretrained model gpt2-medium into HookedTransformer


In [12]:
# And then grab the activations for a few simple prompts. Let's just verify the shape is right
acts = get_acts(gpt2_medium, ["John is a great cook", "I don't know where my phone is"])

# Make sure our shape is right
assert(list(acts.shape) == [2,24,1024])

So we can grab the activations for a set of prompts. Let's do it for the train and test sets of the STS dataset.

In [25]:
def save_activations_for_model():
    test_set  = load_dataset("sentence-transformers/stsb", split="test")
    train_set = load_dataset("sentence-transformers/stsb", split="train")

    # Grab the activations for all of the test prompts. Both the first and the second sentence.
    test_acts_1 = get_acts(gpt2_medium, test_set['sentence1'])
    test_acts_2 = get_acts(gpt2_medium, test_set['sentence2'])

    # Assert their shapes are right and that the first index is different (To make sure we actually
    # computed two different sets of activations)
    assert(list(test_acts_1.shape) == [1379, 24, 1024])
    assert(test_acts_1.shape == test_acts_2.shape)
    assert(not torch.equal(test_acts_1[0], test_acts_2[0]))

    # Now for the train set
    train_acts_1 = get_acts(gpt2_medium, train_set['sentence1'])
    train_acts_2 = get_acts(gpt2_medium, train_set['sentence2'])

    # Again some nice asserts
    assert(list(train_acts_1.shape) == [5749, 24, 1024])
    assert(train_acts_1.shape == train_acts_2.shape)
    assert(not torch.equal(train_acts_1[0], train_acts_2[0]))

    # Now we save everything
    torch.save(test_acts_1, "gpt2_medium_test_acts_1.npy")
    torch.save(test_acts_2, "gpt2_medium_test_acts_2.npy")

    torch.save(train_acts_1, "gpt2_medium_train_acts_1.npy")
    torch.save(train_acts_2, "gpt2_medium_train_acts_2.npy")

In [26]:
# save_activations_for_model()