# Setup

In [None]:
try:
  import google.colab
  IN_COLAB = True
  print("Running as a Colab notebook")
  from google.colab import drive
  drive.mount('/content/gdrive')
  %cd /content/gdrive/MyDrive/AI-ML-Stuff/Dissertation/work/Transformer-Masking
  # Note: this is different to the usual setup, since I'm using
  # Alex Turner's activation addition repo
  commit = "08efeb9" # Stable commit
  get_ipython().run_line_magic(magic_name='pip', line=f'install -U git+https://github.com/montemac/algebraic_value_editing.git@{commit}')
except:
  IN_COLAB = False
  print("Running as a Jupyter notebook - intended for development only!")

Running as a Colab notebook
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/AI-ML-Stuff/Dissertation/work/Transformer-Masking


In [None]:
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.renderers.default = "notebook_connected" # or use "browser" if you want plots to open with browser
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import einops
from fancy_einsum import einsum
from typing import List, Optional, Callable, Tuple, Union, Dict
from functools import partial
from tqdm import tqdm
from IPython.display import display
import random
import os
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
import json

# Neel's Stuff
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

# Activation Addition Stuff
from algebraic_value_editing.completion_utils import print_n_comparisons

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f966c7a8f10>

In [None]:
!nvidia-smi

Thu Jul  6 16:31:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    38W / 300W |  16146MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
gpt2_small = HookedTransformer.from_pretrained("gpt2-small").cuda()
gpt2_small.name = "gpt2 small"

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  cuda


In [None]:
gpt2_xl = HookedTransformer.from_pretrained("gpt2-xl").cuda()
gpt2_xl.name = "gpt2 xl"

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda


# Functions

In [None]:
def SVD(matrix: t.Tensor) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  """
  Compute the SVD of a matrix.
  Returns the three associated matrices
  """
  U, S, V_H = t.linalg.svd(matrix)
  return U, S, V_H


In [None]:
# Run a dataset through the model, store all the activations
def dataset_activations(
  model: HookedTransformer,
  dataset: List[str]
):
  # TODO: make this run in batches instead of at once.
  #
  # Tokenise the batch, form a batch tensor
  batch_tokens = model.to_tokens(dataset)
  # Feed the tensor through the model
  logits, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)

  return logits, cache

In [None]:
from torch.nn.utils.rnn import pad_sequence

def dataset_activations_optimised(
  model: HookedTransformer,
  dataset: List[str],
  location: str,
  max_batch_size: int
):

  num_batches = (len(dataset) + max_batch_size - 1) // max_batch_size
  all_final_activations = []

  # Process each batch
  for batch_idx in range(num_batches):
    t.cuda.empty_cache()
    # print("batch_idx be: ", batch_idx)
    # Determine the start and end index for this batch
    start_idx = batch_idx * max_batch_size
    end_idx = min(start_idx + max_batch_size, len(dataset))

    # Extract the subset of the dataset for this batch
    batch_subset = dataset[start_idx:end_idx]

    # Tokenise the batch, form a batch tensor
    batch_tokens = model.to_tokens(batch_subset)

    mask = batch_tokens != 50256
    final_indices = ((mask.cumsum(dim=1) == mask.sum(dim=1).unsqueeze(1)).int()).argmax(dim=1)
    final_indices = final_indices.view(-1,1)

    # print(batch_tokens)
    # Feed the tensor through the model
    _, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)
    activations = cache[location]


    # # Take the last activation
    index_expanded = final_indices.unsqueeze(-1).expand(-1, -1, activations.size(2))
    # print("index_expanded: ", index_expanded)
    final_activations = t.gather(activations, 1, index_expanded)
    # Move the activations to the CPU and store them
    final_activations = final_activations.cpu()
    final_activations = final_activations.squeeze()
    all_final_activations.append(final_activations)

  all_final_activations = t.cat(all_final_activations, dim=0)

  # print("all final activations shape is: ", all_final_activations.shape)


  return all_final_activations

In [None]:
def dataset_activations_optimised_locations(
  model: HookedTransformer,
  dataset: List[str],
  layers: int,
  location: str,
  max_batch_size: int
):
  """
  Same as earlier function, but returns activations for all locations.
  """

  num_batches = (len(dataset) + max_batch_size - 1) // max_batch_size
  all_final_activations = {}

  # Process each batch
  for batch_idx in range(num_batches):
    t.cuda.empty_cache()
    # print("batch_idx be: ", batch_idx)
    # Determine the start and end index for this batch
    start_idx = batch_idx * max_batch_size
    end_idx = min(start_idx + max_batch_size, len(dataset))

    # Extract the subset of the dataset for this batch
    batch_subset = dataset[start_idx:end_idx]

    # Tokenise the batch, form a batch tensor
    batch_tokens = model.to_tokens(batch_subset)

    mask = batch_tokens != 50256
    final_indices = ((mask.cumsum(dim=1) == mask.sum(dim=1).unsqueeze(1)).int()).argmax(dim=1)
    final_indices = final_indices.view(-1,1)

    # print(batch_tokens)
    # Feed the tensor through the model
    _, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)

    for layer in range(layers):
      activations = cache[location.format(layer)]
      # # Take the last activation
      index_expanded = final_indices.unsqueeze(-1).expand(-1, -1, activations.size(2))
      # print("index_expanded: ", index_expanded)
      final_activations = t.gather(activations, 1, index_expanded)
      # Move the activations to the CPU and store them
      final_activations = final_activations.cpu()
      final_activations = final_activations.squeeze()
      all_final_activations.setdefault(layer, []).append(final_activations)



  for layer in range(layers):
    all_final_activations[layer] = t.cat(all_final_activations[layer], dim=0)


  return all_final_activations

In [None]:
# Reshape into a matrix (looks at activations of each token)

def reshape_activations(
  batch_activations: t.Tensor
) -> t.Tensor:
  squeezed_tensor = einops.rearrange(batch_activations, 'b tokens dim -> (b tokens) dim')
  return squeezed_tensor



In [None]:
def activation_SVD(
    model: HookedTransformer,
    dataset: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations(model, dataset)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  U, S, V_H = SVD(squeezed_activations)
  return U, S, V_H


In [None]:
# Run a dataset through the model, store all the activations
def dataset_activations_tokens(
  model: HookedTransformer,
  dataset_tokens: t.Tensor
):
  # Tokenise the batch, form a batch tensor
  batch_tokens = dataset_tokens
  # Feed the tensor through the model
  logits, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)
  return logits, cache

def reshape_activations(
  batch_activations: t.Tensor
) -> t.Tensor:
  squeezed_tensor = einops.rearrange(batch_activations, 'b tokens dim -> (b tokens) dim')
  return squeezed_tensor

def activation_SVD_tokens(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  U, S, V_H = SVD(squeezed_activations)
  return U, S, V_H

In [None]:
def activation_SVD_covariance(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  """
  Similar to normal covariance, but we look at the normalised covariance matrix instead.
  """
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  print(squeezed_activations.shape)
  mean_activation = squeezed_activations.mean(dim=0, keepdim=True)
  centred_activations = squeezed_activations - mean_activation
  covariance_matrix = centred_activations.T @ centred_activations
  print(covariance_matrix.shape)
  U, S, V_H = SVD(covariance_matrix)
  return U, S, V_H


# activation_SVD_covariance(gpt2_small, multiplication_dataset, 'blocks.5.hook_attn_out')



In [None]:
# Using tokens as a starting point, and also using the covariance matrix
def activation_SVD_tokens_covariance(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  mean_activation = squeezed_activations.mean(dim=0, keepdim=True)
  centred_activations = squeezed_activations - mean_activation
  covariance_matrix = centred_activations.T @ centred_activations
  U, S, V_H = SVD(covariance_matrix)
  return U, S, V_H

# Singular Vector Functions

In [None]:
def dataset_projection(
    X: t.Tensor,
    B: t.Tensor
) -> t.Tensor:
  """
  Take in dataset X (with datapoints as rows) and an orthogonal basis B of a subspace.

  The basis should be of dimension D x M, with M vectors in the basis each of dim D.

  Compute the projection of each datapoint on this subspace, store the results as
  another dataset in the same form as the original.
  """

  return B.T @ B @ X.T


In [None]:
def top_k_projection(
    X: t.Tensor,
    V_H: t.Tensor,
    k: int
) -> t.Tensor:
  """
  Project the dataset X onto the top k orthogonal basis vectors in V_H
  """
  B = V_H[:k,:]
  proj = dataset_projection(X, B)
  return proj.T

In [None]:
def matrix_error(
    X_1: t.Tensor,
    X_2: t.Tensor
) -> int:
  """
  Treat X_1 and X_2 as rows of datapoints.
  Then, take the difference between these matrices,
  compute the l_2 norm of each row,
  and then sum these norms.

  Return the sum of these norms.
  """
  X = X_1 - X_2
  norms = t.norm(X, dim=1, p=2)
  sum_of_norms = t.sum(norms)

  return sum_of_norms

# PCA Reconstruction Errors

In [None]:
def pca_reconstruction_errors(
    model: HookedTransformer,
    target_dataset: List[str],
    comparison_datasets: Dict[str, List[str]],
    k: int,
    layers: int,
    location: str
):
  """
  Given a target dataset and a comparison dataset, with a transformer model,
  for each location in the model:
    1. Compute the singular value decomposition of activations of the final token
    of the model for the target dataset.
    2. Consider the subspace corresponding to the top k principle components. (determine k lmao)
    3. Project the activations corresponding to the comparison dataset onto this subspace.
    4. Look at the L_2 error of the reconstruction (as a fraction of the total length).

  Repeat this for every location in the model, construct a plot for the MLP layers, the attention layers, and
  the residual stream.
  """
  # Get the number of layers of the model
  # Shouldn't hardcode, but I know this for gpt2 xl is 48
  all_accuracies = {}
  all_comparison_activations_dict = {}
  target_activations_dict = dataset_activations_optimised_locations(
    model,
    target_dataset,
    layers,
    location,
    2
  )
  for name, comparison_dataset in comparison_datasets.items():
    comparison_activations_dict = dataset_activations_optimised_locations(
      model,
      comparison_dataset,
      layers,
      location,
      2
    )

    all_comparison_activations_dict[name] = comparison_activations_dict

    all_accuracies[name] = []

  for layer in range(layers):
    target_activations = target_activations_dict[layer]
    # Do SVD
    _, _, V_H = SVD(target_activations)


    for name, comparison_dataset in comparison_datasets.items():
      comparison_activations_dict = all_comparison_activations_dict[name]
      comparison_activations = comparison_activations_dict[layer]
      # Project the comparison activations onto the top k vectors of V_H
      comparison_approx = top_k_projection(comparison_activations, V_H, k)

      # Get the errors
      error = matrix_error(comparison_approx, comparison_activations)
      total_l2_norms = t.sum(t.norm(comparison_activations, dim=1, p=2))
      error_fraction = error / total_l2_norms
      all_accuracies[name].append(1 - error_fraction)

  print(all_accuracies)

  for name, accuracies in all_accuracies.items():
    plt.plot(accuracies, label=name)
  plt.ylabel("Reconstruction accuracies (as fraction of original)")
  plt.xlabel("Layer")
  plt.title(f"Reconstruction accuracies of comparison projection on top {k} vectors of target activations at {location}")
  plt.legend()
  plt.show()




In [None]:
def pca_reconstruction_errors_self(
    model: HookedTransformer,
    datasets: Dict[str, List[str]],
    k: int,
    layers: int,
    location: str
):
  """
  Given a target dataset and a comparison dataset, with a transformer model,
  for each location in the model:
    1. Compute the singular value decomposition of activations of the final token
    of the model for the target dataset.
    2. Consider the subspace corresponding to the top k principle components. (determine k lmao)
    3. Project the activations corresponding to the comparison dataset onto this subspace.
    4. Look at the L_2 error of the reconstruction (as a fraction of the total length).

  Repeat this for every location in the model, construct a plot for the MLP layers, the attention layers, and
  the residual stream.
  """
  # Get the number of layers of the model
  # Shouldn't hardcode, but I know this for gpt2 xl is 48
  all_accuracies = {}
  all_activations_dict = {}

  for name, dataset in datasets.items():
    activations_dict = dataset_activations_optimised_locations(
      model,
      dataset,
      layers,
      location,
      2
    )

    all_activations_dict[name] = activations_dict

    all_accuracies[name] = []

    for layer in range(layers):
        # do SVD
        activations = all_activations_dict[name][layer]
        _, _, V_H = SVD(activations)

        # Project the comparison activations onto the top k vectors of V_H
        approx = top_k_projection(activations, V_H, k)

        # Get the errors
        error = matrix_error(approx, activations)
        total_l2_norms = t.sum(t.norm(activations, dim=1, p=2))
        error_fraction = error / total_l2_norms
        all_accuracies[name].append(1 - error_fraction)

        gc.collect()
        t.cuda.empty_cache()

  print(all_accuracies)

  for name, accuracies in all_accuracies.items():
    plt.plot(accuracies, label=name)
  plt.ylabel("Reconstruction accuracies (as fraction of original)")
  plt.xlabel("Layer")
  plt.title(f"Reconstruction accuracies of projection on top {k} vectors of activations at {location}")
  plt.legend()
  plt.show()

In [None]:
def activation_plot_final_acts_optimised(
  model: HookedTransformer,
  datasets_lang: List[str],
  datasets_tokens: List,
  plot_type: str,
  location: str,
  dimension: int,
  random: bool = False,
  centre: bool = True
):
  """
  Given a dataset, create a plot of the activations for the final token.
  Use gpt 2 small, look at any given location.
  Works for either numerical or categorical labels.
  Should support both t-SNE and PCA.
  """
  t.cuda.empty_cache()
  activation_dict = {}
  # Do the forward pass for each dataset
  for name in datasets_lang:
    dataset_lang = datasets_lang[name]
    final_activations = dataset_activations_optimised(model, dataset_lang, location, 2)
    # Looking at the final activations! Might want to change this?

    # target_X = reshape_activations(activations)

    # Convert the tensor to a numpy array as scikit-learn works with numpy arrays
    data_numpy = final_activations.cpu().numpy()
    activation_dict[name] = data_numpy
    print("data numpy shape is: ", data_numpy.shape)

  for name in datasets_tokens:
    dataset_token = datasets_tokens[name]
    activations = dataset_activations_tokens(model, dataset_token)[1][location]
    # Looking at the final activations! Might want to change this?
    final_activations = activations[:,-1,:]

    # target_X = reshape_activations(activations)

    # Convert the tensor to a numpy array as scikit-learn works with numpy arrays
    data_numpy = final_activations.cpu().numpy()
    activation_dict[name] = data_numpy

  all_data = []
  all_labels = []
  for label, data in activation_dict.items():
      all_data.append(data)
      all_labels.extend([label] * len(data))

  all_data = np.concatenate(all_data, axis=0)

  print("all data shape is: ", all_data.shape)

  if random:

    # Determine the number of data points per label (assuming all labels have the same number of data points)
    num_points_per_label = len(activation_dict[list(activation_dict.keys())[0]])

    # Calculate the mean and variance of the entire dataset
    mean = np.mean(all_data, axis=0)
    variance = np.var(all_data, axis=0)
    # Generate some random data
    # Will use the same mean and variance as the data
    # Generate synthetic data with the same variance
    if centre:
      synthetic_data = np.random.normal(loc=mean, scale=np.sqrt(variance), size=(num_points_per_label, all_data.shape[1]))
    else:
      zeros = 0 * mean
      synthetic_data = np.random.normal(loc=zeros, scale=np.sqrt(variance), size=(num_points_per_label, all_data.shape[1]))
    synthetic_labels = ['Synthetic'] * len(synthetic_data)

    # Concatenate the synthetic data with the original data
    all_data = np.concatenate([all_data, synthetic_data], axis=0)
    all_labels.extend(synthetic_labels)



  # Initialize the t-SNE
  tsne = TSNE(n_components=2, random_state=21)

  # Fit and transform the data to 2D
  data_2d = tsne.fit_transform(all_data)

  data_2d_copy, all_labels_copy = data_2d, all_labels


  # Shuffle the data and labels
  data_2d, all_labels = shuffle(data_2d, all_labels, random_state=42)

  # Plot the transformed data
  # Create a colormap for labels
  unique_labels = list(set(all_labels))
  colors = plt.cm.get_cmap('viridis', len(unique_labels))

  # Plot the transformed data with labels
  plt.figure(figsize=(6, 5))
  markers = ['o', 's', '^', 'D', 'P']

  # Plot all points together, coloring them based on their labels
  for i, label in enumerate(all_labels):
      color_idx = unique_labels.index(label)
      marker = markers[color_idx % len(markers)]
      plt.scatter(data_2d[i, 0], data_2d[i, 1], color=colors(color_idx), s=20, alpha=0.6, label=label if color_idx not in [unique_labels.index(lbl) for lbl in all_labels[:i]] else "", marker=marker)

  # Add a legend
  handles, labels = plt.gca().get_legend_handles_labels()
  plt.legend(handles, labels, title="Labels")

  plt.xlabel('t-SNE feature 0')
  plt.ylabel('t-SNE feature 1')
  plt.title(f't-SNE visualization of the final activations of {model.name} at {location}')
  plt.show()
  return data_2d_copy, all_labels_copy

In [None]:
addition_dataset = [str(x) + " + " + str(y) + " =" for x in range(24) for y in range(24)]
multiplication_dataset = [str(x) + " * " + str(y) + " =" for x in range(24) for y in range(24)]

# Activation Addition Utils

In [None]:
class ActivationAddition:
  """
  Specifies a Dataset, a coefficient, and a location in the model.
  Then provides a method for finding the principal component at this location,
  which can be used for activation addition interventions in the residual stream.
  Note: I'm going to be overwriting the version that currently exists for this.
  This is to allow me to do PCA stuff, not just use prompts.
  """


  def __init__(
      self,
      coeff: float,
      act_name: Union[str, int],
      dataset: List[str],
  )
    """
    Specifies a model location ('act_name') from which to extract activations.
    PCA will then be used to extract the top principal component
    of the activations of the final token of the dataset at this location.
    This will then be multiplied by 'coeff'.
    TODO: identify how big to make the vector as a baseline.
    """

    self.coeff = coeff





# Activation Addition Intervention

My plan for this is to use the existing print_n_comparisons function from the algebraic_value_editing library. This should mean that I only need to change the ActivationAddition class, and might be able to just plug and play with the rest?

Don't think this works - probably need to fork the repo and make my changes to it.

In [None]:
def run_with_intervention(
    model: HookedTransformer,
    prompt: str,
    addition_vector: ActivationAddition,
    temperature: float,
    examples: int = 5,
)
  """
  Complete a forward pass of a model, with an intervention on the
  residual stream corresponding to activation addition.

  This activation vector, and where to intervene on the model, is contained
  in the addition_vector object.
  """