I'm going to be using Beren's stuff on interpreting the SVD of weight matrices here. The goal will be to find the tokens which correspond most to different principal components of the activations of a specific dataset.

# Setup

In [None]:
try:
  import google.colab
  IN_COLAB = True
  print("Running as a Colab notebook")
  from google.colab import drive
  drive.mount('/content/gdrive')
  %cd /content/gdrive/MyDrive/AI-ML-Stuff/Dissertation/work/Transformer-Masking
  %pip install transformer_lens
  %pip install scikit-learn
except:
  IN_COLAB = False
  print("Running as a Jupyter notebook - intended for development only!")

Running as a Colab notebook
Mounted at /content/gdrive
/content/gdrive/MyDrive/AI-ML-Stuff/Dissertation/work/Transformer-Masking
Collecting transformer_lens
  Downloading transformer_lens-1.5.0-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.9/105.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.7/739.7 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.7.1 (from transformer_lens)
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops>=0.6.0 (from transformer_lens)
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 k

In [None]:
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.renderers.default = "notebook_connected" # or use "browser" if you want plots to open with browser
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import einops
from fancy_einsum import einsum
from typing import List, Optional, Callable, Tuple, Union, Dict
import functools
from tqdm import tqdm
from IPython.display import display
import random
import os
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
import json


from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x783f2c527760>

In [None]:
!nvidia-smi

Wed Aug  9 18:59:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    49W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
gpt2_small = HookedTransformer.from_pretrained("gpt2-small").cuda()
gpt2_small.name = "gpt2 small"

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  cuda


In [None]:
gpt2_small.tokenizer.decode([5500])

'uts'

In [None]:
gpt2_xl = HookedTransformer.from_pretrained("gpt2-xl").cuda()
gpt2_xl.name = "gpt2 xl"

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda


# New Functions

# Functions

In [None]:
def SVD(matrix: t.Tensor) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  """
  Compute the SVD of a matrix.
  Returns the three associated matrices
  """
  U, S, V_H = t.linalg.svd(matrix)
  return U, S, V_H


In [None]:
# Run a dataset through the model, store all the activations
def dataset_activations(
  model: HookedTransformer,
  dataset: List[str]
):
  # TODO: make this run in batches instead of at once.
  #
  # Tokenise the batch, form a batch tensor
  batch_tokens = model.to_tokens(dataset)
  # Feed the tensor through the model
  logits, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)

  return logits, cache

In [None]:
from torch.nn.utils.rnn import pad_sequence

def dataset_activations_optimised(
  model: HookedTransformer,
  dataset: List[str],
  location: str,
  max_batch_size: int,
  all_activations: bool = False
):

  num_batches = (len(dataset) + max_batch_size - 1) // max_batch_size
  all_final_activations = []

  # Process each batch
  for batch_idx in range(num_batches):
    t.cuda.empty_cache()
    # print("batch_idx be: ", batch_idx)
    # Determine the start and end index for this batch
    start_idx = batch_idx * max_batch_size
    end_idx = min(start_idx + max_batch_size, len(dataset))

    # Extract the subset of the dataset for this batch
    batch_subset = dataset[start_idx:end_idx]

    # Tokenise the batch, form a batch tensor
    batch_tokens = model.to_tokens(batch_subset)

    mask = batch_tokens != 50256
    final_indices = ((mask.cumsum(dim=1) == mask.sum(dim=1).unsqueeze(1)).int()).argmax(dim=1)
    final_indices = final_indices.view(-1,1)

    # print(batch_tokens)
    # Feed the tensor through the model
    _, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)
    activations = cache[location]
    index_expanded = final_indices.unsqueeze(-1).expand(-1, -1, activations.size(2))

    if all_activations:
      all_activations
    else:
      # Take the last activation
      # print("index_expanded: ", index_expanded)
      final_activations = t.gather(activations, 1, index_expanded)
      # Move the activations to the CPU and store them
      final_activations = final_activations.cpu()
      final_activations = final_activations.squeeze()
      all_final_activations.append(final_activations)

  all_final_activations = t.cat(all_final_activations, dim=0)

  # print("all final activations shape is: ", all_final_activations.shape)


  return all_final_activations

In [None]:
def select_vectors_parallel(indices, Y):
    """
    For each index, select every vector in Y up to and including the index.

    Parameters
    ----------
    indices : torch.Tensor
        Tensor of indices.
    Y : torch.Tensor
        Tensor of shape (batch, length, dimension).

    Returns
    -------
    torch.Tensor
        Stacked vectors.
    """
    assert len(indices.shape) == 1, "Indices tensor should be 1D"
    assert len(Y.shape) == 3, "Y tensor should be 3D"
    assert indices.shape[0] == Y.shape[0], "Batch size should match"

    # Create a mask of shape (batch, length)
    mask = t.arange(Y.shape[1]).expand(indices.shape[0], Y.shape[1]).to(Y.device) <= indices.unsqueeze(1)

    # Expand mask to match the shape of Y
    mask = mask.unsqueeze(2).expand_as(Y)

    # Apply mask and remove extra dimensions
    selected_vectors = Y[mask].view(-1, Y.shape[-1])

    return selected_vectors

In [None]:
from torch.nn.utils.rnn import pad_sequence

def dataset_activations_optimised_new(
  model: HookedTransformer,
  dataset: List[str],
  location: str,
  max_batch_size: int,
  use_all_activations: bool = False
):
  """
  Note: this function has been updated to also return all activations, if we want it to do this

  """
  num_batches = (len(dataset) + max_batch_size - 1) // max_batch_size
  all_activations = []

  # Process each batch
  for batch_idx in range(num_batches):
    t.cuda.empty_cache()
    # print("batch_idx be: ", batch_idx)
    # Determine the start and end index for this batch
    start_idx = batch_idx * max_batch_size
    end_idx = min(start_idx + max_batch_size, len(dataset))

    # Extract the subset of the dataset for this batch
    batch_subset = dataset[start_idx:end_idx]

    # Tokenise the batch, form a batch tensor
    batch_tokens = model.to_tokens(batch_subset)

    mask = batch_tokens != 50256
    final_indices = ((mask.cumsum(dim=1) == mask.sum(dim=1).unsqueeze(1)).int()).argmax(dim=1)
    final_indices = final_indices.view(-1,1)

    # print(batch_tokens)
    # Feed the tensor through the model
    _, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)
    activations = cache[location]



    if use_all_activations:
      output_activations = select_vectors_parallel(final_indices.squeeze(), activations).cpu()
    else:
      index_expanded = final_indices.unsqueeze(-1).expand(-1, -1, activations.size(2))
      # print("index_expanded: ", index_expanded)
      final_activations = t.gather(activations, 1, index_expanded)
      # Move the activations to the CPU and store them
      final_activations = final_activations.cpu()
      output_activations = final_activations.squeeze()

    all_activations.append(output_activations)

  all_activations = t.cat(all_activations, dim=0)

  return all_activations

In [None]:
def dataset_activations_optimised_locations(
  model: HookedTransformer,
  dataset: List[str],
  layers: int,
  location: str,
  max_batch_size: int
):
  """
  Same as earlier function, but returns activations for all locations.
  """

  num_batches = (len(dataset) + max_batch_size - 1) // max_batch_size
  all_final_activations = {}

  # Process each batch
  for batch_idx in range(num_batches):
    t.cuda.empty_cache()
    # print("batch_idx be: ", batch_idx)
    # Determine the start and end index for this batch
    start_idx = batch_idx * max_batch_size
    end_idx = min(start_idx + max_batch_size, len(dataset))

    # Extract the subset of the dataset for this batch
    batch_subset = dataset[start_idx:end_idx]

    # Tokenise the batch, form a batch tensor
    batch_tokens = model.to_tokens(batch_subset)

    mask = batch_tokens != 50256
    final_indices = ((mask.cumsum(dim=1) == mask.sum(dim=1).unsqueeze(1)).int()).argmax(dim=1)
    final_indices = final_indices.view(-1,1)

    # print(batch_tokens)
    # Feed the tensor through the model
    _, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)

    for layer in range(layers):
      activations = cache[location.format(layer)]
      # # Take the last activation
      index_expanded = final_indices.unsqueeze(-1).expand(-1, -1, activations.size(2))
      # print("index_expanded: ", index_expanded)
      final_activations = t.gather(activations, 1, index_expanded)
      # Move the activations to the CPU and store them
      final_activations = final_activations.cpu()
      final_activations = final_activations.squeeze()
      all_final_activations.setdefault(layer, []).append(final_activations)



  for layer in range(layers):
    all_final_activations[layer] = t.cat(all_final_activations[layer], dim=0)


  return all_final_activations

In [None]:
# Reshape into a matrix (looks at activations of each token)

def reshape_activations(
  batch_activations: t.Tensor
) -> t.Tensor:
  squeezed_tensor = einops.rearrange(batch_activations, 'b tokens dim -> (b tokens) dim')
  return squeezed_tensor



In [None]:
def activation_SVD(
    model: HookedTransformer,
    dataset: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations(model, dataset)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  U, S, V_H = SVD(squeezed_activations)
  return U, S, V_H


In [None]:
# Run a dataset through the model, store all the activations
def dataset_activations_tokens(
  model: HookedTransformer,
  dataset_tokens: t.Tensor
):
  # Tokenise the batch, form a batch tensor
  batch_tokens = dataset_tokens
  # Feed the tensor through the model
  logits, cache = model.run_with_cache(batch_tokens, return_cache_object=True, remove_batch_dim=False)
  return logits, cache

def reshape_activations(
  batch_activations: t.Tensor
) -> t.Tensor:
  squeezed_tensor = einops.rearrange(batch_activations, 'b tokens dim -> (b tokens) dim')
  return squeezed_tensor

def activation_SVD_tokens(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  U, S, V_H = SVD(squeezed_activations)
  return U, S, V_H

In [None]:
def activation_SVD_covariance(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  """
  Similar to normal covariance, but we look at the normalised covariance matrix instead.
  """
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  print(squeezed_activations.shape)
  mean_activation = squeezed_activations.mean(dim=0, keepdim=True)
  centred_activations = squeezed_activations - mean_activation
  covariance_matrix = centred_activations.T @ centred_activations
  print(covariance_matrix.shape)
  U, S, V_H = SVD(covariance_matrix)
  return U, S, V_H


# activation_SVD_covariance(gpt2_small, multiplication_dataset, 'blocks.5.hook_attn_out')



In [None]:
# Using tokens as a starting point, and also using the covariance matrix
def activation_SVD_tokens_covariance(
    model: HookedTransformer,
    dataset_tokens: List[str],
    location: str
) -> Tuple[t.Tensor, t.Tensor, t.Tensor]:
  _, cache = dataset_activations_tokens(model, dataset_tokens)
  activation_cache = cache[location]
  squeezed_activations = reshape_activations(activation_cache)
  mean_activation = squeezed_activations.mean(dim=0, keepdim=True)
  centred_activations = squeezed_activations - mean_activation
  covariance_matrix = centred_activations.T @ centred_activations
  U, S, V_H = SVD(covariance_matrix)
  return U, S, V_H

# Singular Vector Functions

In [None]:
def dataset_projection(
    X: t.Tensor,
    B: t.Tensor
) -> t.Tensor:
  """
  Take in dataset X (with datapoints as rows) and an orthogonal basis B of a subspace.

  The basis should be of dimension D x M, with M vectors in the basis each of dim D.

  Compute the projection of each datapoint on this subspace, store the results as
  another dataset in the same form as the original.
  """

  return B.T @ B @ X.T


In [None]:
def top_k_projection(
    X: t.Tensor,
    V_H: t.Tensor,
    k: int
) -> t.Tensor:
  """
  Project the dataset X onto the top k orthogonal basis vectors in V_H
  """
  B = V_H[:k,:]
  proj = dataset_projection(X, B)
  return proj.T

In [None]:
def matrix_error(
    X_1: t.Tensor,
    X_2: t.Tensor
) -> int:
  """
  Treat X_1 and X_2 as rows of datapoints.
  Then, take the difference between these matrices,
  compute the l_2 norm of each row,
  and then sum these norms.

  Return the sum of these norms.
  """
  X = X_1 - X_2
  norms = t.norm(X, dim=1, p=2)
  sum_of_norms = t.sum(norms)

  return sum_of_norms

# PCA Reconstruction Errors

In [None]:
def pca_reconstruction_errors(
    model: HookedTransformer,
    target_dataset: List[str],
    comparison_datasets: Dict[str, List[str]],
    k: int,
    layers: int,
    location: str
):
  """
  Given a target dataset and a comparison dataset, with a transformer model,
  for each location in the model:
    1. Compute the singular value decomposition of activations of the final token
    of the model for the target dataset.
    2. Consider the subspace corresponding to the top k principle components. (determine k lmao)
    3. Project the activations corresponding to the comparison dataset onto this subspace.
    4. Look at the L_2 error of the reconstruction (as a fraction of the total length).

  Repeat this for every location in the model, construct a plot for the MLP layers, the attention layers, and
  the residual stream.
  """
  # Get the number of layers of the model
  # Shouldn't hardcode, but I know this for gpt2 xl is 48
  all_accuracies = {}
  all_comparison_activations_dict = {}
  target_activations_dict = dataset_activations_optimised_locations(
    model,
    target_dataset,
    layers,
    location,
    2
  )
  for name, comparison_dataset in comparison_datasets.items():
    comparison_activations_dict = dataset_activations_optimised_locations(
      model,
      comparison_dataset,
      layers,
      location,
      2
    )

    all_comparison_activations_dict[name] = comparison_activations_dict

    all_accuracies[name] = []

  for layer in range(layers):
    target_activations = target_activations_dict[layer]
    # Do SVD
    _, _, V_H = SVD(target_activations)


    for name, comparison_dataset in comparison_datasets.items():
      comparison_activations_dict = all_comparison_activations_dict[name]
      comparison_activations = comparison_activations_dict[layer]
      # Project the comparison activations onto the top k vectors of V_H
      comparison_approx = top_k_projection(comparison_activations, V_H, k)

      # Get the errors
      error = matrix_error(comparison_approx, comparison_activations)
      total_l2_norms = t.sum(t.norm(comparison_activations, dim=1, p=2))
      error_fraction = error / total_l2_norms
      all_accuracies[name].append(1 - error_fraction)

  print(all_accuracies)

  for name, accuracies in all_accuracies.items():
    plt.plot(accuracies, label=name)
  plt.ylabel("Reconstruction accuracies (as fraction of original)")
  plt.xlabel("Layer")
  plt.title(f"Reconstruction accuracies of comparison projection on top {k} vectors of target activations at {location}")
  plt.legend()
  plt.show()




In [None]:
def pca_reconstruction_errors_self(
    model: HookedTransformer,
    datasets: Dict[str, List[str]],
    k: int,
    layers: int,
    location: str
):
  """
  Given a target dataset and a comparison dataset, with a transformer model,
  for each location in the model:
    1. Compute the singular value decomposition of activations of the final token
    of the model for the target dataset.
    2. Consider the subspace corresponding to the top k principle components. (determine k lmao)
    3. Project the activations corresponding to the comparison dataset onto this subspace.
    4. Look at the L_2 error of the reconstruction (as a fraction of the total length).

  Repeat this for every location in the model, construct a plot for the MLP layers, the attention layers, and
  the residual stream.
  """
  # Get the number of layers of the model
  # Shouldn't hardcode, but I know this for gpt2 xl is 48
  all_accuracies = {}
  all_activations_dict = {}

  for name, dataset in datasets.items():
    activations_dict = dataset_activations_optimised_locations(
      model,
      dataset,
      layers,
      location,
      2
    )

    all_activations_dict[name] = activations_dict

    all_accuracies[name] = []

    for layer in range(layers):
        # do SVD
        activations = all_activations_dict[name][layer]
        _, _, V_H = SVD(activations)

        # Project the comparison activations onto the top k vectors of V_H
        approx = top_k_projection(activations, V_H, k)

        # Get the errors
        error = matrix_error(approx, activations)
        total_l2_norms = t.sum(t.norm(activations, dim=1, p=2))
        error_fraction = error / total_l2_norms
        all_accuracies[name].append(1 - error_fraction)

        gc.collect()
        t.cuda.empty_cache()

  print(all_accuracies)

  for name, accuracies in all_accuracies.items():
    plt.plot(accuracies, label=name)
  plt.ylabel("Reconstruction accuracies (as fraction of original)")
  plt.xlabel("Layer")
  plt.title(f"Reconstruction accuracies of projection on top {k} vectors of activations at {location}")
  plt.legend()
  plt.show()

In [None]:
def activation_plot_final_acts_optimised(
  model: HookedTransformer,
  datasets_lang: List[str],
  datasets_tokens: List,
  plot_type: str,
  location: str,
  dimension: int,
  random: bool = False,
  centre: bool = True
):
  """
  Given a dataset, create a plot of the activations for the final token.
  Use gpt 2 small, look at any given location.
  Works for either numerical or categorical labels.
  Should support both t-SNE and PCA.
  """
  t.cuda.empty_cache()
  activation_dict = {}
  # Do the forward pass for each dataset
  for name in datasets_lang:
    dataset_lang = datasets_lang[name]
    final_activations = dataset_activations_optimised(model, dataset_lang, location, 2)
    # Looking at the final activations! Might want to change this?

    # target_X = reshape_activations(activations)

    # Convert the tensor to a numpy array as scikit-learn works with numpy arrays
    data_numpy = final_activations.cpu().numpy()
    activation_dict[name] = data_numpy
    print("data numpy shape is: ", data_numpy.shape)

  for name in datasets_tokens:
    dataset_token = datasets_tokens[name]
    activations = dataset_activations_tokens(model, dataset_token)[1][location]
    # Looking at the final activations! Might want to change this?
    final_activations = activations[:,-1,:]

    # target_X = reshape_activations(activations)

    # Convert the tensor to a numpy array as scikit-learn works with numpy arrays
    data_numpy = final_activations.cpu().numpy()
    activation_dict[name] = data_numpy

  all_data = []
  all_labels = []
  for label, data in activation_dict.items():
      all_data.append(data)
      all_labels.extend([label] * len(data))

  all_data = np.concatenate(all_data, axis=0)

  print("all data shape is: ", all_data.shape)

  if random:

    # Determine the number of data points per label (assuming all labels have the same number of data points)
    num_points_per_label = len(activation_dict[list(activation_dict.keys())[0]])

    # Calculate the mean and variance of the entire dataset
    mean = np.mean(all_data, axis=0)
    variance = np.var(all_data, axis=0)
    # Generate some random data
    # Will use the same mean and variance as the data
    # Generate synthetic data with the same variance
    if centre:
      synthetic_data = np.random.normal(loc=mean, scale=np.sqrt(variance), size=(num_points_per_label, all_data.shape[1]))
    else:
      zeros = 0 * mean
      synthetic_data = np.random.normal(loc=zeros, scale=np.sqrt(variance), size=(num_points_per_label, all_data.shape[1]))
    synthetic_labels = ['Synthetic'] * len(synthetic_data)

    # Concatenate the synthetic data with the original data
    all_data = np.concatenate([all_data, synthetic_data], axis=0)
    all_labels.extend(synthetic_labels)



  # Initialize the t-SNE
  tsne = TSNE(n_components=2, random_state=21)

  # Fit and transform the data to 2D
  data_2d = tsne.fit_transform(all_data)

  data_2d_copy, all_labels_copy = data_2d, all_labels


  # Shuffle the data and labels
  data_2d, all_labels = shuffle(data_2d, all_labels, random_state=42)

  # Plot the transformed data
  # Create a colormap for labels
  unique_labels = list(set(all_labels))
  colors = plt.cm.get_cmap('viridis', len(unique_labels))

  # Plot the transformed data with labels
  plt.figure(figsize=(6, 5))
  markers = ['o', 's', '^', 'D', 'P']

  # Plot all points together, coloring them based on their labels
  for i, label in enumerate(all_labels):
      color_idx = unique_labels.index(label)
      marker = markers[color_idx % len(markers)]
      plt.scatter(data_2d[i, 0], data_2d[i, 1], color=colors(color_idx), s=20, alpha=0.6, label=label if color_idx not in [unique_labels.index(lbl) for lbl in all_labels[:i]] else "", marker=marker)

  # Add a legend
  handles, labels = plt.gca().get_legend_handles_labels()
  plt.legend(handles, labels, title="Labels")

  plt.xlabel('t-SNE feature 0')
  plt.ylabel('t-SNE feature 1')
  plt.title(f't-SNE visualization of the final activations of {model.name} at {location}')
  plt.show()
  return data_2d_copy, all_labels_copy

In [None]:
addition_dataset = [str(x) + " + " + str(y) + " =" for x in range(24) for y in range(24)]
multiplication_dataset = [str(x) + " * " + str(y) + " =" for x in range(24) for y in range(24)]

# Comparing Centres

In [None]:
def find_activations_centre(
  model: HookedTransformer,
  dataset: List[str],
  location: str,
  max_batch_size: int,
  use_all_activations: bool = False
):
  """
  Find the centre of the activations of a dataset, at some
  layer of a certain model.
  """
  all_activations = dataset_activations_optimised_new(
    model,
    dataset,
    location,
    max_batch_size,
    use_all_activations
  )

  # Find the mean
  mean = t.mean(all_activations, dim=0)


  return mean

In [None]:
def find_activations_centre_diff(
  model: HookedTransformer,
  target_dataset: List[str],
  baseline_dataset: List[str],
  location: str,
  max_batch_size: int,
  use_all_activations: bool = False
):
  """
  Find the centre of the activations of the baseline dataset,
  take this away from the centre of the activations of a second dataset.

  Return the resulting difference vector
  """

  baseline_centre = find_activations_centre(
    model,
    baseline_dataset,
    location,
    max_batch_size,
    use_all_activations
  )

  baseline_target = find_activations_centre(
    model,
    target_dataset,
    location,
    max_batch_size,
    use_all_activations
  )

  difference = baseline_target - baseline_centre
  return difference

# Datasets

In [None]:
stories = {}
# Specify the file path
file_path = 'fantasy_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy = json.load(file)

  stories["fantasy"] = dataset_fantasy

# Specify the file path
file_path = 'scifi_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_scifi = json.load(file)

  stories["scifi"] = dataset_scifi

# Specify the file path
file_path = 'sports_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_sports = json.load(file)

  stories["sports"] = dataset_sports

In [None]:
all_stories = dataset_sports + dataset_scifi + dataset_sports

In [None]:
# Specify the file path
file_path = 'story_questions/fantasy_genre_200.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy_genre = json.load(file)

  stories["fantasy genre question"] = dataset_fantasy_genre

In [None]:
family_datasets = {}

# Specify the file path
file_path = 'family_dataset1_big.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset1 = json.load(file)

  family_datasets["same name"] = dataset1

# Specify the file path
file_path = 'family_dataset2_big.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset2 = json.load(file)
  family_datasets["same relation"] = dataset2

# Specify the file path
file_path = 'family_dataset3_big.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
  # Load the JSON data from the file
  dataset3 = json.load(file)

  family_datasets["same name and relation"] = dataset3

In [None]:
# Generate a dataset with all numbers from 0 to 1000

addition_dataset = [str(x) + " + " + str(y) + " =" for x in range(24) for y in range(24)]
odd_addition_dataset = [str(2 * x + 1) + " + " + str(2 * y + 1) + " =" for x in range(24) for y in range(24)]
even_addition_dataset = [str(2 * x) + " + " + str(2 * y) + " =" for x in range(24) for y in range(24)]
multiplication_dataset = [str(x) + " * " + str(y) + " =" for x in range(24) for y in range(24)]
even_multiplication_dataset = [str(2 * x ) + " * " + str(2 * y ) + " =" for x in range(24) for y in range(24)]
odd_multiplication_dataset = [str(2 * x + 1) + " * " + str(2 * y + 1) + " =" for x in range(24) for y in range(24)]
multiplication_dataset2 = [str(x) + " x " + str(y) + " =" for x in range(24) for y in range(24)]

In [None]:
def read_all_text_files(directory):
    # List to hold the contents of all files
    contents_list = []

    # List all files in directory
    for filename in os.listdir(directory):
        # Check if file is a text file
        if filename.endswith('.txt'):
            # Construct full file path
            filepath = os.path.join(directory, filename)

            # Open the file and read the contents
            with open(filepath, 'r') as f:
                contents = f.read()

            # Add the file contents to the list
            contents_list.append(contents)

    return contents_list

training_subset = read_all_text_files('urlsf_subset01-1_data') + read_all_text_files('urlsf_subset01-182_data')


In [None]:
training_subset_small = [x for x in training_subset if len(x) < 1000]

In [None]:
len(training_subset)

599

In [None]:
len(training_subset_small)

44

# Experiment

We have two options for how we compare to the input tokens: we can either compare to just the embedding matrix of the tokens, or we can compare them to the activations of that word as processed by the model. Start with the first.

In [None]:
gpt2_small.W_E.shape

torch.Size([50257, 768])

In [None]:
gpt2_small.W_E[422,:]

tensor([-1.4059e-02,  3.1735e-03,  2.9210e-02, -7.9607e-03,  3.8570e-02,
        -6.4167e-02, -2.6494e-01, -4.6788e-02,  4.1205e-02,  5.9344e-02,
         2.5156e-02, -6.9654e-02, -1.0855e-01,  1.9387e-02, -1.9373e-02,
         5.1050e-02,  4.2179e-02, -8.1671e-03,  7.0000e-02,  2.2484e-01,
        -2.4882e-03,  4.5705e-02,  4.7429e-02,  4.4519e-02, -7.1037e-02,
         1.2692e-02, -3.4407e-02,  2.4246e-02,  1.4797e-02,  2.9224e-02,
        -7.2414e-02,  4.3373e-02, -7.6438e-02, -6.9749e-03, -3.4653e-02,
         8.2840e-02, -3.1377e-01, -7.0101e-02, -4.5921e-02,  4.0834e-02,
         4.7736e-02,  2.0628e-02,  3.6475e-02,  1.0918e-01, -3.2999e-02,
        -8.5490e-02, -1.1188e-01, -2.9987e-02,  5.1817e-02, -1.5906e-01,
        -3.4452e-02,  6.7772e-02,  1.6393e-02,  5.7060e-02, -1.3651e-02,
        -1.1479e-01, -7.2417e-02,  4.5526e-02,  5.5151e-02, -5.4743e-02,
        -9.9664e-02,  1.2942e-02, -3.4261e-02,  5.5662e-02, -3.0952e-01,
        -1.3812e-01,  3.8526e-03, -1.0848e-01,  1.4

In [None]:
def find_k_similar_vectors(
    model: HookedTransformer,
    vector: t.Tensor,
    k: int
):
  vector = vector.cuda()
  # Find the embedding matrix
  embed = model.W_E

  # Make a vector storing the inner product of the embeddings with the vector
  products = embed @ vector

  # Find the k indices with the highest values
  _, top_indices = products.topk(k, largest=True)
  _, bottom_indices = products.topk(k, largest=False)

  # Decode these

  most_similar = [{"word" : model.tokenizer.decode([i]), "similarity" : products[i].float()} for i in top_indices]
  least_similar = [{"word" : model.tokenizer.decode([i]), "similarity" : products[i].float()} for i in bottom_indices]

  return most_similar, least_similar

In [None]:
def pca_similar_tokens(
    model: HookedTransformer,
    dataset: List[str],
    layer: int,
    k: int,
    all_activations: bool = False,
):
  location = f"blocks.{layer}.hook_resid_post"
  # Find the PC vector
  activations = dataset_activations_optimised_new(
      model,
      dataset,
      location,
      2,
      all_activations
  )

  _, _, V_H = SVD(activations)

  pc = V_H[0,:]

  most_similar, least_similar = find_k_similar_vectors(model, pc, k)



  return most_similar, least_similar

In [None]:
def difference_similar_tokens(
    model: HookedTransformer,
    target_dataset: List[str],
    baseline_dataset: List[str],
    layer: int,
    k: int,
    all_activations: bool = False,
):
  """
  Same as the previous function, but extracts the feature vector
  by looking at the difference between the centres of the activations
  of two different datasets.
  """
  location = f"blocks.{layer}.hook_resid_post"
  # Find the feature vector
  difference = find_activations_centre_diff(
    model,
    target_dataset,
    baseline_dataset,
    location,
    2,
    all_activations
  )

  most_similar, least_similar = find_k_similar_vectors(
    model,
    difference,
    k
  )

  return most_similar, least_similar

In [None]:
gpt2_small.tokenizer.decode([321])

'am'

In [None]:
vector = gpt2_small.W_E[321,:]

find_k_similar_vectors(gpt2_small, vector, 21)

([{'word': 'am', 'similarity': tensor(11.7904, device='cuda:0')},
  {'word': 'AM', 'similarity': tensor(7.9326, device='cuda:0')},
  {'word': 'ams', 'similarity': tensor(7.3871, device='cuda:0')},
  {'word': 'ammy', 'similarity': tensor(6.9177, device='cuda:0')},
  {'word': 'amic', 'similarity': tensor(6.5874, device='cuda:0')},
  {'word': 'amia', 'similarity': tensor(6.5553, device='cuda:0')},
  {'word': 'amar', 'similarity': tensor(6.5366, device='cuda:0')},
  {'word': 'amin', 'similarity': tensor(6.4791, device='cuda:0')},
  {'word': 'amy', 'similarity': tensor(6.2506, device='cuda:0')},
  {'word': 'amation', 'similarity': tensor(6.2468, device='cuda:0')},
  {'word': 'pm', 'similarity': tensor(6.2364, device='cuda:0')},
  {'word': 'amate', 'similarity': tensor(6.2238, device='cuda:0')},
  {'word': 'Am', 'similarity': tensor(6.2193, device='cuda:0')},
  {'word': 'amas', 'similarity': tensor(6.0712, device='cuda:0')},
  {'word': 'amer', 'similarity': tensor(6.0362, device='cuda:0')},


In [None]:
all_tokens = [gpt2_small.tokenizer.decode([i]) for i in range(gpt2_small.tokenizer.vocab_size)]

In [None]:
all_tokens

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�',
 '�

# Trying the Thing!

## GPT-2 XL

### Scifi

In [None]:
import gc
gc.collect()
t.cuda.empty_cache()

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    9,
    15
)

NameError: ignored

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    18,
    15
)

([' The',
  '\n',
  '\n\n',
  ' In',
  ' A',
  ' As',
  ' And',
  ' After',
  ' This',
  ' It',
  ' With',
  '<|endoftext|>',
  ' "',
  ' At',
  ' Soon'],
 ['\x1c',
  '\x02',
  '�',
  '\x12',
  '\x16',
  '�',
  '\r',
  '\x0c',
  '�',
  '�',
  '\x1a',
  '\x1d',
  '\t',
  '�',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    27,
    15
)

(['\n',
  '\n\n',
  '<|endoftext|>',
  ' The',
  ' Meanwhile',
  ' Now',
  ' But',
  ' A',
  ' As',
  ' Unfortunately',
  ' Soon',
  ' In',
  ' This',
  ' Eventually',
  ' At'],
 ['\x1c',
  '\x02',
  '�',
  '�',
  '\x01',
  '\t',
  '\x0c',
  '�',
  '\x0e',
  '\x14',
  '�',
  '\x1d',
  '\x06',
  '\x08',
  '\x16'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    37,
    15
)

(['<|endoftext|>',
  '\n',
  ' Written',
  '\n\n',
  ' Now',
  ' But',
  ' The',
  ' Soon',
  ' However',
  ' With',
  ' Inspired',
  ' This',
  ' In',
  ' As',
  ' After'],
 ['\x1c',
  '�',
  '\x06',
  '\x14',
  '\t',
  '�',
  '\x0e',
  '�',
  '\x1f',
  '\x1a',
  '�',
  '\x02',
  '\x15',
  '�',
  '\x08'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    43,
    15
)

(['\n',
  '<|endoftext|>',
  '\n\n',
  ' But',
  ' The',
  ' Now',
  ' Written',
  ' In',
  ' As',
  ' However',
  ' This',
  ' A',
  ' With',
  ' It',
  ' And'],
 ['\x1c',
  '\t',
  '�',
  '�',
  '\x06',
  '\x1a',
  '\x14',
  '\x1f',
  ' サーティ',
  '\x04',
  '\x0e',
  '\x11',
  '\x01',
  '\x15',
  '�'])

### Sports

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    9,
    15
)

(['\x12',
  '�',
  '\x02',
  '\x1c',
  '\x0c',
  '�',
  '\r',
  '�',
  '\x16',
  '\x1a',
  '�',
  'InstoreAndOnline',
  '�',
  '�',
  '�'],
 [' The',
  '\n',
  ' A',
  ' "',
  ' In',
  ' It',
  ' And',
  ' I',
  ' As',
  ' But',
  ' He',
  '\n\n',
  ' This',
  ' No',
  ' All'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    17,
    15
)

([' The',
  '\n',
  ' And',
  ' In',
  '\n\n',
  ' It',
  ' A',
  ' As',
  ' After',
  ' "',
  ' This',
  ' With',
  ' But',
  ' At',
  ' No'],
 ['\x12',
  '\x1c',
  '\x02',
  '\x16',
  '�',
  '�',
  '�',
  '\x1a',
  '\r',
  '�',
  '\x0c',
  '�',
  '\t',
  '�',
  '\x1d'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    24,
    15
)

(['\n',
  '\n\n',
  ' The',
  ' Now',
  ' But',
  '<|endoftext|>',
  ' And',
  ' Moments',
  ' Today',
  ' Welcome',
  ' This',
  ' In',
  ' It',
  ' After',
  ' A'],
 ['\x1c',
  '\x02',
  '�',
  '�',
  '�',
  '�',
  '\x12',
  '\x14',
  '\t',
  '\r',
  '\x16',
  'embedreportprint',
  '\x01',
  '\x1a',
  '\x1d'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    35,
    15
)

(['\n',
  '<|endoftext|>',
  '\n\n',
  ' Now',
  ' But',
  ' Unfortunately',
  ' Soon',
  ' Suddenly',
  ' The',
  ' After',
  ' This',
  ' With',
  ' And',
  ' However',
  ' As'],
 ['\x1c',
  '\x1f',
  '�',
  '\x02',
  '�',
  '\x14',
  '\x06',
  '\x01',
  '�',
  '\x0e',
  '\t',
  '\x15',
  '\x05',
  'embedreportprint',
  '\x1a'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    46,
    15
)

(['\n',
  ' The',
  '<|endoftext|>',
  '\n\n',
  ' A',
  ' It',
  ' "',
  ' But',
  ' In',
  ' As',
  ' This',
  ' And',
  ' Now',
  ' For',
  ' That'],
 ['�',
  '\x1c',
  '\t',
  '\x1f',
  '\x1a',
  '\x1e',
  '�',
  '\x06',
  'embedreportprint',
  '\x14',
  ' サーティ',
  ' RandomRedditor',
  ' TheNitrome',
  '\x1d',
  '\x15'])

## Numeric

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    4,
    15
)

(['�',
  '\x1a',
  ' RandomRedditor',
  'InstoreAndOnline',
  '\x1b',
  '\x02',
  '�',
  '\t',
  '\r',
  '�',
  '\x12',
  ' TheNitrome',
  '�',
  '\x16',
  'embedreportprint'],
  ' Bronze',
  ' Stone',
  ' 0',
  ' lockout',
  ' WHITE',
  ' bronze',
  ' £',
  ' 45',
  ' Drill',
  ' grinding',
  ' ruined',
  ' stone',
  ' 50'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    9,
    15
)

(['\x02',
  '�',
  ' 裏�',
  ' guiActive',
  '\x1a',
  '�',
  ' RandomRedditor',
  '\t',
  '\r',
  '\x1b',
  '\x06',
  'InstoreAndOnline',
  ' TheNitrome',
  '�',
  '\x1c'],
  ' ax',
  ' 999',
  ' Colossus',
  ' Coke',
  ' Bronze',
  '===',
  ' 1000',
  ' Angelo',
  ' Crim',
  ' £',
  ' Infinity',
  ' div',
  ' Platinum'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    19,
    15
)

(['�',
  ' RandomRedditor',
  '�',
  'InstoreAndOnline',
  'rawdownload',
  '\x02',
  '�',
  '\t',
  '�',
  '\x19',
  '\x1f',
  '\x15',
  '\x1a',
  '\r',
  '\x18'],
 [' infinity',
  ' Infinity',
  ' Split',
  ' 8',
  ' Total',
  ' 9',
  ' Pi',
  ' 432',
  ' BLACK',
  ' Astral',
  ' OU',
  ' VERY',
  ' DP',
  ' �',
  ' Prim'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    27,
    15
)

([' RandomRedditor',
  '�',
  'rawdownload',
  '\x02',
  '�',
  'InstoreAndOnline',
  '\x1a',
  '\x15',
  '\x05',
  '�',
  '�',
  '�',
  '\x1f',
  '�',
  '\x19'],
 [' infinity',
  '?',
  ' 8',
  ' 5',
  ' �',
  ' 7',
  ' 9',
  ' 14',
  ' 6',
  ' Total',
  ' BLACK',
  ' 4',
  ' 16',
  ' 24',

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    36,
    15
)

([' RandomRedditor',
  '�',
  'rawdownload',
  '\x02',
  '�',
  '�',
  '�',
  ' TheNitrome',
  '\x1e',
  '\x15',
  '\x06',
  '\x1a',
  'embedreportprint',
  '\x16',
  '\x11'],
 ['?',
  ' 22',
  ' 21',
  ' 14',
  ' 15',
  ' 25',
  ' 24',
  ' 20',
  ' 12',
  ' infinity',
  ' 17',
  ' 13',
  ' 16',
  ' 10',
  ' 18'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    46,
    15
)

(['�',
  '\t',
  '\x1c',
  '\x1e',
  ' RandomRedditor',
  ' TheNitrome',
  '�',
  '\x06',
  '\x1f',
  '\x02',
  '\x1a',
  '\x1d',
  'rawdownload',
  '\x16',
  '\x14'],
 ['\n',
  ' 20',
  '?',
  ' 21',
  ' 24',
  ' 22',
  '?',
  ' 30',
  ' 25',
  ' 23',
  ' "',
  ' 27',
  ' 26',
  ' 29',
  ' 28'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    addition_dataset,
    47,
    15
)

(['�',
  '\x1c',
  '\t',
  ' TheNitrome',
  '\x06',
  '\x1e',
  '�',
  ' RandomRedditor',
  '\x1a',
  '\x02',
  '\x1d',
  '\x1f',
  '\x16',
  '\x15',
  '\x14'],
 ['\n',
  ' 20',
  '?',
  '?',
  ' 21',
  ' 22',
  ' "',
  ' 24',
  ' 30',
  ' 25',
  ' 23',
  ' 27',
  ' 26',
  ' 29',
  ' 18'])

## Reading Comprehensions

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    9,
    15
)

([' supposed',
  ' very',
  ' sent',
  ' famous',
  ' sitting',
  ' seated',
  ' taken',
  ' emb',
  ' made',
  ' riding',
  ' honored',
  ' painted',
  ' determined',
  ' named',
  ' obviously'],
 ['InstoreAndOnline',
  '\x12',
  '\x1a',
  '�',
  '\x02',
  '\r',
  '\x16',
  '�',
  '�',
  '\x0c',
  '\t',
  '�',
  '\x07',
  '\x1d',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    18,
    15
)

([' supposed',
  ' Romeo',
  ' Pats',
  ' George',
  ' very',
  ' "',
  ' mar',
  ' N',
  ' ill',
  ' Ober',
  ' named',
  ' kept',
  ' sitting',
  ' brother',
  ' hom'],
 [' actionGroup',
  ' antidepress',
  'ipeg',
  '/$',
  '�',
  'claimer',
  'ailability',
  'WINDOWS',
  'outheast',
  ' IMAGES',
  'awatts',
  'ventory',
  ' Flavoring',
  'effic',
  'cloneembedreportprint'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    27,
    15
)

([' called',
  ' named',
  ' George',
  ' Jacob',
  ' Abel',
  ' Angelo',
  ' Peter',
  ' Nicholas',
  ' Patrick',
  ' Robert',
  ' James',
  ' John',
  ' Adam',
  ' Joshua',
  ' William'],
 [' actionGroup',
  '�',
  '\t',
  '\x12',
  'embedreportprint',
  'cloneembedreportprint',
  '\r',
  'InstoreAndOnline',
  '\x1c',
  '\x1a',
  '�',
  '�',
  '�',
  ' RandomRedditor',
  '\x03'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    35,
    15
)

([' John',
  ' Joshua',
  ' James',
  ' Robert',
  ' named',
  ' Andrew',
  ' called',
  ' George',
  ' William',
  ' Michael',
  ' Patrick',
  ' Matthew',
  ' Joseph',
  ' David',
  ' Peter'],
 ['�',
  '\t',
  '�',
  '\x1c',
  ' RandomRedditor',
  '�',
  '\x1a',
  '\x11',
  '\x02',
  ' サーティ',
  'embedreportprint',
  ' TheNitrome',
  '\r',
  '�',
  '\x1b'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    43,
    15
)

([' called',
  ' John',
  ' named',
  ' James',
  ' William',
  ' Joshua',
  ' Robert',
  ' Joseph',
  ' Andrew',
  ' Michael',
  ' David',
  ' George',
  ' Matthew',
  ' "',
  ' Thomas'],
 ['�',
  '\t',
  '\x1c',
  '�',
  ' RandomRedditor',
  '\x02',
  '\x11',
  '�',
  '\x1a',
  ' サーティ',
  '\x1f',
  'rawdownload',
  '\x1b',
  '\x1e',
  'embedreportprint'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    7,
    15
)

([{'word': ' supposed', 'similarity': tensor(0.1399, device='cuda:0')},
  {'word': ' sent', 'similarity': tensor(0.1191, device='cuda:0')},
  {'word': ' served', 'similarity': tensor(0.1089, device='cuda:0')},
  {'word': ' determined', 'similarity': tensor(0.1073, device='cuda:0')},
  {'word': ' very', 'similarity': tensor(0.1066, device='cuda:0')},
  {'word': ' made', 'similarity': tensor(0.1053, device='cuda:0')},
  {'word': ' brought', 'similarity': tensor(0.1041, device='cuda:0')},
  {'word': ' sitting', 'similarity': tensor(0.1015, device='cuda:0')},
  {'word': ' seated', 'similarity': tensor(0.1006, device='cuda:0')},
  {'word': ' painted', 'similarity': tensor(0.1003, device='cuda:0')},
  {'word': ' taken', 'similarity': tensor(0.1002, device='cuda:0')},
  {'word': ' not', 'similarity': tensor(0.0994, device='cuda:0')},
  {'word': ' obviously', 'similarity': tensor(0.0991, device='cuda:0')},
  {'word': ' mine', 'similarity': tensor(0.0981, device='cuda:0')},
  {'word': ' placed'

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    15,
    15
)

([' supposed',
  ' very',
  ' "',
  ' rumored',
  ' called',
  ' ill',
  ' obviously',
  ' sitting',
  ' ha',
  ' sent',
  ' supposedly',
  ' N',
  ' chosen',
  ' brothers',
  ' destroyed'],
 ['InstoreAndOnline',
  '\x12',
  '�',
  '�',
  '\x02',
  '\r',
  '\x16',
  '\x1a',
  '�',
  '\t',
  '\x0c',
  '�',
  '\x1d',
  '\x19',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    22,
    15
)

([' Jam',
  ' Jack',
  ' Clara',
  ' Jacob',
  ' Kate',
  ' daughter',
  ' Romeo',
  ' Gene',
  ' N',
  ' William',
  ' Thom',
  ' Jes',
  ' Ny',
  ' Rh',
  ' Maria'],
 [' actionGroup',
  ' antidepress',
  ' pione',
  'cloneembedreportprint',
  ' practition',
  '�',
  ' dstg',
  'inventory',
  'eworld',
  ' Unloaded',
  'isSpecial',
  'asaki',
  'ÃÂÃÂ',
  'eworks',
  'iHUD'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    25,
    15
)

([{'word': ' Jack', 'similarity': tensor(0.1498, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1490, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1459, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1456, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1453, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1448, device='cuda:0')},
  {'word': ' Clara', 'similarity': tensor(0.1444, device='cuda:0')},
  {'word': ' Taylor', 'similarity': tensor(0.1440, device='cuda:0')},
  {'word': ' N', 'similarity': tensor(0.1415, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1395, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1395, device='cuda:0')},
  {'word': ' Quincy', 'similarity': tensor(0.1390, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1388, device='cuda:0')},
  {'word': ' Luc', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' Cam', 'similarity': tensor(0.1380,

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    27,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.1780, device='cuda:0')},
  {'word': ' Clara', 'similarity': tensor(0.1702, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1696, device='cuda:0')},
  {'word': ' Jack', 'similarity': tensor(0.1659, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1644, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1640, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1608, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1574, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1569, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1565, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1536, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1510, device='cuda:0')},
  {'word': ' Chester', 'similarity': tensor(0.1509, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' Wil', 'similarity': tens

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    29,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2592, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2026, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1849, device='cuda:0')},
  {'word': ' Jack', 'similarity': tensor(0.1844, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1836, device='cuda:0')},
  {'word': ' Smith', 'similarity': tensor(0.1833, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1814, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1804, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1802, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1800, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.1779, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1767, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1744, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1743, device='cuda:0')},
  {'word': ' Clara', 'similarity': tenso

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    37,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.4354, device='cuda:0')},
  {'word': 'Taylor', 'similarity': tensor(0.2981, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.2827, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2409, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.2328, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.2193, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.2162, device='cuda:0')},
  {'word': ' Joshua', 'similarity': tensor(0.2153, device='cuda:0')},
  {'word': ' Tyler', 'similarity': tensor(0.2151, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.2087, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.2065, device='cuda:0')},
  {'word': ' Tiffany', 'similarity': tensor(0.2057, device='cuda:0')},
  {'word': ' Sophia', 'similarity': tensor(0.2052, device='cuda:0')},
  {'word': ' Timothy', 'similarity': tensor(0.2013, device='cuda:0')},
  {'word': ' named', 'si

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    45,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2409, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1750, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1666, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1627, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1585, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1543, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1513, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1500, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1447, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1426, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1403, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1396, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.1390, device='cuda:0')},
  {'word': ' A', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.1368, de

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    47,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2008, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1667, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1526, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1516, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1397, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1396, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1389, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1327, device='cuda:0')},
  {'word': ' A', 'similarity': tensor(0.1317, device='cuda:0')},
  {'word': '\n', 'similarity': tensor(0.1312, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1288, device='cuda:0')},
  {'word': ' L', 'similarity': tensor(0.1288, device='cuda:0')},
  {'word': ' E', 'similarity': tensor(0.1274, device='cuda:0')},
  {'word': ',', 'similarity': tensor(0.1273, device='cuda:0')}],


In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    0,
    15
)

([' originally',
  ' supposed',
  ' painted',
  ' sent',
  ' once',
  ' founded',
  ' taken',
  ' mag',
  ' ruled',
  ' destroyed',
  ' formerly',
  ' photographed',
  ' replaced',
  ' relegated',
  ' initially'],
 ['�',
  '�',
  '\x1a',
  '\x02',
  '\x12',
  '\x0c',
  '\r',
  '\x04',
  '\x16',
  '\t',
  '�',
  '�',
  ' サーティ',
  '\x18',
  '\x1b'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    9,
    15
)

([' supposed',
  ' very',
  ' sent',
  ' famous',
  ' sitting',
  ' seated',
  ' emb',
  ' taken',
  ' made',
  ' riding',
  ' painted',
  ' honored',
  ' determined',
  ' named',
  ' obviously'],
 ['InstoreAndOnline',
  '\x12',
  '\x1a',
  '�',
  '\x02',
  '\r',
  '\x16',
  '�',
  '�',
  '\x0c',
  '\t',
  '�',
  '\x07',
  '\x1d',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    18,
    15
)

([' supposed',
  ' Romeo',
  ' Pats',
  ' George',
  ' ill',
  ' "',
  ' very',
  ' Ober',
  ' N',
  ' mar',
  ' kept',
  ' named',
  ' hom',
  ' brother',
  ' Ny'],
 [' actionGroup',
  ' antidepress',
  'ipeg',
  '/$',
  'claimer',
  'ailability',
  'outheast',
  '�',
  'WINDOWS',
  ' IMAGES',
  ' Flavoring',
  'awatts',
  '://',
  'ventory',
  'cloneembedreportprint'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    27,
    15
)

([{'word': ' called', 'similarity': tensor(0.2035, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1895, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1769, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1700, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1663, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1589, device='cuda:0')},
  {'word': ' Abel', 'similarity': tensor(0.1554, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1553, device='cuda:0')},
  {'word': ' Angelo', 'similarity': tensor(0.1545, device='cuda:0')},
  {'word': ' Joseph', 'similarity': tensor(0.1541, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1539, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1536, device='cuda:0')},
  {'word': ' Taylor', 'similarity': tensor(0.1531, device='cuda:0')},
  {'word': ' Jeremiah', 'similarity': tensor(0.1513, device='cuda:0')},
  {'word': ' Joshua', 'simila

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    29,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2345, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.2160, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.2078, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2055, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1950, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1936, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1908, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1819, device='cuda:0')},
  {'word': ' Joseph', 'similarity': tensor(0.1798, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1783, device='cuda:0')},
  {'word': ' Patrick', 'similarity': tensor(0.1780, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1768, device='cuda:0')},
  {'word': ' Robert', 'similarity': tensor(0.1762, device='cuda:0')},
  {'word': ' Smith', 'similarity': tensor(0.1758, device='cuda:0')},
  {'word': ' Charles', 'simila

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    45,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2387, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1837, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1834, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1813, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1713, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1693, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1615, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1562, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1521, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1520, device='cuda:0')},
  {'word': ' Charles', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.1484, device='cuda:0')},
  {'word': ' Robert', 'similarity': tensor(0.1472, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1468, device='cuda:0')},
  {'word': ' also', 'similarity'

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    47,
    15
)

all activations shape is:  torch.Size([500, 1600])


([{'word': ' Taylor', 'similarity': tensor(0.1984, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1678, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1634, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1594, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1545, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1544, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1473, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1462, device='cuda:0')},
  {'word': ' also', 'similarity': tensor(0.1392, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1381, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1333, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1327, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1323, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.13

## Gpt-2 Small

In [None]:
pca_similar_tokens(
    gpt2_small,
    dataset_scifi,
    9,
    15
)

(['\n',
  ' The',
  '<|endoftext|>',
  ' the',
  ' "',
  ' A',
  ' (',
  '\n\n',
  ' In',
  ' a',
  ' and',
  ' It',
  ' in',
  ' As',
  ' This'],
 ['覚醒',
  '���',
  '龍契士',
  'quickShipAvailable',
  'ActionCode',
  '\x1b',
  '�',
  '�',
  '�',
  '\x00',
  '\x15',
  '\x05',
  '\x12',
  '\x0c',
  '\x06'])

In [None]:
pca_similar_tokens(
    gpt2_small,
    dataset_scifi,
    10,
    15
)

(['覚醒',
  '���',
  'ActionCode',
  '\x1b',
  '�',
  '�',
  '�',
  '\x05',
  'StreamerBot',
  '\x00',
  '\x12',
  '�',
  'InstoreAndOnline',
  '\x0c',
  '�'],
 ['\n',
  ' The',
  ' "',
  ' the',
  ' A',
  ' (',
  '<|endoftext|>',
  ' In',
  ' a',
  ' and',
  '\n\n',
  ' in',
  ',',
  ' to',
  '-'])

I think the most useful thing would be to see how this varies across layers easily. The ideal would be using pysvelte for this, but it doesn't seem to be working.

## Trying with all activations

### Fantasy

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    0,
    15,
    True
)

([{'word': ' mine', 'similarity': tensor(0.1193, device='cuda:0')},
  {'word': ' mad', 'similarity': tensor(0.1172, device='cuda:0')},
  {'word': ' Ard', 'similarity': tensor(0.1171, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.1144, device='cuda:0')},
  {'word': ' maiden', 'similarity': tensor(0.1107, device='cuda:0')},
  {'word': ' shining', 'similarity': tensor(0.1104, device='cuda:0')},
  {'word': ' Garland', 'similarity': tensor(0.1085, device='cuda:0')},
  {'word': ' bra', 'similarity': tensor(0.1077, device='cuda:0')},
  {'word': ' sand', 'similarity': tensor(0.1070, device='cuda:0')},
  {'word': ' grim', 'similarity': tensor(0.1062, device='cuda:0')},
  {'word': ' crumbling', 'similarity': tensor(0.1039, device='cuda:0')},
  {'word': ' mag', 'similarity': tensor(0.1038, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(0.1036, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.1030, device='cuda:0')},
  {'word': ' ha', 'similarity

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    2,
    15,
    True
)

([{'word': '\x12', 'similarity': tensor(0.3211, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3210, device='cuda:0')},
  {'word': '\x02', 'similarity': tensor(0.3183, device='cuda:0')},
  {'word': '\x16', 'similarity': tensor(0.3183, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3177, device='cuda:0')},
  {'word': '\r', 'similarity': tensor(0.3167, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3166, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3157, device='cuda:0')},
  {'word': '\x0c', 'similarity': tensor(0.3156, device='cuda:0')},
  {'word': '\x1a', 'similarity': tensor(0.3154, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3153, device='cuda:0')},
  {'word': 'InstoreAndOnline', 'similarity': tensor(0.3151, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3148, device='cuda:0')},
  {'word': '\x1c', 'similarity': tensor(0.3136, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.3133, device='cuda:0')}],
 [{'w

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    4,
    15,
    True
)

([{'word': ' mag', 'similarity': tensor(0.0976, device='cuda:0')},
  {'word': ' Ard', 'similarity': tensor(0.0957, device='cuda:0')},
  {'word': ' mine', 'similarity': tensor(0.0947, device='cuda:0')},
  {'word': ' bra', 'similarity': tensor(0.0899, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0882, device='cuda:0')},
  {'word': ' maiden', 'similarity': tensor(0.0877, device='cuda:0')},
  {'word': ' ha', 'similarity': tensor(0.0863, device='cuda:0')},
  {'word': ' humble', 'similarity': tensor(0.0840, device='cuda:0')},
  {'word': ' Garland', 'similarity': tensor(0.0830, device='cuda:0')},
  {'word': ' white', 'similarity': tensor(0.0827, device='cuda:0')},
  {'word': ' shining', 'similarity': tensor(0.0822, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(0.0818, device='cuda:0')},
  {'word': ' sc', 'similarity': tensor(0.0816, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.0806, device='cuda:0')},
  {'word': '…', 'similarity': tens

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    19,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1141, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1118, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1091, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1091, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1012, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0974, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0968, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0947, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0942, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0930, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0915, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0911, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0910, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0903, device='cuda:0')},
  

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    29,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1138, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1113, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1090, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1089, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1040, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0981, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0961, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0919, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0907, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0907, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0906, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0897, device='cuda:0')},
  {'word': ' ill', 'similarity': tensor(0.0893, device='cuda:0')},
  {'word': ' unimaginable', 'similarity': tensor(0.0893, device='cuda:0'

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    39,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1095, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1083, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1072, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1065, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1064, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.1021, device='cuda:0')},
  {'word': ' enchanted', 'similarity': tensor(0.0987, device='cuda:0')},
  {'word': ' seemingly', 'similarity': tensor(0.0930, device='cuda:0')},
  {'word': ' unimaginable', 'similarity': tensor(0.0926, device='cuda:0')},
  {'word': ' ill', 'similarity': tensor(0.0908, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0901, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0889, device='cuda:0')},
  {'word': ' more', 'similarity': tensor(0.0867, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0867, device='cuda:0')}

### Scifi

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    0,
    15,
    True
)

([{'word': ' rive', 'similarity': tensor(0.1131, device='cuda:0')},
  {'word': ' crumbling', 'similarity': tensor(0.1107, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.1085, device='cuda:0')},
  {'word': ' mine', 'similarity': tensor(0.1038, device='cuda:0')},
  {'word': ' Gat', 'similarity': tensor(0.1014, device='cuda:0')},
  {'word': ' destroyed', 'similarity': tensor(0.0994, device='cuda:0')},
  {'word': ' Garland', 'similarity': tensor(0.0966, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0959, device='cuda:0')},
  {'word': ' mag', 'similarity': tensor(0.0949, device='cuda:0')},
  {'word': ' mad', 'similarity': tensor(0.0939, device='cuda:0')},
  {'word': ' charred', 'similarity': tensor(0.0930, device='cuda:0')},
  {'word': ' bra', 'similarity': tensor(0.0925, device='cuda:0')},
  {'word': ' destroy', 'similarity': tensor(0.0912, device='cuda:0')},
  {'word': ' Drill', 'similarity': tensor(0.0905, device='cuda:0')},
  {'word': ' war', 'simil

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    4,
    15,
    True
)

([{'word': ' mag', 'similarity': tensor(0.0920, device='cuda:0')},
  {'word': ' mine', 'similarity': tensor(0.0894, device='cuda:0')},
  {'word': ' war', 'similarity': tensor(0.0858, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0842, device='cuda:0')},
  {'word': ' ha', 'similarity': tensor(0.0829, device='cuda:0')},
  {'word': '…', 'similarity': tensor(0.0815, device='cuda:0')},
  {'word': ' night', 'similarity': tensor(0.0790, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.0789, device='cuda:0')},
  {'word': ' battle', 'similarity': tensor(0.0788, device='cuda:0')},
  {'word': ' destroyed', 'similarity': tensor(0.0782, device='cuda:0')},
  {'word': ' white', 'similarity': tensor(0.0769, device='cuda:0')},
  {'word': ' light', 'similarity': tensor(0.0757, device='cuda:0')},
  {'word': ' gun', 'similarity': tensor(0.0755, device='cuda:0')},
  {'word': ' chaotic', 'similarity': tensor(0.0736, device='cuda:0')},
  {'word': ' massive', 'similarity': 

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    6,
    15,
    True
)

([{'word': ' mine', 'similarity': tensor(0.0945, device='cuda:0')},
  {'word': ' ha', 'similarity': tensor(0.0923, device='cuda:0')},
  {'word': ' war', 'similarity': tensor(0.0919, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0883, device='cuda:0')},
  {'word': '…', 'similarity': tensor(0.0872, device='cuda:0')},
  {'word': ' mag', 'similarity': tensor(0.0834, device='cuda:0')},
  {'word': ' night', 'similarity': tensor(0.0822, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.0813, device='cuda:0')},
  {'word': ' chaotic', 'similarity': tensor(0.0809, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.0808, device='cuda:0')},
  {'word': ' destroyed', 'similarity': tensor(0.0801, device='cuda:0')},
  {'word': ' crumbling', 'similarity': tensor(0.0796, device='cuda:0')},
  {'word': ' battle', 'similarity': tensor(0.0790, device='cuda:0')},
  {'word': ' flight', 'similarity': tensor(0.0789, device='cuda:0')},
  {'word': ' light', 'simila

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    19,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1146, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1121, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1097, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1094, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1011, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0975, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0973, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0952, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0943, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0933, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0916, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0916, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0914, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0905, device='cuda:0')},
  

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    29,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1150, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1123, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1115, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1095, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1025, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0993, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0965, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0928, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0921, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0913, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0910, device='cuda:0')},
  {'word': ' unimaginable', 'similarity': tensor(0.0904, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0903, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0888, device='

### Sports

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    0,
    15,
    True
)

([{'word': '�', 'similarity': tensor(0.2964, device='cuda:0')},
  {'word': '\x12', 'similarity': tensor(0.2957, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2947, device='cuda:0')},
  {'word': '\x16', 'similarity': tensor(0.2932, device='cuda:0')},
  {'word': '\x02', 'similarity': tensor(0.2929, device='cuda:0')},
  {'word': '\r', 'similarity': tensor(0.2919, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2918, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2918, device='cuda:0')},
  {'word': '\x0c', 'similarity': tensor(0.2916, device='cuda:0')},
  {'word': 'InstoreAndOnline', 'similarity': tensor(0.2913, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2906, device='cuda:0')},
  {'word': '\x1a', 'similarity': tensor(0.2905, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2895, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2895, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.2888, device='cuda:0')}],
 [{'word

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    4,
    15,
    True
)

([{'word': ' white', 'similarity': tensor(0.0886, device='cuda:0')},
  {'word': ' right', 'similarity': tensor(0.0875, device='cuda:0')},
  {'word': ' ha', 'similarity': tensor(0.0825, device='cuda:0')},
  {'word': ' bra', 'similarity': tensor(0.0816, device='cuda:0')},
  {'word': ' rive', 'similarity': tensor(0.0809, device='cuda:0')},
  {'word': ' left', 'similarity': tensor(0.0807, device='cuda:0')},
  {'word': '…', 'similarity': tensor(0.0790, device='cuda:0')},
  {'word': ' lac', 'similarity': tensor(0.0790, device='cuda:0')},
  {'word': ' I', 'similarity': tensor(0.0789, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0784, device='cuda:0')},
  {'word': ' bar', 'similarity': tensor(0.0766, device='cuda:0')},
  {'word': ' tall', 'similarity': tensor(0.0763, device='cuda:0')},
  {'word': ' more', 'similarity': tensor(0.0759, device='cuda:0')},
  {'word': ' red', 'similarity': tensor(0.0757, device='cuda:0')},
  {'word': ' night', 'similarity': tensor(0.0756, devi

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    6,
    15,
    True
)

([{'word': ' ha', 'similarity': tensor(0.0915, device='cuda:0')},
  {'word': ' right', 'similarity': tensor(0.0863, device='cuda:0')},
  {'word': ' white', 'similarity': tensor(0.0848, device='cuda:0')},
  {'word': '…', 'similarity': tensor(0.0800, device='cuda:0')},
  {'word': ' red', 'similarity': tensor(0.0798, device='cuda:0')},
  {'word': ' I', 'similarity': tensor(0.0798, device='cuda:0')},
  {'word': ' shining', 'similarity': tensor(0.0787, device='cuda:0')},
  {'word': ' more', 'similarity': tensor(0.0785, device='cuda:0')},
  {'word': ' night', 'similarity': tensor(0.0782, device='cuda:0')},
  {'word': ' shattered', 'similarity': tensor(0.0776, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.0776, device='cuda:0')},
  {'word': ' huge', 'similarity': tensor(0.0775, device='cuda:0')},
  {'word': ' rive', 'similarity': tensor(0.0772, device='cuda:0')},
  {'word': ' in', 'similarity': tensor(0.0765, device='cuda:0')},
  {'word': ' left', 'similarity': tensor(0.076

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    12,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1187, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1143, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1128, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1106, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1038, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0988, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0985, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0980, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0970, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0963, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0948, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0942, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0938, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0932, device='cuda:0')},
 

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    19,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1153, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1121, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1097, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1095, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1008, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0973, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0964, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0949, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0942, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0935, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0915, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0910, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0909, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0907, device='cuda:0')},
 

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    27,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1157, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1118, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1098, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1098, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1013, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0965, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0963, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0932, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0916, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0911, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0906, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0903, device='cuda:0')},
  {'word': ' intense', 'similarity': tensor(0.0902, device='cuda:0')},
  {'word': ' sole', 'similarity': tensor(0.0897, device='cuda:0')},

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    29,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1160, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1116, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1099, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1097, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1016, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0964, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0960, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0923, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0909, device='cuda:0')},
  {'word': ' intense', 'similarity': tensor(0.0906, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0906, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0904, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0899, device='cuda:0')},
  {'word': ' sole', 'similarity': tensor(0.0894, device='cuda:0')},

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    38,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1124, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1095, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1080, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1068, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.0968, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0945, device='cuda:0')},
  {'word': ' intense', 'similarity': tensor(0.0915, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0910, device='cuda:0')},
  {'word': ' seemingly', 'similarity': tensor(0.0896, device='cuda:0')},
  {'word': ' unimaginable', 'similarity': tensor(0.0869, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0866, device='cuda:0')},
  {'word': ' joy', 'similarity': tensor(0.0863, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0856, device='cuda:0')},
  {'word': ' huge', 'similarity': tensor(0.0852, device='cuda:0')},
  

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    45,
    15,
    True
)

([{'word': ' the', 'similarity': tensor(0.1927, device='cuda:0')},
  {'word': ',', 'similarity': tensor(0.1726, device='cuda:0')},
  {'word': ' and', 'similarity': tensor(0.1692, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1666, device='cuda:0')},
  {'word': ' in', 'similarity': tensor(0.1607, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1537, device='cuda:0')},
  {'word': '\n', 'similarity': tensor(0.1369, device='cuda:0')},
  {'word': ' one', 'similarity': tensor(0.1343, device='cuda:0')},
  {'word': ' to', 'similarity': tensor(0.1333, device='cuda:0')},
  {'word': ' on', 'similarity': tensor(0.1294, device='cuda:0')},
  {'word': ' as', 'similarity': tensor(0.1288, device='cuda:0')},
  {'word': ' all', 'similarity': tensor(0.1285, device='cuda:0')},
  {'word': '.', 'similarity': tensor(0.1284, device='cuda:0')},
  {'word': ' for', 'similarity': tensor(0.1183, device='cuda:0')},
  {'word': ' with', 'similarity': tensor(0.1156, device='cuda:0')}],
 [{'word'

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset_sports,
    47,
    15,
    True
)

([{'word': ' the', 'similarity': tensor(0.1807, device='cuda:0')},
  {'word': ',', 'similarity': tensor(0.1624, device='cuda:0')},
  {'word': ' and', 'similarity': tensor(0.1623, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1568, device='cuda:0')},
  {'word': ' in', 'similarity': tensor(0.1518, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1497, device='cuda:0')},
  {'word': '\n', 'similarity': tensor(0.1354, device='cuda:0')},
  {'word': ' one', 'similarity': tensor(0.1255, device='cuda:0')},
  {'word': ' to', 'similarity': tensor(0.1251, device='cuda:0')},
  {'word': ' on', 'similarity': tensor(0.1220, device='cuda:0')},
  {'word': ' as', 'similarity': tensor(0.1216, device='cuda:0')},
  {'word': '.', 'similarity': tensor(0.1184, device='cuda:0')},
  {'word': ' all', 'similarity': tensor(0.1183, device='cuda:0')},
  {'word': ' for', 'similarity': tensor(0.1123, device='cuda:0')},
  {'word': ' with', 'similarity': tensor(0.1094, device='cuda:0')}],
 [{'word'

### Reading Comprehensions

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    0,
    15,
    True
)

([{'word': ' Garland', 'similarity': tensor(0.1222, device='cuda:0')},
  {'word': ' Ard', 'similarity': tensor(0.1154, device='cuda:0')},
  {'word': ' Architects', 'similarity': tensor(0.1048, device='cuda:0')},
  {'word': ' Drill', 'similarity': tensor(0.1018, device='cuda:0')},
  {'word': ' Bruno', 'similarity': tensor(0.1002, device='cuda:0')},
  {'word': ' Atkins', 'similarity': tensor(0.0990, device='cuda:0')},
  {'word': ' Blackwell', 'similarity': tensor(0.0977, device='cuda:0')},
  {'word': ' Crane', 'similarity': tensor(0.0977, device='cuda:0')},
  {'word': ' anarchist', 'similarity': tensor(0.0965, device='cuda:0')},
  {'word': ' Clay', 'similarity': tensor(0.0960, device='cuda:0')},
  {'word': ' ruined', 'similarity': tensor(0.0960, device='cuda:0')},
  {'word': ' Lyon', 'similarity': tensor(0.0937, device='cuda:0')},
  {'word': ' Ce', 'similarity': tensor(0.0936, device='cuda:0')},
  {'word': ' Norman', 'similarity': tensor(0.0931, device='cuda:0')},
  {'word': ' Barb', 'si

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    9,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1227, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1170, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1169, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1122, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1082, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.1037, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.1008, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.1001, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0995, device='cuda:0')},
  {'word': ' Eater', 'similarity': tensor(0.0989, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0980, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0975, device='cuda:0')},
  {'word': ' colossal', 'similarity': tensor(0.0972, device='cuda:0')},
  {'word': ' undeniable', 'similarity': tensor(0.0964, device='cuda:0')},


In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    18,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1148, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1123, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1099, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1093, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1016, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0980, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0970, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0968, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0954, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0953, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0948, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0923, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0919, device='cuda:0')},
  {'word': ' Eater', 'similarity': tensor(0.0917, device='cuda:0')},
  {'w

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    27,
    15,
    True
)

([{'word': ' unthinkable', 'similarity': tensor(0.1119, device='cuda:0')},
  {'word': ' enormous', 'similarity': tensor(0.1110, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1085, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1080, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.1008, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0981, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0964, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0950, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0949, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0925, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0925, device='cuda:0')},
  {'word': ' brunt', 'similarity': tensor(0.0923, device='cuda:0')},
  {'word': ' sole', 'similarity': tensor(0.0908, device='cuda:0')},
  {'word': 'arget', 'similarity': tensor(0.0900, device='cuda:0')},
  {'wo

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    35,
    15,
    True
)

([{'word': ' enormous', 'similarity': tensor(0.1062, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1048, device='cuda:0')},
  {'word': ' unthinkable', 'similarity': tensor(0.1042, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.1025, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.0959, device='cuda:0')},
  {'word': ' larger', 'similarity': tensor(0.0949, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0929, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0913, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0902, device='cuda:0')},
  {'word': ' Rebellion', 'similarity': tensor(0.0901, device='cuda:0')},
  {'word': ' secretive', 'similarity': tensor(0.0900, device='cuda:0')},
  {'word': ' sole', 'similarity': tensor(0.0876, device='cuda:0')},
  {'word': ' ill', 'similarity': tensor(0.0868, device='cuda:0')},
  {'word': ' intense', 'similarity': tensor(0.0858, device='cuda:0')},
  {'w

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset2,
    43,
    15,
    True
)

([{'word': '\t', 'similarity': tensor(0.5789, device='cuda:0')},
  {'word': '\x1c', 'similarity': tensor(0.5787, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.5781, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.5773, device='cuda:0')},
  {'word': '�', 'similarity': tensor(0.5756, device='cuda:0')},
  {'word': '\x02', 'similarity': tensor(0.5753, device='cuda:0')},
  {'word': '\x1f', 'similarity': tensor(0.5752, device='cuda:0')},
  {'word': ' RandomRedditor', 'similarity': tensor(0.5739, device='cuda:0')},
  {'word': '\r', 'similarity': tensor(0.5739, device='cuda:0')},
  {'word': '\x1a', 'similarity': tensor(0.5736, device='cuda:0')},
  {'word': ' サーティ', 'similarity': tensor(0.5729, device='cuda:0')},
  {'word': '\x1e', 'similarity': tensor(0.5728, device='cuda:0')},
  {'word': '\x11', 'similarity': tensor(0.5723, device='cuda:0')},
  {'word': 'rawdownload', 'similarity': tensor(0.5717, device='cuda:0')},
  {'word': '\x04', 'similarity': tensor(0.5717, device='

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    7,
    15,
    True
)

([{'word': ' enormous', 'similarity': tensor(0.1012, device='cuda:0')},
  {'word': ' massive', 'similarity': tensor(0.1008, device='cuda:0')},
  {'word': ' immense', 'similarity': tensor(0.0966, device='cuda:0')},
  {'word': ' unthinkable', 'similarity': tensor(0.0932, device='cuda:0')},
  {'word': ' very', 'similarity': tensor(0.0930, device='cuda:0')},
  {'word': ' ill', 'similarity': tensor(0.0912, device='cuda:0')},
  {'word': ' obvious', 'similarity': tensor(0.0905, device='cuda:0')},
  {'word': ' vast', 'similarity': tensor(0.0856, device='cuda:0')},
  {'word': ' fateful', 'similarity': tensor(0.0848, device='cuda:0')},
  {'word': ' huge', 'similarity': tensor(0.0845, device='cuda:0')},
  {'word': ' formidable', 'similarity': tensor(0.0843, device='cuda:0')},
  {'word': ' Eater', 'similarity': tensor(0.0838, device='cuda:0')},
  {'word': ' ha', 'similarity': tensor(0.0836, device='cuda:0')},
  {'word': ' sitting', 'similarity': tensor(0.0830, device='cuda:0')},
  {'word': ' chaot

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    15,
    15
)

([' supposed',
  ' very',
  ' "',
  ' rumored',
  ' called',
  ' ill',
  ' obviously',
  ' sitting',
  ' ha',
  ' sent',
  ' supposedly',
  ' N',
  ' chosen',
  ' brothers',
  ' destroyed'],
 ['InstoreAndOnline',
  '\x12',
  '�',
  '�',
  '\x02',
  '\r',
  '\x16',
  '\x1a',
  '�',
  '\t',
  '\x0c',
  '�',
  '\x1d',
  '\x19',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    22,
    15
)

([' Jam',
  ' Jack',
  ' Clara',
  ' Jacob',
  ' Kate',
  ' daughter',
  ' Romeo',
  ' Gene',
  ' N',
  ' William',
  ' Thom',
  ' Jes',
  ' Ny',
  ' Rh',
  ' Maria'],
 [' actionGroup',
  ' antidepress',
  ' pione',
  'cloneembedreportprint',
  ' practition',
  '�',
  ' dstg',
  'inventory',
  'eworld',
  ' Unloaded',
  'isSpecial',
  'asaki',
  'ÃÂÃÂ',
  'eworks',
  'iHUD'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    25,
    15
)

([{'word': ' Jack', 'similarity': tensor(0.1498, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1490, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1459, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1456, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1453, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1448, device='cuda:0')},
  {'word': ' Clara', 'similarity': tensor(0.1444, device='cuda:0')},
  {'word': ' Taylor', 'similarity': tensor(0.1440, device='cuda:0')},
  {'word': ' N', 'similarity': tensor(0.1415, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1395, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1395, device='cuda:0')},
  {'word': ' Quincy', 'similarity': tensor(0.1390, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1388, device='cuda:0')},
  {'word': ' Luc', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' Cam', 'similarity': tensor(0.1380,

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    27,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.1780, device='cuda:0')},
  {'word': ' Clara', 'similarity': tensor(0.1702, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1696, device='cuda:0')},
  {'word': ' Jack', 'similarity': tensor(0.1659, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1644, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1640, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1608, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1574, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1569, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1565, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1536, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1510, device='cuda:0')},
  {'word': ' Chester', 'similarity': tensor(0.1509, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' Wil', 'similarity': tens

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    29,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2592, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2026, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1849, device='cuda:0')},
  {'word': ' Jack', 'similarity': tensor(0.1844, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1836, device='cuda:0')},
  {'word': ' Smith', 'similarity': tensor(0.1833, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1814, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1804, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1802, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1800, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.1779, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1767, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1744, device='cuda:0')},
  {'word': ' Kate', 'similarity': tensor(0.1743, device='cuda:0')},
  {'word': ' Clara', 'similarity': tenso

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    37,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.4354, device='cuda:0')},
  {'word': 'Taylor', 'similarity': tensor(0.2981, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.2827, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2409, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.2328, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.2193, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.2162, device='cuda:0')},
  {'word': ' Joshua', 'similarity': tensor(0.2153, device='cuda:0')},
  {'word': ' Tyler', 'similarity': tensor(0.2151, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.2087, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.2065, device='cuda:0')},
  {'word': ' Tiffany', 'similarity': tensor(0.2057, device='cuda:0')},
  {'word': ' Sophia', 'similarity': tensor(0.2052, device='cuda:0')},
  {'word': ' Timothy', 'similarity': tensor(0.2013, device='cuda:0')},
  {'word': ' named', 'si

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    45,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2409, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1750, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1666, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1627, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1585, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1543, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1513, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1500, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1447, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1426, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1403, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1396, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.1390, device='cuda:0')},
  {'word': ' A', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' Tay', 'similarity': tensor(0.1368, de

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset1,
    47,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2008, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1667, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1526, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1516, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1397, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1396, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1389, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1327, device='cuda:0')},
  {'word': ' A', 'similarity': tensor(0.1317, device='cuda:0')},
  {'word': '\n', 'similarity': tensor(0.1312, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1288, device='cuda:0')},
  {'word': ' L', 'similarity': tensor(0.1288, device='cuda:0')},
  {'word': ' E', 'similarity': tensor(0.1274, device='cuda:0')},
  {'word': ',', 'similarity': tensor(0.1273, device='cuda:0')}],


In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    0,
    15
)

([' originally',
  ' supposed',
  ' painted',
  ' sent',
  ' once',
  ' founded',
  ' taken',
  ' mag',
  ' ruled',
  ' destroyed',
  ' formerly',
  ' photographed',
  ' replaced',
  ' relegated',
  ' initially'],
 ['�',
  '�',
  '\x1a',
  '\x02',
  '\x12',
  '\x0c',
  '\r',
  '\x04',
  '\x16',
  '\t',
  '�',
  '�',
  ' サーティ',
  '\x18',
  '\x1b'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    9,
    15
)

([' supposed',
  ' very',
  ' sent',
  ' famous',
  ' sitting',
  ' seated',
  ' emb',
  ' taken',
  ' made',
  ' riding',
  ' painted',
  ' honored',
  ' determined',
  ' named',
  ' obviously'],
 ['InstoreAndOnline',
  '\x12',
  '\x1a',
  '�',
  '\x02',
  '\r',
  '\x16',
  '�',
  '�',
  '\x0c',
  '\t',
  '�',
  '\x07',
  '\x1d',
  '�'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    18,
    15
)

([' supposed',
  ' Romeo',
  ' Pats',
  ' George',
  ' ill',
  ' "',
  ' very',
  ' Ober',
  ' N',
  ' mar',
  ' kept',
  ' named',
  ' hom',
  ' brother',
  ' Ny'],
 [' actionGroup',
  ' antidepress',
  'ipeg',
  '/$',
  'claimer',
  'ailability',
  'outheast',
  '�',
  'WINDOWS',
  ' IMAGES',
  ' Flavoring',
  'awatts',
  '://',
  'ventory',
  'cloneembedreportprint'])

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    27,
    15
)

([{'word': ' called', 'similarity': tensor(0.2035, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1895, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1769, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1700, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1663, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1589, device='cuda:0')},
  {'word': ' Abel', 'similarity': tensor(0.1554, device='cuda:0')},
  {'word': ' Peter', 'similarity': tensor(0.1553, device='cuda:0')},
  {'word': ' Angelo', 'similarity': tensor(0.1545, device='cuda:0')},
  {'word': ' Joseph', 'similarity': tensor(0.1541, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1539, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1536, device='cuda:0')},
  {'word': ' Taylor', 'similarity': tensor(0.1531, device='cuda:0')},
  {'word': ' Jeremiah', 'similarity': tensor(0.1513, device='cuda:0')},
  {'word': ' Joshua', 'simila

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    29,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2345, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.2160, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.2078, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.2055, device='cuda:0')},
  {'word': ' Jacob', 'similarity': tensor(0.1950, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1936, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1908, device='cuda:0')},
  {'word': ' Thom', 'similarity': tensor(0.1819, device='cuda:0')},
  {'word': ' Joseph', 'similarity': tensor(0.1798, device='cuda:0')},
  {'word': ' Sam', 'similarity': tensor(0.1783, device='cuda:0')},
  {'word': ' Patrick', 'similarity': tensor(0.1780, device='cuda:0')},
  {'word': ' Jam', 'similarity': tensor(0.1768, device='cuda:0')},
  {'word': ' Robert', 'similarity': tensor(0.1762, device='cuda:0')},
  {'word': ' Smith', 'similarity': tensor(0.1758, device='cuda:0')},
  {'word': ' Charles', 'simila

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    45,
    15
)

([{'word': ' Taylor', 'similarity': tensor(0.2387, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1837, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1834, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1813, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1713, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1693, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1615, device='cuda:0')},
  {'word': ' named', 'similarity': tensor(0.1562, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1521, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1520, device='cuda:0')},
  {'word': ' Charles', 'similarity': tensor(0.1506, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.1484, device='cuda:0')},
  {'word': ' Robert', 'similarity': tensor(0.1472, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1468, device='cuda:0')},
  {'word': ' also', 'similarity'

In [None]:
pca_similar_tokens(
    gpt2_xl,
    dataset3,
    47,
    15
)

all activations shape is:  torch.Size([500, 1600])


([{'word': ' Taylor', 'similarity': tensor(0.1984, device='cuda:0')},
  {'word': ' John', 'similarity': tensor(0.1678, device='cuda:0')},
  {'word': ' "', 'similarity': tensor(0.1634, device='cuda:0')},
  {'word': ' called', 'similarity': tensor(0.1594, device='cuda:0')},
  {'word': ' James', 'similarity': tensor(0.1545, device='cuda:0')},
  {'word': ' William', 'similarity': tensor(0.1544, device='cuda:0')},
  {'word': ' the', 'similarity': tensor(0.1473, device='cuda:0')},
  {'word': ' T', 'similarity': tensor(0.1462, device='cuda:0')},
  {'word': ' also', 'similarity': tensor(0.1392, device='cuda:0')},
  {'word': ' Thomas', 'similarity': tensor(0.1383, device='cuda:0')},
  {'word': ' George', 'similarity': tensor(0.1381, device='cuda:0')},
  {'word': " '", 'similarity': tensor(0.1333, device='cuda:0')},
  {'word': ' a', 'similarity': tensor(0.1327, device='cuda:0')},
  {'word': ' Michael', 'similarity': tensor(0.1323, device='cuda:0')},
  {'word': ' David', 'similarity': tensor(0.13

## Difference Vectors!

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    dataset_sports,
    0,
    15,
    True
)

([{'word': ' sorcerer', 'similarity': tensor(1.2320, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(1.2230, device='cuda:0')},
  {'word': ' Sorceress', 'similarity': tensor(1.1793, device='cuda:0')},
  {'word': ' Realms', 'similarity': tensor(1.1566, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(1.1554, device='cuda:0')},
  {'word': ' Realm', 'similarity': tensor(1.1395, device='cuda:0')},
  {'word': ' Lich', 'similarity': tensor(1.1048, device='cuda:0')},
  {'word': ' Enchant', 'similarity': tensor(1.1030, device='cuda:0')},
  {'word': ' realms', 'similarity': tensor(1.1017, device='cuda:0')},
  {'word': ' Grail', 'similarity': tensor(1.0747, device='cuda:0')},
  {'word': ' artifact', 'similarity': tensor(1.0433, device='cuda:0')},
  {'word': ' Pry', 'similarity': tensor(1.0233, device='cuda:0')},
  {'word': ' Sorce', 'similarity': tensor(1.0039, device='cuda:0')},
  {'word': ' dru', 'similarity': tensor(0.9880, device='cuda:0')},
  {'word': ' witch', 'sim

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    dataset_sports,
    4,
    15,
    True
)

([{'word': ' Elven', 'similarity': tensor(3.1885, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(3.1781, device='cuda:0')},
  {'word': ' Enchant', 'similarity': tensor(3.0993, device='cuda:0')},
  {'word': ' Sorceress', 'similarity': tensor(2.7440, device='cuda:0')},
  {'word': ' Realms', 'similarity': tensor(2.6730, device='cuda:0')},
  {'word': ' Amber', 'similarity': tensor(2.6637, device='cuda:0')},
  {'word': ' Isle', 'similarity': tensor(2.6475, device='cuda:0')},
  {'word': ' shaman', 'similarity': tensor(2.5687, device='cuda:0')},
  {'word': ' sorcerer', 'similarity': tensor(2.5631, device='cuda:0')},
  {'word': ' Sorce', 'similarity': tensor(2.5349, device='cuda:0')},
  {'word': ' adventurer', 'similarity': tensor(2.5214, device='cuda:0')},
  {'word': ' Siren', 'similarity': tensor(2.5015, device='cuda:0')},
  {'word': ' witch', 'similarity': tensor(2.4755, device='cuda:0')},
  {'word': ' mystical', 'similarity': tensor(2.4730, device='cuda:0')},
  {'word': ' Wyr

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    dataset_sports,
    20,
    15,
    True
)

([{'word': ' sorce', 'similarity': tensor(6.3246, device='cuda:0')},
  {'word': ' Amber', 'similarity': tensor(5.8252, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(5.6478, device='cuda:0')},
  {'word': ' Realms', 'similarity': tensor(5.6010, device='cuda:0')},
  {'word': ' Healing', 'similarity': tensor(5.5226, device='cuda:0')},
  {'word': ' Enchant', 'similarity': tensor(5.4253, device='cuda:0')},
  {'word': ' crystal', 'similarity': tensor(5.4125, device='cuda:0')},
  {'word': ' arcane', 'similarity': tensor(5.3624, device='cuda:0')},
  {'word': ' Serpent', 'similarity': tensor(5.3319, device='cuda:0')},
  {'word': ' Pixie', 'similarity': tensor(5.2345, device='cuda:0')},
  {'word': ' Gaia', 'similarity': tensor(5.2133, device='cuda:0')},
  {'word': ' Lilith', 'similarity': tensor(5.2079, device='cuda:0')},
  {'word': ' arte', 'similarity': tensor(5.1456, device='cuda:0')},
  {'word': ' Mistress', 'similarity': tensor(5.1339, device='cuda:0')},
  {'word': ' Arcane', 

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    dataset_sports,
    47,
    15,
    True
)

([{'word': ' enchanted', 'similarity': tensor(55.9487, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(51.0747, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(49.9904, device='cuda:0')},
  {'word': ' Sorceress', 'similarity': tensor(48.3182, device='cuda:0')},
  {'word': ' Realms', 'similarity': tensor(47.8612, device='cuda:0')},
  {'word': ' corrupted', 'similarity': tensor(47.2285, device='cuda:0')},
  {'word': ' mystical', 'similarity': tensor(46.5397, device='cuda:0')},
  {'word': ' Carbuncle', 'similarity': tensor(46.4714, device='cuda:0')},
  {'word': ' enchantment', 'similarity': tensor(45.8021, device='cuda:0')},
  {'word': ' enchant', 'similarity': tensor(45.7227, device='cuda:0')},
  {'word': ' prophe', 'similarity': tensor(45.2086, device='cuda:0')},
  {'word': ' sentient', 'similarity': tensor(44.9766, device='cuda:0')},
  {'word': ' Grimoire', 'similarity': tensor(44.3376, device='cuda:0')},
  {'word': ' Sorcerer', 'similarity': tensor(43.8078, d

### Fantasy - Baseline

In [None]:
import gc
gc.collect()
t.cuda.empty_cache()

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    0,
    15,
    True
)

([{'word': ' Elven', 'similarity': tensor(1.5935, device='cuda:0')},
  {'word': ' warrior', 'similarity': tensor(1.4612, device='cuda:0')},
  {'word': ' jewel', 'similarity': tensor(1.4392, device='cuda:0')},
  {'word': ' enchantment', 'similarity': tensor(1.3887, device='cuda:0')},
  {'word': ' realms', 'similarity': tensor(1.3553, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(1.3543, device='cuda:0')},
  {'word': ' Celestial', 'similarity': tensor(1.3510, device='cuda:0')},
  {'word': ' Primordial', 'similarity': tensor(1.3426, device='cuda:0')},
  {'word': ' elf', 'similarity': tensor(1.3423, device='cuda:0')},
  {'word': ' elemental', 'similarity': tensor(1.3258, device='cuda:0')},
  {'word': ' enchanted', 'similarity': tensor(1.3258, device='cuda:0')},
  {'word': ' magically', 'similarity': tensor(1.3108, device='cuda:0')},
  {'word': ' celestial', 'similarity': tensor(1.3063, device='cuda:0')},
  {'word': ' warriors', 'similarity': tensor(1.3055, device='cuda:0')

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    10,
    15,
    True
)

([{'word': ' sorce', 'similarity': tensor(5.6013, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(5.2541, device='cuda:0')},
  {'word': ' crystals', 'similarity': tensor(5.1913, device='cuda:0')},
  {'word': ' sword', 'similarity': tensor(4.9080, device='cuda:0')},
  {'word': ' shards', 'similarity': tensor(4.6676, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(4.6558, device='cuda:0')},
  {'word': ' sorcerer', 'similarity': tensor(4.5590, device='cuda:0')},
  {'word': ' shaman', 'similarity': tensor(4.5459, device='cuda:0')},
  {'word': ' knights', 'similarity': tensor(4.5323, device='cuda:0')},
  {'word': ' blade', 'similarity': tensor(4.5259, device='cuda:0')},
  {'word': ' crimson', 'similarity': tensor(4.5175, device='cuda:0')},
  {'word': ' enchanted', 'similarity': tensor(4.5161, device='cuda:0')},
  {'word': ' lava', 'similarity': tensor(4.5161, device='cuda:0')},
  {'word': ' maiden', 'similarity': tensor(4.4905, device='cuda:0')},
  {'word':

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    20,
    15,
    True
)

([{'word': ' sorce', 'similarity': tensor(9.4969, device='cuda:0')},
  {'word': ' enchanted', 'similarity': tensor(8.6562, device='cuda:0')},
  {'word': ' visions', 'similarity': tensor(8.1587, device='cuda:0')},
  {'word': ' runes', 'similarity': tensor(8.1337, device='cuda:0')},
  {'word': ' shards', 'similarity': tensor(8.0894, device='cuda:0')},
  {'word': ' enchant', 'similarity': tensor(8.0438, device='cuda:0')},
  {'word': ' crimson', 'similarity': tensor(7.9101, device='cuda:0')},
  {'word': ' crystals', 'similarity': tensor(7.8458, device='cuda:0')},
  {'word': ' mage', 'similarity': tensor(7.8325, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(7.8050, device='cuda:0')},
  {'word': ' arcane', 'similarity': tensor(7.7952, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(7.7806, device='cuda:0')},
  {'word': ' wand', 'similarity': tensor(7.6282, device='cuda:0')},
  {'word': ' mages', 'similarity': tensor(7.5915, device='cuda:0')},
  {'word': ' telepo

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    29,
    15,
    True
)

([{'word': ' enchanted', 'similarity': tensor(24.2016, device='cuda:0')},
  {'word': ' mystical', 'similarity': tensor(21.4413, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(20.7328, device='cuda:0')},
  {'word': ' awakened', 'similarity': tensor(20.3526, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(20.2063, device='cuda:0')},
  {'word': ' enchant', 'similarity': tensor(19.4190, device='cuda:0')},
  {'word': ' runes', 'similarity': tensor(19.0198, device='cuda:0')},
  {'word': ' mystic', 'similarity': tensor(18.7535, device='cuda:0')},
  {'word': ' enchantment', 'similarity': tensor(18.6639, device='cuda:0')},
  {'word': ' arcane', 'similarity': tensor(18.5177, device='cuda:0')},
  {'word': ' treasures', 'similarity': tensor(18.4305, device='cuda:0')},
  {'word': ' awakening', 'similarity': tensor(18.4127, device='cuda:0')},
  {'word': ' shaman', 'similarity': tensor(18.4119, device='cuda:0')},
  {'word': ' goddess', 'similarity': tensor(18.3436, device

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    30,
    15,
    True
)

([{'word': ' enchanted', 'similarity': tensor(27.2222, device='cuda:0')},
  {'word': ' mystical', 'similarity': tensor(24.3666, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(23.1472, device='cuda:0')},
  {'word': ' awakened', 'similarity': tensor(22.6172, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(22.2151, device='cuda:0')},
  {'word': ' enchant', 'similarity': tensor(21.9208, device='cuda:0')},
  {'word': ' mystic', 'similarity': tensor(21.2233, device='cuda:0')},
  {'word': ' arcane', 'similarity': tensor(21.0342, device='cuda:0')},
  {'word': ' awakening', 'similarity': tensor(20.8125, device='cuda:0')},
  {'word': ' destiny', 'similarity': tensor(20.7555, device='cuda:0')},
  {'word': ' enchantment', 'similarity': tensor(20.5637, device='cuda:0')},
  {'word': ' magic', 'similarity': tensor(20.4918, device='cuda:0')},
  {'word': ' awaken', 'similarity': tensor(20.3906, device='cuda:0')},
  {'word': ' treasures', 'similarity': tensor(20.0586, device

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy,
    training_subset_small,
    40,
    15,
    True
)

([{'word': ' enchanted', 'similarity': tensor(61.0661, device='cuda:0')},
  {'word': ' mystical', 'similarity': tensor(51.2953, device='cuda:0')},
  {'word': ' magical', 'similarity': tensor(50.2876, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(47.6175, device='cuda:0')},
  {'word': ' Elven', 'similarity': tensor(46.0082, device='cuda:0')},
  {'word': ' enchantment', 'similarity': tensor(45.7933, device='cuda:0')},
  {'word': ' wond', 'similarity': tensor(45.6981, device='cuda:0')},
  {'word': ' mystic', 'similarity': tensor(44.9179, device='cuda:0')},
  {'word': ' awakened', 'similarity': tensor(44.5441, device='cuda:0')},
  {'word': ' magic', 'similarity': tensor(44.0878, device='cuda:0')},
  {'word': ' arcane', 'similarity': tensor(44.0601, device='cuda:0')},
  {'word': ' enchant', 'similarity': tensor(43.4395, device='cuda:0')},
  {'word': ' millennia', 'similarity': tensor(43.2334, device='cuda:0')},
  {'word': ' awakening', 'similarity': tensor(43.1318, device='cu

### Scifi - Baseline

In [None]:
import gc
gc.collect()
t.cuda.empty_cache()

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    0,
    15,
    True
)

([{'word': ' cosmic', 'similarity': tensor(1.3879, device='cuda:0')},
  {'word': ' interstellar', 'similarity': tensor(1.3638, device='cuda:0')},
  {'word': ' asteroid', 'similarity': tensor(1.3117, device='cuda:0')},
  {'word': ' disemb', 'similarity': tensor(1.3032, device='cuda:0')},
  {'word': ' dimensional', 'similarity': tensor(1.2861, device='cuda:0')},
  {'word': ' Celestial', 'similarity': tensor(1.2771, device='cuda:0')},
  {'word': ' wasteland', 'similarity': tensor(1.2605, device='cuda:0')},
  {'word': ' explorer', 'similarity': tensor(1.2599, device='cuda:0')},
  {'word': ' fireball', 'similarity': tensor(1.2589, device='cuda:0')},
  {'word': ' beings', 'similarity': tensor(1.2111, device='cuda:0')},
  {'word': ' adventurer', 'similarity': tensor(1.2102, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(1.2082, device='cuda:0')},
  {'word': ' loneliness', 'similarity': tensor(1.2030, device='cuda:0')},
  {'word': ' Primordial', 'similarity': tensor(1.201

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    10,
    15,
    True
)

([{'word': ' interstellar', 'similarity': tensor(4.9854, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(4.9110, device='cuda:0')},
  {'word': ' warp', 'similarity': tensor(4.8199, device='cuda:0')},
  {'word': ' asteroid', 'similarity': tensor(4.4373, device='cuda:0')},
  {'word': ' teleport', 'similarity': tensor(4.3154, device='cuda:0')},
  {'word': ' crystals', 'similarity': tensor(4.3006, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(4.2824, device='cuda:0')},
  {'word': ' Templar', 'similarity': tensor(4.2625, device='cuda:0')},
  {'word': ' galactic', 'similarity': tensor(4.1699, device='cuda:0')},
  {'word': ' starship', 'similarity': tensor(4.1691, device='cuda:0')},
  {'word': ' ruins', 'similarity': tensor(4.1431, device='cuda:0')},
  {'word': ' fuse', 'similarity': tensor(4.1069, device='cuda:0')},
  {'word': ' wasteland', 'similarity': tensor(4.0958, device='cuda:0')},
  {'word': ' shards', 'similarity': tensor(4.0201, device='cuda:0')},

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    20,
    15,
    True
)

([{'word': ' interstellar', 'similarity': tensor(8.3020, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(8.2517, device='cuda:0')},
  {'word': ' humanity', 'similarity': tensor(8.1931, device='cuda:0')},
  {'word': ' humankind', 'similarity': tensor(7.9206, device='cuda:0')},
  {'word': ' mankind', 'similarity': tensor(7.8071, device='cuda:0')},
  {'word': 'PsyNetMessage', 'similarity': tensor(7.7062, device='cuda:0')},
  {'word': ' sorce', 'similarity': tensor(7.5029, device='cuda:0')},
  {'word': ' warp', 'similarity': tensor(7.3675, device='cuda:0')},
  {'word': ' mutants', 'similarity': tensor(7.2903, device='cuda:0')},
  {'word': ' starship', 'similarity': tensor(7.2699, device='cuda:0')},
  {'word': ' shards', 'similarity': tensor(7.1393, device='cuda:0')},
  {'word': ' irrad', 'similarity': tensor(7.1326, device='cuda:0')},
  {'word': ' civilization', 'similarity': tensor(7.1110, device='cuda:0')},
  {'word': ' cosmic', 'similarity': tensor(7.0971, device='c

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    29,
    15,
    True
)

([{'word': ' humankind', 'similarity': tensor(23.0412, device='cuda:0')},
  {'word': ' humanity', 'similarity': tensor(21.1181, device='cuda:0')},
  {'word': ' mankind', 'similarity': tensor(21.0091, device='cuda:0')},
  {'word': ' millennia', 'similarity': tensor(20.2334, device='cuda:0')},
  {'word': ' interstellar', 'similarity': tensor(20.1033, device='cuda:0')},
  {'word': ' galactic', 'similarity': tensor(18.8764, device='cuda:0')},
  {'word': ' civilization', 'similarity': tensor(18.8511, device='cuda:0')},
  {'word': ' cosmic', 'similarity': tensor(18.3858, device='cuda:0')},
  {'word': ' awakened', 'similarity': tensor(18.0623, device='cuda:0')},
  {'word': ' starship', 'similarity': tensor(17.8426, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(17.8221, device='cuda:0')},
  {'word': ' ensl', 'similarity': tensor(17.8137, device='cuda:0')},
  {'word': ' sentient', 'similarity': tensor(17.5726, device='cuda:0')},
  {'word': ' destiny', 'similarity': tensor

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    30,
    15,
    True
)

([{'word': ' humankind', 'similarity': tensor(24.7868, device='cuda:0')},
  {'word': ' mankind', 'similarity': tensor(22.9134, device='cuda:0')},
  {'word': ' humanity', 'similarity': tensor(22.7502, device='cuda:0')},
  {'word': ' millennia', 'similarity': tensor(22.0554, device='cuda:0')},
  {'word': ' interstellar', 'similarity': tensor(21.9002, device='cuda:0')},
  {'word': ' civilization', 'similarity': tensor(20.3703, device='cuda:0')},
  {'word': ' cosmic', 'similarity': tensor(20.0823, device='cuda:0')},
  {'word': ' galactic', 'similarity': tensor(20.0506, device='cuda:0')},
  {'word': ' awakened', 'similarity': tensor(19.7769, device='cuda:0')},
  {'word': ' destiny', 'similarity': tensor(19.2815, device='cuda:0')},
  {'word': ' ensl', 'similarity': tensor(19.2232, device='cuda:0')},
  {'word': ' Humanity', 'similarity': tensor(19.0211, device='cuda:0')},
  {'word': ' teleportation', 'similarity': tensor(19.0131, device='cuda:0')},
  {'word': ' starship', 'similarity': tensor

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_scifi,
    training_subset_small,
    40,
    15,
    True
)

([{'word': ' humanity', 'similarity': tensor(53.0644, device='cuda:0')},
  {'word': ' humankind', 'similarity': tensor(52.8094, device='cuda:0')},
  {'word': ' interstellar', 'similarity': tensor(52.0640, device='cuda:0')},
  {'word': ' mankind', 'similarity': tensor(50.7920, device='cuda:0')},
  {'word': ' Humanity', 'similarity': tensor(48.9388, device='cuda:0')},
  {'word': ' millennia', 'similarity': tensor(48.7351, device='cuda:0')},
  {'word': ' galactic', 'similarity': tensor(47.0291, device='cuda:0')},
  {'word': ' civilization', 'similarity': tensor(44.8525, device='cuda:0')},
  {'word': ' sentient', 'similarity': tensor(44.4629, device='cuda:0')},
  {'word': ' Earth', 'similarity': tensor(44.1430, device='cuda:0')},
  {'word': ' civilizations', 'similarity': tensor(43.6744, device='cuda:0')},
  {'word': ' starship', 'similarity': tensor(41.0151, device='cuda:0')},
  {'word': ' Mankind', 'similarity': tensor(40.6393, device='cuda:0')},
  {'word': ' enslaved', 'similarity': ten

### Sports - Baseline

In [None]:
import gc
gc.collect()
t.cuda.empty_cache()

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    0,
    15,
    True
)

([{'word': ' swirling', 'similarity': tensor(0.9769, device='cuda:0')},
  {'word': ' clenched', 'similarity': tensor(0.9573, device='cuda:0')},
  {'word': ' sto', 'similarity': tensor(0.9085, device='cuda:0')},
  {'word': ' pounding', 'similarity': tensor(0.9057, device='cuda:0')},
  {'word': ' flames', 'similarity': tensor(0.9015, device='cuda:0')},
  {'word': ' longing', 'similarity': tensor(0.8974, device='cuda:0')},
  {'word': ' clasp', 'similarity': tensor(0.8902, device='cuda:0')},
  {'word': ' grit', 'similarity': tensor(0.8879, device='cuda:0')},
  {'word': ' gripping', 'similarity': tensor(0.8874, device='cuda:0')},
  {'word': ' trembling', 'similarity': tensor(0.8518, device='cuda:0')},
  {'word': ' euph', 'similarity': tensor(0.8444, device='cuda:0')},
  {'word': ' fists', 'similarity': tensor(0.8423, device='cuda:0')},
  {'word': ' towering', 'similarity': tensor(0.8361, device='cuda:0')},
  {'word': ' loving', 'similarity': tensor(0.8276, device='cuda:0')},
  {'word': ' ro

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    10,
    15,
    True
)

([{'word': ' crimson', 'similarity': tensor(3.9103, device='cuda:0')},
  {'word': ' chanting', 'similarity': tensor(3.7771, device='cuda:0')},
  {'word': ' trembling', 'similarity': tensor(3.7767, device='cuda:0')},
  {'word': ' bouncing', 'similarity': tensor(3.7527, device='cuda:0')},
  {'word': ' flames', 'similarity': tensor(3.5360, device='cuda:0')},
  {'word': ' clenched', 'similarity': tensor(3.4981, device='cuda:0')},
  {'word': ' paced', 'similarity': tensor(3.4928, device='cuda:0')},
  {'word': ' gripping', 'similarity': tensor(3.4533, device='cuda:0')},
  {'word': ' crest', 'similarity': tensor(3.4416, device='cuda:0')},
  {'word': ' brightest', 'similarity': tensor(3.4087, device='cuda:0')},
  {'word': ' stride', 'similarity': tensor(3.4012, device='cuda:0')},
  {'word': 'flame', 'similarity': tensor(3.3914, device='cuda:0')},
  {'word': ' tears', 'similarity': tensor(3.3795, device='cuda:0')},
  {'word': ' rhyth', 'similarity': tensor(3.3352, device='cuda:0')},
  {'word': 

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    20,
    15,
    True
)

([{'word': ' trembling', 'similarity': tensor(7.9368, device='cuda:0')},
  {'word': ' Gleaming', 'similarity': tensor(7.7750, device='cuda:0')},
  {'word': ' chants', 'similarity': tensor(7.3548, device='cuda:0')},
  {'word': ' chanting', 'similarity': tensor(7.0428, device='cuda:0')},
  {'word': ' momentarily', 'similarity': tensor(6.8777, device='cuda:0')},
  {'word': ' overcame', 'similarity': tensor(6.8410, device='cuda:0')},
  {'word': ' clenched', 'similarity': tensor(6.7948, device='cuda:0')},
  {'word': ' roar', 'similarity': tensor(6.7788, device='cuda:0')},
  {'word': ' crimson', 'similarity': tensor(6.7232, device='cuda:0')},
  {'word': ' exhilar', 'similarity': tensor(6.6321, device='cuda:0')},
  {'word': ' rhyth', 'similarity': tensor(6.5458, device='cuda:0')},
  {'word': ' puls', 'similarity': tensor(6.5434, device='cuda:0')},
  {'word': ' bouncing', 'similarity': tensor(6.5338, device='cuda:0')},
  {'word': ' throb', 'similarity': tensor(6.5189, device='cuda:0')},
  {'wo

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    29,
    15,
    True
)

([{'word': ' exhilar', 'similarity': tensor(18.3830, device='cuda:0')},
  {'word': ' victorious', 'similarity': tensor(17.7590, device='cuda:0')},
  {'word': ' triumph', 'similarity': tensor(17.3707, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(17.2756, device='cuda:0')},
  {'word': ' triumphant', 'similarity': tensor(17.1828, device='cuda:0')},
  {'word': ' adrenaline', 'similarity': tensor(17.1531, device='cuda:0')},
  {'word': ' glory', 'similarity': tensor(17.0817, device='cuda:0')},
  {'word': ' trembling', 'similarity': tensor(16.8489, device='cuda:0')},
  {'word': ' victory', 'similarity': tensor(16.4317, device='cuda:0')},
  {'word': ' chants', 'similarity': tensor(15.7367, device='cuda:0')},
  {'word': ' crimson', 'similarity': tensor(15.6651, device='cuda:0')},
  {'word': ' celebration', 'similarity': tensor(15.6471, device='cuda:0')},
  {'word': ' cheers', 'similarity': tensor(15.3817, device='cuda:0')},
  {'word': ' clenched', 'similarity': tensor(15.3448

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    30,
    15,
    True
)

([{'word': ' exhilar', 'similarity': tensor(20.3594, device='cuda:0')},
  {'word': ' victorious', 'similarity': tensor(19.4764, device='cuda:0')},
  {'word': ' adrenaline', 'similarity': tensor(19.1527, device='cuda:0')},
  {'word': ' triumph', 'similarity': tensor(18.9922, device='cuda:0')},
  {'word': ' triumphant', 'similarity': tensor(18.6899, device='cuda:0')},
  {'word': ' glory', 'similarity': tensor(18.6271, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(18.5982, device='cuda:0')},
  {'word': ' victory', 'similarity': tensor(18.1274, device='cuda:0')},
  {'word': ' trembling', 'similarity': tensor(18.0371, device='cuda:0')},
  {'word': ' chants', 'similarity': tensor(17.2317, device='cuda:0')},
  {'word': ' crimson', 'similarity': tensor(16.8788, device='cuda:0')},
  {'word': ' dazzling', 'similarity': tensor(16.8334, device='cuda:0')},
  {'word': ' celebration', 'similarity': tensor(16.8253, device='cuda:0')},
  {'word': ' thrilling', 'similarity': tensor(16.7

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    training_subset_small,
    40,
    15,
    True
)

([{'word': ' exhilar', 'similarity': tensor(39.9922, device='cuda:0')},
  {'word': ' victorious', 'similarity': tensor(38.0716, device='cuda:0')},
  {'word': ' triumph', 'similarity': tensor(37.5682, device='cuda:0')},
  {'word': ' adrenaline', 'similarity': tensor(36.5496, device='cuda:0')},
  {'word': ' triumphant', 'similarity': tensor(36.0631, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(35.9723, device='cuda:0')},
  {'word': ' chants', 'similarity': tensor(34.4296, device='cuda:0')},
  {'word': ' victory', 'similarity': tensor(34.2409, device='cuda:0')},
  {'word': ' teammates', 'similarity': tensor(32.9563, device='cuda:0')},
  {'word': ' thrilling', 'similarity': tensor(32.7388, device='cuda:0')},
  {'word': ' cheers', 'similarity': tensor(32.5065, device='cuda:0')},
  {'word': ' dazzling', 'similarity': tensor(32.4719, device='cuda:0')},
  {'word': ' glory', 'similarity': tensor(32.4311, device='cuda:0')},
  {'word': ' soaring', 'similarity': tensor(32.4156, 

### Sports - All Stories

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    0,
    15,
    True
)

([{'word': ' Soccer', 'similarity': tensor(0.2834, device='cuda:0')},
  {'word': ' AFL', 'similarity': tensor(0.2712, device='cuda:0')},
  {'word': ' padding', 'similarity': tensor(0.2664, device='cuda:0')},
  {'word': ' hon', 'similarity': tensor(0.2619, device='cuda:0')},
  {'word': ' cheers', 'similarity': tensor(0.2610, device='cuda:0')},
  {'word': ' finals', 'similarity': tensor(0.2600, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(0.2573, device='cuda:0')},
  {'word': ' infield', 'similarity': tensor(0.2507, device='cuda:0')},
  {'word': ' uphill', 'similarity': tensor(0.2490, device='cuda:0')},
  {'word': ' volleyball', 'similarity': tensor(0.2389, device='cuda:0')},
  {'word': ' turnout', 'similarity': tensor(0.2332, device='cuda:0')},
  {'word': ' thw', 'similarity': tensor(0.2331, device='cuda:0')},
  {'word': ' Tennis', 'similarity': tensor(0.2310, device='cuda:0')},
  {'word': ' tennis', 'similarity': tensor(0.2233, device='cuda:0')},
  {'word': ' attenda

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    10,
    15,
    True
)


([{'word': ' padding', 'similarity': tensor(1.1901, device='cuda:0')},
  {'word': ' drib', 'similarity': tensor(1.1615, device='cuda:0')},
  {'word': ' coaches', 'similarity': tensor(1.1560, device='cuda:0')},
  {'word': ' thw', 'similarity': tensor(1.1536, device='cuda:0')},
  {'word': ' hoop', 'similarity': tensor(1.1473, device='cuda:0')},
  {'word': ' uphill', 'similarity': tensor(1.1151, device='cuda:0')},
  {'word': ' padded', 'similarity': tensor(1.1106, device='cuda:0')},
  {'word': ' coached', 'similarity': tensor(1.1059, device='cuda:0')},
  {'word': ' applause', 'similarity': tensor(1.0426, device='cuda:0')},
  {'word': 'NBA', 'similarity': tensor(1.0422, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(1.0233, device='cuda:0')},
  {'word': ' swe', 'similarity': tensor(1.0211, device='cuda:0')},
  {'word': ' yard', 'similarity': tensor(1.0187, device='cuda:0')},
  {'word': ' AFL', 'similarity': tensor(1.0157, device='cuda:0')},
  {'word': ' athleticism', 'simi

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    20,
    15,
    True
)


([{'word': ' drib', 'similarity': tensor(2.1686, device='cuda:0')},
  {'word': ' applause', 'similarity': tensor(2.1359, device='cuda:0')},
  {'word': ' padding', 'similarity': tensor(2.1064, device='cuda:0')},
  {'word': ' announcer', 'similarity': tensor(2.0937, device='cuda:0')},
  {'word': ' clinch', 'similarity': tensor(2.0800, device='cuda:0')},
  {'word': ' AFL', 'similarity': tensor(2.0711, device='cuda:0')},
  {'word': ' coached', 'similarity': tensor(2.0551, device='cuda:0')},
  {'word': ' hoop', 'similarity': tensor(2.0231, device='cuda:0')},
  {'word': ' athlet', 'similarity': tensor(2.0062, device='cuda:0')},
  {'word': ' jersey', 'similarity': tensor(2.0051, device='cuda:0')},
  {'word': ' spectators', 'similarity': tensor(1.9785, device='cuda:0')},
  {'word': ' coaches', 'similarity': tensor(1.9659, device='cuda:0')},
  {'word': ' infield', 'similarity': tensor(1.9604, device='cuda:0')},
  {'word': ' scoreboard', 'similarity': tensor(1.9252, device='cuda:0')},
  {'word':

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    29,
    15,
    True
)

([{'word': ' applause', 'similarity': tensor(5.6497, device='cuda:0')},
  {'word': ' clinch', 'similarity': tensor(5.6426, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(5.5961, device='cuda:0')},
  {'word': ' cheers', 'similarity': tensor(5.5719, device='cuda:0')},
  {'word': ' spectators', 'similarity': tensor(5.3533, device='cuda:0')},
  {'word': ' scoreboard', 'similarity': tensor(5.2183, device='cuda:0')},
  {'word': ' clin', 'similarity': tensor(5.1572, device='cuda:0')},
  {'word': ' Wembley', 'similarity': tensor(5.0453, device='cuda:0')},
  {'word': ' announcer', 'similarity': tensor(5.0010, device='cuda:0')},
  {'word': ' scorer', 'similarity': tensor(4.8137, device='cuda:0')},
  {'word': ' hoop', 'similarity': tensor(4.7874, device='cuda:0')},
  {'word': ' referee', 'similarity': tensor(4.7168, device='cuda:0')},
  {'word': ' coaches', 'similarity': tensor(4.7012, device='cuda:0')},
  {'word': ' drib', 'similarity': tensor(4.6567, device='cuda:0')},
  {'word

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    30,
    15,
    True
)


([{'word': ' applause', 'similarity': tensor(6.1518, device='cuda:0')},
  {'word': ' cheers', 'similarity': tensor(5.9629, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(5.9402, device='cuda:0')},
  {'word': ' clinch', 'similarity': tensor(5.8723, device='cuda:0')},
  {'word': ' spectators', 'similarity': tensor(5.8098, device='cuda:0')},
  {'word': ' scoreboard', 'similarity': tensor(5.7732, device='cuda:0')},
  {'word': ' clin', 'similarity': tensor(5.6705, device='cuda:0')},
  {'word': ' announcer', 'similarity': tensor(5.6296, device='cuda:0')},
  {'word': ' referee', 'similarity': tensor(5.3326, device='cuda:0')},
  {'word': ' scorer', 'similarity': tensor(5.2900, device='cuda:0')},
  {'word': ' hoop', 'similarity': tensor(5.2580, device='cuda:0')},
  {'word': ' drib', 'similarity': tensor(5.1869, device='cuda:0')},
  {'word': ' Wembley', 'similarity': tensor(5.0874, device='cuda:0')},
  {'word': ' gust', 'similarity': tensor(4.9923, device='cuda:0')},
  {'word': 

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_sports,
    all_stories,
    40,
    15,
    True
)

([{'word': ' cheers', 'similarity': tensor(12.6464, device='cuda:0')},
  {'word': ' scoreboard', 'similarity': tensor(12.6210, device='cuda:0')},
  {'word': ' drib', 'similarity': tensor(12.3789, device='cuda:0')},
  {'word': ' jerseys', 'similarity': tensor(12.1617, device='cuda:0')},
  {'word': ' hoop', 'similarity': tensor(12.1014, device='cuda:0')},
  {'word': ' cheering', 'similarity': tensor(12.0994, device='cuda:0')},
  {'word': ' coaches', 'similarity': tensor(12.0772, device='cuda:0')},
  {'word': ' Bulls', 'similarity': tensor(11.9978, device='cuda:0')},
  {'word': ' jersey', 'similarity': tensor(11.9018, device='cuda:0')},
  {'word': ' applause', 'similarity': tensor(11.8972, device='cuda:0')},
  {'word': ' Coach', 'similarity': tensor(11.4917, device='cuda:0')},
  {'word': ' athlet', 'similarity': tensor(11.3789, device='cuda:0')},
  {'word': ' clinch', 'similarity': tensor(11.3100, device='cuda:0')},
  {'word': ' Basketball', 'similarity': tensor(11.2533, device='cuda:0')}

### Fantasy vs Fantasy no genre

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy_genre,
    dataset_fantasy,
    0,
    15,
    True
)

([{'word': 'FS', 'similarity': tensor(0.1263, device='cuda:0')},
  {'word': 'TE', 'similarity': tensor(0.1259, device='cuda:0')},
  {'word': 'separ', 'similarity': tensor(0.1176, device='cuda:0')},
  {'word': ' Goes', 'similarity': tensor(0.1158, device='cuda:0')},
  {'word': 'oS', 'similarity': tensor(0.1156, device='cuda:0')},
  {'word': 'UC', 'similarity': tensor(0.1128, device='cuda:0')},
  {'word': 'Nation', 'similarity': tensor(0.1124, device='cuda:0')},
  {'word': ' Not', 'similarity': tensor(0.1103, device='cuda:0')},
  {'word': ' Nation', 'similarity': tensor(0.1094, device='cuda:0')},
  {'word': 'Ab', 'similarity': tensor(0.1089, device='cuda:0')},
  {'word': ' Mission', 'similarity': tensor(0.1081, device='cuda:0')},
  {'word': ' Bare', 'similarity': tensor(0.1077, device='cuda:0')},
  {'word': 'KO', 'similarity': tensor(0.1072, device='cuda:0')},
  {'word': 'Nik', 'similarity': tensor(0.1063, device='cuda:0')},
  {'word': '̶', 'similarity': tensor(0.1061, device='cuda:0')}]

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy_genre,
    dataset_fantasy,
    4,
    15,
    True
)

([{'word': ' Reasons', 'similarity': tensor(0.5479, device='cuda:0')},
  {'word': ' Prelude', 'similarity': tensor(0.4972, device='cuda:0')},
  {'word': ' Plays', 'similarity': tensor(0.4930, device='cuda:0')},
  {'word': 'Reply', 'similarity': tensor(0.4829, device='cuda:0')},
  {'word': ' canonical', 'similarity': tensor(0.4679, device='cuda:0')},
  {'word': 'SO', 'similarity': tensor(0.4506, device='cuda:0')},
  {'word': '.?', 'similarity': tensor(0.4499, device='cuda:0')},
  {'word': 'Crit', 'similarity': tensor(0.4493, device='cuda:0')},
  {'word': 'Meta', 'similarity': tensor(0.4399, device='cuda:0')},
  {'word': ' Contribut', 'similarity': tensor(0.4375, device='cuda:0')},
  {'word': ' Terms', 'similarity': tensor(0.4327, device='cuda:0')},
  {'word': 'Science', 'similarity': tensor(0.4267, device='cuda:0')},
  {'word': ' Random', 'similarity': tensor(0.4235, device='cuda:0')},
  {'word': 'oS', 'similarity': tensor(0.4221, device='cuda:0')},
  {'word': ' Rew', 'similarity': tens

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy_genre,
    dataset_fantasy,
    20,
    15,
    True
)

([{'word': ' descriptive', 'similarity': tensor(1.3861, device='cuda:0')},
  {'word': ' synopsis', 'similarity': tensor(1.3284, device='cuda:0')},
  {'word': '?:', 'similarity': tensor(1.3099, device='cuda:0')},
  {'word': 'Compar', 'similarity': tensor(1.2810, device='cuda:0')},
  {'word': ' concise', 'similarity': tensor(1.2609, device='cuda:0')},
  {'word': ' Quote', 'similarity': tensor(1.2597, device='cuda:0')},
  {'word': 'Answer', 'similarity': tensor(1.2368, device='cuda:0')},
  {'word': ' Answer', 'similarity': tensor(1.2082, device='cuda:0')},
  {'word': ' Fundamental', 'similarity': tensor(1.1993, device='cuda:0')},
  {'word': ' Explain', 'similarity': tensor(1.1760, device='cuda:0')},
  {'word': ' Plays', 'similarity': tensor(1.1622, device='cuda:0')},
  {'word': ' Context', 'similarity': tensor(1.1596, device='cuda:0')},
  {'word': ' Opinion', 'similarity': tensor(1.1378, device='cuda:0')},
  {'word': 'description', 'similarity': tensor(1.1366, device='cuda:0')},
  {'word'

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy_genre,
    dataset_fantasy,
    34,
    15,
    True
)

([{'word': ' Answer', 'similarity': tensor(4.0082, device='cuda:0')},
  {'word': ' Explain', 'similarity': tensor(3.9952, device='cuda:0')},
  {'word': ' synopsis', 'similarity': tensor(3.9383, device='cuda:0')},
  {'word': ' Literary', 'similarity': tensor(3.8871, device='cuda:0')},
  {'word': ' Proced', 'similarity': tensor(3.8638, device='cuda:0')},
  {'word': '?:', 'similarity': tensor(3.8580, device='cuda:0')},
  {'word': 'Answer', 'similarity': tensor(3.8269, device='cuda:0')},
  {'word': ' Novel', 'similarity': tensor(3.6944, device='cuda:0')},
  {'word': ' Character', 'similarity': tensor(3.4082, device='cuda:0')},
  {'word': ' Typically', 'similarity': tensor(3.3852, device='cuda:0')},
  {'word': ' Characters', 'similarity': tensor(3.3707, device='cuda:0')},
  {'word': ' Reviews', 'similarity': tensor(3.3601, device='cuda:0')},
  {'word': ' Typical', 'similarity': tensor(3.3417, device='cuda:0')},
  {'word': 'Generally', 'similarity': tensor(3.3176, device='cuda:0')},
  {'word

In [None]:
difference_similar_tokens(
    gpt2_xl,
    dataset_fantasy_genre,
    dataset_fantasy,
    47,
    15,
    True
)

([{'word': ' Recommended', 'similarity': tensor(8.3129, device='cuda:0')},
  {'word': ' Literary', 'similarity': tensor(8.1637, device='cuda:0')},
  {'word': ' Explain', 'similarity': tensor(7.8248, device='cuda:0')},
  {'word': 'Rating', 'similarity': tensor(7.8113, device='cuda:0')},
  {'word': ' Novel', 'similarity': tensor(7.7762, device='cuda:0')},
  {'word': ' Character', 'similarity': tensor(7.7361, device='cuda:0')},
  {'word': '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0',
   'similarity': tensor(7.7175, device='cuda:0')},
  {'word': ' Overall', 'similarity': tensor(7.7073, device='cuda:0')},
  {'word': ' Basically', 'similarity': tensor(7.4110, device='cuda:0')},
  {'word': 'Basically', 'similarity': tensor(7.3733, device='cuda:0')},
  {'word': ' Typical', 'similarity': tensor(7.2918, device='cuda:0')},
  {'word': ' Proced', 'similarity': tensor(7.2717, device='cuda:0')},
  {'word': ' Characters', 'similarity': tensor(7.2099, device='cuda:0')},
  {'word': ' \xa0\xa0', 'similarity': tens

dataset_sports

In [None]:
for story in dataset_sports:
  print('Kobe' in story)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals