In [1]:
# Replicate ITI results, make sure ITI utils and probing utils work right

#%%
%load_ext autoreload
%autoreload 2
from IPython import get_ipython

ipython = get_ipython()
# Code to automatically update the TransformerLens code as its edited without restarting the kernel
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")
    
import plotly.io as pio
# pio.renderers.default = "png"
# Import stuff

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from fancy_einsum import einsum
import tqdm.notebook as tqdm
import random
from pathlib import Path
import plotly.express as px
from torch.utils.data import DataLoader

from jaxtyping import Float, Int
from typing import List, Union, Optional
from functools import partial
import copy

import itertools
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import dataclasses
import datasets
from IPython.display import HTML

from tqdm import tqdm
# from utils.probing_utils import ModelActs
from utils.dataset_utils import CounterFact_Dataset, TQA_MC_Dataset, EZ_Dataset

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

from utils.iti_utils import patch_iti

from utils.analytics_utils import plot_probe_accuracies, plot_norm_diffs, plot_cosine_sims
import os
from torch import Tensor

from utils.analytics_utils import plot_z_probe_accuracies, plot_resid_probe_accuracies, plot_transfer_acc_subplots
from utils.new_probing_utils import ModelActsLargeSimple, ChunkedModelActs
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt


  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Format raw activations

Format unformatted activation files with each prompt being a single file, into formatted activations of each component's collated acts.

Only has to be done once.

In [5]:
# N = 2550 #upper bound the global (level 0) index
d_head = 128
n_layers = 80
n_heads = 64
# num_params = "70b"

from utils.cache_utils import create_probe_dataset, format_logits
data_dir = "/mnt/ssd-2/jamescampbell3"
act_type = "z"
run_id = 7
save_dir = f"/home/phillipguo/formatted_runs/run_{run_id}"

seq_pos = -1
# azaria_mitchell_short_splits = ['facts', 'neg_facts', 'conj_neg_facts', 'companies', 'neg_companies', 'conj_neg_companies']
# azaria_mitchell_short_splits = ['cities', 'capitals', 'companies', 'animals', 'elements', 'inventions', 'facts', 'neg_facts', 'conj_neg_facts', 'neg_companies', 'conj_neg_companies']
azaria_mitchell_short_splits = ['cities', 'capitals', 'companies', 'animals', 'elements', 'inventions', 'facts']
azaria_mitchell_splits = [f'azaria_mitchell_{split}' for split in azaria_mitchell_short_splits]
from datasets import load_dataset
# dataset = load_dataset("notrichardren/deception-evals")
dataset = load_dataset("notrichardren/azaria-mitchell")
# dataset = load_dataset("notrichardren/truthfulness_high_quality")
run_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}"

# modes = ["neutral_unprompted", "elements_liar"]
# modes = ["elements_liar"]
# for mode in modes:
#     for split in azaria_mitchell_short_splits:
#         create_probe_dataset(run_id=run_id, seq_pos=seq_pos, prompt_tag=mode, act_type=act_type, data_dir=data_dir, splits=[split], save_path=save_dir, inference_path=f"inference_outputs/inference_output_{run_id}_honest.csv")

#         indices = dataset["train"]["ind"]
#         indices = [row["ind"] for row in dataset["combined"] if row["dataset"] == split]
#         # print(indices)
#         format_logits(indices, split, run_folder=run_folder, run_id=run_id, formatted_folder=save_dir, modes=modes, seq_pos=seq_pos, logit_mode=mode)

## Initialize ModelActs objects. 

ModelActsLargeSimple is if we want to store all activations in memory at once (memory inefficient but faster transfer accuracy). 

ChunkedModelActs is if we don't want to keep activations in memory, instead load in activations in batches of layers and train probes iteratively, then unload activations. Transfer accuracy is much slower.

In [3]:
# datasets = ["azaria_mitchell_inventions", "azaria_mitchell_elements", "azaria_mitchell_animals", "azaria_mitchell_cities", "azaria_mitchell_facts", "azaria_mitchell_companies", "azaria_mitchell_capitals"]
# am_datasets = ["azaria_mitchell_facts", "azaria_mitchell_companies", "azaria_mitchell_neg_facts", "azaria_mitchell_conj_neg_facts", "azaria_mitchell_neg_companies", "azaria_mitchell_conj_neg_companies"]
# am_datasets = ["azaria_mitchell_facts", "azaria_mitchell_neg_facts", "azaria_mitchell_conj_neg_facts", "azaria_mitchell_companies", "azaria_mitchell_neg_companies", "azaria_mitchell_conj_neg_companies"]
datasets = ["inventions", "elements", "animals", "cities", "facts", "companies", "capitals"]
datasets = ["facts"]
# datasets = ["misaligned"]
# modes = ["honest", "neutral", "liar"]
modes = ["honest", "elements_liar", "liar"]
act_dict = {}
for dataset_name in datasets:
    act_dict[dataset_name] = {}
    for mode in modes:
        act_dict[dataset_name][mode]: ModelActsLargeSimple = ModelActsLargeSimple()



In [6]:
seq_pos = -1
act_type = "z"
# act_types = ["logits", "resid_mid", "z", "mlp_out"]
act_types = ["logits", "z"]
# modes = ["honest", "liar"]
dont_include = None
run_id = 7
formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"
# formatted_data_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}/activations/formatted"
train_probes = True

for dataset_name in tqdm(datasets):
    for mode in modes:
        print(f"Loading activations from {mode} model on {dataset_name}")
        for act_type in act_types:
            file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
            if seq_pos is not None:
                file_prefix += f"_{seq_pos}"
            file_prefix += f"_{act_type}"
            if dataset_name is not None:
                file_prefix += f"_{dataset_name}"

            with open(f"{formatted_data_folder}/labels_{run_id}_honest_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
                labels = torch.load(handle)
            
            act_dict[dataset_name][mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=False)

            if act_type != "logits" and train_probes:
                act_dict[dataset_name][mode].train_probes(act_type, verbose=False, max_iter=10000)
    print(f"{dataset_name} dataset Size: {labels.shape[0]}")

  0%|          | 0/1 [00:00<?, ?it/s]

Loading activations from honest model on facts
Loading activations from elements_liar model on facts
Loading activations from liar model on facts


100%|██████████| 1/1 [01:43<00:00, 103.00s/it]

facts dataset Size: 599





In [None]:
act_dict["facts"]["honest"].probes

In [None]:
from transformers import LlamaTokenizer

weights_dir = f"{os.getcwd()}/llama-weights-70b"
tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(weights_dir)

In [None]:
import resource
print(f"Memory used: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1e-6} GB") # check memory usage in GB

In [None]:
def acc_tensor_from_dict(probe_accs_dict, n_layers, n_heads=None):
    """
    Helper method to convert dictionaries with component indices as keys (e.g. (5, 4) for Z dict or 79 for resid dict) to tensors, of shape (n_layers, n_heads) for Z or just (n_layers) for resid.
    """
    if n_heads is not None:
        probe_accs = np.zeros(shape=(n_layers, n_heads))
        for layer in range(n_layers):
            for head in range(n_heads):
                probe_accs[layer, head] = probe_accs_dict[(layer, head)]

    else:
        probe_accs = np.zeros(shape=(n_layers,))
        for layer in range(n_layers):
            probe_accs[layer] = probe_accs_dict[layer]
    return probe_accs


def get_px_fig(act_type, transfer_accs, n_layers, n_heads, title, graph_type=None, average_layer=False):
    """
    Helper method to generate a figure showing a quantity (accuracy, cosine sim, whatever) for each layer of a model. If act_type is 
    args:
        act_type: "z" is treated alone, all others are 
    """
    if act_type == "z":
        px_fig = plot_z_probe_accuracies(transfer_accs, n_layers, n_heads=n_heads, title=title, average_layer=average_layer)
    else:
        px_fig = plot_resid_probe_accuracies(transfer_accs, n_layers, title=title, graph_type=graph_type)
    return px_fig

## Bar Chart of general probe accuracy
Figure 1

In [28]:
# Initialize plot
threshold = 0.25

plt.figure(figsize=(10, 8))

# Initialize some empty lists to store accuracy values
accuracies_honest = []
accuracies_lying = []

# Loop through datasets and modes to fill the accuracy lists
# This part you will have to replace with how you are getting your 'correct_prob' values
for dataset_name in tqdm(datasets):
    for mode in ["honest", "liar", "elements_liar"]:
        # your code to get correct_prob goes here.
        correct_probs, incorrect_probs = act_dict[dataset_name][mode].get_inference_accuracy(tokenizer, scale_relative=True, threshold=threshold)
        
        correct_prob = np.mean(correct_probs)

        if mode == "honest":
            accuracies_honest.append(correct_prob)
        else:
            accuracies_lying.append(correct_prob)


# Setting the positions and width for the bars
pos = list(range(len(datasets)))
bar_width = 0.35

# Additional spacing between sub-bars within each group
spacing = 0.02

# Plotting the bars
fig, ax = plt.subplots(figsize=(12, 8))

# Create a bar with honest data, in position pos
plt.bar(pos, accuracies_honest, bar_width, label='Honest',
        color='dodgerblue', edgecolor='black')
# Remove y-axis grid lines

# Create a bar with lying data, in position pos + bar_width + spacing
plt.bar([p + bar_width + spacing for p in pos], accuracies_lying, bar_width, label='Lying',
        color='darkorange', edgecolor='black')

# Set the x-axis labels
ax.set_xticks([p + 0.5 * bar_width for p in pos])
# truncated_dataset_names = [dataset[16:] for dataset in datasets]
ax.set_xticklabels(datasets)

# Add labels and title
plt.xlabel('Datasets', fontsize=13)
plt.ylabel('Model Inference Accuracy', fontsize=13)
plt.title(f'Model Inference Accuracy Across Datasets, Threshold {threshold}', fontsize=15)

# Adding the legend and showing the plot
plt.legend(['Honest', 'Lying'], loc='upper left')
plt.grid()
plt.show()

 33%|███▎      | 1/3 [00:00<00:00,  3.84it/s]


AssertionError: 

<Figure size 1000x800 with 0 Axes>

## Probe Accuracy and Transfer

### First, just regular in-distribution probe accuracy

In [None]:
act_type = "z"
average_layer = True

dataset_name = "facts"
dataset_acts = act_dict[dataset_name]

from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=len(modes))

for i, label in enumerate(modes):
    px_fig = get_px_fig(act_type, dataset_acts[label].probe_accs[act_type], n_layers, n_heads, title = f"{label} {act_type} Probe Accuracies", average_layer=average_layer)
    fig.add_trace(
        px_fig['data'][0],  # add the trace from plotly express figure
        row=1,
        col=i+1
    )

    if act_type == "z":
        fig.update_xaxes(title_text=f"Heads ({label})", row=1, col=i+1)
    else:
        fig.update_xaxes(title_text=f"Layer ({label})", row=1, col=i+1)
        fig.update_yaxes(title_text=px_fig.layout.yaxis.title.text, row=1, col=i+1)

    print(acc_tensor_from_dict(dataset_acts[label].probe_accs["z"], n_layers, n_heads).mean())

fig.update_yaxes(title_text=px_fig.layout.yaxis.title.text, row=1, col=1)


fig.update_layout(title_text=f"{act_type} Probe Accuracies, Dataset {dataset_name}, Sequence Pos {seq_pos}", showlegend=False)
fig.show()

### Dataset Transfers

In [None]:
from utils.analytics_utils import plot_transfer_acc_subplots
transfer_mode = "liar"
percentile = 95
datasets = ["inventions", "elements", "animals", "facts", "companies", "capitals", "neg_facts", "neg_companies"]

train_acts = {dataset: act_dict[dataset][transfer_mode] for dataset in datasets}
test_acts = train_acts

cossim_tensors, _ = plot_transfer_acc_subplots(train_acts, test_acts, cosine_sim=True)

cossim_tensor_percentiles = np.percentile(einops.rearrange(cossim_tensors, "dataset1 dataset2 layer head -> dataset1 dataset2 (layer head)"), percentile, axis=-1)


In [None]:
import plotly.graph_objects as go
x = np.arange(0, len(datasets))
y = np.arange(0, len(datasets))

# Create a figure
# fig = px.imshow(cossim_tensor_percentiles,
#                 labels=dict(x="Test Datasets", y="Train Datasets", color="Value"),
#                 x=x,
#                 y=y,
#                 color_continuous_scale="YlGnBu")  # You can change the color scale as needed
fig = go.Figure(data=go.Heatmap(
    z=cossim_tensor_percentiles[:,::-1],
    x=datasets,
    y=datasets[::-1],
    colorbar=dict(title='Accuracy'),
    colorscale='Viridis'
))

for i in range(0, len(y)):
    for j in range(0, len(x)):
        fig.add_annotation(
            x=j,
            y=i,
            text=str(round(cossim_tensor_percentiles[i, len(datasets)-1-j], 2)),
            showarrow=False,
            font=dict(
                color="black",
                size=12
            )
        )

fig.update_layout(width=600, height=600, title=f"Cosine Similarities Between Datasets, {transfer_mode} {percentile} Percentile")
# Show the plot
fig.show()

### Multiple Sequence Positions

In [None]:
# datasets = ['cities', 'capitals', 'companies', 'animals', 'elements', 'inventions', 'facts', 'neg_facts', 'conj_neg_facts', 'neg_companies', 'conj_neg_companies']
datasets = ['facts']
modes = ["honest"]
act_dict_seq_pos = {}
seq_positions = [-3, -5, -10]
for seq_pos in seq_positions:
    act_dict_seq_pos[seq_pos] = {}
    act_dict_temp = act_dict_seq_pos[seq_pos]
    for dataset_name in datasets:
        act_dict_temp[dataset_name] = {}
        for mode in modes:
            act_dict_temp[dataset_name][mode]: ModelActsLargeSimple = ModelActsLargeSimple()

for seq_pos in seq_positions:
    act_dict_temp = act_dict_seq_pos[seq_pos]

    act_type = "z"
    # act_types = ["logits", "resid_mid", "z", "mlp_out"]
    act_types = ["z"]
    dont_include = None
    run_id = 33
    formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"
    # formatted_data_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}/activations/formatted"
    train_probes = True

    for dataset_name in tqdm(datasets):
        for mode in modes:
            print(f"Loading activations from {mode} model on {dataset_name}")
            for act_type in act_types:
                file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
                if seq_pos is not None:
                    file_prefix += f"_{seq_pos}"
                file_prefix += f"_{act_type}"
                if dataset_name is not None:
                    file_prefix += f"_{dataset_name}"

                with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
                    labels = torch.load(handle)
                
                act_dict_temp[dataset_name][mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=False)

                if act_type != "logits" and train_probes:
                    act_dict_temp[dataset_name][mode].train_probes(act_type, verbose=False, max_iter=10000)
        print(f"{dataset_name} dataset Size: {labels.shape[0]}")

In [None]:
seq_pos = -1
act_type = "z"
act_types = ["z"]
dont_include = None
run_id = 6
formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"
# formatted_data_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}/activations/formatted"
train_probes = True

datasets_1 = datasets
act_dict_1 = {}
for dataset_name in datasets_1:
    act_dict_1[dataset_name] = {}
    for mode in modes:
        act_dict_1[dataset_name][mode]: ModelActsLargeSimple = ModelActsLargeSimple()

for dataset_name in tqdm(datasets_1):
    for mode in modes:
        print(f"Loading activations from {mode} model on {dataset_name}")
        for act_type in act_types:
            file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
            if seq_pos is not None:
                file_prefix += f"_{seq_pos}"
            file_prefix += f"_{act_type}"
            if dataset_name is not None:
                file_prefix += f"_{dataset_name}"

            with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
                labels = torch.load(handle)
            
            act_dict_1[dataset_name][mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=False)

            if act_type != "logits" and train_probes:
                act_dict_1[dataset_name][mode].train_probes(act_type, verbose=False, max_iter=10000)
    print(f"{dataset_name} dataset Size: {labels.shape[0]}")

In [None]:
train_acts = {}
train_acts["facts_-1"] = act_dict_1["facts"]["honest"]
for seq_pos in seq_positions:
    train_acts[f"facts_{seq_pos}"] = act_dict_seq_pos[seq_pos]["facts"]["honest"]

_, fig = plot_transfer_acc_subplots(train_acts, train_acts)
fig.update_layout(
    width=1000,
    height=1000,
    title=f"Transfer probes at different sequence positions, Azaria Mitchell Facts Honest",
)
fig.show()

In [None]:
_, fig = plot_transfer_acc_subplots(train_acts, train_acts, cosine_sim=True)
fig.update_layout(
    width=1000,
    height=1000,
    title=f"Cosine Similarities of Probes at different sequence positions, Azaria Mitchell Facts Honest",
)
fig.show()

### Next, transfer to Neg and Conj

In [None]:
# 
dataset_name = "all_am"
act_dict[dataset_name] = {"honest": ModelActsLargeSimple(), "liar": ModelActsLargeSimple(), "neutral": ModelActsLargeSimple()}
sub_datasets_long = ["azaria_mitchell_inventions", "azaria_mitchell_elements", "azaria_mitchell_animals", "azaria_mitchell_facts", "azaria_mitchell_companies", "azaria_mitchell_capitals", "azaria_mitchell_cities"]
sub_datasets_short = [dataset[16:] for dataset in sub_datasets_long]
act_types = ["z"]

for mode in ["honest", "liar"]:
    for act_type in act_types:
        
        file_prefixes = []
        file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
        if seq_pos is not None:
            file_prefix += f"_{seq_pos}"
        file_prefix += f"_{act_type}"


        all_labels = []
        for sub_name in sub_datasets_short:
            file_prefixes.append(f"{file_prefix}_{sub_name}")
            with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{sub_name}.pt", "rb") as handle:
                all_labels.append(torch.load(handle))
        # print(labels)
        labels = torch.cat(all_labels, dim=0)
        print(f"{labels.shape=}")
        
        act_dict[dataset_name][mode].load_acts(None, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=True, file_prefixes=file_prefixes)

        if act_type != "logits" and train_probes:
            act_dict[dataset_name][mode].train_probes(act_type, verbose=True, max_iter=10000, in_order=False)

In [None]:
from utils.analytics_utils import plot_z_probe_accuracies, plot_resid_probe_accuracies, plot_transfer_acc_subplots

train_acts = {"all_honest": act_dict["all_am"]["honest"], "all_liar": act_dict["all_am"]["liar"]}
test_acts = {"neg_facts_honest": act_dict["neg_facts"]["honest"], "neg_facts_liar": act_dict["neg_facts"]["liar"], "neg_companies_honest": act_dict["neg_companies"]["honest"], "neg_companies_liar": act_dict["neg_companies"]["liar"]}#, "conj_neg_companies_honest": act_dict["conj_neg_companies"]["honest"], "conj_neg_companies_liar": act_dict["conj_neg_companies"]["liar"]}

transfer_acc_tensors, fig = plot_transfer_acc_subplots(train_acts, test_acts)


In [None]:
fig.update_layout(
    width=800,
    height=500,
    title=f"Transfer from Probing on All Datasets to Negated Datasets",
)
fig.show()

### Misaligned Activations

In [None]:
# 
dataset_name = "decept"
act_types = ["z", "logits"]
act_dict[dataset_name] = {"misaligned": ModelActsLargeSimple()}
run_id = 101
formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"

for mode in ["misaligned"]:
    for act_type in act_types:
        
        file_prefixes = []
        file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
        if seq_pos is not None:
            file_prefix += f"_{seq_pos}"
        file_prefix += f"_{act_type}"


        all_labels = []
        file_prefixes.append(f"{file_prefix}_{dataset_name}")
        with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
            all_labels.append(torch.load(handle))
        # print(labels)
        labels = torch.cat(all_labels, dim=0)
        print(f"{labels.shape=}")
        
        act_dict[dataset_name][mode].load_acts(None, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=True, file_prefixes=file_prefixes)

        if act_type != "logits" and train_probes:
            act_dict[dataset_name][mode].train_probes(act_type, verbose=True, max_iter=10000, in_order=False)

In [None]:
act_dict["decept"]["misaligned"].get_inference_accuracy(tokenizer, scale_relative=True)[0].mean()
act_dict["decept"]["misaligned"].get_inference_accuracy(tokenizer, scale_relative=True)[1].mean()

In [None]:
train_acts = {"decept_misaligned": act_dict["decept"]["misaligned"]}
# test_acts = {"neg_facts_honest": act_dict["neg_facts"]["honest"], "neg_facts_liar": act_dict["neg_facts"]["liar"], "neg_companies_honest": act_dict["neg_companies"]["honest"], "neg_companies_liar": act_dict["neg_companies"]["liar"]}#, "conj_neg_companies_honest": act_dict["conj_neg_companies"]["honest"], "conj_neg_companies_liar": act_dict["conj_neg_companies"]["liar"]}

transfer_acc_tensors, fig = plot_transfer_acc_subplots(train_acts, train_acts)

fig.update_layout(
    width=500,
    height=500,
    title=f"Misaligned Deception Probe Accuracy",
)
fig.show()

In [None]:
train_acts = {"all_am_honest": act_dict["all_am"]["honest"], "all_am_liar": act_dict["all_am"]["liar"]}
test_acts = {"decept_misaligned": act_dict["decept"]["misaligned"]}

transfer_acc_tensors, fig = plot_transfer_acc_subplots(train_acts, test_acts)

fig.update_layout(
    width=400,
    height=1000,
    title=f"Misaligned Deception Probe Accuracy",
)
fig.show()

### Test deception onto harmful

In [None]:
harmful_act_dict = {}
act_types = ["z", "logits"]
dataset_name = "harmful"
dont_include = None
run_id = 9
data_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}"
modes = ["honest", "liar"]
seq_pos = -1

for mode in tqdm(modes):
    harmful_act_dict[mode] = ModelActsLargeSimple()
    for act_type in act_types:
        file_prefix = f"{data_folder}/activations/formatted/run_{run_id}_{mode}"
        if seq_pos is not None:
            file_prefix += f"_{seq_pos}"
        file_prefix += f"_{act_type}"
        if dataset_name is not None:
            file_prefix += f"_{dataset_name}"

        with open(f"{data_folder}/activations/formatted/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
            labels = torch.load(handle)
            # print(f"{labels.shape=}")
        # print(labels)
        true_indices = torch.where(labels != 0)[0]
        false_indices = torch.where(labels == 0)[0]
        labels = labels[true_indices]
        # print(labels)

        harmful_act_dict[mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=false_indices, act_type=act_type)

In [None]:
train_acts = {"decept_misaligned": act_dict["decept"]["misaligned"]}
test_acts = {"harmful_honest": harmful_act_dict["honest"], "harmful_liar": harmful_act_dict["liar"]}

transfer_acc_tensors, fig = plot_transfer_acc_subplots(train_acts, test_acts)

fig.update_layout(
    width=800,
    height=500,
    title=f"Misaligned Deception Probe Transferred to Harmful",
)
fig.show()

### Standard z transfer accuracy on azaria mitchell

In [None]:
# Honest-Liar Probe Transfer for AZ Facts
from utils.analytics_utils import plot_z_probe_accuracies, plot_resid_probe_accuracies, plot_transfer_acc_subplots

dataset_name = "all_am"
train_acts = {mode: act_dict[dataset_name][mode] for mode in ["honest", "liar"]}

_, fig = plot_transfer_acc_subplots(train_acts, train_acts)
fig.update_layout(
    width=400,
    height=500,
    title=f"Transfer z Probe Accuracies on {dataset_name}"
)
fig.show()

In [None]:
_, fig = plot_transfer_acc_subplots(train_acts, train_acts, cosine_sim=True)
fig.update_layout(
    width=600,
    height=800,
    title=f"Cosine Similarities of Probe Coefficients on azaria_mitchell_{dataset_name}"
)
fig.show()

### Different Models
LLama 1 vs LLama 2

In [37]:
act_dict_llama_1 = {}
datasets = ["facts", "companies"]
# datasets = ["misaligned"]
# modes = ["honest", "neutral", "liar"]
modes = ["honest", "neutral", "liar"]
for dataset_name in datasets:
    act_dict_llama_1[dataset_name] = {}
    for mode in modes:
        act_dict_llama_1[dataset_name][mode]: ModelActsLargeSimple = ModelActsLargeSimple()

seq_pos = -1
act_type = "z"
# act_types = ["logits", "resid_mid", "z", "mlp_out"]
act_types = ["z", "logits"]
# modes = ["honest", "liar"]
dont_include = None
run_id = 7
formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"
# formatted_data_folder = f"/mnt/ssd-2/jamescampbell3/data/large_run_{run_id}/activations/formatted"
train_probes = True

for dataset_name in tqdm(datasets):
    for mode in modes:
        print(f"Loading activations from {mode} model on {dataset_name}")
        for act_type in act_types:
            file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
            if seq_pos is not None:
                file_prefix += f"_{seq_pos}"
            file_prefix += f"_{act_type}"
            if dataset_name is not None:
                file_prefix += f"_{dataset_name}"

            with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
                labels = torch.load(handle)
            
            act_dict_llama_1[dataset_name][mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=False)

            if act_type != "logits" and train_probes:
                act_dict_llama_1[dataset_name][mode].train_probes(act_type, verbose=False, max_iter=10000)
    print(f"{dataset_name} dataset Size: {labels.shape[0]}")

  0%|          | 0/2 [00:00<?, ?it/s]

Loading activations from honest model on facts
Loading activations from neutral model on facts
Loading activations from liar model on facts


 50%|█████     | 1/2 [01:28<01:28, 88.21s/it]

facts dataset Size: 599
Loading activations from honest model on companies
Loading activations from neutral model on companies
Loading activations from liar model on companies


100%|██████████| 2/2 [03:48<00:00, 114.11s/it]

companies dataset Size: 1200





In [38]:
act_dict_llama_2 = {}
for dataset_name in datasets:
    act_dict_llama_2[dataset_name] = {}
    for mode in modes:
        act_dict_llama_2[dataset_name][mode]: ModelActsLargeSimple = ModelActsLargeSimple()

modes = ["honest"]
seq_pos = -1
act_type = "z"
act_types = ["z"]
dont_include = None
run_id = 6
formatted_data_folder = f"/home/phillipguo/formatted_runs/run_{run_id}"
train_probes = True

for dataset_name in tqdm(datasets):
    for mode in modes:
        print(f"Loading activations from {mode} model on {dataset_name}")
        for act_type in act_types:
            file_prefix = f"{formatted_data_folder}/run_{run_id}_{mode}"
            if seq_pos is not None:
                file_prefix += f"_{seq_pos}"
            file_prefix += f"_{act_type}"
            if dataset_name is not None:
                file_prefix += f"_{dataset_name}"

            with open(f"{formatted_data_folder}/labels_{run_id}_{mode}_{seq_pos}_z_{dataset_name}.pt", "rb") as handle:
                labels = torch.load(handle)
            
            act_dict_llama_2[dataset_name][mode].load_acts(file_prefix, n_layers, n_heads=n_heads, labels=labels, exclude_points=dont_include, act_type=act_type, verbose=False)

            if act_type != "logits" and train_probes:
                act_dict_llama_2[dataset_name][mode].train_probes(act_type, verbose=False, max_iter=10000)
    print(f"{dataset_name} dataset Size: {labels.shape[0]}")

  0%|          | 0/2 [00:00<?, ?it/s]

Loading activations from honest model on facts


 50%|█████     | 1/2 [00:27<00:27, 27.64s/it]

facts dataset Size: 599
Loading activations from honest model on companies


100%|██████████| 2/2 [01:09<00:00, 34.77s/it]

companies dataset Size: 1200





In [35]:
# train_acts = {"llama_2_facts": act_dict_llama_2["facts"]["honest"], "llama_2_companies": act_dict_llama_2["companies"]["honest"]}
# test_acts = {"llama_1_facts": act_dict_llama_1["facts"]["honest"], "llama_1_companies": act_dict_llama_1["companies"]["honest"]}
train_acts = {"llama_1": act_dict_llama_1["facts"]["honest"], "llama_2": act_dict_llama_2["facts"]["honest"]}
_, fig = plot_transfer_acc_subplots(train_acts, train_acts)

llama_1 -> llama_1
llama_1 -> llama_2
llama_2 -> llama_1
llama_2 -> llama_2


In [36]:
fig.update_layout(
    width=600,
    height=800,
    title=f"Transfer Accuracies Between Llama 1 and Llama 2, Honest"
)
fig.show()

## Analyze Probe Accuracy vs Inference Accuracy
Make plot of probe accuracies vs inference accuracy

In [None]:
# Get indices of rows of each dataset in the big dataset
from datasets import load_dataset
dataset_name = "notrichardren/truthfulness_high_quality"
dataset = load_dataset(dataset_name)

#get the rows that have azaria_mitchell_facts as their value for the dataset column
dataset_indices = {dataset_name: [row['ind'] for row in dataset["combined"] if row['dataset'] == dataset_name] for dataset_name in tqdm(datasets)}

In [None]:
# Format logits into formatted style: run_{run_id}_{mode}_{seq_pos}_logits_{dataset_name}.pt
seq_pos = -1
for dataset_name in tqdm(datasets):
    
    for mode in ["honest", "liar"]:
        logits = []
        for data_index in dataset_indices[dataset_name]:
            with open(f"activations/inference_outputs/logits_{run_id}_{mode}_{data_index}.pt", "rb") as handle:
                logits.append(torch.load(handle))
        logits = torch.cat(logits, dim=0)
        with open(f"activations/formatted/run_{run_id}_{mode}_{seq_pos}_logits_{dataset_name}.pt", "wb") as handle:
            torch.save(logits, handle)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

act_type = "z"

# Initialize plot
plt.figure(figsize=(10, 8))

# Define a color map
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', 's']

dataset_legend_handles = []

for dataset_idx, dataset_name in enumerate(tqdm(datasets)):
    for mode_idx, mode in enumerate(modes):
        correct_probs, incorrect_probs = act_dict[dataset_name][mode].get_inference_accuracy(tokenizer, scale_relative=True)
        
        correct_prob = np.mean(correct_probs)
        probe_accs = acc_tensor_from_dict(act_dict[dataset_name][mode].probe_accs[act_type], n_layers=n_layers, n_heads=n_heads)
        probe_acc = np.percentile(probe_accs, 95)
        sc = plt.scatter(correct_prob, probe_acc, color=colors[dataset_idx], label=f"{dataset_name}-{mode}", marker=markers[mode_idx], s=100)

        if mode_idx == 0:
            dataset_legend_handles.append(sc)

mode_legend_handles = []
for marker in markers:
    mode_legend_handles.append(mlines.Line2D([], [], color='black', marker=marker, linestyle='None', markersize=10))


plt.xlabel("Model Inference Accuracy", fontsize=13)
plt.ylabel("Probe Accuracy (95th Percentile)", fontsize=13)
plt.title("Scatter Plot of Model Inference Accuracy vs Probe Accuracy", fontsize=15)

# Add legend using only the handles we've collected
first_legend = plt.legend(dataset_legend_handles, datasets, title='Datasets', loc='lower right')
# Add the legend manually to the current Axes.
ax = plt.gca().add_artist(first_legend)

plt.legend(mode_legend_handles, modes, title="Modes", loc="center right")
plt.show()

In [None]:
from utils.cache_utils import create_probe_dataset
data_dir = "/mnt/ssd-2/jamescampbell3"
act_type = "z"
run_id = 6
save_dir = f"/home/phillipguo/formatted_runs/run_{run_id}"

azaria_mitchell_splits = ['cities', 
                          'capitals', 
                          'companies', 
                          'animals', 
                          'elements', 
                          'inventions', 
                          'facts', 
                          'neg_companies', 
                          'neg_facts', 
                          'conj_neg_companies', 
                          'conj_neg_facts'
                          ]
for split in azaria_mitchell_splits:
    create_probe_dataset(run_id=run_id, seq_pos=-1, prompt_tag="liar", act_type=act_type, data_dir=data_dir, splits=[split], save_path=save_dir)
