In [1]:
import itertools
from datetime import datetime

import submitit 

from auto_circuit.tasks import (
    IOI_COMPONENT_CIRCUIT_TASK, 
    IOI_GPT2_AUTOENCODER_COMPONENT_CIRCUIT_TASK, 
    DOCSTRING_TOKEN_CIRCUIT_TASK, 
    DOCSTRING_COMPONENT_CIRCUIT_TASK, 
    GREATERTHAN_COMPONENT_CIRCUIT_TASK, 
    GREATERTHAN_GPT2_AUTOENCODER_COMPONENT_CIRCUIT_TASK, 
)
from elk_experiments.utils import repo_path_to_abs_path, OUTPUT_DIR
from elk_experiments.auto_circuit.tasks import IOI_TOKEN_CIRCUIT_TASK
from auto_circuit.types import AblationType
from elk_experiments.auto_circuit.score_funcs import GradFunc, AnswerFunc

# Specify Settings / Hyperpameters

In [12]:
#specify the hyperparameters
tasks = [
    IOI_TOKEN_CIRCUIT_TASK, 
    IOI_COMPONENT_CIRCUIT_TASK, 
    DOCSTRING_TOKEN_CIRCUIT_TASK, 
    DOCSTRING_COMPONENT_CIRCUIT_TASK, 
    GREATERTHAN_COMPONENT_CIRCUIT_TASK, 
    # IOI_GPT2_AUTOENCODER_COMPONENT_CIRCUIT_TASK, 
    # GREATERTHAN_GPT2_AUTOENCODER_COMPONENT_CIRCUIT_TASK, 
    # ANIMAL_DIET_GPT2_AUTOENCODER_COMPONENT_CIRCUIT_TASK
]
use_abs = [True, False]
ablation_types = [
    AblationType.RESAMPLE, 
    AblationType.TOKENWISE_MEAN_CLEAN,
    AblationType.TOKENWISE_MEAN_CORRUPT, 
    AblationType.TOKENWISE_MEAN_CLEAN_AND_CORRUPT,
    # AblationType.ZERO
]
grad_funcs = [
    GradFunc.LOGIT, 
    GradFunc.LOGPROB
]
answer_funcs = [
    AnswerFunc.MAX_DIFF, 
    AnswerFunc.AVG_VAL
]

use_abs_to_epsilon = {
    True: [0.1], 
    False: [0.0, -0.4]
}

# generate all combinations
combinations = [
    {
        "task": f"'{task.key}'",
        "use_abs": use_ab,
        "ablation_type": ablation_type.name,
        "grad_func": grad_func.name,
        "answer_func": answer_func.name,
        "epsilon": epsilon
    }
    for task, use_ab, ablation_type, grad_func, answer_func in itertools.product(
        tasks,
        use_abs,
        ablation_types,
        grad_funcs,
        answer_funcs
    )
    for epsilon in use_abs_to_epsilon[use_ab]
]

# combinations = [
#     {
#         "task": f"'{IOI_TOKEN_CIRCUIT_TASK.key}'",
#         "use_abs": False,
#         "ablation_type": AblationType.TOKENWISE_MEAN_CORRUPT.name,
#         "grad_func": GradFunc.LOGPROB.name,
#         "answer_func": AnswerFunc.AVG_VAL.name,
#         "epsilon": 0.0
#     }
# ]

# Setup Executor and Run

In [3]:
# setup the executor
out_dir = repo_path_to_abs_path(OUTPUT_DIR / "hypo_test_out_logs" / datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
out_dir.mkdir(exist_ok=True, parents=True)
executor = submitit.AutoExecutor(folder=out_dir)
num_jobs_parallel = 8
executor.update_parameters(
    timeout_min=60*24,
    mem_gb=40,
    gres="gpu:1",
    cpus_per_task=8,
    nodes=1,
    slurm_qos="high", 
    slurm_array_parallelism=num_jobs_parallel
)




In [4]:
# run the jobs
with executor.batch():
    jobs = []
    for combo in combinations:
        function = submitit.helpers.CommandFunction(
            ["python", "scripts/hypothesis_tests.py"] + [
                f"{key}={value}" for key, value in combo.items()
            ], 
            cwd=repo_path_to_abs_path(".")
        )
        jobs.append(executor.submit(function))

In [8]:
len(jobs)

48

In [9]:
job = jobs[47]
# read stdout and stderr
print(job.stdout())
# print(job.stderr())

submitit INFO (2024-08-14 15:22:36,045) - Starting with JobEnvironment(job_id=432143_47, hostname=ppo.ist.berkeley.edu, local_rank=0(1), node=0(1), global_rank=0(1))
submitit INFO (2024-08-14 15:22:36,045) - Loading pickle: /nas/ucb/oliveradk/elk-experiments/output/hypo_test_out_logs/2024-08-14_14-45-35/432143_47_submitted.pkl
The following command is sent: "python scripts/hypothesis_tests.py task='Indirect Object Identification Component Circuit' use_abs=False ablation_type=TOKENWISE_MEAN_CLEAN_AND_CORRUPT grad_func=LOGPROB answer_func=AVG_VAL epsilon=-0.4"
Loaded pretrained model gpt2-small into HookedTransformer
seq_len before divergence None
seq_len after divergence None
Saving cache to /nas/ucb/oliveradk/elk-experiments/output/hypo_test_results/Indirect_Object_Identification_Component_Circuit_TOKENWISE_MEAN_CLEAN_AND_CORRUPT_LOGPROB_AVG_VAL_10/False_0.05_-0.4_0.9/prune_scores-14-08-2024_15-23-13.pkl
interval: 0 - 32491
width 1000
interval: 32000 - 32491
width 100
interval: 32400 -

In [14]:
combinations[0]

{'task': "'Indirect Object Identification Token Circuit'",
 'use_abs': True,
 'ablation_type': 'RESAMPLE',
 'grad_func': 'LOGIT',
 'answer_func': 'MAX_DIFF',
 'epsilon': 0.1}

In [12]:
len(combinations) / num_jobs_parallel * 7 / 60 # roughly 4 hours

4.375

# Analysize Results

## Pathologies of Ablations and Score Functions

- explore some examples on IOI in depth, show how e.g. mean clean ablation just attends from source to test for avg val, similar for max value
- zero ablation requires almost every edge, too out of distribution
- mean corrupt leads to more "task identification" mechanisms
- resample ablation allows for patching edges not reached by input, is in this sense more natural (someone has said this before), but also higher variance and less general (constructing contrast pairs is expensive)

## Synergies Between Score Functions and Ablation Methodologies

In [None]:
# I want to plot fraction of edges required for faithfulness, with bar plot seperated by ablation methodology and and score func (
    # generate different plot per answer function

In [14]:
from auto_circuit.tasks import Task
def get_exp_dir(
    task_key: str,
    ablation_type: AblationType,
    grad_func: GradFunc,
    answer_func: AnswerFunc,
    ig_samples: int, 
    use_abs: bool,
    alpha: float,
    epsilon: float,
    q_star: float
):
    return repo_path_to_abs_path(
        OUTPUT_DIR / "hypo_test_results" / f"{task_key.replace(' ', '_')}_{ablation_type.name}_{grad_func.name}_{answer_func.name}_{ig_samples}" / f"{use_abs}_{alpha}_{epsilon}_{q_star}"
    )

In [15]:
import json
from collections import defaultdict
ig_default = 10
alpha_default = 0.05
q_star_default = 0.9
# epsilon_default = use_abs_to_epsilon[use_abs_default][0]
# grad_func_default = GradFunc.LOGPROB
# tasks = [IOI_TOKEN_CIRCUIT_TASK, DOCSTRING_TOKEN_CIRCUIT_TASK]
# answer_funcs = [AnswerFunc.MAX_DIFF, AnswerFunc.AVG_VAL]
# ablation_types = ablation_types

# log frac of total edges needed 

frac_total_edges = defaultdict(dict) # dictionary from 
for combo in combinations:
    task_key = combo["task"][1:-1] # strip quotes
    ablation_type = AblationType[combo["ablation_type"]]
    grad_func_default = GradFunc[combo["grad_func"]]
    answer_func = AnswerFunc[combo["answer_func"]]
    use_ab = combo["use_abs"]
    epsilon = combo["epsilon"]
    # get path to results 
    exp_path = get_exp_dir(task_key, ablation_type, grad_func_default, answer_func, ig_default, use_ab, alpha_default, epsilon, q_star_default)
    # load equiv test results 
    with open(exp_path / "equiv_results.json", "r") as f:
        equiv_test_results: dict[int, dict] = json.load(f)
    total_edges = max([int(edge_count) for edge_count in equiv_test_results.keys()])
    equivs = [int(edge_count) for edge_count, result in equiv_test_results.items() if not result["not_equiv"]]
    min_equiv = min(equivs) if len(equivs) > 0 else total_edges
    frac_total_edges[(task_key, use_ab, epsilon)][(ablation_type, answer_func)] = (min_equiv / total_edges, min_equiv, total_edges)


In [17]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

scale = 'log' 
# scale = 'linear'

def limit_image_size(fig, ax, max_size=65000):  # 2^16 - 1 = 65535, we use 65000 for safety
    while True:
        fig.canvas.draw()  # Render the figure
        width, height = fig.canvas.get_width_height()
        if width <= max_size and height <= max_size:
            break
        # If too large, reduce figure size
        fig_size = fig.get_size_inches()
        fig.set_size_inches(fig_size[0] * 0.9, fig_size[1] * 0.9)
    return fig, ax

# Preparing data for plotting
ablation_answer_combinations = list(itertools.product(ablation_types, answer_funcs))

# Create a figure for each combination of epsilon and use_ab
for use_ab in use_abs:
    for epsilon in use_abs_to_epsilon[use_ab]:
        # Filter task combinations for current use_ab and epsilon
        task_combos = [
            (task.key, use_ab, epsilon) 
            for task in tasks
        ]
        n_tasks = len(task_combos)
        n_combinations = len(ablation_answer_combinations)

        # Create a new figure for each combination
        fig, ax = plt.subplots(figsize=(16, 9), dpi=100)  # Start with a smaller figure size
        index = np.arange(n_tasks)
        bar_width = 0.8 / n_combinations
        opacity = 0.8

        for i, combination in enumerate(ablation_answer_combinations):
            values = [frac_total_edges[task_combo][combination][0] for task_combo in task_combos]
            labels = [frac_total_edges[task_combo][combination][1] for task_combo in task_combos]
            
            # Adjust the position of each group of bars
            position = index + (i - n_combinations/2 + 0.5) * bar_width
            
            bars = ax.bar(position, values, bar_width,
                          alpha=opacity,
                          label=f'{combination[0]} & {combination[1]}')

            # Adding labels above each bar (removed for simplicity and size reduction)
            # for bar, label in zip(bars, labels):
            #     ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), label,
            #             ha='center', va='bottom', rotation=90, fontsize=6)

        # Labels and titles
        ax.set_xlabel('Tasks')
        ax.set_ylabel('Fraction of Total Edges')
        ax.set_title(f'Fraction of Total Edges by Task (use_ab={use_ab}, epsilon={epsilon})')
        ax.set_xticks(index)
        ax.set_xticklabels([task_combo[0] for task_combo in task_combos], rotation=90, ha='center', fontsize=8)
        
        # Move the legend outside the plot
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., fontsize=8)

        ax.set_yscale(scale)

        # Adjust the layout to make room for the legend
        plt.tight_layout()
        plt.subplots_adjust(right=0.75)

        # Limit the image size
        fig, ax = limit_image_size(fig, ax)

        # Save the figure with controlled size
        plt.savefig(f'fraction_edges_use_ab_{use_ab}_epsilon_{epsilon}.png', 
                    bbox_inches='tight', 
                    dpi=100)
        plt.close(fig)

print("All figures have been generated and saved.")

All figures have been generated and saved.


## Minimality / Completeness Tradeoff