In [1]:
import torch as t
from llama_wrapper import LlamaWrapper
import os
from dotenv import load_dotenv
from matplotlib import pyplot as plt
from IPython.display import display, HTML
import matplotlib
from utils.tokenize import tokenize_llama_chat
from behaviors import SYCOPHANCY, get_steering_vector

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
BEHAVIOR = SYCOPHANCY

In [3]:
model = LlamaWrapper(HUGGINGFACE_TOKEN)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 39.39 GiB total capacity; 12.11 GiB already allocated; 140.81 MiB free; 12.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Calculating dot products between steering vectors and activations

In [None]:
def value_to_color(value, cmap=plt.cm.RdBu, vmin=-25, vmax=25):
    # Convert value to a range between 0 and 1
    norm = plt.Normalize(vmin=vmin, vmax=vmax)
    rgba = cmap(norm(value))
    return matplotlib.colors.to_hex(rgba)


def display_token_dot_products(data):
    html_content = ""
    max_dist_from_zero = max([abs(x[1]) for x in data])
    for token, value in data:
        color = value_to_color(value, vmin=-1 * max_dist_from_zero, vmax=max_dist_from_zero)
        html_content += f"<span style='background-color: {color}; padding: 2px 5px; margin: 2px; border-radius: 3px;'>{token} ({value:.4f})</span>"
    display(HTML(html_content))
    
def display_token_dot_products_final_text(data, text, tokenizer):
    html_content = "<div>"
    max_dist_from_zero = max([abs(x[1]) for x in data])
    tokens = tokenizer.encode(text)
    tokens = tokenizer.batch_decode(t.tensor(tokens).unsqueeze(-1))
    for idx, (_, value) in enumerate(data):
        color = value_to_color(value, vmin=-1 * max_dist_from_zero, vmax=max_dist_from_zero)
        html_content += f"<span style='background-color: {color}; padding: 2px 5px; margin: 2px; border-radius: 3px;'>{tokens[idx].strip()} ({value:.4f})</span>"
    html_content += "</div>"
    display(HTML(html_content))

In [None]:
layer = 16
model.reset_all()
max_new_tokens = 20
vec = get_steering_vector(BEHAVIOR, layer, model.model_name_path)
model.set_save_internal_decodings(False)
model.set_calc_dot_product_with(layer, vec.cuda())
model_input = "If you disagree with a friend, should you tell them or stay quiet and keep the peace?"
m_out = model.generate_text(model_input, max_new_tokens=max_new_tokens)
m_out_answer = m_out.split("[/INST]")[-1].strip()
print("Output:", m_out_answer)
dp = model.get_dot_products(layer)
display(HTML('<style>del, s, strike, .line-through { text-decoration: none !important; }</style>'))
display_token_dot_products(dp)

In [None]:
display_token_dot_products_final_text(dp, m_out_answer, model.tokenizer)

# Intermediate decoding ("logit lens")

In [None]:
def prompt_to_tokens(instruction, model_output):
    tokens = tokenize_llama_chat(
        model.tokenizer,
        instruction,
        model_output
    )
    return t.tensor(tokens).unsqueeze(0)

In [None]:
layer = 15
multiplier = 0
vec = get_steering_vector(layer, model.model_name_path)
model.set_save_internal_decodings(True)
model.reset_all()
model.set_add_activations(layer, multiplier * vec.cuda())
instruction = "Is it more important to be nice or to be honest?"
tokens = prompt_to_tokens(
    instruction=instruction,
    model_output="",
)
model.decode_all_layers(tokens)

In [None]:
multiplier = -1.5
model.reset_all()
model.set_add_activations(layer, multiplier * vec.cuda())
model.plot_decoded_activations_for_layer(25, tokens, 10)

In [None]:
multiplier = 1.5
model.reset_all()
model.set_add_activations(layer, multiplier * vec.cuda())
model.plot_decoded_activations_for_layer(25, tokens, 10)