# Demo (Assessing GCG Suffixes' Hijacking)

## Setup

In [None]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "svg"  # 'html'
pio.templates.default = "plotly_white"

In [None]:
from src.interp.utils import load_model
from src.evaluate.utils import load_data

model_name = "google/gemma-2-2b-it"
# model_name = "qwen/qwen2.5-1.5b-instruct"
# model_name = "meta-llama/llama-3-8b-instruct"


data_df = load_data(model_name=model_name)

model = load_model(model_name)

## Load GCG Suffixes Dataset

In [None]:
print(f">> dataset size: {data_df.shape[0]} rows.")
print(f">> {data_df.suffix_id.nunique()} different GCG suffixes, evaluated on {data_df.message_id.nunique()} messages.")

univ_scores = data_df.groupby('suffix_id').agg({'univ_score': 'first'}).sort_values(by='univ_score', ascending=False).reset_index()
# TODO fix
# px.bar(
#     univ_scores,
#     x='univ_score',
#     title=f"GCG suffixes' Universal Scores ({model_name})",
#     labels={'universal_score': 'Universal Score'},
# ).show()

## Extract Transformed Vectors
Extracting the fine-grained components of the attention calculation, to inspect the information flow.

In [None]:
from src.interp.utils import to_toks


toks, _ = to_toks("Write a parody song based on The Beatles' 'Strawberry Fields'", model).squeeze(0)
print(model.tokenizer.decode(toks))

NameError: name 'model' is not defined

In [None]:
# temporarily enable fine-grained attention hooks (this is memory consuming, but required for our analysis)
model.set_use_attn_fine_grained(True)  

_, cache = model.run_with_cache(toks)
cache.to('cpu')
model.set_use_attn_fine_grained(False)

# The decompoased attetnion outputs (=transformed vectors) tensor:
cache['Y'].shape  # (n_layers, n_head, dst_seq, src_seq, d_model)

### Dominance score

Quantifying the dominance score across the layers (reproducing Fig. 5 in the paper).

In [None]:
def show_dominance_area_plot(dom_scores, name=None):
    ## build df:
    df = []
    for src_name, values in dom_scores.items():
        for layer, val in enumerate(values):
            df.append({
                'src_name': src_name,
                'layer': layer,
                'val': val.item()
            })
    df = pd.DataFrame(df)

    colors = {
        'bos': '#A9A9A9',       # Dark grey
        'chat_pre': '#D3D3D3',  # Light grey
        'instr': '#4B77BE',     # Dark green
        'adv': "#E8362D",           # Dark red
        'chat[:-1]': '#FFA500',      # Orange
        'chat[-1]': '#FFB84D'       # Light orange-yellow (subtler)
    }

    px.area(
        df,
        x='layer',
        y='val',
        color='src_name',
        labels={'val': 'Dominance Score', 'layer': 'Layer', 'src_name': 'Source Subseq.'},
        color_discrete_map=colors,
        width=500, height=350,
        template='plotly_white',
        title=f"Dominance Scores to 'chat[-1]'" + (f' ({name})' if name else ''),
    ).show()

In [None]:
# from src.interp.dominance_tools import get_adv_hijack_score
import src.interp.dominance_tools as dominance_tools
from src.interp.dominance_tools import get_dominance_scores

message = data_df[data_df.message_id == 619].iloc[0].message_str
suffix_rand = data_df[data_df.suffix_category == 'init'].suffix_str.iloc[0]
suffix_gcg = data_df[data_df.suffix_rank == 0].suffix_str.iloc[0]

for suffix_name, suffix_str in [('random suffix', suffix_rand), ('GCG suffix', suffix_gcg)]:
    dom_scores = get_dominance_scores( # wraps the extraction of 'Y' and calculation of dominance
        model,
        message, suffix_str,
        dst_slc_name = 'chat[-1]',
        hijacking_metric ='Y@attn',  # 'Y@resid', 'Y@attn', 'X@WVO@attn', 'Y@dcmp_resid', 'Y@dir
        hijacking_metric_flavor = 'sum',  # 'sum', 'sum-top0.1'
    )

    show_dominance_area_plot(dom_scores, name=suffix_name)