# Setup

In [1]:
# Setup
from neel.imports import *
from neel_plotly import *

In IPython
In IPython
Set autoreload
Imported everything!


In [2]:
import neel.utils as nutils

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f833942c7c0>

In [3]:
n_ctx = 256
model = HookedTransformer.from_pretrained("pythia-70m")
dataset = load_dataset("NeelNanda/pile-10k", split="train")
token_dataset = utils.tokenize_and_concatenate(
    dataset, model.tokenizer, max_length=n_ctx
).shuffle(42)
pile_tokens = token_dataset["tokens"].cuda()
print(f"pile_tokens.shape: {pile_tokens.shape}")
print(f"pile_tokens first: {model.to_string(pile_tokens[0, :30])}")

W_OU = einops.einsum(
    model.W_O,
    model.W_U,
    "layer head d_head d_model, d_model d_vocab -> layer head d_head d_vocab",
)
print("W_OU.shape:", W_OU.shape)

n_layers = model.cfg.n_layers
# n_ctx = model.cfg.n_ctx
n_heads = model.cfg.n_heads
d_model = model.cfg.d_model
d_vocab = model.cfg.d_vocab
d_head = model.cfg.d_head
d_mlp = model.cfg.d_mlp

Using pad_token, but it is not set yet.


Loaded pretrained model pythia-70m into HookedTransformer


Found cached dataset parquet (/workspace/cache/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /workspace/cache/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7cf6e702038e0709_*_of_00010.arrow
Loading cached shuffled indices for dataset at /workspace/cache/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-fa8114ce633c3f97.arrow


pile_tokens.shape: torch.Size([60194, 256])
pile_tokens first: <|endoftext|>
            next = new CssToken(this.index, this.column, this.line, CssTokenType.EOF, '
W_OU.shape: torch.Size([6, 8, 64, 50304])


## Utils

In [4]:
# Utils

SPACE = "·"
NEWLINE = "↩"
TAB = "→"


def process_token(s):
    if isinstance(s, torch.Tensor):
        s = s.item()
    if isinstance(s, np.int64):
        s = s.item()
    if isinstance(s, int):
        s = model.to_string(s)
    s = s.replace(" ", SPACE)
    s = s.replace("\n", NEWLINE + "\n")
    s = s.replace("\t", TAB)
    return s


process_tokens = lambda l: [process_token(s) for s in l]
process_tokens_index = lambda l: [f"{process_token(s)}/{i}" for i, s in enumerate(l)]


def create_vocab_df(logit_vec, make_probs=False, full_vocab=None):
    if full_vocab is None:
        full_vocab = process_tokens(
            model.to_str_tokens(torch.arange(model.cfg.d_vocab))
        )
    vocab_df = pd.DataFrame({"token": full_vocab, "logit": to_numpy(logit_vec)})
    if make_probs:
        vocab_df["log_prob"] = to_numpy(logit_vec.log_softmax(dim=-1))
        vocab_df["prob"] = to_numpy(logit_vec.softmax(dim=-1))
    return vocab_df.sort_values("logit", ascending=False)


from html import escape
import colorsys

from IPython.display import display


def create_html(strings, values, saturation=0.5):
    # escape strings to deal with tabs, newlines, etc.
    escaped_strings = [escape(s, quote=True) for s in strings]
    processed_strings = [
        s.replace("\n", "<br/>").replace("\t", "&emsp;").replace(" ", "&nbsp;")
        for s in escaped_strings
    ]

    # scale values
    max_value = max(max(values), -min(values))
    scaled_values = [v / max_value * saturation for v in values]

    # create html
    html = ""
    for s, v in zip(processed_strings, scaled_values):
        if v < 0:
            hue = 0  # hue for red in HSV
        else:
            hue = 0.66  # hue for blue in HSV
        rgb_color = colorsys.hsv_to_rgb(
            hue, v, 1
        )  # hsv color with hue 0.66 (blue), saturation as v, value 1
        hex_color = "#%02x%02x%02x" % (
            int(rgb_color[0] * 255),
            int(rgb_color[1] * 255),
            int(rgb_color[2] * 255),
        )
        html += f'<span style="background-color: {hex_color}; border: 1px solid lightgray; font-size: 16px; border-radius: 3px;">{s}</span>'

    display(HTML(html))


s = create_html(["a", "b\nd", "c        d"], [1, -2, -3])

def add_to_df(df, name, tensor):
    df[name] = to_numpy(tensor.flatten())
    return df

## Metrics

In [5]:
# Mover metrics
def get_head_dla(cache: ActivationCache, tokens: torch.Tensor):
    z = cache.stack_activation("z")
    z = z[:, :, :-1, :, :]
    # print("z.shape", z.shape) # [layer, batch, pos, head, d_head]
    W_OU_tokens = W_OU[:, :, :, tokens[:, 1:]]
    W_OU_tokens_scaled = W_OU_tokens / cache["scale"][:, :-1, 0]

    # print("W_OU_tokens_scaled.shape", W_OU_tokens_scaled.shape) # [layer head d_head batch pos]
    head_dla = einops.einsum(
        z,
        W_OU_tokens_scaled,
        "layer batch pos head d_head, layer head d_head batch pos -> layer head batch pos",
    )
    return head_dla


def max_prev_attended_to_token(cache):
    patterns = cache.stack_activation("pattern")
    # print("patterns.shape", patterns.shape) #[layer, batch head dest_pos src_pos]
    max_with_bos, argmax_with_bos = patterns.max(dim=-1)
    argmax_with_bos = einops.rearrange(
        argmax_with_bos, "layer batch head dest_pos -> layer head batch dest_pos"
    )[..., :-1]
    max_with_bos = einops.rearrange(
        max_with_bos, "layer batch head dest_pos -> layer head batch dest_pos"
    )[..., :-1]
    patterns[:, :, :, :, 0] = 0.0
    max_without_bos, argmax_without_bos = patterns.max(dim=-1)
    argmax_without_bos = einops.rearrange(
        argmax_without_bos, "layer batch head dest_pos -> layer head batch dest_pos"
    )[..., :-1]
    max_without_bos = einops.rearrange(
        max_without_bos, "layer batch head dest_pos -> layer head batch dest_pos"
    )[..., :-1]

    return argmax_with_bos, argmax_without_bos, max_with_bos, max_without_bos


def argmax_attn_to_token(argmax_token_index, tokens):
    predicted_token = torch.stack(
        [tokens[i][argmax_token_index[:, :, i, :]] for i in range(len(tokens))]
    )
    return predicted_token


def get_mover_attn_score(cache: ActivationCache, tokens: torch.Tensor):
    """
    Return the average attention paid to copies of the next token
    """
    if len(tokens.shape) == 1:
        tokens = tokens[None, :]

    patterns = cache.stack_activation("pattern")
    is_next_token = tokens[:, 1:, None] == tokens[:, None, :-1]
    filtered_pattern = patterns[:, :, :, :-1, :-1] * is_next_token[None, :, None, :, :]
    return einops.reduce(
        filtered_pattern,
        "layer batch head dest_pos src_pos -> layer head batch dest_pos",
        "sum",
    )


def get_mover_dla_score(cache: ActivationCache, tokens: torch.Tensor):
    """
    Return the average attention paid to copies of the next token
    """
    if len(tokens.shape) == 1:
        tokens = tokens[None, :]

    patterns = cache.stack_activation("pattern")
    is_next_token = tokens[:, 1:, None] == tokens[:, None, :-1]
    filtered_pattern = patterns[:, :, :, :-1, :-1] * is_next_token[None, :, None, :, :]
    v = cache.stack_activation("v")[:, :, :-1, :, :]
    filtered_z = einops.einsum(
        v,
        filtered_pattern,
        "layer batch src_pos head d_head, layer batch head dest_pos src_pos -> layer batch dest_pos head d_head",
    )

    # print("z.shape", z.shape) # [layer, batch, pos, head, d_head]
    W_OU_tokens = W_OU[:, :, :, tokens[:, 1:]]
    W_OU_tokens_scaled = W_OU_tokens / cache["scale"][:, :-1, 0]

    # print("W_OU_tokens_scaled.shape", W_OU_tokens_scaled.shape) # [layer head d_head batch pos]
    # print(filtered_z.shape)
    # print(W_OU_tokens_scaled.shape)
    mover_head_dla = einops.einsum(
        filtered_z,
        W_OU_tokens_scaled,
        "layer batch pos head d_head, layer head d_head batch pos -> layer head batch pos",
    )
    return mover_head_dla

# Gathering Data

## Running the Model

In [6]:
# Actually running the model on the data distribution
torch.set_grad_enabled(False)
num_prompts = 288
batch_size = 32
head_dla_list = []
mover_attn_score_list = []
mover_dla_score_list = []
plps_list = []

for i in tqdm.tqdm(range(0, num_prompts, batch_size)):
    tokens = pile_tokens[i : i + batch_size]
    logits, cache = model.run_with_cache(tokens)
    plps_list.append(model.loss_fn(logits, tokens, per_token=True))
    head_dla = get_head_dla(cache, tokens)
    head_dla_list.append(head_dla)
    mover_attn_score_list.append(get_mover_attn_score(cache, tokens))
    mover_dla_score_list.append(get_mover_dla_score(cache, tokens))

  0%|          | 0/9 [00:00<?, ?it/s]

In [7]:
plps = torch.cat(plps_list, dim=0)
head_dla = torch.cat(head_dla_list, dim=2)
print(head_dla.shape)
mover_attn_score = torch.cat(mover_attn_score_list, dim=2)
mover_dla_score = torch.cat(mover_dla_score_list, dim=2)

head_dla_flat = einops.rearrange(
    head_dla, "layer head batch pos -> (layer head) (batch pos)"
)
mover_attn_flat = einops.rearrange(
    mover_attn_score, "layer head batch pos -> (layer head) (batch pos)"
)
mover_dla_flat = einops.rearrange(
    mover_dla_score, "layer head batch pos -> (layer head) (batch pos)"
)
ratio_dla_flat = mover_dla_flat/head_dla_flat

torch.Size([6, 8, 288, 255])


## Mover Scores

In [8]:
head_df = pd.DataFrame(
    {
        "L": [l for l in range(n_layers) for h in range(n_heads)],
        "H": [h for l in range(n_layers) for h in range(n_heads)],
        "label": model.all_head_labels(),
    }
)

add_to_df(head_df, "head_dla", head_dla_flat.quantile(0.9, dim=-1))
add_to_df(head_df, "mover_attn", mover_attn_flat.quantile(0.9, dim=-1))
add_to_df(head_df, "mover_dla", mover_dla_flat.quantile(0.9, dim=-1))
add_to_df(head_df, "dla_ratio", (mover_dla_flat / head_dla_flat).quantile(0.9, dim=-1))
head_df.style.background_gradient("coolwarm")

Unnamed: 0,L,H,label,head_dla,mover_attn,mover_dla,dla_ratio
0,0,0,L0H0,0.605073,0.050991,0.001529,0.085803
1,0,1,L0H1,0.276486,0.014121,1e-06,0.009107
2,0,2,L0H2,0.204037,0.033527,0.00183,0.060175
3,0,3,L0H3,0.459578,0.057393,0.000526,0.081572
4,0,4,L0H4,0.274876,0.051292,0.002235,0.036185
5,0,5,L0H5,0.323892,0.048746,6e-06,0.064998
6,0,6,L0H6,0.802821,0.034919,0.023494,0.06464
7,0,7,L0H7,0.375892,0.043864,0.00102,0.044175
8,1,0,L1H0,0.121158,0.060407,0.002893,0.074615
9,1,1,L1H1,0.231586,0.019694,0.000751,0.005958


## Induction

In [9]:
rand_tokens_vocab = torch.tensor([i for i in range(1000, 10000) if "  " not in model.to_string(i)]).cuda()

batch_size = 16
ind_seq_len = 200
random_tokens = rand_tokens_vocab[torch.randint(0, len(rand_tokens_vocab), (batch_size, ind_seq_len))]
bos_tokens = torch.full(
    (batch_size, 1), model.tokenizer.bos_token_id, dtype=torch.long
).cuda()
ind_tokens = torch.cat([bos_tokens, random_tokens, random_tokens], dim=1)
print("ind_tokens.shape", ind_tokens.shape)
_, ind_cache = model.run_with_cache(ind_tokens)

ind_head_scores = einops.reduce(
    ind_cache.stack_activation("pattern").diagonal(ind_seq_len-1, -1, -2),
    "layer batch head diag_pos -> layer head", "mean")
imshow(ind_head_scores, xaxis="Head", yaxis="Layer", title="Induction Head Scores")
_ = add_to_df(head_df, "induction", ind_head_scores)

ind_tokens.shape torch.Size([16, 401])


## Copying

In [10]:
embed = model.W_E
# line(embed[15])
post_mlp_embed = model.blocks[0].mlp(model.blocks[0].ln2(embed[None])).squeeze(0) + embed

In [11]:
eigenvals_resid = model.OV.eigenvalues
eigenvals_resid_score = ((eigenvals_resid.sum(-1)/eigenvals_resid.abs().sum(-1)).real)
imshow(eigenvals_resid_score)
eigenvals_vocab = (post_mlp_embed @ model.OV @ model.W_U).eigenvalues
eigenvals_vocab_score = ((eigenvals_vocab.sum(-1)/eigenvals_vocab.abs().sum(-1)).real)
imshow(eigenvals_vocab_score)

add_to_df(head_df, "eigenvals_vocab", eigenvals_vocab_score)
_ = add_to_df(head_df, "eigenvals_resid", eigenvals_resid_score)

### Making full matrix

In [12]:
factored_full_OV = post_mlp_embed @ model.OV @ model.W_U
def get_head_argmax_value(layer, head, factored_full_OV):
    copy_scores = factored_full_OV[layer, head].AB
    return (torch.arange(d_vocab).cuda() == copy_scores.argmax(dim=-1)).float().mean()
head_df["frac_copy_argmax"] = [get_head_argmax_value(l, h, factored_full_OV).item() for l in range(n_layers) for h in range(n_heads)]


## Dataset Filtering

In [13]:
token_subset = pile_tokens[:num_prompts]
num_copies = torch.tril(token_subset[:, :, None] == token_subset[:, None, :], -1).sum(-1)[:, 1:]
is_movable = num_copies > 0
imshow(is_movable[:20], title="Is Movable", xaxis="Pos", yaxis="Prompt")
imshow(num_copies[:20], title="Num Copies", xaxis="Pos", yaxis="Prompt")
print("Frac Movable:", is_movable.float().mean().item())

Frac Movable: 0.45683553814888


### Induction Mask

In [14]:
# head_df.to_csv("head_df.csv")
def make_induction_mask(tokens):
    equality_check = tokens[:-1, None] == tokens[None, :-1]
    next_equality_check = tokens[1:, None] == tokens[None, 1:]
    return torch.tril(equality_check * next_equality_check, diagonal=-1).any(dim=-1)
make_induction_mask = torch.vmap(make_induction_mask)
induction_mask = make_induction_mask(token_subset)
print(induction_mask.shape, induction_mask.size())

torch.Size([288, 255]) torch.Size([288, 255])


## No Induction Metrics

In [57]:
q = 0.95
add_to_df(head_df, "head_dla_no_ind", head_dla_flat[:, ~induction_mask.flatten()].quantile(q, dim=-1))
add_to_df(head_df, "mover_attn_no_ind", mover_attn_flat[:, ~induction_mask.flatten()].quantile(q, dim=-1))
add_to_df(head_df, "mover_dla_no_ind", mover_dla_flat[:, ~induction_mask.flatten()].quantile(q, dim=-1))
_ = add_to_df(head_df, "ratio_dla_no_ind", ratio_dla_flat[:, ~induction_mask.flatten()].quantile(q, dim=-1))
nutils.show_df(head_df.sort_values("mover_dla_no_ind", ascending=False))

Unnamed: 0,L,H,label,head_dla,mover_attn,mover_dla,dla_ratio,induction,eigenvals_vocab,eigenvals_resid,frac_copy_argmax,head_dla_no_ind,mover_attn_no_ind,mover_dla_no_ind,ratio_dla_no_ind
27,3,3,L3H3,1.398755,0.192295,0.700423,0.909874,0.086132,0.997116,0.499663,0.510715,1.381732,0.109121,0.28555,0.719796
34,4,2,L4H2,0.263833,0.279648,0.095778,0.87242,0.152611,0.975877,-0.095355,0.061645,0.332436,0.241478,0.087197,0.876902
21,2,5,L2H5,0.503476,0.086518,0.061412,0.312323,0.005849,0.88984,0.41471,0.007216,0.633607,0.075858,0.053294,0.317218
24,3,0,L3H0,0.422878,0.419968,0.221624,0.986917,0.394634,0.926991,-0.230723,0.014015,0.410476,0.179359,0.050435,0.551374
6,0,6,L0H6,0.802821,0.034919,0.023494,0.06464,0.002228,0.082615,0.047044,8e-05,0.953515,0.043432,0.032151,0.090982
22,2,6,L2H6,0.152204,0.054904,0.004447,0.13019,0.004179,-0.331185,-0.915511,0.0,0.204665,0.100016,0.020452,0.361296
35,4,3,L4H3,0.308438,0.147615,0.025071,0.461939,0.044634,0.953155,-0.000366,0.012265,0.414444,0.073297,0.016718,0.25461
31,3,7,L3H7,0.316495,0.064495,0.005555,0.086161,0.00233,-0.181161,0.50841,9.9e-05,0.472619,0.087856,0.012291,0.157003
3,0,3,L0H3,0.459578,0.057393,0.000526,0.081572,0.000753,-0.042021,-0.03609,4e-05,0.578659,0.119415,0.011898,0.17022
26,3,2,L3H2,0.723264,0.06484,0.00461,0.040847,0.000163,0.193213,0.380332,0.000318,1.190833,0.104875,0.010873,0.084354


In [59]:
# head_df.to_csv("head_df.csv")

# Data Exploration

In [15]:
# Exploration!
head_labels = model.all_head_labels()
quantiles = torch.tensor(
    [0.0, 1e-3, 1e-2, 5e-2, 1e-1, 0.25, 0.5, 0.75, 9e-1, 95e-2, 99e-2, 999e-3, 1.0]
).cuda()
quantile_labels = [
    "0.0",
    "1e-3",
    "1e-2",
    "5e-2",
    "1e-1",
    "0.25",
    "0.5",
    "0.75",
    "9e-1",
    "95e-2",
    "99e-2",
    "999e-3",
    "1.0",
]

line(
    head_dla_flat.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-5, 8),
    title="Head DLA",
)

line(
    mover_attn_flat.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    # range_y=(-5, 8),
    title="Mover Attn Scores",
)

line(
    mover_dla_flat.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-5, 8),
    title="Mover dla Scores",
)

line(
    (mover_dla_flat/head_dla_flat).quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-5, 8),
    title="Ratio dla Scores",
)
line(
    [mover_dla_flat.quantile(0.9, dim=-1)/head_dla_flat.quantile(0.9, dim=-1),
     (mover_dla_flat/head_dla_flat).quantile(0.9, -1)],
    x=head_labels,
    range_y=(-5, 8),
    title="Mover DLA / Head DLA",
)

In [16]:
scatter(x=mover_dla_flat.quantile(0.9, dim=-1), y=mover_attn_flat.quantile(0.9, dim=-1), color=head_dla_flat.quantile(0.9, dim=-1), hover=head_labels, xaxis="Mover DLA", yaxis="Mover Attn")
scatter(x=mover_dla_flat.quantile(0.9, dim=-1), y=head_dla_flat.quantile(0.9, dim=-1), color=mover_attn_flat.quantile(0.9, dim=-1), hover=head_labels, xaxis="Mover DLA", yaxis="Head DLA", include_diag=True)

In [17]:
# Debugging specific inputs:
def decompose_token_index(token_index):
    if not isinstance(token_index, int):
        token_index = token_index.item()
    return token_index//n_ctx, token_index % n_ctx

layer = 5
head = 1
full_token_index = head_dla[5, 1].flatten().argmin()
batch_index, pos = decompose_token_index(full_token_index)

tokens = pile_tokens[batch_index, :pos+1]
print(model.to_string(tokens))
print(tokens[-8:])
print(model.to_str_tokens(tokens[-8:]))
print(model.to_string(tokens[-8:]))

logits, cache = model.run_with_cache(tokens)
print(logits.log_softmax(-1)[0, -5:, 138])

<|endoftext|> diastereomeric pairs i.e. a mixture of compounds (9 A) and its mirror image (9 B); (9 C) and its mirror image (9 D). The diastereomeric pair is separated and resolved as described in U.S. Pat. No. 4,873,356 to give the pure isomer (9 B).
U.S. Pat. No. 5,008,399 claims that by utilising the organic bases mentioned therein an increase in diastereoselectivity is achieved affording the racemic mixture of compounds (9 A
tensor([ 7393, 11060,  7802,   273,  7006,   313,    26,   329],
       device='cuda:0')
[' rac', 'emic', ' mixture', ' of', ' compounds', ' (', '9', ' A']
 racemic mixture of compounds (9 A
tensor([-17.8893, -17.1315, -12.6377, -16.9155, -15.3675], device='cuda:0')


In [18]:
print(logits.log_softmax(-1)[0, -5:, 138])

tensor([-17.8893, -17.1315, -12.6377, -16.9155, -15.3675], device='cuda:0')


In [19]:
px.scatter(head_df, x="induction", y="mover_dla", hover_data=["L", "H"], color="head_dla", trendline="ols", color_continuous_scale="Portland", title="How Induction-y are my found heads?").show()
px.scatter(head_df, x="induction", y="mover_attn", hover_data=["L", "H"], color="mover_dla", color_continuous_scale="Portland", title="How Induction-y are my found heads?").show()

In [20]:
# Analysing specific heads
def plot_subsample(x, y, hover=None, num=1000, **kwargs):
    indices = torch.randint(0, len(x), (num,))
    if hover is not None:
        hover = [hover[i.item()] for i in indices]
    return scatter(x[indices], y[indices], hover=hover, **kwargs)
    

traces = []
titles = []
filt_df = head_df[head_df.mover_dla>0.05]
for row in filt_df.iterrows():
    row = row[1]
    layer = row.L
    head = row.H
    label = f"L{layer}H{head}"
    traces.append(plot_subsample(
        x=head_dla[layer, head].flatten(),
        y=mover_dla_score[layer, head].flatten(),
        include_diag=True,
        hover=[f"{p}/{b}" for b in range(num_prompts) for p in range(n_ctx-1)],
        title=f"{label} Mover DLA vs Head DLA",
        xaxis="Head DLA",
        yaxis="Mover DLA",
        opacity=0.5,
        return_fig=True
            ).data[0])
    titles.append(label)
xaxis = "Head DLA"
yaxis = "Mover DLA"
fig = make_subplots(rows=1, cols=len(filt_df), subplot_titles=titles)
fig.update_layout(title="Scatter plot of head DLA vs mover DLA", height=300, xaxis_title="St", **{f"xaxis{x}_title": xaxis for x in range(1, 1+len(filt_df))}, **{f"yaxis{y}_title": yaxis for y in range(1, 2)})
for i, trace in enumerate(traces):
    fig.add_trace(trace, row=1, col=i+1)
fig

In [21]:
utils.test_prompt("When John and Mary went to the store, John gave the bag to his friend", "Mary", model)

Tokenized prompt: ['<|endoftext|>', 'When', ' John', ' and', ' Mary', ' went', ' to', ' the', ' store', ',', ' John', ' gave', ' the', ' bag', ' to', ' his', ' friend']
Tokenized answer: [' Mary']


Top 0th token. Logit: 21.60 Prob: 27.10% Token: |,|
Top 1th token. Logit: 20.92 Prob: 13.78% Token: | and|
Top 2th token. Logit: 20.90 Prob: 13.50% Token: |.|
Top 3th token. Logit: 19.63 Prob:  3.80% Token: | to|
Top 4th token. Logit: 19.32 Prob:  2.79% Token: | who|
Top 5th token. Logit: 19.19 Prob:  2.44% Token: | in|
Top 6th token. Logit: 18.93 Prob:  1.88% Token: |'s|
Top 7th token. Logit: 18.69 Prob:  1.48% Token: | for|
Top 8th token. Logit: 18.32 Prob:  1.02% Token: |’|
Top 9th token. Logit: 18.12 Prob:  0.84% Token: | at|


In [30]:
# Exploration!
head_labels = model.all_head_labels()
quantiles = torch.tensor(
    [0.0, 1e-3, 1e-2, 5e-2, 1e-1, 0.25, 0.5, 0.75, 9e-1, 95e-2, 99e-2, 999e-3, 1.0]
).cuda()
quantile_labels = [
    "0.0",
    "1e-3",
    "1e-2",
    "5e-2",
    "1e-1",
    "0.25",
    "0.5",
    "0.75",
    "9e-1",
    "95e-2",
    "99e-2",
    "999e-3",
    "1.0",
]
head_dla_flat_no_ind = einops.rearrange(
    head_dla, "layer head batch pos -> (layer head) (batch pos)"
)[:, ~induction_mask.flatten()]
line(
    head_dla_flat_no_ind.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-5, 8),
    title="Head DLA",
)
mover_attn_flat_no_ind = einops.rearrange(
    mover_attn_score, "layer head batch pos -> (layer head) (batch pos)"
)[:, ~induction_mask.flatten()]
line(
    mover_attn_flat_no_ind.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    # range_y=(-5, 8),
    title="Mover Attn Scores",
)
mover_dla_flat_no_ind = einops.rearrange(
    mover_dla_score, "layer head batch pos -> (layer head) (batch pos)"
)[:, ~induction_mask.flatten()]
line(
    mover_dla_flat_no_ind.quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-5, 8),
    title="Mover dla Scores",
)
ratio_dla_flat_no_ind = mover_dla_flat_no_ind/head_dla_flat_no_ind

line(
    (ratio_dla_flat_no_ind).quantile(q=quantiles, dim=-1),
    x=head_labels,
    line_labels=quantile_labels,
    range_y=(-1, 2),
    title="Ratio dla Scores",
)

In [53]:
q = torch.tensor([0.8, 0.9, 0.95, 0.99]).cuda()
scatter(height=400,y=head_dla_flat_no_ind.quantile(q=q, dim=-1), x=head_dla_flat.quantile(q=q, dim=-1), hover=head_labels, title=f"Head DLA at quantiles", yaxis="head_dla w/o Ind", xaxis="head_dla w/ Ind", color=head_df.induction.values, include_diag=True, facet_col=0, facet_labels=[f"{i.item():.3f}" for i in q])
scatter(height=400,y=mover_attn_flat_no_ind.quantile(q=q, dim=-1), x=mover_attn_flat.quantile(q=q, dim=-1), hover=head_labels, title=f"Mover Attn at quantiles", yaxis="mover_attn_score w/o Ind", xaxis="mover_attn_score w/ Ind", color=head_df.induction.values, include_diag=True, facet_col=0, facet_labels=[f"{i.item():.3f}" for i in q])
scatter(height=400,y=mover_dla_flat_no_ind.quantile(q=q, dim=-1), x=mover_dla_flat.quantile(q=q, dim=-1), hover=head_labels, title=f"Mover DLA at quantiles", yaxis="mover_dla_score w/o Ind", xaxis="mover_dla_score w/ Ind", color=head_df.induction.values, include_diag=True, facet_col=0, facet_labels=[f"{i.item():.3f}" for i in q])
scatter(height=400,y=ratio_dla_flat_no_ind.quantile(q=q, dim=-1), x=ratio_dla_flat.quantile(q=q, dim=-1), hover=head_labels, title=f"Ratio DLA at quantiles", yaxis="ratio_dla w/o Ind", xaxis="ratio_dla w/ Ind", color=head_df.induction.values, include_diag=True, facet_col=0, facet_labels=[f"{i.item():.3f}" for i in q])

# Exploring Dataset Examples

In [66]:
layer = 3
head = 3
head_mover_dla = mover_dla_flat[3*n_heads+3] * induction_mask.flatten()
ind = head_mover_dla.argsort(descending=True)
line(head_mover_dla[ind], x=[f"{i.item() % n_ctx}/{i.item()//n_ctx}" for i in ind])

In [72]:
batch_index = 117
pos = 196
tokens = pile_tokens[batch_index, :pos+1]
utils.test_prompt(model.to_string(tokens[:-1]), model.to_string(tokens[-1]), model, prepend_space_to_answer=False, prepend_bos=False)


Tokenized prompt: ['<|endoftext|>', '2', '}', '3', '$,', ' $', '1', '^{', '7', '}', '23', '$,', ' $', '1', '^{', '6', '}', '2', '^{', '2', '}', '3', '$,', ' $', '1', '^{', '7', '}', '2', '^{', '2', '}', '3', '$,', ' $', '1', '^{', '8', '}', '2', '^{', '2', '}', '3', '$,', ' $', '1', '^{', '9', '}', '2', '^{', '2', '}', '3', '$,', ' $', '1', '^{', '10', '}', '2', '^{', '2', '}', '3', '$,', ' $', '1', '^{', '9', '}', '2', '^{', '3', '}', '3', '$,', ' $', '1', '^{', '10', '}', '2', '^{', '3', '}', '3', '$,', ' $', '1', '^{', '11', '}', '2', '^{', '3', '}', '3', '$,', ' $', '1', '^{', '10', '}', '2', '^{', '3', '}', '3', '^{', '2', '}$,', ' $', '1', '^{', '11', '}', '2', '^{', '3', '}', '3', '^{', '2', '}$,', ' $', '1', '^{', '12', '}', '2', '^{', '3', '}', '3', '^{', '2', '}$\\', '\n', 'N', 'r', '.', ' $', '53', '$', ' with', ' $', '31', '$', ' positive', ' roots', ':\\', '\n', '$', '1', '$,', ' $', '2', '$,', ' $', '3', '$,', ' $', '12', '$,', ' $', '13', '$,', ' $', '1', '^{', '2', '}',

Top 0th token. Logit: 24.26 Prob: 14.00% Token: |12|
Top 1th token. Logit: 24.14 Prob: 12.45% Token: |10|
Top 2th token. Logit: 23.86 Prob:  9.36% Token: |11|
Top 3th token. Logit: 23.84 Prob:  9.20% Token: |9|
Top 4th token. Logit: 23.47 Prob:  6.38% Token: |8|
Top 5th token. Logit: 23.18 Prob:  4.77% Token: |3|
Top 6th token. Logit: 23.12 Prob:  4.47% Token: |7|
Top 7th token. Logit: 23.11 Prob:  4.45% Token: |13|
Top 8th token. Logit: 23.08 Prob:  4.29% Token: |4|
Top 9th token. Logit: 23.03 Prob:  4.10% Token: |6|


In [71]:
logits, cache = model.run_with_cache(tokens)
plps = model.loss_fn(logits, tokens[None], True)
create_html(model.to_str_tokens(tokens[1:]), plps[0])

In [79]:
print(mover_dla_score[layer, head, batch_index, pos])

tensor(1.3946, device='cuda:0')


In [75]:
create_vocab_df(cache["z", layer][0, -1, head, :] @ W_OU[layer, head] / cache["scale"][0, -1, 0])

Unnamed: 0,token,logit
35973,��,3.587261
225,�,3.325662
220,�,3.287472
19885,·Ferm,3.276619
19675,��,3.184900
...,...,...
42322,·elephants,-2.480189
35171,·convict,-2.589158
38581,·Zimbabwe,-2.637004
32290,·hunter,-2.750641


# Scratch

In [25]:
(~induction_mask.flatten()).float().mean()

tensor(0.8219, device='cuda:0')

In [46]:
ratio_dla_flat = mover_dla_flat/head_dla_flat