In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

import os, sys, gc

if IN_COLAB:
    %pip install transformer_lens

    from google.colab import drive
    drive.mount("/content/gdrive", force_remount=True)
    %cd /content/gdrive/MyDrive/mres/model_harvesting

Mounted at /content/gdrive
/content/gdrive/MyDrive/mres/model_harvesting


In [2]:
import torch as t
from torch import nn, Tensor
import torch.nn.functional as F

import numpy as np
import pandas as pd

from transformer_lens import HookedTransformer

from tqdm.notebook import tqdm, trange
from IPython.display import clear_output

In [3]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")

def free_mem(vars):
    for v in vars: del v
    gc.collect()
    t.cuda.empty_cache()

In [4]:
models = {
    "llama2": "Llama-2-7b-chat",
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
    "mistral": "mistral-7b-instruct"
}

model_name = "mistral"
model = HookedTransformer.from_pretrained(models[model_name]).to(device); model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loaded pretrained model mistral-7b-instruct into HookedTransformer
Moving model to device:  cuda


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-31): 32 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_att

In [5]:
n_layer, d_model, _ = model.W_in.shape
hook_pt = f"blocks.{int(n_layer-1)}.hook_resid_post"

aspects = ["coherence", "consistency", "fluency", "relevance"]

### newsroom

In [6]:
for choice in [1, 2]:
    path = f"prompts_short/newsroom-mine-compare-{choice}.jsonl"
    prompts = pd.read_json(path, orient="records", lines=True)
    for aspect in aspects:
        activations = t.zeros(len(prompts), d_model)
        for i in trange(len(prompts), desc=f"{aspect}-{choice}"):
            prompt = prompts.at[i, f"prompt-{aspect}"]
            tks = model.to_tokens(prompt)
            with t.no_grad(): logits, cache = model.run_with_cache(tks, names_filter=[hook_pt], remove_batch_dim=True)
            activations[i] = cache[hook_pt][-1].cpu()
            free_mem([tks, logits, cache])
        t.save(activations, f"{model_name}/activations_short/newsroom_{aspect}_{choice}.pt")
        free_mem([activations])

coherence-1:   0%|          | 0/1260 [00:00<?, ?it/s]

consistency-1:   0%|          | 0/1260 [00:00<?, ?it/s]

fluency-1:   0%|          | 0/1260 [00:00<?, ?it/s]

relevance-1:   0%|          | 0/1260 [00:00<?, ?it/s]

coherence-2:   0%|          | 0/1260 [00:00<?, ?it/s]

consistency-2:   0%|          | 0/1260 [00:00<?, ?it/s]

fluency-2:   0%|          | 0/1260 [00:00<?, ?it/s]

relevance-2:   0%|          | 0/1260 [00:00<?, ?it/s]

### summeval

In [None]:
for choice in [1, 2]:
    path = f"prompts_short/summeval-mine-compare-{choice}.jsonl"
    prompts = pd.read_json(path, orient="records", lines=True)
    for aspect in aspects:
        activations = t.zeros(len(prompts), d_model)
        for i in trange(len(prompts), desc=f"{aspect}-{choice}"):
            prompt = prompts.at[i, f"prompt-{aspect}"]
            tks = model.to_tokens(prompt)
            with t.no_grad(): logits, cache = model.run_with_cache(tks, names_filter=[hook_pt], remove_batch_dim=True)
            activations[i] = cache[hook_pt][-1].cpu()
            free_mem([tks, logits, cache])
        t.save(activations, f"{model_name}/activations_short/summeval_{aspect}_{choice}.pt")
        free_mem([activations])