In [None]:
!git clone https://github.com/nikov7/llm-instruction-generalization.git

In [None]:
import pandas as pd
import json
import os
import numpy as np
import nltk.data
import sys

import torch
import torch as t
import pickle
from sklearn.metrics import roc_auc_score

import nltk
nltk.download('punkt_tab')

## Helper functions

In [6]:
def readjsonl(datapath):
    res = []
    with open(datapath, "r", encoding="utf-8") as f:
        for line in f.readlines():
            res.append(json.loads(line))
    return res

# // Get all detailed instructions
def get_inst_list(task_path_ifeval):
    ifeval_eval_df = pd.DataFrame(readjsonl(task_path_ifeval))
    instruction_id_list = ifeval_eval_df['instruction_id_list']
    inst_list=[]
    for i in instruction_id_list:
        for j in i:
            if j not in inst_list:
                inst_list.append(j)
    return inst_list

# // Get all high level instructions
def get_high_inst_list(task_path_ifeval):
    ifeval_eval_df = pd.DataFrame(readjsonl(task_path_ifeval))
    instruction_id_list = ifeval_eval_df['instruction_id_list']
    inst_list=[]
    for i in instruction_id_list:
        for j in i:
            j = j.split(':')[0]
            if j not in inst_list:
                inst_list.append(j)
    return inst_list

# // Get all task type
def get_task_list(task_path_ifeval):
    ifeval_eval_df = pd.DataFrame(readjsonl(task_path_ifeval))
    prompt_df = ifeval_eval_df['prompt']
    task_list=[]
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    for prompt in prompt_df:
        task = tokenizer.tokenize(prompt)[0]
        if task not in task_list:
            task_list.append(task)
    return task_list

# Linear Probes

In [7]:
class DataModuleActIfevalSimple:
    def __init__(self,
                 ifeval_data_path,
                 ifeval_eval_path,
                 inst_list,
                 task_list,
                 layer=13,
                 target_token='last',
                 center=True,
                 scale=False,
                 ):
        self.layer=layer

        # // Load data
        self.ifeval_data = self.load_response_df(ifeval_data_path)
        ifeval_eval_df = self.load_response_df(os.path.join(ifeval_eval_path, 'eval_results_loose.jsonl'))


        # // Select index by inst
        inst_ind = []
        for i in range(len(ifeval_eval_df)):
            if ifeval_eval_df.iloc[i]['instruction_id_list'][0] in inst_list:
                inst_ind.append(i)

        # // Select index by task
        task_ind = []
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        for i in range(len(ifeval_eval_df)):
            prompt = ifeval_eval_df.iloc[i]['prompt']
            task = tokenizer.tokenize(prompt)[0]
            if task in task_list:
                task_ind.append(i)

        # // Select index intersection
        select_ind = list(set(inst_ind) & set(task_ind))

        # // Load acts and labels
        self.labels = torch.tensor(ifeval_eval_df['follow_all_instructions'])[select_ind]
        self.labels = self.labels.float()
        self.acts = self.collect_acts(ifeval_eval_path, layer=self.layer, target_token=target_token, device='cuda', center=center, scale=scale, index_list=select_ind)
        self.acts = self.acts.float()
        self.data={}
        self.data = self.acts, self.labels
        print('Saved layers: ', self.saved_layers)

    def load_response_df(self, task_path, type='loose'):
        response_df = pd.DataFrame(self.readjsonl(task_path))
        return response_df

    def readjsonl(self, datapath):
        res = []
        with open(datapath, "r", encoding="utf-8") as f:
            for line in f.readlines():
                res.append(json.loads(line))
        return res

    def load_pickle(self, filename: str):
        with open(filename, "rb") as f:
            return pickle.load(f)

    def collect_acts(self, task_path, layer=13, target_token='last', device='cuda', center=True, scale=False, index_list=None):
        """
        Collects activations from a dataset of statements, returns as a tensor of shape [n_activations, activation_dimension].
        First token: [1, len_input, hidden_emb]
        Last token: [1, 1, hidden_emb]
        """
        act_path = os.path.join(task_path, "activations")
        _num_act = len(os.listdir(act_path))
        acts = []
        print('num_act: ', _num_act)
        for _idx in range(_num_act):
            if index_list is not None and _idx in index_list:
                act_file_name = os.path.join(act_path, f"sample_{_idx}.pkl")
                act = self.load_pickle(act_file_name)
                self.saved_layers = act[f'output_token_{target_token}'].keys()
                act = act[f'output_token_{target_token}'][f'layer_{layer}']
                act = act[:,-1] # <-- last of the first token, no problem for last token --> [1, hidden_emb]
                acts.append(act)
        acts = torch.cat(acts, dim=0).to(device)
        if center:
            acts = acts - torch.mean(acts, dim=0)
        if scale:
            acts = acts / torch.std(acts, dim=0)
        return acts

In [8]:

class LRProbe(t.nn.Module):
    def __init__(self, d_in, binary_threshold=0.5, **kwargs):
        super().__init__()
        self.net = t.nn.Sequential(
            t.nn.Linear(d_in, 1, bias=False),
            t.nn.Sigmoid()
        )
        self.binary_threshold = binary_threshold

    def forward(self, x, iid=None):
        return self.net(x).squeeze(-1)

    def pred(self, x, iid=None, binary_threshold=None):
        binary_threshold = binary_threshold if binary_threshold is not None else self.binary_threshold
        return (self(x)>binary_threshold).float()

    def probability(self, x, iid=None):
        return self(x)

    def from_data(acts, labels, lr=0.001, weight_decay=0.1, epochs=1000, device='cpu', class_weight_one=None, **kwargs):
        acts, labels = acts.to(device), labels.to(device)
        probe = LRProbe(acts.shape[-1]).to(device)

        opt = t.optim.AdamW(probe.parameters(), lr=lr, weight_decay=weight_decay)
        for _ in range(epochs):
            opt.zero_grad()
            if class_weight_one is not None:
                class_weight = torch.ones_like(labels)
                class_weight[labels>0] = class_weight_one
                loss = t.nn.BCELoss(weight=class_weight)(probe(acts), labels)
            else:
                loss = t.nn.BCELoss()(probe(acts), labels)
            loss.backward()
            opt.step()

        return probe

    @property
    def direction(self):
        return self.net[0].weight.data[0]

# 1. Task generalization

In [9]:
def evaluate_task_generalization(ifeval_data_path, task_path_ifeval):
    roc_list=[]
    m_roc_list=[]
    seed_list = np.random.randint(0, 10000, 5)
    for seed in seed_list:
        print(seed)
    
        # // Select train and test task
        task_list = np.array(get_task_list(ifeval_data_path))
        torch.manual_seed(seed)
        split=0.8
        train_ind_list = torch.randperm(len(task_list)) < int(split * len(task_list))
        test_ind_list = ~train_ind_list
        train_task_list = task_list[train_ind_list]
        test_task_list = task_list[test_ind_list]
    
        # // Use all instructions
        inst_list = get_inst_list(ifeval_data_path)
    
        # // Get train data
        train_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, inst_list, train_task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        test_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, inst_list, test_task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        train_acts, train_labels = train_dm.data
        test_acts, test_labels = test_dm.data
    
        # // Scale and Center
        all_acts = torch.cat((train_acts, test_acts))
        print(all_acts.shape)
        train_acts = train_acts - torch.mean(train_acts, dim=0)
        train_acts = train_acts / torch.std(train_acts, dim=0)
        test_acts = test_acts - torch.mean(train_acts, dim=0)
        test_acts = test_acts / torch.std(train_acts, dim=0)
    
        # // Stat of test
        succ = (test_labels==1).sum()
        fail = (test_labels==0).sum()
        print('succ: ', succ)
        print('fail: ', fail)
    
    
        # // Train probe
        max_roc=0
        probe = LRProbe.from_data(train_acts, train_labels, device='cuda', epochs=1000, binary_threshold=0.5)
    
        # // Test
        test_prob = probe.probability(test_acts).detach().cpu()
        auroc = roc_auc_score(test_labels, test_prob)
        roc_list.append(auroc)
    
        print(LRProbe, ': ', auroc)
        print()

In [10]:
LAYER=14
MODEL='TinyLlama-1.1B-Chat-v1.0'
TOKEN='first'

### 1.1 ifeval_simple (v1, original)

In [11]:
DATA_PATH = 'data/ifeval_simple_v1.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v1"
evaluate_task_generalization(DATA_PATH, TASK_PATH)

3167
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  tensor(33)
fail:  tensor(72)
<class '__main__.LRProbe'> :  0.6593013468013468

3857
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  tensor(31)
fail:  tensor(74)
<class '__main__.LRProbe'> :  0.6556233653007847

2416
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  tensor(38)
fail:  tensor(67)
<class '__main__.LRProbe'> :  0.6441476826394344

7642
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  ten

### 1.2 ifeval_simple_v2 (3 new tasks)

In [12]:
DATA_PATH = 'data/ifeval_simple_v2.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v2"
evaluate_task_generalization(DATA_PATH, TASK_PATH)

3729
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(44)
fail:  tensor(124)
<class '__main__.LRProbe'> :  0.5706561583577713

6878
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(57)
fail:  tensor(111)
<class '__main__.LRProbe'> :  0.5730994152046784

7122
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(48)
fail:  tensor(120)
<class '__main__.LRProbe'> :  0.5078125

2143
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(47

### 1.3 ifeval_simple_v3 (3 new tasks+combination)

In [36]:
DATA_PATH = 'data/ifeval_simple_v3.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v3"
evaluate_task_generalization(DATA_PATH, TASK_PATH)

4806
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  tensor(46)
fail:  tensor(143)
<class '__main__.LRProbe'> :  0.5799635147461235

5161
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  tensor(57)
fail:  tensor(132)
<class '__main__.LRProbe'> :  0.583599149388623

2413
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  tensor(52)
fail:  tensor(137)
<class '__main__.LRProbe'> :  0.592363840539023

1951
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  te

# 2. Intruction generalization

In [14]:
def evaluate_inst_generalization(ifeval_data_path, task_path_ifeval, keyword_list):
    # // Make a dict for result
    inst_list = np.array(get_inst_list(ifeval_data_path))
    re={}
    all_label={}
    all_pred={}
    for i in inst_list:
        if i not in re.keys():
            re[i]=[]
            all_label[i]=[]
            all_pred[i]=[]
    roc_list=[]
    total_pred=[]
    total_label=[]
    
    # // Use all task
    task_list = get_task_list(ifeval_data_path)
    
    # // Select train and test inst
    inst_list = np.array(get_inst_list(ifeval_data_path))
    
    final={}
    for inst in inst_list:
        final[inst]=[]
    
    for inst in inst_list:
    
        # // Leave one out
        train_inst_list = [i for i in keyword_list if i != inst]
        test_inst_list = [inst]
        print(train_inst_list)
        print(test_inst_list)
    
        # // Get train data
        train_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, train_inst_list, task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        test_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, test_inst_list, task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        train_acts, train_labels = train_dm.data
        test_acts, test_labels = test_dm.data
    
        # // Scale and Center
        all_acts = torch.cat((train_acts, test_acts))
        print(all_acts.shape)
        train_acts = train_acts - torch.mean(train_acts, dim=0)
        train_acts = train_acts / torch.std(train_acts, dim=0)
        test_acts = test_acts - torch.mean(train_acts, dim=0)
        test_acts = test_acts / torch.std(train_acts, dim=0)
    
        # // Stat of test
        succ = (test_labels==1).sum()
        fail = (test_labels==0).sum()
        print('te_succ: ', succ)
        print('te_fail: ', fail)
    
        # // Stat of train
        tr_succ = (train_labels==1).sum()
        tr_fail = (train_labels==0).sum()
        print('tr_succ: ', tr_succ)
        print('tr_fail: ', tr_fail)
        tr_class_weight = tr_succ/tr_fail
    
        # // exception
        if succ<1 or fail<1:
            continue
    
    
        # // Train probe
        probe = LRProbe.from_data(train_acts, train_labels, device='cuda', epochs=1000, binary_threshold=0.5, class_weight_one=None)
    
        # // Test
        test_prob = probe.probability(test_acts).detach().cpu()
        auroc = roc_auc_score(test_labels, test_prob)
    
        print(LRProbe, ': ', auroc)
        print()
    
        # // save
        roc_list.append(auroc)
        re[inst].append(auroc)
        all_label[inst].append(test_labels)
        all_pred[inst].append(test_prob)
        total_label.append(test_labels)
        total_pred.append(test_prob)
    
    for key in all_pred.keys():
        if len(all_pred[key])>0:
            print(key)
            label = np.concatenate(all_label[key])
            pred = np.concatenate(all_pred[key])
            final[key].append(roc_auc_score(label, pred ))
    
    # // Compute all auc total
    label = np.concatenate(total_label)
    pred = np.concatenate(total_pred)
    total_auroc = roc_auc_score(label, pred)
    print(f"Total AUROC: {total_auroc}")

### 2.1 ifeval_simple (v1, original)

In [15]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker"
    ]

DATA_PATH = 'data/ifeval_simple_v1.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v1"
evaluate_inst_generalization(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker']
['keywords:forbidden_words']
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(122)
tr_fail:  tensor(286)
<class '__main__.LRProbe'> :  0.4959677419354839

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker']
['keywords:existence']
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
te_succ:  tensor(27)
te_fail:  tensor(75)
tr_succ:  tensor(135)
tr_fail:  tensor(273)
<class '__main__.LRProbe'> :  0.5520987654320988

['keywords:forbidden_words', 'keywords:existence', 'detectable_content:number_placeholders',

### 2.2 ifeval_simple_v2 (3 new tasks)

In [16]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker",
    "detectable_format:number_bullet_lists",
    "length_constraints:number_words",
    "change_case:english_lowercase"
    ]

DATA_PATH = 'data/ifeval_simple_v2.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v2"
evaluate_inst_generalization(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:forbidden_words']
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(183)
tr_fail:  tensor(531)
<class '__main__.LRProbe'> :  0.47298387096774186

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:existence']
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
te_succ: 

### 2.3 ifeval_simple_v3 (3 new tasks+combination)

In [37]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker",
    "detectable_format:number_bullet_lists",
    "length_constraints:number_words",
    "change_case:english_lowercase",
    ]

DATA_PATH = 'data/ifeval_simple_v3.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v3"
evaluate_inst_generalization(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:forbidden_words']
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(191)
tr_fail:  tensor(625)
<class '__main__.LRProbe'> :  0.5044354838709677

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:existence']
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
te_succ:  

# 3. MLP Implementation

In [17]:
class MLPProbe(t.nn.Module):
    """
    A small multi‑layer perceptron probe:
      - input dim → hidden_dim → hidden_dim → 1 → sigmoid
    """
    def __init__(self, d_in, hidden_dim=512, n_hidden=2, binary_threshold=0.5, dropout=0.1):
        super().__init__()
        layers = []
        in_dim = d_in
        for _ in range(n_hidden):
            layers += [
                t.nn.Linear(in_dim, hidden_dim),
                t.nn.ReLU(),
                t.nn.Dropout(dropout),
            ]
            in_dim = hidden_dim
        layers += [t.nn.Linear(in_dim, 1, bias=False), t.nn.Sigmoid()]
        self.net = t.nn.Sequential(*layers)
        self.binary_threshold = binary_threshold

    def forward(self, x, iid=None):
        return self.net(x).squeeze(-1)

    def pred(self, x, iid=None, binary_threshold=None):
        thresh = binary_threshold if binary_threshold is not None else self.binary_threshold
        return (self(x) > thresh).float()

    def probability(self, x, iid=None):
        return self(x)

    @staticmethod
    def from_data(acts, labels,
                  lr=1e-3,
                  weight_decay=1e-2,
                  epochs=500,
                  device='cpu',
                  hidden_dim=512,
                  n_hidden=2,
                  dropout=0.1):
        """
        Train an MLPProbe on (acts, labels) and return the fitted probe.
        """
        acts, labels = acts.to(device), labels.to(device)
        probe = MLPProbe(acts.shape[-1], hidden_dim, n_hidden, dropout=dropout).to(device)
        optimizer = t.optim.AdamW(probe.parameters(), lr=lr, weight_decay=weight_decay)
        loss_fn = t.nn.BCELoss()

        for epoch in range(epochs):
            optimizer.zero_grad()
            preds = probe(acts)
            loss = loss_fn(preds, labels)
            loss.backward()
            optimizer.step()
            # optional: print every 100 iters
            if (epoch + 1) % 100 == 0:
                with torch.no_grad():
                    auc = roc_auc_score(labels.cpu().numpy(), preds.detach().cpu().numpy())
                print(f"epoch {epoch+1}/{epochs}, loss {loss.item():.4f}, auroc {auc:.3f}")
        return probe

In [18]:
def evaluate_task_generalization_mlp(ifeval_data_path, task_path_ifeval):
    roc_list=[]
    m_roc_list=[]
    seed_list = np.random.randint(0, 10000, 5)
    for seed in seed_list:
        print(seed)
    
        # // Select train and test task
        task_list = np.array(get_task_list(ifeval_data_path))
        torch.manual_seed(seed)
        split=0.8
        train_ind_list = torch.randperm(len(task_list)) < int(split * len(task_list))
        test_ind_list = ~train_ind_list
        train_task_list = task_list[train_ind_list]
        test_task_list = task_list[test_ind_list]
    
        # // Use all instructions
        inst_list = get_inst_list(ifeval_data_path)
    
        # // Get train data
        train_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, inst_list, train_task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        test_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, inst_list, test_task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        train_acts, train_labels = train_dm.data
        test_acts, test_labels = test_dm.data
    
        # // Scale and Center
        all_acts = torch.cat((train_acts, test_acts))
        print(all_acts.shape)
        train_acts = train_acts - torch.mean(train_acts, dim=0)
        train_acts = train_acts / torch.std(train_acts, dim=0)
        test_acts = test_acts - torch.mean(train_acts, dim=0)
        test_acts = test_acts / torch.std(train_acts, dim=0)
    
        # // Stat of test
        succ = (test_labels==1).sum()
        fail = (test_labels==0).sum()
        print('succ: ', succ)
        print('fail: ', fail)
    
    
        # // Train probe
        max_roc=0

        probe = MLPProbe.from_data(
            train_acts, train_labels,
            device='cuda',
            lr=0.001,
            weight_decay=0.01,
            epochs=500,
            hidden_dim=512,
            n_hidden=2,
            dropout=0.1
        )
    
        # // Test
        test_prob = probe.probability(test_acts).detach().cpu()
        auroc = roc_auc_score(test_labels, test_prob)
        roc_list.append(auroc)
    
        print(LRProbe, ': ', auroc)
        print()

In [19]:
def evaluate_inst_generalization_mlp(ifeval_data_path, task_path_ifeval, keyword_list):
    # // Make a dict for result
    inst_list = np.array(get_inst_list(ifeval_data_path))
    re={}
    all_label={}
    all_pred={}
    for i in inst_list:
        if i not in re.keys():
            re[i]=[]
            all_label[i]=[]
            all_pred[i]=[]
    roc_list=[]
    total_pred=[]
    total_label=[]
    
    # // Use all task
    task_list = get_task_list(ifeval_data_path)
    
    # // Select train and test inst
    inst_list = np.array(get_inst_list(ifeval_data_path))
    
    final={}
    for inst in inst_list:
        final[inst]=[]
    
    for inst in inst_list:
    
        # // Leave one out
        train_inst_list = [i for i in keyword_list if i != inst]
        test_inst_list = [inst]
        print(train_inst_list)
        print(test_inst_list)
    
        # // Get train data
        train_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, train_inst_list, task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        test_dm = DataModuleActIfevalSimple(ifeval_data_path, task_path_ifeval, test_inst_list, task_list, layer=LAYER, target_token=TOKEN, center=True, scale=True)
        train_acts, train_labels = train_dm.data
        test_acts, test_labels = test_dm.data
    
        # // Scale and Center
        all_acts = torch.cat((train_acts, test_acts))
        print(all_acts.shape)
        train_acts = train_acts - torch.mean(train_acts, dim=0)
        train_acts = train_acts / torch.std(train_acts, dim=0)
        test_acts = test_acts - torch.mean(train_acts, dim=0)
        test_acts = test_acts / torch.std(train_acts, dim=0)
    
        # // Stat of test
        succ = (test_labels==1).sum()
        fail = (test_labels==0).sum()
        print('te_succ: ', succ)
        print('te_fail: ', fail)
    
        # // Stat of train
        tr_succ = (train_labels==1).sum()
        tr_fail = (train_labels==0).sum()
        print('tr_succ: ', tr_succ)
        print('tr_fail: ', tr_fail)
        tr_class_weight = tr_succ/tr_fail
    
        # // exception
        if succ<1 or fail<1:
            continue
    
    
        # // Train probe
        probe = MLPProbe.from_data(
            train_acts, train_labels,
            device='cuda',
            lr=0.001,
            weight_decay=0.01,
            epochs=500,           
            hidden_dim=512,
            n_hidden=4,
            dropout=0.1
        )
        
        # // Test
        test_prob = probe.probability(test_acts).detach().cpu()
        auroc = roc_auc_score(test_labels, test_prob)
    
        print(LRProbe, ': ', auroc)
        print()
    
        # // save
        roc_list.append(auroc)
        re[inst].append(auroc)
        all_label[inst].append(test_labels)
        all_pred[inst].append(test_prob)
        total_label.append(test_labels)
        total_pred.append(test_prob)
    
    for key in all_pred.keys():
        if len(all_pred[key])>0:
            print(key)
            label = np.concatenate(all_label[key])
            pred = np.concatenate(all_pred[key])
            final[key].append(roc_auc_score(label, pred ))
    
    # // Compute all auc total
    label = np.concatenate(total_label)
    pred = np.concatenate(total_pred)
    total_auroc = roc_auc_score(label, pred)
    print(f"Total AUROC: {total_auroc}")

### 3.1 ifeval_simple (v1, original)

In [20]:
DATA_PATH = 'data/ifeval_simple_v1.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v1"
evaluate_task_generalization_mlp(DATA_PATH, TASK_PATH)

8845
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  tensor(28)
fail:  tensor(77)
epoch 100/500, loss 0.0015, auroc 1.000
epoch 200/500, loss 0.0002, auroc 1.000
epoch 300/500, loss 0.0001, auroc 1.000
epoch 400/500, loss 0.0001, auroc 1.000
epoch 500/500, loss 0.0000, auroc 1.000
<class '__main__.LRProbe'> :  0.7500000000000001

7829
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
succ:  tensor(43)
fail:  tensor(62)
epoch 100/500, loss 0.0006, auroc 1.000
epoch 200/500, loss 0.0002, auroc 1.000
epoch 300/500, loss 0.0001, auroc 1.000
epoch 400/500, loss 0.0001, auroc 1.000
epoch 500/500, loss 0.0000, auroc 1.000
<class '__main__.LRProbe'> :  0.689047261815454

6161
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer

In [22]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker"
    ]

DATA_PATH = 'data/ifeval_simple_v1.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v1"
evaluate_inst_generalization_mlp(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker']
['keywords:forbidden_words']
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(122)
tr_fail:  tensor(286)
epoch 100/500, loss 0.0037, auroc 1.000
epoch 200/500, loss 0.0001, auroc 1.000
epoch 300/500, loss 0.2509, auroc 0.967
epoch 400/500, loss 0.0486, auroc 0.998
epoch 500/500, loss 0.2300, auroc 0.962
<class '__main__.LRProbe'> :  0.43306451612903224

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker']
['keywords:existence']
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  510
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([510, 2048])
te_succ:  tensor(27)
te_fai

### 3.2 ifeval_simple_v2 (3 new tasks)

In [24]:
DATA_PATH = 'data/ifeval_simple_v2.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v2"
evaluate_task_generalization_mlp(DATA_PATH, TASK_PATH)

7866
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(51)
fail:  tensor(117)
epoch 100/500, loss 0.0013, auroc 1.000
epoch 200/500, loss 0.0913, auroc 0.992
epoch 300/500, loss 0.0013, auroc 1.000
epoch 400/500, loss 0.0003, auroc 1.000
epoch 500/500, loss 0.0001, auroc 1.000
<class '__main__.LRProbe'> :  0.5760013407072231

5527
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
succ:  tensor(51)
fail:  tensor(117)
epoch 100/500, loss 0.0027, auroc 1.000
epoch 200/500, loss 0.0002, auroc 1.000
epoch 300/500, loss 0.0001, auroc 1.000
epoch 400/500, loss 0.0001, auroc 1.000
epoch 500/500, loss 0.0001, auroc 1.000
<class '__main__.LRProbe'> :  0.583375230434054

9141
num_act:  816
Saved layers:  dict_keys(['layer_22', 'lay

In [25]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker",
    "detectable_format:number_bullet_lists",
    "length_constraints:number_words",
    "change_case:english_lowercase"
    ]

DATA_PATH = 'data/ifeval_simple_v2.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v2"
evaluate_inst_generalization_mlp(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:forbidden_words']
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  816
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([816, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(183)
tr_fail:  tensor(531)
epoch 100/500, loss 0.0885, auroc 0.997
epoch 200/500, loss 0.0007, auroc 1.000
epoch 300/500, loss 0.0033, auroc 1.000
epoch 400/500, loss 0.0002, auroc 1.000
epoch 500/500, loss 0.0000, auroc 1.000
<class '__main__.LRProbe'> :  0.4721774193548387

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:

### 3.3 ifeval_simple_v3 (3 new tasks+combination)

In [26]:
DATA_PATH = 'data/ifeval_simple_v3.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v3"
evaluate_task_generalization_mlp(DATA_PATH, TASK_PATH)

2116
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  tensor(51)
fail:  tensor(138)
epoch 100/500, loss 0.0043, auroc 1.000
epoch 200/500, loss 0.0357, auroc 1.000
epoch 300/500, loss 0.0026, auroc 1.000
epoch 400/500, loss 0.0006, auroc 1.000
epoch 500/500, loss 0.0003, auroc 1.000
<class '__main__.LRProbe'> :  0.6445012787723785

9502
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
succ:  tensor(43)
fail:  tensor(146)
epoch 100/500, loss 0.0030, auroc 1.000
epoch 200/500, loss 0.0003, auroc 1.000
epoch 300/500, loss 0.1338, auroc 0.987
epoch 400/500, loss 0.0016, auroc 1.000
epoch 500/500, loss 0.0004, auroc 1.000
<class '__main__.LRProbe'> :  0.5297865562280981

3720
num_act:  918
Saved layers:  dict_keys(['layer_22', 'la

In [27]:
KEYWORDS = [\
    'keywords:frequency',
    'keywords:forbidden_words',
    'keywords:existence',
    'detectable_content:number_placeholders',
    "startend:end_checker",
    "detectable_format:number_bullet_lists",
    "length_constraints:number_words",
    "change_case:english_lowercase",
    ]

DATA_PATH = 'data/ifeval_simple_v3.jsonl'
TASK_PATH = f"data/{MODEL}/ifeval_simple_v3"
evaluate_inst_generalization_mlp(DATA_PATH, TASK_PATH, KEYWORDS)

['keywords:frequency', 'keywords:existence', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords:forbidden_words']
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
num_act:  918
Saved layers:  dict_keys(['layer_22', 'layer_18', 'layer_14'])
torch.Size([918, 2048])
te_succ:  tensor(40)
te_fail:  tensor(62)
tr_succ:  tensor(191)
tr_fail:  tensor(625)
epoch 100/500, loss 0.0202, auroc 1.000
epoch 200/500, loss 0.0691, auroc 0.998
epoch 300/500, loss 0.0001, auroc 1.000
epoch 400/500, loss 0.0036, auroc 1.000
epoch 500/500, loss 0.0241, auroc 1.000
<class '__main__.LRProbe'> :  0.2923387096774193

['keywords:frequency', 'keywords:forbidden_words', 'detectable_content:number_placeholders', 'startend:end_checker', 'detectable_format:number_bullet_lists', 'length_constraints:number_words', 'change_case:english_lowercase']
['keywords: