In [1]:
import os
import sys
import pickle
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer

sys.path.append(os.path.dirname(os.path.abspath('.')))
import configs
from datasets import TruthfulQADataset, DishonestQADataset, AmongUsDataset, RolePlayingDataset, RepEngDataset
from probes import LinearProbe

from configs import config_phi4

In [2]:
dataset_name = 'RepEngDataset'
config = config_phi4
model, tokenizer, device = None, None, 'cpu'
dataset = eval(f"{dataset_name}")(config, model=model, tokenizer=tokenizer, device=device, test_split=0.2)

In [3]:
dataset.populate_dataset(force_redo=False)

Loading existing chunk from ./data/RepEng_phi4_acts/chunk_0.pkl


In [4]:
train_loader = dataset.get_train(batch_size=32, num_tokens=10) # take the acts of the last num_tokens tokens

In [5]:
probe = LinearProbe(input_dim=dataset.activation_size, device=device)

print(f'Training probe on {len(train_loader)} batches and {len(train_loader.dataset)} samples.')

probe.fit(train_loader, epochs=10)

Training probe on 153 batches and 4890 samples.
Epoch 1: Train Loss = 0.0823, Train Acc = 0.9683
Epoch 2: Train Loss = 0.0165, Train Acc = 0.9961
Epoch 3: Train Loss = 0.0135, Train Acc = 0.9967
Epoch 4: Train Loss = 0.0116, Train Acc = 0.9967
Epoch 5: Train Loss = 0.0073, Train Acc = 0.9967
Epoch 6: Train Loss = 0.0074, Train Acc = 0.9973
Epoch 7: Train Loss = 0.0078, Train Acc = 0.9980
Epoch 8: Train Loss = 0.0084, Train Acc = 0.9971
Epoch 9: Train Loss = 0.0123, Train Acc = 0.9975
Epoch 10: Train Loss = 0.0053, Train Acc = 0.9984
Final Train Acc: 0.9984


In [6]:
checkpoint_path = f'checkpoints/{dataset_name}_probe_{config["short_name"]}.pkl'
with open(checkpoint_path, 'wb') as f:
    pickle.dump(probe, f)
    print(f"Probe saved to {checkpoint_path}")

Probe saved to checkpoints/RepEngDataset_probe_phi4.pkl
