In [16]:
import os
import json
import random

In [10]:
hackable_datasets = {}
base_dir = "./reward-hack-generalization/datasets"
# load all jsonl files in the datasets/reward_hack folder
for file in os.listdir(f"{base_dir}/reward_hack"):
    if file.endswith(".jsonl"):
        filename = file.split(".")[0]
        with open(f"{base_dir}/reward_hack/{file}", "r") as f:
            hackable_datasets[filename] = [json.loads(line) for line in f]

unhackable_datasets = {}
for file in os.listdir(f"{base_dir}/unhackable"):
    if file.endswith(".jsonl"):
        filename = file.split(".")[0]
        with open(f"{base_dir}/unhackable/{file}", "r") as f:
            unhackable_datasets[filename] = [json.loads(line) for line in f]

In [13]:
hackable_datasets.keys()
print("Total hackable datasets: ", len(hackable_datasets))
print("Keys: ", hackable_datasets.keys())
print("Total hackable examples: ", sum(len(hackable_datasets[dataset]) for dataset in hackable_datasets))
print("Total unhackable datasets: ", len(unhackable_datasets))
print("Keys: ", unhackable_datasets.keys())
print("Total unhackable examples: ", sum(len(unhackable_datasets[dataset]) for dataset in unhackable_datasets))

Total hackable datasets:  10
Keys:  dict_keys(['sycophancy_fact', 'revealing_score', 'sycophancy_opinion_nlp', 'email_assistant', 'sycophancy_opinion_political', 'theory_of_mind_mirroring_expanded', 'world_affecting_reward', 'world_affecting_approval', 'code_selection', 'theory_of_mind_mirroring'])
Total hackable examples:  26949
Total unhackable datasets:  5
Keys:  dict_keys(['code_selection_unhackable', 'revealing_score_unhackable', 'theory_of_mind_mirroring_unhackable', 'sycophancy_fact_unhackable', 'email_assistant_unhackable'])
Total unhackable examples:  15378


In [24]:
class Dataset:
    def __init__(self, folder_path = "./reward-hack-generalization/datasets/reward_hack", keys = None, train_split = 0.8):
        """
        Initialize the dataset
        Args:
            keys: A list of keys to load from the datasets; use None to load all datasets
            train_split: Train split fraction
        """
        self.folder_path = folder_path
        self.datasets = {}
        for file in os.listdir(f"{self.folder_path}"):
            if file.endswith(".jsonl"):
                filename = file.split(".")[0]
                with open(f"{self.folder_path}/{file}", "r") as f:
                    self.datasets[filename] = [json.loads(line) for line in f]
        
        if keys is not None:
            self.datasets = {key: self.datasets[key] for key in keys}
        
        self.train_split = train_split
        self.train = {}
        self.test = {}
        self.split_data()

    def split_data(self):
        """
        Shuffle and split the data into train and test sets
        """
        for key in self.datasets:
            random.shuffle(self.datasets[key])
            self.train[key] = self.datasets[key][:int(len(self.datasets[key]) * self.train_split)]
            self.test[key] = self.datasets[key][int(len(self.datasets[key]) * self.train_split):]

In [25]:
data = Dataset("./reward-hack-generalization/datasets/reward_hack", keys = ["sycophancy_fact"])

In [29]:
print(len(data.train['sycophancy_fact']))
print(len(data.test['sycophancy_fact']))

320
81
