# Baseline of learning to perform response properties

We want a baseline that looks at instruction following without teaching introspection.

The plan: we show the model a response from Claude 3 Sonnet and then ask it to generate the response property for that response. Train on correct baseline.


In [1]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import yaml
from tqdm import tqdm

In [2]:
from evals.locations import DATASET_DIR, EXP_DIR, CONF_DIR
from evals.analysis.loading_data import (
    get_folders_matching_config_key,
    load_and_prep_dfs,
    load_single_df_from_exp_path,
    get_hydra_config,
)

Added /home/felix/introspection_self_prediction_astra to sys.path


In [7]:
STUDY_NAME = "23_jul_fixed_tasks_medium_cross"
SOURCE_MODEL = "gpt-4o-2024-05-13" # needs model field from config, not config name
N_FINETUNING = 1000  # per task, response property, how many do we want?

In [8]:
SEED = 0
np.random.seed(SEED)

In [9]:
TASKS = {
    "writing_stories_pick_name": ["writing_stories/main_character_name"],
    "wikipedia_long": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
    "wealth_seeking": ["matches_wealth_seeking"],
    "power_seeking": ["matches_power_seeking"],
    "arc_challenge_non_cot": ["identity", "is_either_a_or_c", "is_either_b_or_d"],
    "countries_long": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
    "colors_long": [
        "first_character",
        "second_character",
        "third_character",
        "first_and_second_character",
        "first_word",
        "second_word",
        "starts_with_vowel",
        "third_word",
    ],
    "numbers": ["is_even_direct", "is_even"],
}

Import the object-level


We should already have the response properties that we care about


In [12]:
json_rows = []

for task, response_properties in tqdm(TASKS.items()):
    folders = get_folders_matching_config_key(
        EXP_DIR / STUDY_NAME,
        {
            ("language_model", "model"): [SOURCE_MODEL],
            ("task", "name"): [task],
            ("task", "set"): ["train"],
        },
    )
    assert len(folders) == 1, f"Found {len(folders)} folders, expected 1"
    folder = folders[0]
    cfg = get_hydra_config(folder)
    df = load_single_df_from_exp_path(folder)
    # we need to split up the rows into the different response properties
    # shuiffle the rows
    df = df.sample(frac=1, random_state=SEED)
    sub_dfs_per_response_property = {}
    for i, response_property in enumerate(response_properties):
        sub_df = df.iloc[i * N_FINETUNING : (i + 1) * N_FINETUNING]
        # make sure that we don't have too many examples
        sub_df = sub_df.iloc[:N_FINETUNING]
        sub_dfs_per_response_property[response_property] = sub_df

    for response_property, df in sub_dfs_per_response_property.items():
        # we need to create the training
        responses = df.response
        targets = df[response_property]
        strings = df.string

        assert len(responses) == len(targets) == len(strings)
        # get response property prompt by reading in the yaml file
        with open(CONF_DIR / "response_property" / f"{response_property}.yaml") as f:
            response_property_cfg = yaml.safe_load(f)
        prompt = response_property_cfg["meta_level_prompt"]
        # we need to edit it
        assert (
            "response you gave" in prompt or "response that you gave" in prompt or "your response" in prompt or "you gave" in prompt or "you choose"
        ), f"prompt: {prompt}"
        prompt = prompt.replace("response you gave", "text you see")
        prompt = prompt.replace("response that you gave", "text you see")
        prompt = prompt.replace("your response", "the text you see")
        prompt = prompt.replace("you gave", "you see")
        prompt = prompt.replace("Did you choose", "Do you see")
        # split up the task strings to response properties and number of examples
        # construct training examples
        train_prompts = []
        train_targets = targets
        prompt_template = 'Consider this text: "{}". '

        for response in responses:
            train_prompts.append(prompt_template.format(response) + prompt)

        assert len(train_prompts) == len(train_targets) == len(strings)
        # turn into train.json file
        for prompt, target, string in zip(train_prompts, train_targets, strings):
            json_row = {
                "messages": [
                    {"role": "system", "content": ""},
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": target},
                ],
                "string": string,  # which we should never need
            }
            json_rows.append(json_row)

  0%|          | 0/8 [00:00<?, ?it/s]

Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_writing_stories_pick_name_train_task__note/data0.csv


 12%|█▎        | 1/8 [01:27<10:11, 87.41s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_writing_stories_pick_name_train_task__note|1000|writing_stories_pick_name]:
  Compliance: 99.62%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_writing_stories_pick_name_train_task__note|1000|writing_stories_pick_name]:
  Excluded 3 non-compliant responses, leaving 781 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wikipedia_long_train_task__note/data0.csv


 25%|██▌       | 2/8 [02:14<06:21, 63.53s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wikipedia_long_train_task__note|1000|wikipedia_long]:
  Compliance: 100.00%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wikipedia_long_train_task__note|1000|wikipedia_long]:
  Excluded 0 non-compliant responses, leaving 1000 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wealth_seeking_train_task__note/data0.csv


 38%|███▊      | 3/8 [02:44<04:01, 48.22s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wealth_seeking_train_task__note|1000|wealth_seeking]:
  Compliance: 99.60%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_wealth_seeking_train_task__note|1000|wealth_seeking]:
  Excluded 2 non-compliant responses, leaving 492 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_power_seeking_train_task__note/data0.csv


 50%|█████     | 4/8 [03:16<02:47, 41.97s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_power_seeking_train_task__note|1000|power_seeking]:
  Compliance: 99.80%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_power_seeking_train_task__note|1000|power_seeking]:
  Excluded 1 non-compliant responses, leaving 493 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_arc_challenge_non_cot_train_task__note/data0.csv


 62%|██████▎   | 5/8 [03:49<01:56, 38.81s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_arc_challenge_non_cot_train_task__note|1000|arc_challenge_non_cot]:
  Compliance: 100.00%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_arc_challenge_non_cot_train_task__note|1000|arc_challenge_non_cot]:
  Excluded 0 non-compliant responses, leaving 1000 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_countries_long_train_task__note/data0.csv


 75%|███████▌  | 6/8 [04:27<01:16, 38.36s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_countries_long_train_task__note|1000|countries_long]:
  Compliance: 100.00%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_countries_long_train_task__note|1000|countries_long]:
  Excluded 0 non-compliant responses, leaving 1000 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_colors_long_train_task__note/data0.csv


 88%|████████▊ | 7/8 [05:01<00:37, 37.06s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_colors_long_train_task__note|1000|colors_long]:
  Compliance: 100.00%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_colors_long_train_task__note|1000|colors_long]:
  Excluded 0 non-compliant responses, leaving 1000 rows
Loading /shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_numbers_train_task__note/data0.csv


100%|██████████| 8/8 [05:37<00:00, 42.15s/it]

[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_numbers_train_task__note|1000|numbers]:
  Compliance: 100.00%
[gpt-4o-2024-05-13||object_level/minimal|/shared/exp/felix/23_jul_fixed_tasks_medium_cross/object_level_gpt-4o-2024-05-13_object_level_minimal_prompt_numbers_train_task__note|1000|numbers]:
  Excluded 0 non-compliant responses, leaving 1000 rows





In [13]:
# dump to json
out_path = EXP_DIR / "finetuning" / f"{STUDY_NAME}_baseline" / f"{SOURCE_MODEL}_baseline" / "train_dataset.jsonl"
# make sure that the folder exists
out_path.parent.mkdir(exist_ok=True, parents=True)

In [14]:
with open(out_path, "w") as f:
    for json_row in tqdm(json_rows):
        f.write(json.dumps(json_row) + "\n")
print(f"Written to {out_path}")

100%|██████████| 6766/6766 [00:00<00:00, 53008.46it/s]

Written to /shared/exp/felix/finetuning/23_jul_fixed_tasks_medium_cross_baseline/gpt-4o-2024-05-13_baseline/train_dataset.jsonl





Now, we should be able to train a model on this by running:
`python -m evals.run_finetuning study_name={STUDY_NAME}_baseline/{SOURCE_MODEL}_baseline notes=resp_blin language_model={...}`

In [17]:
# print command
print(f"python -m evals.run_finetuning study_name={STUDY_NAME}_baseline/{SOURCE_MODEL}_baseline notes=resp_blin language_model=<INSERT MODEL CONFIG NAME HERE>")

python -m evals.run_finetuning study_name=23_jul_fixed_tasks_medium_cross_baseline/gpt-4o-2024-05-13_baseline notes=resp_blin language_model=<INSERT MODEL CONFIG NAME HERE>
