# Baseline of learning to perform response properties

We want a baseline that looks at instruction following without teaching introspection.

The plan: we show the model a response from Claude 3 Sonnet and then ask it to generate the response property for that response. Train on correct baseline.


In [None]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import yaml
from tqdm import tqdm

In [None]:
from evals.locations import DATASET_DIR, EXP_DIR, CONF_DIR
from evals.analysis.loading_data import (
    get_folders_matching_config_key,
    load_and_prep_dfs,
    load_single_df_from_exp_path,
    get_hydra_config,
)

In [None]:
STUDY_NAME = "may20_thrifty_sweep"
SOURCE_MODEL = "claude-3-sonnet-20240229"
N_FINETUNING = 200  # per task, response property, how many do we want?

In [None]:
SEED = 0
np.random.seed(SEED)

In [None]:
TASKS = {
    "wikipedia": ["identity", "syllable_count", "first_character", "last_character"],
    "dear_abbie": ["identity", "sentiment"],
    "number_triplets": ["identity", "is_even", "last_character", "first_character"],
    "daily_dialog": ["identity", "syllable_count", "first_character", "last_character"],
    "personal_preferences": ["identity", "syllable_count", "first_character", "last_character"],
    "self_referential": ["identity", "syllable_count", "first_character", "last_character"],
    "writing_stories": ["identity", "first_word", "writing_stories/main_character_name"],
}

Import the object-level


We should already have the response properties that we care about


In [None]:
task = "wikipedia"
response_property = "first_character"

In [None]:
json_rows = []

for task, response_properties in tqdm(TASKS.items()):
    folders = get_folders_matching_config_key(
        EXP_DIR / STUDY_NAME,
        {
            ("language_model", "model"): [SOURCE_MODEL],
            ("task", "name"): [task],
            ("task", "set"): ["train"],
        },
    )
    assert len(folders) == 1, f"Found {len(folders)} folders, expected 1"
    folder = folders[0]
    cfg = get_hydra_config(folder)
    df = load_single_df_from_exp_path(folder)
    # we need to split up the rows into the different response properties
    # shuiffle the rows
    df = df.sample(frac=1, random_state=SEED)
    sub_dfs_per_response_property = {}
    for i, response_property in enumerate(response_properties):
        sub_df = df.iloc[i * N_FINETUNING : (i + 1) * N_FINETUNING]
        # make sure that we don't have too many examples
        sub_df = sub_df.iloc[:N_FINETUNING]
        sub_dfs_per_response_property[response_property] = sub_df

    for response_property, df in sub_dfs_per_response_property.items():
        # we need to create the training
        responses = df.response
        targets = df[response_property]
        strings = df.string

        assert len(responses) == len(targets) == len(strings)
        # get response property prompt by reading in the yaml file
        with open(CONF_DIR / "response_property" / f"{response_property}.yaml") as f:
            response_property_cfg = yaml.safe_load(f)
        prompt = response_property_cfg["meta_level_prompt"]
        # we need to edit it
        assert (
            "response you gave" in prompt or "response that you gave" in prompt or "your response" in prompt or "you gave" in prompt
        ), f"prompt: {prompt}"
        prompt = prompt.replace("response you gave", "text you see")
        prompt = prompt.replace("response that you gave", "text you see")
        prompt = prompt.replace("your response", "the text you see")
        prompt = prompt.replace("you gave", "you see")
        # split up the task strings to response properties and number of examples
        # construct training examples
        train_prompts = []
        train_targets = targets
        prompt_template = 'Consider this text: "{}". '

        for response in responses:
            train_prompts.append(prompt_template.format(response) + prompt)

        assert len(train_prompts) == len(train_targets) == len(strings)
        # turn into train.json file
        for prompt, target, string in zip(train_prompts, train_targets, strings):
            json_row = {
                "messages": [
                    {"role": "system", "content": ""},
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": target},
                ],
                "string": string,  # which we should never need
            }
            json_rows.append(json_row)

In [None]:
# dump to json
out_path = EXP_DIR / "finetuning" / f"{STUDY_NAME}_baseline" / f"{SOURCE_MODEL}_baseline" / "train_dataset.jsonl"
# make sure that the folder exists
out_path.parent.mkdir(exist_ok=True, parents=True)

In [None]:
with open(out_path, "w") as f:
    for json_row in tqdm(json_rows):
        f.write(json.dumps(json_row) + "\n")
print(f"Written to {out_path}")

Now, we should be able to train a model on this by running:
`python -m evals.run_finetuning study_name={STUDY_NAME}_baseline/{SOURCE_MODEL}_baseline notes=resp_blin language_model={...}`