In [31]:
from pathlib import Path

In [32]:
from tqdm.auto import tqdm

In [62]:
log_dir = Path("logs/default/generation.direct.gpt-4-1106-preview_corrset_default_1_tree.log")

In [63]:
from evals.utils import log_utils

In [64]:
spec = log_utils.extract_spec(log_dir)

In [78]:
def handle_cot_double_sampling(sampling_entries, solver):
    if "cot" in solver:
        sampling_entries = [
            entry
            for entry in sampling_entries
            if (
                # for chat models we filter like this
                isinstance(entry["prompt"], list)
                and entry["prompt"][-1]["content"].startswith(
                    "Given the above reasoning"
                )
                or (
                    # for base models we need to filter like this
                    isinstance(entry["prompt"], str)
                    and "Given the above reasoning" in entry["prompt"]
                )
            )
        ]
    return sampling_entries

In [65]:
spec

{'completion_fns': ['generation/direct/gpt-4-1106-preview'],
 'eval_name': 'identifying_variables.corrset.default',
 'base_eval': 'identifying_variables',
 'split': 'corrset',
 'run_config': {'completion_fns': ['generation/direct/gpt-4-1106-preview'],
  'eval_spec': {'cls': 'evals.elsuite.identifying_variables.eval:IdentifyingVariables',
   'registry_path': '/Users/thesofakillers/repos/dangerous-capability-evaluations/evals/registry',
   'args': {'samples_jsonl': 'identifying_variables/500.jsonl',
    'renderer': 'corrset',
    'group_metrics': True},
   'key': 'identifying_variables.corrset.default',
   'group': 'identifying_variables'},
  'seed': 1,
  'max_samples': None,
  'command': '/Users/thesofakillers/miniconda3/envs/evals/bin/oaieval generation/direct/gpt-4-1106-preview identifying_variables.corrset.default --extra_eval_param show_tree=True --record_path ./logs/20240112_182258/generation.direct.gpt-4-1106-preview_corrset_default_1_tree.log --seed 1',
  'initial_settings': {'vi

In [66]:
final_res = log_utils.extract_final_results(log_dir)

In [67]:
ind_res = log_utils.extract_individual_results(log_dir, "sampling")

In [68]:
len(ind_res)

500

In [39]:
ind_res[0]['usage']

{'completion_tokens': 248, 'prompt_tokens': 541, 'total_tokens': 789}

In [80]:
eval_names = [
    "identifying_variables.corrset.default",
    "identifying_variables.language-tabular.default",
]
solver_names = [
    "generation/hhh/gpt-4-base",
    "generation/direct/gpt-3.5-turbo",
    "generation/direct/gpt-4-1106-preview",
    "generation/cot_hhh/gpt-4-base",
    "generation/cot/gpt-3.5-turbo",
    "generation/cot/gpt-4-1106-preview",
]
solver_to_eval = {
    solver: eval_names[0] if "cot" not in solver else eval_names[1]
    for solver in solver_names
}
solver_to_tree = {
    solver: False if "cot" not in solver else True for solver in solver_names
}

In [72]:
import pandas as pd

In [85]:
tokens_per_sample_df = pd.DataFrame(
    index=solver_to_eval.keys(),
    columns=["input tokens/sample", "output tokens/sample", "total tokens/sample"],
)

In [74]:
results_dir = Path("logs/default/")

In [94]:
def np_nan_if_none(input_num):
    if input_num is None:
        return np.nan
    else:
        return input_num

In [95]:
solver_to_tokens = {
    solver: {"input": [], "output": [], "total": []} for solver in solver_names
}
for log in tqdm(results_dir.glob("*.log"), total=222):
    spec = log_utils.extract_spec(log)
    solver = spec["completion_fns"][0]
    eval_name = spec["eval_name"]
    seed = spec["run_config"]["seed"]
    tree = "show_tree=True" in spec["run_config"]["command"]
    if not (
        solver in solver_to_eval
        and eval_name == solver_to_eval[solver]
        and seed == 1
        and tree != solver_to_tree[solver]
    ):
        # not a solver for interest
        continue
    print(solver)
    samplings = log_utils.extract_individual_results(log, "sampling")
    samplings = handle_cot_double_sampling(samplings, solver)
    for sampling in samplings:
        usage = sampling["usage"]
        solver_to_tokens[solver]["input"].append(np_nan_if_none(usage["prompt_tokens"]))
        solver_to_tokens[solver]["output"].append(
            np_nan_if_none(usage["completion_tokens"])
        )
        solver_to_tokens[solver]["total"].append(np_nan_if_none(usage["total_tokens"]))

  0%|          | 0/222 [00:00<?, ?it/s]

generation/cot/gpt-4-1106-preview
generation/hhh/gpt-4-base
generation/direct/gpt-3.5-turbo
generation/cot_hhh/gpt-4-base
generation/direct/gpt-4-1106-preview
generation/cot/gpt-3.5-turbo


In [83]:
len(solver_to_tokens['generation/direct/gpt-3.5-turbo']['output'])

500

In [86]:
import numpy as np

In [96]:
for solver in solver_to_tokens.keys():
    # print(solver_to_tokens[solver])
    input_mean = np.nanmean(solver_to_tokens[solver]["input"])
    output_mean = np.nanmean(solver_to_tokens[solver]["output"])
    total_mean = np.nanmean(solver_to_tokens[solver]["total"])
    # print([input_mean, output_mean, total_mean])
    tokens_per_sample_df.loc[solver] = [input_mean, output_mean, total_mean]

[850.766, 175.506, 1026.272]
[850.766, 106.1, 956.866]
[2293.766, 496.382, 2790.148]
[1560.354, 27.096, 1587.45]
[1710.938, 26.722, 1737.66]
[2849.714, 475.9899799599198, 3324.752]


In [97]:
tokens_per_sample_df

Unnamed: 0,input tokens/sample,output tokens/sample,total tokens/sample
generation/direct/gpt-3.5-turbo,850.766,175.506,1026.272
generation/direct/gpt-4-1106-preview,850.766,106.1,956.866
generation/hhh/gpt-4-base,2293.766,496.382,2790.148
generation/cot/gpt-3.5-turbo,1560.354,27.096,1587.45
generation/cot/gpt-4-1106-preview,1710.938,26.722,1737.66
generation/cot_hhh/gpt-4-base,2849.714,475.98998,3324.752
