In [1]:
from pathlib import Path

In [2]:
import numpy as np

In [3]:
from tqdm.auto import tqdm

In [4]:
log_dir = Path("logs/default/generation.direct.gpt-4-1106-preview_corrset_default_1_tree.log")

In [5]:
from evals.utils import log_utils

In [6]:
spec = log_utils.extract_spec(log_dir)

In [31]:
def np_nan_if_none(input_num):
    if input_num is None:
        return np.nan
    else:
        return input_num
    
def zero_if_none(input_num):
    if input_num is None:
        return 0
    else:
        return input_num

In [8]:
def handle_cot_double_sampling(sampling_entries, solver):
    if "cot" in solver:
        sampling_entries = [
            entry
            for entry in sampling_entries
            if (
                # for chat models we filter like this
                isinstance(entry["prompt"], list)
                and entry["prompt"][-1]["content"].startswith(
                    "Given the above reasoning"
                )
                or (
                    # for base models we need to filter like this
                    isinstance(entry["prompt"], str)
                    and "Given the above reasoning" in entry["prompt"]
                )
            )
        ]
    return sampling_entries

In [9]:
spec

{'completion_fns': ['generation/direct/gpt-4-1106-preview'],
 'eval_name': 'identifying_variables.corrset.default',
 'base_eval': 'identifying_variables',
 'split': 'corrset',
 'run_config': {'completion_fns': ['generation/direct/gpt-4-1106-preview'],
  'eval_spec': {'cls': 'evals.elsuite.identifying_variables.eval:IdentifyingVariables',
   'registry_path': '/Users/thesofakillers/repos/dangerous-capability-evaluations/evals/registry',
   'args': {'samples_jsonl': 'identifying_variables/500.jsonl',
    'renderer': 'corrset',
    'group_metrics': True},
   'key': 'identifying_variables.corrset.default',
   'group': 'identifying_variables'},
  'seed': 1,
  'max_samples': None,
  'command': '/Users/thesofakillers/miniconda3/envs/evals/bin/oaieval generation/direct/gpt-4-1106-preview identifying_variables.corrset.default --extra_eval_param show_tree=True --record_path ./logs/20240112_182258/generation.direct.gpt-4-1106-preview_corrset_default_1_tree.log --seed 1',
  'initial_settings': {'vi

In [10]:
final_res = log_utils.extract_final_results(log_dir)

In [11]:
ind_res = log_utils.extract_individual_results(log_dir, "sampling")

In [12]:
len(ind_res)

500

In [13]:
ind_res[0]['usage']

{'completion_tokens': 9, 'prompt_tokens': 869, 'total_tokens': 878}

In [14]:
eval_names = [
    "identifying_variables.corrset.default",
    "identifying_variables.language-tabular.default",
]
solver_names = [
    "generation/hhh/gpt-4-base",
    "generation/direct/gpt-3.5-turbo",
    "generation/direct/gpt-4-1106-preview",
    "generation/cot_hhh/gpt-4-base",
    "generation/cot/gpt-3.5-turbo",
    "generation/cot/gpt-4-1106-preview",
]
solver_to_eval = {
    solver: eval_names[0] if "cot" not in solver else eval_names[1]
    for solver in solver_names
}
solver_to_tree = {
    solver: False if "cot" not in solver else True for solver in solver_names
}

In [15]:
import pandas as pd

In [16]:
tokens_per_sample_df = pd.DataFrame(
    index=solver_to_eval.keys(),
    columns=["input tokens/sample", "output tokens/sample", "total tokens/sample"],
)

In [17]:
results_dir = Path("logs/default/")

In [32]:
solver_to_tokens = {
    solver: {"input": [], "output": [], "total": []} for solver in solver_names
}
total_input = 0
total_output = 0
for log in tqdm(results_dir.glob("*.log"), total=222):
    spec = log_utils.extract_spec(log)
    solver = spec["completion_fns"][0]
    eval_name = spec["eval_name"]
    seed = spec["run_config"]["seed"]
    tree = "show_tree=True" in spec["run_config"]["command"]
    samplings = log_utils.extract_individual_results(log, "sampling")
    samplings = handle_cot_double_sampling(samplings, solver)
    for sampling in samplings:
        usage = sampling["usage"]
        if (
            solver in solver_to_eval
            and eval_name == solver_to_eval[solver]
            and seed == 1
            and tree != solver_to_tree[solver]
        ):
            solver_to_tokens[solver]["input"].append(
                np_nan_if_none(usage["prompt_tokens"])
            )
            solver_to_tokens[solver]["output"].append(
                np_nan_if_none(usage["completion_tokens"])
            )
            solver_to_tokens[solver]["total"].append(
                np_nan_if_none(usage["total_tokens"])
            )
        total_input += zero_if_none(usage["prompt_tokens"])
        total_output += zero_if_none(usage["completion_tokens"])

  0%|          | 0/222 [00:00<?, ?it/s]

In [33]:
total_input

179460552

In [34]:
total_output

24819644

In [19]:
len(solver_to_tokens['generation/direct/gpt-3.5-turbo']['output'])

500

In [22]:
for solver in solver_to_tokens.keys():
    # print(solver_to_tokens[solver])
    input_mean = np.nanmean(solver_to_tokens[solver]["input"])
    output_mean = np.nanmean(solver_to_tokens[solver]["output"])
    total_mean = np.nanmean(solver_to_tokens[solver]["total"])
    # print([input_mean, output_mean, total_mean])
    tokens_per_sample_df.loc[solver] = [
        round(input_mean),
        round(output_mean),
        round(total_mean),
    ]

In [24]:
solver_to_index = {
    "generation/hhh/gpt-4-base": "HHH GPT-4-base (corrset, no tree)",
    "generation/direct/gpt-3.5-turbo": "Direct GPT-3.5-turbo (corrset, no tree)",
    "generation/direct/gpt-4-1106-preview": "Direct GPT-4-1106-preview (corrset, no tree)",
    "generation/cot_hhh/gpt-4-base": "CoT HHH GPT-4-base (language-tabular, with tree)",
    "generation/cot/gpt-3.5-turbo": "CoT GPT-3.5-turbo (language-tabular, with tree)",
    "generation/cot/gpt-4-1106-preview": "CoT GPT-4-1106-preview (language-tabular, with tree)",
}

In [25]:
tokens_per_sample_df = tokens_per_sample_df.rename(index=solver_to_index)

In [26]:
tokens_per_sample_df

Unnamed: 0,input tokens/sample,output tokens/sample,total tokens/sample
"HHH GPT-4-base (corrset, no tree)",2294,496,2790
"Direct GPT-3.5-turbo (corrset, no tree)",851,176,1026
"Direct GPT-4-1106-preview (corrset, no tree)",851,106,957
"CoT HHH GPT-4-base (language-tabular, with tree)",2850,476,3325
"CoT GPT-3.5-turbo (language-tabular, with tree)",1560,27,1587
"CoT GPT-4-1106-preview (language-tabular, with tree)",1711,27,1738
