# Computing the ceiling performance for a model on the sweep

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

In [None]:
from evals.locations import REPO_DIR, EXP_DIR
from evals.utils import run_command
from evals.analysis.loading_data import get_hydra_config
from evals.analysis.loading_data import load_single_df_from_exp_path

## Which models, and which tasks?
Using the format from `scripts/sweep_full_study.py`.

`TASKS` is a string of a dict.

In [None]:
STUDY_NAME = "may20_thrifty_sweep"
MODELS = [
    "claude-3-sonnet",
    "gpt-3.5-turbo",
    "gpt-4",
    "gemini-1.0-pro-002",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9R9Lqsm2", #ft:gpt-3.5-turbo-1106:dcevals-kokotajlo:sweep:9R9Lqsm2",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9R9L0Ddt",
    "finetuned/may20_thrifty_sweep/gpt-4/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSQ9BDP",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSQHCmp",
    "finetuned/may20_thrifty_sweep/gpt-4/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9RSPteWA",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-4-0613_dcevals-kokotajlo_sweep_9RSPjTJF",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_lr2_9RW1QKsf",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo-0125/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9Th6cCBF",
    # "finetuned/may20_thrifty_sweep/gpt-4-turbo/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9ThUFr7R",
    # "finetuned/may20_thrifty_sweep/gpt-4-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_sweep_9ThBY0oK",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo-0125/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9Th7D4TK",
    "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9ThVmSp2",
    "finetuned/may20_thrifty_sweep/claude-3-sonnet/ft_gpt-3.5-turbo-0125_dcevals-kokotajlo_sweep_9Th9i5Mf",
    # "finetuned/may20_thrifty_sweep/gpt-3.5-turbo/ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_scramble_9TfFZ0nD",
]
TASKS = '{"number_triplets": ["identity", "is_even", "last_character", "first_character"], "wikipedia": ["identity", "syllable_count", "first_character", "last_character"], "writing_stories": ["identity", "first_word", "writing_stories/main_character_name"], "personal_preferences": ["identity", "syllable_count", "first_character", "last_character"], }'

In [None]:
# other hyperparameters
N_PER_TASK = 10
SEED = 42
# SAMPLES_PER_INPUT = 100
SAMPLES_PER_INPUT = 50

In [None]:
TASKS = eval(TASKS)

## Run the ceiling calculation

In [None]:
for model in tqdm(MODELS):
    for task in TASKS.keys():
        # can we get the model divergent strings?
        model_divergent_string_path = EXP_DIR / STUDY_NAME / f"divergent_strings_{task}.csv"
        if os.path.exists(model_divergent_string_path):
            print(f"üîç Found divergent strings for {model} on {task}")
            command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={'nondeterminism_ceiling/'+STUDY_NAME} task={task} language_model={model} task.set=val n_samples={SAMPLES_PER_INPUT} task.num={N_PER_TASK} strings_path={model_divergent_string_path} "
        else:
            print(f"üîç‚ö†Ô∏è Could not find divergent strings for {model} on {task}‚ÄîRunning without")
            command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={'nondeterminism_ceiling/'+STUDY_NAME} task={task} language_model={model} task.set=val n_samples={SAMPLES_PER_INPUT} task.num={N_PER_TASK} "
        print(f"üèÉ‚Äç‚û°Ô∏è Running {model} on {task}: {command}")
        run_command(command)

## Extract the response properties

In [None]:
results_folder = EXP_DIR / "nondeterminism_ceiling" / STUDY_NAME
subfolders = [results_folder / f for f in next(os.walk(results_folder))[1]]
print(f"Got {len(subfolders)} subfolders")

In [None]:
for folder in tqdm(subfolders):
        # load config
        try:
                cfg = get_hydra_config(folder)
        except ValueError:
                print(f"Skipping {folder}")
                continue
        task = cfg.task.name
        response_properties = TASKS[task]
        for response_property in response_properties:
                command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_property_extraction.py dir={folder} response_property={response_property}"
                print(f"üõ∏ Extracting {response_property} on {model} on {task}: {command}")
                try:
                        run_command(command)
                except Exception as e:
                        print(f"Error: {e}\nwhile running {command}")

## Compute the Ceiling

In [None]:
BOOTSTRAP_N = 1000

In [None]:
def compute_pairwise_match(df_subset, response_property='identity'):
    # assert len(df_subset) == N_SAMPLES, f"Expected {N_SAMPLES} samples, got {len(df_subset)}"
    assert df_subset['string'].nunique() == 1, "Expected all samples to be from the same string"
    responses = df_subset[response_property].values
    shuffled_responses = np.random.permutation(responses)
    return np.mean(responses == shuffled_responses)

We'd have to make them be the same distribution, but with different levels of noise. Seems harder. 

The way to do this would be: 
- for both pairs of responses
    - find the most common response, rename 'A'
    - find the second most common response, rename 'B'
    - ...
- see how often two arbitrary pairs match

In [None]:
def compute_pairwise_match_across_sets(df_subsetA, df_subsetB, response_property='identity'):
    assert df_subsetA['string'].nunique() == 1, "Expected all samples to be from the same string"
    assert df_subsetB['string'].nunique() == 1, "Expected all samples to be from the same string"
    # we need to replace the responses with dummy values‚Äîmost common one is 0, then 1, then 2, etc.
    responsesA = df_subsetA[response_property].values
    responsesB = df_subsetB[response_property].values
    # count up the responses in A
    response_countsA = {}
    for response in responsesA:
        if response not in response_countsA:
            response_countsA[response] = 0
        response_countsA[response] += 1
    # sort by frequency
    response_countsA = {k: v for k, v in sorted(response_countsA.items(), key=lambda item: item[1], reverse=True)}
    # count up the responses in B
    response_countsB = {}
    for response in responsesB:
        if response not in response_countsB:
            response_countsB[response] = 0
        response_countsB[response] += 1
    # sort by frequency
    response_countsB = {k: v for k, v in sorted(response_countsB.items(), key=lambda item: item[1], reverse=True)}
    # make aligned responses
    aligned_responsesA = []
    for i, (response, count) in enumerate(response_countsA.items()):
        aligned_responsesA.extend([i]*count)
    aligned_responsesB = []
    for i, (response, count) in enumerate(response_countsB.items()):
        aligned_responsesB.extend([i]*count)
    matches = []
    for _ in range(BOOTSTRAP_N):
        A = np.random.choice(aligned_responsesA)
        B = np.random.choice(aligned_responsesB)
        matches.append(A == B)
    return np.mean(matches)

In [None]:
def bootstrap_95_CI(samples):
    means = []
    for _ in range(BOOTSTRAP_N):
        sample = np.random.choice(samples, len(samples), replace=True)
        means.append(np.mean(sample))
    return np.percentile(means, [2.5, 97.5])

In [None]:
def compute_ceiling(folder, response_property):
    # load df
    df = load_single_df_from_exp_path(folder, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    samples_across_strings = []
    means_across_strings = []

    for string in tqdm(df.string.unique()):
        samples_across_iters = [compute_pairwise_match(df[df.string == string], response_property) for _ in range(BOOTSTRAP_N)]
        samples_across_strings.append(samples_across_iters)
        means_across_strings.append(np.mean(samples_across_iters))
    
    all_samples = np.concatenate(samples_across_strings)
    # return mean and 95%CI of mean
    return np.mean(means_across_strings), bootstrap_95_CI(all_samples)

In [None]:
def compute_ceiling_for_model_pair(folderA, folderB, response_property):
    # load df
    dfA = load_single_df_from_exp_path(folderA, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    dfB = load_single_df_from_exp_path(folderB, exclude_noncompliant=False) # TODO Should this be true? That might increase the ceiling.
    means_across_strings = []

    for current_string in tqdm(set(list(dfA.string.unique()) + list(dfB.string.unique()))):
        dfA_string_subset = dfA[dfA.string == current_string]
        dfB_string_subset = dfB[dfB.string == current_string]
        if len(dfA_string_subset) == 0 or len(dfB_string_subset) == 0:
            continue
        mean_acc = compute_pairwise_match_across_sets(dfA_string_subset, dfB_string_subset, response_property)
        means_across_strings.append(mean_acc)
    
    # return mean and 95%CI of mean
    return np.mean(means_across_strings), bootstrap_95_CI(means_across_strings)

In [None]:
ceiling_results = {}

for folder in tqdm(subfolders):
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        continue
    task = cfg.task.name
    model = cfg.language_model.model
    response_properties = TASKS[task]
    for response_property in response_properties:
        mean, ci = compute_ceiling(folder, response_property)
        ceiling_results[(model, task, response_property)] = (mean, ci)

ceiling_results_df = pd.DataFrame(ceiling_results).T
ceiling_results_df.columns = ['mean', 'ci']


In [None]:
ceiling_results_df

In [None]:
# aggregated mean by model
display(ceiling_results_df['mean'].groupby(level=0).mean())

We also want to calculate how well each model predicts every other model.

We'd have to make them be the same distribution, but with different levels of noise. Seems harder. 

The way to do this would be: 
- for both pairs of responses
    - find the most common response, rename 'A'
    - find the second most common response, rename 'B'
    - ...
- see how often two arbitrary pairs match

In [None]:
# load in folders in the structure {model}/{task}
dfs_per_model_task = {}

for folder in tqdm(subfolders):
    try:
        cfg = get_hydra_config(folder)
    except ValueError:
        print(f"Skipping {folder}")
        continue
    task = cfg.task.name
    model = cfg.language_model.model
    try:
        dfs_per_model_task[model][task] = folder
    except KeyError:
        dfs_per_model_task[model] = {task: folder}

In [None]:
ceiling_pair_results = {}

for modelA in tqdm(dfs_per_model_task.keys()):
    for task in dfs_per_model_task[modelA].keys():
        folderA = dfs_per_model_task[modelA][task]
        for modelB in dfs_per_model_task.keys():
            folderB = dfs_per_model_task[modelB][task]
            for response_property in TASKS[task]:
                mean, ci = compute_ceiling_for_model_pair(folderA, folderB, response_property)
                ceiling_pair_results[(modelA, modelB, task, response_property)] = {'mean': mean, 'ci': ci}

ceiling_paired_results_df = pd.DataFrame(ceiling_pair_results).T

In [None]:
pd.DataFrame(ceiling_pair_results).T

In [None]:
# grouped by model pair
display(ceiling_paired_results_df['mean'].groupby(level=[0,1]).mean())

In [None]:
ceiling_pair_results_df = ceiling_paired_results_df

In [None]:
ceiling_pair_results_df.reset_index(inplace=True)

In [None]:
ceiling_pair_results_df.columns = ['modelA', 'modelB', 'task', 'response_property', 'ceiling', 'ci']

In [None]:
ceiling_pair_results_df

## Save results
as .csv

In [None]:
ceiling_results_df.to_csv(EXP_DIR / "nondeterminism_ceiling" / f"{STUDY_NAME}_ceiling_results.csv")

In [None]:
ceiling_pair_results_df.to_csv(EXP_DIR / "nondeterminism_ceiling" / f"{STUDY_NAME}_ceiling_pair_results.csv")