# Pretty Plots
These are meant for pretty plots. Use the structured input to load the data and to assign plotting functions. For a more detailed/sweeping analysis, see the other notebooks.

## Load in dataframes

```
EXPERIMENTS = {
    "Experiment Name": {
        "folder": "folder_name",
        "style_args": {
            "color": "red",
            "linestyle": "-",
            "linewidth": 2,
            "marker": "o",
            "markersize": 6
        }
    },
    ...
    }

```

    Copying the few cells below into the experiment log files might be a good idea.

In [1]:
# load in the data for random_numbers_bergenia (include base dfs here)
EXPERIMENTS = {
    "GPT3.5 object-level": {
        "study": "number_triplets_bergenia",
        "exp": "base_gpt-3.5-turbo-1106_base-completion-bergenia_prompt_number_triplets_dataset",  # experiment folder name
        "style_args": {
            "color": "red",
        },
    },
    "GPT4 object-level": {
        "study": "number_triplets_bergenia",
        "exp": "base_gpt-4-0613_base-completion-bergenia_prompt_number_triplets_dataset",  # experiment folder name
        "style_args": {
            "color": "blue",
        },
    },

    "GPT3.5 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_gpt-3.5-turbo-1106_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_nonft35_reverse_note", # experiment folder name
        "style_args": {
            "color": "coral",
        }
    },
    "GPT4 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_gpt-4-0613_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_nonft4_reverse_note",
        "style_args": {
            "color": "lightblue",
        }
    },
    "GPT3.5 finetuned on GPT3.5 object-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "base_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on35onnum_8x4lehAb_base-completion-bergenia_prompt_number_triplets_dataset",
        "style_args": {
            "color": "indianred",
        },
    },
    "GPT4 finetuned on GPT4 object-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "base_ft_gpt-4-0613_dcevals-kokotajlo_4on4onnum_8x8dNwL1_base-completion-bergenia_prompt_number_triplets_dataset",
        "style_args": {
            "color": "cornflowerblue",
        },
    },
    "GPT3.5 finetuned on GPT4 object-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "base_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on4onnum_8xMcmGZM_base-completion-bergenia_prompt_number_triplets_dataset",
        "style_args": {
            "color": "lightcoral",
        },
    },
    "GPT3.5 finetuned on GPT3.5 scrambled object-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "base_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on35onnumscram_8x6QzXiQ_base-completion-bergenia_prompt_number_triplets_dataset",
        "style_args": {
            "color": "lightsalmon",
        },
    },
    # meta levels
    "GPT3.5 finetuned on GPT3.5 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on35onnum_8x4lehAb_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_ft35_reverse_note",
        "style_args": {
            "color": "lightcoral",
        },
    },
        "GPT3.5 finetuned on GPT4 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on4onnum_8xMcmGZM_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_ft35on4_reverse_note",
        "style_args": {
            "color": "purple",
        },
    },
        "GPT3.5 finetuned on GPT3.5 scrambled meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on35onnumscram_8x6QzXiQ_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_ft35scrambled_reverse_note",
        "style_args": {
            "color": "darkred",
        },
    },
    "GPT4 finetuned on GPT4 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_ft_gpt-4-0613_dcevals-kokotajlo_4on4onnum_8x8dNwL1_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_ft4_reverse_note",
        "style_args": {
            "color": "darkblue",
        },
    },
    "GPT3.5 finetuned on GPT4 meta-level": {
        "study": "number_triplets_bergenia_ft_self_pred",
        "exp": "self_ft_gpt-3.5-turbo-1106_dcevals-kokotajlo_35on4onnum_8xMcmGZM_number_triplets_dataset_0_shot_True_seed_self-prediction-bergenia-nontechnical_prompt_ft35on4_reverse_note",
        "style_args": {
            "color": "darkred",
        },
    },
}

In [None]:
# # if we want to, load in few shot data at scale
# Ns = [0, 1, 2, 3, 5, 10, 15, 20, 30, 50]

# gpt35_few_shots = {
#     f"GPT3.5 {n}-shot": {
#         "study": "number_triplets_bergenia",
#         "exp": f"self_gpt-3.5-turbo-1106_number_triplets_dataset_{n}_shot_True_seed_self-prediction-bergenia-technical_prompt__note",
#         "style_args": {
#             "color": "red",
#         },
#     } for n in Ns}
# EXPERIMENTS.update(gpt35_few_shots)

# # gpt_4_few_shots = {
# #     f"GPT4 {n}-shot": {
# #         "study": "number_triplets_bergenia",
# #         "exp": f"self_gpt-4-0613_number_triplets_dataset_{n}_shot_True_seed_self-prediction-bergenia-nontechnical_prompt__note",
# #         "style_args": {
# #             "color": "blue",
# #         },
# #     } for n in Ns}
# # EXPERIMENTS.update(gpt_4_few_shots)

# # gpt35_other_model_few_shots = {
# #     f"GPT3.5 {n}-shot with GPT4 examples": {
# #         "study": "number_triplets_bergenia",
# #         "exp": f"self_gpt-3.5-turbo-1106_number_triplets_dataset_{n}_shot_other_model_seed_self-prediction-bergenia-nontechnical_prompt__note",
# #         "style_args": {
# #             "color": "purple",
# #         },
# #     } for n in Ns}
# # EXPERIMENTS.update(gpt35_other_model_few_shots)

# gpt35_other_task_few_shots = {
#     f"GPT3.5 {n}-shot with random word examples": {
#         "study": "number_triplets_bergenia_seeded_random_words",
#         "exp": f"self_gpt-3.5-turbo-1106_number_triplets_dataset_{n}_shot_other_task_seed_self-prediction-bergenia-nontechnical_prompt__note",
#         "style_args": {
#             "color": "yellowgreen",
#         },
#     } for n in Ns}
# EXPERIMENTS.update(gpt35_other_task_few_shots)
        

In [2]:
# which names from above should be mer
PAIRS = {
    "GPT3.5 non-finetuned\nagainst self on reversed input": {
        "names": ["GPT3.5 object-level", "GPT3.5 meta-level"],
        "style_args": {
            "color": "red",
        }
    },
    "GPT4 non-finetuned\nagainst self on reversed input": {
        "names": ["GPT4 object-level", "GPT4 meta-level"],
        "style_args": {
            "color": "blue",
        }
    },
    "GPT3.5 finetuned\nagainst self on reversed input": {
        "names": ["GPT3.5 finetuned on GPT3.5 object-level", "GPT3.5 finetuned on GPT3.5 meta-level"],
        "style_args": {
            "color": "darkred",
        }
    },
    "GPT4 finetuned\nagainst self on reversed input": {
        "names": ["GPT4 finetuned on GPT4 object-level", "GPT4 finetuned on GPT4 meta-level"],
        "style_args": {
            "color": "darkblue",
        }
    },
    "GPT3.5 finetuned\nagainst nonft GPT3.5 on reversed input": {
        "names": ["GPT3.5 object-level", "GPT3.5 finetuned on GPT3.5 meta-level"],
        "style_args": {
            "color": "lightcoral",
        }
    },
    "GPT4 finetuned\nagainst nonft GPT4 on reversed input": {
        "names": ["GPT4 object-level", "GPT4 finetuned on GPT4 meta-level"],
        "style_args": {
            "color": "lightblue",
        }
    },
    "GPT3.5 scrambled finetuned\nagainst self on reversed input": {
        "names": ["GPT3.5 finetuned on GPT3.5 scrambled object-level", "GPT3.5 finetuned on GPT3.5 scrambled meta-level"],
        "style_args": {
            "color": "khaki",
        }
    },
    "GPT3.5 scrambled finetuned\nagainst nonft GPT3.5 on reversed input": {
        "names": ["GPT3.5 object-level", "GPT3.5 finetuned on GPT3.5 scrambled meta-level"],
        "style_args": {
            "color": "darkkhaki",
        }
    },
    "GPT3.5 finetuned on GPT4\nagainst self on reversed input": {
        "names": ["GPT3.5 finetuned on GPT4 object-level", "GPT3.5 finetuned on GPT4 meta-level"],
        "style_args": {
            "color": "violet",
        }
    },
    "GPT3.5 finetuned on GPT4\nagainst nonft GPT4 on reversed input": {
        "names": ["GPT4 object-level", "GPT3.5 finetuned on GPT4 meta-level"],
        "style_args": {
            "color": "darkviolet",
        }
    },
}

In [None]:
# # merge few-shot series
# for n in Ns:
#     PAIRS[f"GPT3.5 {n}-shot"] = {
#         "names": ["GPT3.5 object-level", f"GPT3.5 {n}-shot"],
#         # "type_name": "GPT3.5 on GPT3.5\ncompared against GPT3.5", # name without eg. the number of shots
#         "type_name": "GPT3.5 with number examples", # name without eg. the number of shots
#         "style_args": {
#             "color": "red",
#             "linestyle": "--",
#         }
#     }
#     # PAIRS[f"GPT4 {n}-shot"] = {
#     #     "names": ["GPT4 object-level", f"GPT4 {n}-shot"],
#     #     "type_name": "GPT 4",
#     #     "style_args": {
#     #         "color": "blue",
#     #         "linestyle": "--",
#     #     }
#     # }

#     # PAIRS[f"GPT3.5 {n}-shot with GPT4 examples\ncompared against GPT3.5"] = {
#     #     "names": ["GPT3.5 object-level", f"GPT3.5 {n}-shot with GPT4 examples"],
#     #     "type_name": "GPT3.5 on GPT4\ncompared against GPT3.5", 
#     #     "style_args": {
#     #         "color": "gold",
#     #         "linestyle": "-",
#     #     }
#     # }

#     # PAIRS[f"GPT3.5 {n}-shot with GPT4 examples\ncompared against GPT4"] = {
#     #     "names": ["GPT4 object-level", f"GPT3.5 {n}-shot with GPT4 examples"],
#     #     "type_name": "GPT3.5 on GPT4\ncompared against GPT4",
#     #     "style_args": {
#     #         "color": "purple",
#     #         "linestyle": "-",
#     #     }
#     # }
#     PAIRS[f"GPT3.5 {n}-shot with random word examples"] = {
#         "names": ["GPT3.5 object-level", f"GPT3.5 {n}-shot with random word examples"],
#         "type_name": "GPT3.5 with word examples",
#         "style_args": {
#             "color": "yellowgreen",
#             "linestyle": "-",
#         }
#     }

In [3]:
# process EXPERIMENTS
# add name as label to style_args
for exp in EXPERIMENTS:
    if "label" not in EXPERIMENTS[exp]["style_args"]:
        EXPERIMENTS[exp]["style_args"]["label"] = exp

In [4]:
# process PAIRS
# add name as label to style_args
for pair in PAIRS:
    for name in PAIRS[pair]["names"]:
        # add label to style_args if it's not already there
        if "label" not in EXPERIMENTS[name]["style_args"]:
            PAIRS[pair]["style_args"]["label"] = name
    # add typename
    if "type_name" not in PAIRS[pair]:
        PAIRS[pair]["type_name"] = pair


## Imports

In [5]:
from pathlib import Path
import subprocess
import sys
import random
import logging

In [6]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [9]:
from evals.analysis.analysis_helpers import merge_object_and_meta_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config, fill_df_with_function_bootstrap, bootstrap_ci
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path, load_single_df_from_exp_path
from evals.utils import get_maybe_nested_from_dict
from evals.analysis.analysis_functions import *

In [10]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [11]:
# set color palette
palette = sns.color_palette("Set1")
sns.set_palette(palette)

In [12]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [13]:
# set font for plots
plt.rcParams["font.family"] = "Univers Next Pro"

# retina plots
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [16]:
from evals.locations import REPO_DIR
REPO_DIR

PosixPath('/Users/pbu5262/Documents/python_scripts/introspection_self_prediction')

In [17]:
# Set the directory for the data
EXPDIR = Path(REPO_DIR) / "exp" 

Load in the data

In [18]:
# load the dataframes
for exp in EXPERIMENTS:
    EXPERIMENTS[exp]["config"] = get_hydra_config(EXPDIR / EXPERIMENTS[exp]["study"] / EXPERIMENTS[exp]["exp"])
    EXPERIMENTS[exp]["df"] = load_single_df_from_exp_path(EXPDIR / EXPERIMENTS[exp]["study"] / EXPERIMENTS[exp]["exp"], exclude_noncompliant=False)
print(f"Loaded {len(EXPERIMENTS)} experiments")

ValueError: No logs found in /Users/pbu5262/Documents/python_scripts/introspection_self_prediction/exp/number_triplets_bergenia/base_gpt-3.5-turbo-1106_base-completion-bergenia_prompt_number_triplets_dataset

In [None]:
# merge the pairs
for pair in PAIRS:
    print(f"Merging {pair}: {PAIRS[pair]['names']}")
    dfs = [EXPERIMENTS[name]["df"] for name in PAIRS[pair]["names"]]
    PAIRS[pair]["df"] = merge_base_and_meta_dfs(dfs[0], dfs[1], string_modifier=EXPERIMENTS[PAIRS[pair]["names"][1]]["config"]["dataset"]["string_modifier"])
    print("---------------------------------")
print(f"Merged {len(PAIRS)} pairs")

Dataset properties

In [None]:
# N_POSSIBLE_ITEMS = len(words.words()) # what is the number of possible items in the string? 🔵
N_POSSIBLE_ITEMS = 1000 # 🔵
# N_POSSIBLE_ITEMS = 2 # 🔵
print(f"Number of possible items in the string: {N_POSSIBLE_ITEMS},\nwhich gives us a probability of {1/N_POSSIBLE_ITEMS:.6%} for a random guess")

## Single Plots

In [None]:
measure = lambda df: (df["compliance"] == True).mean()

plt.figure(figsize=(10, 6))
for i, exp in tqdm(enumerate(EXPERIMENTS), total=len(EXPERIMENTS)):
    df = EXPERIMENTS[exp]["df"]
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    ci = np.array([val - ci[0], ci[1] - val]).reshape(2, 1)
    plt.bar(i, val, yerr=ci, **EXPERIMENTS[exp]["style_args"])
    plt.text(i, val / 2, exp, ha="center", va="center", color="grey", rotation=90)

plt.ylabel("Compliance")
plt.title("Compliance")
# yticks in percentage
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])

# Move the legend to the side
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()


In [None]:
measure = lambda df: stats.entropy(df['response'].value_counts(normalize=True))

plt.figure(figsize=(10, 6))
for i, exp in tqdm(enumerate(EXPERIMENTS), total=len(EXPERIMENTS)):
    df = EXPERIMENTS[exp]["df"]
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    ci = np.array([abs(val - ci[0]), abs(ci[1] - val)]).reshape(2, 1)
    plt.bar(i, val, yerr=ci, **EXPERIMENTS[exp]["style_args"])
    plt.text(i, val / 2, exp, ha="center", va="center", color="grey", rotation=90)

plt.ylabel("Shannon Entropy over responses")
plt.title("Shannon Entropy")

# Move the legend to the side
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()


## Pair Plots

Accuracy plot

In [None]:
measure = calc_accuracy
baseline = baseline_accuracy_under_mode

plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
for i, (name, info) in tqdm(enumerate(PAIRS.items()), total=len(PAIRS.items())):
    df = info["df"]
    c = info["style_args"].get("color", palette[i % len(palette)])
    shading = info["style_args"].get("shading", 0.33)
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    plt.bar(i, val, yerr=(ci[1] - ci[0]) / 2, label=name, color=c, alpha=shading)
    #baseline
    baseline_val = baseline(df, EXPERIMENTS[info["names"][0]]["df"])
    plt.plot([i-.5, i+.5], [baseline_val, baseline_val], color=c, linestyle="dotted", alpha=shading)
    plt.text(i, val / 2, name, ha="center", va="center", color="grey", rotation=90)

# add chance line
plt.axhline(1/N_POSSIBLE_ITEMS, color="grey", linestyle="dotted", alpha=0.5)
plt.text (i+.5, 1/N_POSSIBLE_ITEMS / 2, "chance", ha="center", va="bottom", color="grey")

plt.title(f"Self-prediction accuracy")
# no xticks
plt.xticks([])
plt.xlabel("Model")
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])

# labels
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')

plt.ylabel("Accuracy")
plt.show()

In [None]:
measure = calc_accuracy_with_excluded
baseline = baseline_accuracy_under_mode

plt.figure(figsize=(12, 6))  # Set the figsize to (12, 6) for a larger figure
for i, (name, info) in tqdm(enumerate(PAIRS.items()), total=len(PAIRS.items())):
    df = info["df"]
    c = info["style_args"].get("color", palette[i % len(palette)])
    shading = info["style_args"].get("shading", 0.33)
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    plt.bar(i, val, yerr=(ci[1] - ci[0]) / 2, label=name, color=c, alpha=shading)
    #baseline
    baseline_val = baseline(df, EXPERIMENTS[info["names"][0]]["df"])
    plt.plot([i-.5, i+.5], [baseline_val, baseline_val], color=c, linestyle="dotted", alpha=shading)
    plt.text(i, val / 2, name, ha="center", va="center", color="grey", rotation=90)

    plt.text(i, val / 2, name, ha="center", va="center", color="grey", rotation=90)

# add chance line
plt.axhline(1/N_POSSIBLE_ITEMS, color="grey", linestyle="dotted", alpha=0.5)
plt.text (i+.5, 1/N_POSSIBLE_ITEMS / 2, "chance", ha="center", va="bottom", color="grey")

plt.title(f"Self-prediction accuracy")
# no xticks
plt.xticks([])
plt.xlabel("Model")
# Scale y labels by 100 to get percent
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])

# Move the legend to the side
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')

plt.ylabel("Accuracy counting non-compliant responses as incorrect")
plt.show()

In [None]:
# animate!
# Set the figure size once, used for all plots
fig_size = (12, 6)

# Determine the total number of items to plot
total_items = len(PAIRS.items())

# Loop over the number of bars to display, increasing by one each time
for num_bars in range(total_items + 1):  # Start from 0 to total_items
    plt.figure(figsize=fig_size)
    for i, (name, info) in enumerate(PAIRS.items()):
        df = info["df"]
        c = info["style_args"].get("color", palette[i % len(palette)])
        shading = info["style_args"].get("shading", 0.33)
        val = measure(df)
        ci = bootstrap_ci(df, measure)

        # Set alpha to 0 (fully transparent) for bars not yet "revealed"
        alpha = 0 if i >= num_bars else shading

        # hide error bars
        if i >= num_bars:
            ci = np.array([0, 0]).reshape(2, 1)
        
        plt.bar(i, val, yerr=(ci[1] - ci[0]) / 2, label=name if i < num_bars else '', color=c, alpha=alpha)
        baseline_val = baseline(df, EXPERIMENTS[info["names"][0]]["df"])
        plt.plot([i-.5, i+.5], [baseline_val, baseline_val], color=c, linestyle="dotted", alpha=alpha)
        
        # Set text color to fully transparent for names not yet "revealed"
        text_color = "grey" if i < num_bars else (1,1,1,0)  # Last element is alpha
        plt.text(i, val, name, ha="center", va="center", color=text_color, rotation=90)

    # Add chance line and text once per figure
    plt.axhline(1/N_POSSIBLE_ITEMS, color="grey", linestyle="dotted", alpha=0.5)
    plt.text(total_items, 1/N_POSSIBLE_ITEMS, "chance", ha="right", va="bottom", color="grey")

    plt.title("Self-prediction accuracy")
    plt.xticks([])
    plt.xlabel("Model")
    plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', framealpha=0)
    plt.ylabel("Accuracy counting non-compliant responses as incorrect")

    # Save each figure with a unique name, including the fully transparent "initial" one
    # plt.savefig(f"/mnt/data/plot_{num_bars}.png", bbox_inches='tight')
    plt.show()
    plt.close()

## Few-shot over n plots

In [None]:
measure = calc_accuracy


plt.figure(figsize=(12, 6))

# Data accumulation structures
type_data = {}  # key: type_name, value: dict with 'x', 'y', 'yerr_lower', 'yerr_upper'
handles = []
labels = []

for i, (name, info) in tqdm(iterable=enumerate(PAIRS.items()), total=len(PAIRS)):
    df = info["df"]
    c = info["style_args"].get("color", palette[i % len(palette)])
    few_shot_n = EXPERIMENTS[info["names"][1]]["config"]["dataset"]["n_shot"]
    shading = info["style_args"].get("shading", 0.33)
    marker = info["style_args"].get("marker", "o")
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    
    # Store data for connecting dots
    if info["type_name"] not in type_data:
        type_data[info["type_name"]] = {'x': [], 'y': [], 'yerr_lower': [], 'yerr_upper': [], 'color': c, 'marker': marker, 'shading': shading}
    type_data[info["type_name"]]['x'].append(few_shot_n)
    type_data[info["type_name"]]['y'].append(val)
    type_data[info["type_name"]]['yerr_lower'].append(val - ci[0])
    type_data[info["type_name"]]['yerr_upper'].append(ci[1] - val)

    # Plot individual point
    plt.errorbar(few_shot_n, val, yerr=np.array([[val - ci[0]], [ci[1] - val]]), fmt=marker, color=c, alpha=shading)

# Plot lines to connect dots
for type_name, data in type_data.items():
    sorted_indices = np.argsort(data['x'])
    sorted_x = np.array(data['x'])[sorted_indices]
    sorted_y = np.array(data['y'])[sorted_indices]
    plt.plot(sorted_x, sorted_y, color=data['color'], alpha=data['shading'])
    
    # Add to legend
    if type_name not in labels:
        handles.append(plt.Line2D([0], [0], marker=data['marker'], color=data['color'], alpha=data['shading'], linestyle='-'))
        labels.append(type_name)

# add in baseline
baseline = baseline_accuracy_under_mode
baseline_type_data = {}  # key: type_name, value: dict with 'x', 'y', 'c'

for i, (name, info) in tqdm(iterable=enumerate(PAIRS.items()), total=len(PAIRS)):
    c = info["style_args"].get("color", palette[i % len(palette)])
    few_shot_n = EXPERIMENTS[info["names"][1]]["config"]["dataset"]["n_shot"]
    # calculate baseline
    val = baseline(info["df"], EXPERIMENTS[info["names"][0]]["df"])
    # Store data for connecting dots
    if info["type_name"] not in baseline_type_data:
        baseline_type_data[info["type_name"]] = {'x': [], 'y': [], 'color': c}
    baseline_type_data[info["type_name"]]['x'].append(few_shot_n)
    baseline_type_data[info["type_name"]]['y'].append(val)

# Plot lines to connect dots
for type_name, data in baseline_type_data.items():
    sorted_indices = np.argsort(data['x'])
    sorted_x = np.array(data['x'])[sorted_indices]
    sorted_y = np.array(data['y'])[sorted_indices]
    plt.plot(sorted_x, sorted_y, color=data['color'], alpha=0.33, linestyle='dotted', marker='')
    

plt.title("Self-prediction accuracy")
plt.xlabel("Number of few-shot examples")
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.legend(handles, labels)
plt.ylabel("Accuracy")
plt.ylim(0)
plt.show()

In [None]:
measure = calc_accuracy_with_excluded


plt.figure(figsize=(12, 6))

# Data accumulation structures
type_data = {}  # key: type_name, value: dict with 'x', 'y', 'yerr_lower', 'yerr_upper'
handles = []
labels = []

for i, (name, info) in tqdm(iterable=enumerate(PAIRS.items()), total=len(PAIRS)):
    df = info["df"]
    c = info["style_args"].get("color", palette[i % len(palette)])
    few_shot_n = EXPERIMENTS[info["names"][1]]["config"]["dataset"]["n_shot"]
    shading = info["style_args"].get("shading", 0.33)
    marker = info["style_args"].get("marker", "o")
    val = measure(df)
    ci = bootstrap_ci(df, measure)
    
    # Store data for connecting dots
    if info["type_name"] not in type_data:
        type_data[info["type_name"]] = {'x': [], 'y': [], 'yerr_lower': [], 'yerr_upper': [], 'color': c, 'marker': marker, 'shading': shading}
    type_data[info["type_name"]]['x'].append(few_shot_n)
    type_data[info["type_name"]]['y'].append(val)
    type_data[info["type_name"]]['yerr_lower'].append(val - ci[0])
    type_data[info["type_name"]]['yerr_upper'].append(ci[1] - val)

    # Plot individual point
    plt.errorbar(few_shot_n, val, yerr=np.array([[val - ci[0]], [ci[1] - val]]), fmt=marker, color=c, alpha=shading)

# Plot lines to connect dots
for type_name, data in type_data.items():
    sorted_indices = np.argsort(data['x'])
    sorted_x = np.array(data['x'])[sorted_indices]
    sorted_y = np.array(data['y'])[sorted_indices]
    plt.plot(sorted_x, sorted_y, color=data['color'], alpha=data['shading'])
    
    # Add to legend
    if type_name not in labels:
        handles.append(plt.Line2D([0], [0], marker=data['marker'], color=data['color'], alpha=data['shading'], linestyle='-'))
        labels.append(type_name)

# add in baseline
baseline = baseline_accuracy_under_mode
baseline_type_data = {}  # key: type_name, value: dict with 'x', 'y', 'c'

for i, (name, info) in tqdm(iterable=enumerate(PAIRS.items()), total=len(PAIRS)):
    c = info["style_args"].get("color", palette[i % len(palette)])
    few_shot_n = EXPERIMENTS[info["names"][1]]["config"]["dataset"]["n_shot"]
    # calculate baseline
    val = baseline(info["df"], EXPERIMENTS[info["names"][0]]["df"])
    # Store data for connecting dots
    if info["type_name"] not in baseline_type_data:
        baseline_type_data[info["type_name"]] = {'x': [], 'y': [], 'color': c}
    baseline_type_data[info["type_name"]]['x'].append(few_shot_n)
    baseline_type_data[info["type_name"]]['y'].append(val)

# Plot lines to connect dots
for type_name, data in baseline_type_data.items():
    sorted_indices = np.argsort(data['x'])
    sorted_x = np.array(data['x'])[sorted_indices]
    sorted_y = np.array(data['y'])[sorted_indices]
    plt.plot(sorted_x, sorted_y, color=data['color'], alpha=0.33, linestyle='dotted', marker='')
    

plt.title("Self-prediction accuracy")
plt.xlabel("Number of few-shot examples")
plt.yticks(plt.yticks()[0], [f"{int(tick*100)}%" for tick in plt.yticks()[0]])
plt.legend(handles, labels)
plt.ylabel("Accuracy counting non-compliant responses as incorrect")
plt.ylim(0)
plt.show()