# Comparing base completions

In [None]:
EXP_FOLDERS = [ # inside of exp/
    "how_different_are_GPT35_versions",
]

In [None]:
from pathlib import Path
import subprocess
import sys
import random
import logging

In [None]:
# set log level
logging.basicConfig(level=logging.WARNING)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import words
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from evals.analysis.analysis_helpers import merge_object_and_meta_dfs, create_df_from_configs, fill_df_with_function, get_pretty_name, filter_configs_by_conditions, pretty_print_config, get_pretty_name_w_labels
from evals.analysis.loading_data import load_dfs_with_filter, load_base_df_from_config, get_hydra_config, load_single_df, get_data_path
from evals.utils import get_maybe_nested_from_dict

In [None]:
# Set the display option to None to show all content
pd.set_option('display.max_colwidth', 200)
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
# set color palette
palette = sns.color_palette("Set1")
sns.set_palette(palette)

In [None]:
# get seaborn to shut up
import warnings
# Ignore the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
from evals.locations import EXP_DIR

In [None]:
# Set the directory for the data
EXPDIR = EXP_DIR

In [None]:
# load the dataframes with configs as keys
dfs = {}
for exp_folder in EXP_FOLDERS:
    _dfs = load_dfs_with_filter(EXPDIR / exp_folder, conditions={}, exclude_noncompliant=False)
    print(f"Loaded {len(_dfs)} dataframes from {exp_folder}")
    dfs.update(_dfs)
print(f"Loaded {len(dfs)} dataframes in total")

## Analyses
Create results dataframe

In [None]:
# create results dataframe
results = create_df_from_configs(dfs.keys())
results.sort_values(by=["language_model_model"], inplace=True)

In [None]:
# add a grouping column

config_vals_of_interest = [
    ["language_model", "model"],
    "note",
    # ["prompt", "method"],
    # "base_dir",
    # "exp_dir",
    # "limit",
    # "dataset",
    # ["dataset", "topic"],
    # ["dataset", "n_shot"],
    # ["dataset", "n_shot_seeding"],
    ["dataset", "string_modifier"],
    ["dataset", "response_property"],
    "prediction_target",
]

results["grouping"] = results["config"].apply(lambda x: get_pretty_name_w_labels(x, config_vals_of_interest))

print(f"Got {results.grouping.nunique()} unique groupings for {len(results)} results")

Ideally, we would like to know how likely the model is to give the correct answer. However, the Chat API does not allow us to get the likelihood of a given response, so we use the likelihood of the first token as a proxy. If the correct response is not in the list of top logprobs, we assume the likelihood is flat over all other tokens, which our token is in.

How many strings are correctly produced by the model?

Let's run the analyses

In [None]:
def exclude_noncompliant(df):
    df = df.copy()
    df = df[(df['compliance_self'] == True) & (df['compliance_base'] == True)]
    return df

In [None]:
results

In [None]:
# how much variance is there in the responses? Calculate Shannon entropy over responses
def calc_entropy(df, col):
    """Calculate the entropy of the model"""
    return stats.entropy(df[col].value_counts(normalize=True))

In [None]:
# let's also check if the model is following some cheap strategy
fill_df_with_function(dfs, lambda df: df['last_word_repeated'].mean(), "last_word_repeated", results)
fill_df_with_function(dfs, lambda df: df['last_char_repeated'].mean(), "last_char_repeated", results)
fill_df_with_function(dfs, lambda df: df['any_word_repeated'].mean(), "any_word_repeated", results)
fill_df_with_function(dfs, lambda df: calc_entropy(df, "response"), "entropy", results)
fill_df_with_function(dfs, lambda df: np.mean(df['response'].str.lower() == df['target'].str.lower()), "correct", results)
fill_df_with_function(dfs, lambda df: np.mean(df['compliance'] == True), "compliance", results)

In [None]:
COLS_OF_INTEREST = [
    "correct",
    'compliance',
    "entropy",
    "last_word_repeated",
    "last_char_repeated",
    "any_word_repeated"
]

In [None]:
for col in COLS_OF_INTEREST:    
    sns.barplot(data=results, x="language_model_model", y=col, hue="grouping")
    plt.title(col.capitalize())
    plt.xlabel("Few-shot n")
    plt.ylabel(col.capitalize())
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

## Pairwise comparisons between dataframes

In [None]:
merged_pair_dfs = {}
for i, (key, df) in enumerate(dfs.items()):
    for j, (key2, df2) in enumerate(dfs.items()):
        if i < j:  # Only merge if the index of the first key is less than the index of the second key
            merged_pair_dfs[(key, key2)] = merge_object_and_meta_dfs(df, df2)

print(f"Merged {len(merged_pair_dfs)} dataframes")

In [None]:
# how often do the models give the same response?
for (key, key2), df in merged_pair_dfs.items():
    pretty_print_config(key)
    print("vs")
    pretty_print_config(key2)
    print(f"Same response: {np.mean(df['response_object'] == df['response_meta']) * 100:.2f}%")
    print("-"*80)

### Detailed analysis

Name conditions, and it will pull the relevant dataframes

In [None]:
filter_conditions = { 
    # ("language_model","model"): ["gpt-3.5-turbo-0125"],
    # ("language_model","model"): ["claude-2.1"],
    # ("language_model","model"): ["davinci-002"],
    # ("language_model","model"): ["gpt-4-1106-preview"],
    # ("dataset","n_shot"): [9], 
    # ("prediction_target"): ["self"],
    # ("dataset","n_shot_seeding"): [True]
}

In [None]:
filtered_configs = filter_configs_by_conditions(dfs.keys(), filter_conditions)
print(f"Got {len(filtered_configs)}, down from {len(dfs)}")

In [None]:
for config in filtered_configs: pretty_print_config(config)

In [None]:
# filter the dfs
filtered_merged_dfs = {config: df for config, df in dfs.items() if config in filtered_configs}
print(f"Got {len(filtered_merged_dfs)}, down from {len(dfs)}")

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    display(detail_df.sample(5))
    # display(detail_df[["string", "response_base", "response_self", "raw_response_self",  'few-shot_string', 'few-shot_response']].sample(5))

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    # what are the most common base predictions pairs?
    display(detail_df[["response"]].value_counts(normalize=True).head(10) * 100)

    # Filter out non-numeric values
    detail_df['response_numeric'] = detail_df['response'].apply(lambda x: int(x) if x.isnumeric() else None)
    detail_df = detail_df.dropna(subset=["response_numeric"])
    detail_df['response_numeric'].hist(bins=100)
    plt.title("Distribution of base predictions")
    plt.xlabel("Prediction")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
for config,detail_df in filtered_merged_dfs.items():
    pretty_print_config(config=config)
    display(detail_df.sample(5))