In [1]:
%cd ~/src/second-best-bench/
from src.plots.common import models, model_order
import numpy as np
import pandas as pd
import altair as alt

/home/yimingz3/src/second-best-bench


In [2]:
score_dfs = []

for subset in ["curated"]:
    for model_family, model_path, model_alias in models:
        df = pd.read_json(f"eval/{subset}/{model_path}/scores.jsonl", lines=True)
        df["subset"] = subset
        df["model_family"] = model_family
        df["model_alias"] = model_alias
        df["sampling_method"] = "Resampling"

        score_dfs.append(df)

    
    for model_family, model_path, model_alias in models:
        df = pd.read_json(f"eval-ic/{subset}/{model_path}/scores.jsonl", lines=True)
        df["subset"] = subset
        df["model_family"] = model_family
        df["model_alias"] = model_alias
        df["sampling_method"] = "In-context regeneration"

        score_dfs.append(df)

model_scores = pd.concat(score_dfs)
model_scores["distinct"] = model_scores["partition_scores"].map(len)


In [6]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias', 'sampling_method']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility', 'sampling_method']]

eval_data = compute_summary(model_scores)
eval_data = eval_data.rename(columns={'mean_utility': 'utility'})

eval_data = eval_data[eval_data["model_family"].isin(["OpenAI", "Gemini"])]

In [12]:
# Create a base chart for grouping
utility_grouped = alt.Chart(eval_data).mark_bar().encode(
    x=alt.X('model_alias:N', title='', sort=model_order),
    y=alt.Y('utility:Q', title='Utility'),
    color=alt.Color('sampling_method:N', title='Sampling method', sort=["Resampling", "In-context regeneration"]),
    xOffset=alt.XOffset('sampling_method:N', sort=["Resampling", "In-context regeneration"])  # This creates the grouping within each model
).properties(
    width=200,
    height=200
)

utility_grouped.save("plots/alternative-prompting-utility.json")
utility_grouped

In [13]:
# Create a base chart for grouping
distinct_grouped = alt.Chart(eval_data).mark_bar().encode(
    x=alt.X('model_alias:N', title='', sort=model_order),
        y=alt.Y('distinct:Q', title='Distinct generations (out of 10)'),
    color=alt.Color('sampling_method:N', title='Sampling method', sort=["Resampling", "In-context regeneration"]),
    xOffset=alt.XOffset('sampling_method:N', sort=["Resampling", "In-context regeneration"]),
).properties(
    width=200,
    height=200
)

distinct_grouped.save("plots/alternative-prompting-distinct.json")
distinct_grouped