In [1]:
%cd ~/src/second-best-bench/
from src.plots.common import models, model_order
import numpy as np
import pandas as pd
import altair as alt

/home/yimingz3/src/second-best-bench


In [2]:
def process_row(row, generation_count=8):
    row["partition"] = row["partition"][:generation_count]
    row["distinct"] = max(row["partition"]) + 1
    row["generation_scores"] = row["generation_scores"][:generation_count]
    row["utility"] = np.average(row["generation_scores"], weights=0.8 ** np.arange(generation_count))
    row["partition_scores"] = row["partition_scores"][:row["distinct"]]
    return row


score_dfs = []

for subset in ["curated"]:
    for model_family, model_path, model_alias in models:
        if model_path in ["anthropic/claude-3-opus@20240229", "openai/gpt-4o-2024-11-20", "gemini/gemini-2.0-pro-exp-02-05"]:
            df = pd.read_json(f"eval/{subset}/{model_path}/scores.jsonl", lines=True)
            df["subset"] = subset
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            df["sampling_method"] = "Resampling"
            score_dfs.append(df)


            df = pd.read_json(f"eval-ic/{subset}/{model_path}/scores.jsonl", lines=True)
            df["subset"] = subset
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            df["sampling_method"] = "In-context regeneration"
            score_dfs.append(df)


            df = pd.read_json(f"eval-paraphrase/{subset}/{model_path}/scores.jsonl", lines=True)
            df["subset"] = subset
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            df["sampling_method"] = "Paraphrasing"
            score_dfs.append(df)

            df = pd.read_json(f"eval-system-prompt/{subset}/{model_path}/scores.jsonl", lines=True)
            df["subset"] = subset
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            df["sampling_method"] = "System prompt"
            score_dfs.append(df)


df = pd.read_json(f"eval-human/scores.jsonl", lines=True)
df["subset"] = subset
df["model_family"] = "Human"
df["model_alias"] = "Human"
df["sampling_method"] = "Human"
score_dfs.append(df)


model_scores = pd.concat(score_dfs)
model_scores = model_scores.apply(process_row, axis=1)

In [3]:
model_scores[model_scores["model_family"] == "Human"]

Unnamed: 0,id,prompt,model,generations,partition,distinct,generation_scores,partition_scores,utility,subset,model_family,model_alias,sampling_method
0,curated-17,What is the top item you would add to a grocer...,human,"[Durian, Onions, Ice Cream, Hot sauce, Fresh f...","[0, 1, 2, 3, 4, 0, 5, 6]",7,"[1, 1, 2, 1, 1, 0, 1, 1]","[1, 1, 2, 1, 1, 1, 1]",1.075056,curated,Human,Human,Human
1,curated-45,What is the main reason people don't get hired...,human,"[Their resume got filtered out by AI, They don...","[0, 1, 0, 2, 3, 4, 0, 0]",5,"[1, 1, 0, 6, 1, 1, 0, 0]","[1, 1, 6, 1, 1]",1.348015,curated,Human,Human,Human
2,curated-77,Name a star other than the Sun.,human,"[Antares, Neptune, Orion , Rho Cassiopeiae, Ca...","[0, 1, 2, 3, 4, 5, 6, 6]",7,"[10, 1, 1, 9, 1, 9, 10, 0]","[10, 1, 1, 9, 1, 9, 10]",5.293781,curated,Human,Human,Human
3,curated-3,Write a haiku about a whale and a walnut tree.,human,"[Vast life overflows -\nWhale's mighty form, t...","[0, 1, 2, 3, 4, 5, 6, 7]",8,"[9, 2, 7, 10, 9, 8, 2, 6]","[9, 2, 7, 10, 9, 8, 2, 6]",6.798720,curated,Human,Human,Human
4,curated-46,What's a saying that English teachers hate?,human,"[Grammar doesn't matter, I don't care, I didn'...","[0, 1, 2, 3, 4, 1, 1, 1]",5,"[2, 1, 1, 1, 8, 0, 0, 0]","[2, 1, 1, 1, 8]",1.737217,curated,Human,Human,Human
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,curated-75,Name one person who was involved in the Americ...,human,"[Alexander Hamilton, George Washington, George...","[0, 1, 1, 2, 3, 4, 1, 5]",6,"[10, 10, 0, 6, 7, 9, 0, 9]","[10, 10, 6, 7, 9, 9]",6.915355,curated,Human,Human,Human
96,curated-15,What would you invent to make the world a more...,human,"[Flying vehicles , flying cars, Teleportation ...","[0, 0, 1, 2, 3, 4, 5, 6]",7,"[2, 0, 2, 1, 1, 2, 1, 1]","[2, 2, 1, 1, 2, 1, 1]",1.280615,curated,Human,Human,Human
97,curated-44,What is the first thing to check when your car...,human,"[Battery, the Engine, Fuel Levels , If the bat...","[0, 1, 2, 3, 4, 0, 5, 0]",6,"[10, 1, 2, 7, 7, 0, 1, 0]","[10, 1, 2, 7, 7, 1]",4.516394,curated,Human,Human,Human
98,curated-16,One thing you might find in a magical forest.,human,"[Flowers that can talk, Unicorn, Talking Tree,...","[0, 1, 2, 3, 4, 5, 6, 7]",8,"[8, 9, 9, 2, 2, 10, 1, 1]","[8, 9, 9, 2, 2, 10, 1, 1]",6.380911,curated,Human,Human,Human


In [4]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias', 'sampling_method']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility', 'sampling_method']]

eval_data = compute_summary(model_scores)
eval_data = eval_data.rename(columns={'mean_utility': 'utility'})

eval_data = eval_data[eval_data["model_family"].isin(["OpenAI", "Anthropic", "Gemini", "Human"])]

In [11]:
# ... existing code ...

# Create a base chart for grouping
utility_grouped = alt.Chart(eval_data[eval_data["model_family"] != "Human"]).mark_bar().encode(
    x=alt.X('model_alias:N', title='', sort=model_order, axis=alt.Axis(labelAngle=-0)),
    y=alt.Y('utility:Q', title='Utility'),
    color=alt.Color('sampling_method:N', title='Sampling method', sort=["Resampling", "In-context regeneration", "Paraphrasing", "System prompt"]),
    xOffset=alt.XOffset('sampling_method:N', sort=["Resampling", "In-context regeneration", "Paraphrasing", "System prompt"])  # This creates the grouping within each model
).properties(
    width=260,
    height=200
)

# Get human utility value
human_utility = eval_data[eval_data["model_family"] == "Human"]["utility"].values[0]

# Create a horizontal rule for human performance
human_rule = alt.Chart(pd.DataFrame({'y': [human_utility]})).mark_rule(
    strokeDash=[6, 4],
    size=2,
    color='black'
).encode(
    y='y:Q'
)

# Add text label for human performance
human_text = alt.Chart(pd.DataFrame({'y': [human_utility], 'text': ['Human']})).mark_text(
    align='left',
    baseline='bottom',
    dx=-90,
    dy=-5,
    fontSize=10
).encode(
    y='y:Q',
    text='text:N'
)

# Combine the charts
final_chart = utility_grouped + human_rule

final_chart.save("plots/alternative-prompting-utility.json")
final_chart

In [12]:
# Create a base chart for grouping
distinct_grouped = alt.Chart(eval_data[eval_data["model_family"] != "Human"]).mark_bar().encode(
    x=alt.X('model_alias:N', title='', sort=model_order, axis=alt.Axis(labelAngle=-0)),
    y=alt.Y('distinct:Q', title='Distinct generations'),
    color=alt.Color('sampling_method:N', title='Sampling method', sort=["Resampling", "In-context regeneration", "Paraphrasing", "System prompt"], legend=None),
    xOffset=alt.XOffset('sampling_method:N', sort=["Resampling", "In-context regeneration", "Paraphrasing", "System prompt"])  # This creates the grouping within each model    
).properties(
    width=260,
    height=200
)

# Get human distinct value
human_distinct = eval_data[eval_data["model_family"] == "Human"]["distinct"].values[0]

# Create a horizontal rule for human performance
human_rule = alt.Chart(pd.DataFrame({'y': [human_distinct]})).mark_rule(
    strokeDash=[6, 4],
    size=2,
    color='black'
).encode(
    y='y:Q'
)

# Add text label for human performance
human_text = alt.Chart(pd.DataFrame({'y': [human_utility], 'text': ['Human']})).mark_text(
    align='left',
    baseline='bottom',
    dx=-90,
    dy=-75,
    fontSize=10
).encode(
    y='y:Q',
    text='text:N'
)

# Combine the charts
final_distinct_chart = distinct_grouped + human_rule

final_distinct_chart.save("plots/alternative-prompting-distinct.json")
final_distinct_chart