In [1]:
%cd ~/src/second-best-bench/
from src.plots.common import *
import numpy as np

/home/yimingz3/src/second-best-bench


In [2]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility']]

def model_scores_at_patience(patience):
    score_dfs = []
    for subset in ["wildchat", "curated"]:
        for model_family, model_path, model_alias in models:
            df = pd.read_json(f"eval/{subset}/{model_path}/scores.jsonl", lines=True)
            df = df.drop(columns="utility")
            df["subset"] = subset
            if patience == 0.0:
                df["utility"] = df["generation_scores"].map(lambda l: l[0])    
            else:
                df["utility"] = df["generation_scores"].map(lambda l: np.average(l, weights=patience ** np.arange(len(l))))
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            score_dfs.append(df)

    model_scores = pd.concat(score_dfs)
    model_scores["distinct"] = model_scores["partition_scores"].map(len)
    model_scores["patience"] = patience
    return model_scores

In [3]:
all_model_scores = pd.concat([model_scores_at_patience(p) for p in np.linspace(0.0, 1.0, 11)])

In [4]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias', 'patience']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility', 'patience']]

eval_data = compute_summary(all_model_scores)
eval_data = eval_data.rename(columns={'mean_utility': 'utility'})


In [None]:
eval_data[eval_data["model_family"].isin(["OpenAI", "Anthropic"])]

Unnamed: 0,model_family,model_alias,distinct,utility,patience
0,Anthropic,Claude 3 Opus,2.943636,63.034545,0.0
1,Anthropic,Claude 3 Opus,2.943636,57.681165,0.1
2,Anthropic,Claude 3 Opus,2.943636,52.251302,0.2
3,Anthropic,Claude 3 Opus,2.943636,46.74057,0.3
4,Anthropic,Claude 3 Opus,2.943636,41.14435,0.4
5,Anthropic,Claude 3 Opus,2.943636,35.468366,0.5
6,Anthropic,Claude 3 Opus,2.943636,29.763208,0.6
7,Anthropic,Claude 3 Opus,2.943636,24.18078,0.7
8,Anthropic,Claude 3 Opus,2.943636,19.001774,0.8
9,Anthropic,Claude 3 Opus,2.943636,14.558737,0.9


In [6]:
llama_patience_chart = alt.Chart(eval_data[eval_data["model_family"].isin(["Llama"])]).mark_line(point=True).encode(
    x=alt.X('patience:Q', title='Patience'),
    y=alt.Y('mean(utility):Q', title='Utility'),
    color=alt.Color('model_alias:O', title="Model", sort=model_order)
).properties(
    width=200,
    height=200
)

llama_patience_chart.save("plots/llama-patience.json")
llama_patience_chart

In [7]:
gemma_patience_chart = alt.Chart(eval_data[eval_data["model_family"].isin(["Gemma"])]).mark_line(point=True).encode(
    x=alt.X('patience:Q', title='Patience'),
    y=alt.Y('mean(utility):Q', title='Utility'),
    color=alt.Color('model_alias:O', title="Model", sort=model_order)
).properties(
    width=200,
    height=200
)

gemma_patience_chart.save("plots/gemma-patience.json")
gemma_patience_chart