In [1]:
%cd ~/src/second-best-bench/
from src.plots.common import *
import numpy as np

/home/yimingz3/src/second-best-bench


In [15]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility']]

def model_scores_at_patience(patience):
    score_dfs = []
    for subset in ["wildchat", "curated"]:
        for model_family, model_path, model_alias in models:
            if not model_path: continue
            df = pd.read_json(f"eval/{subset}/{model_path}/scores.jsonl", lines=True)
            df = df.drop(columns="utility")
            df["subset"] = subset
            if patience == 0.0:
                df["utility"] = df["generation_scores"].map(lambda l: l[0])    
            else:
                df["utility"] = df["generation_scores"].map(lambda l: np.average(l, weights=patience ** np.arange(len(l))))
            df["model_family"] = model_family
            df["model_alias"] = model_alias
            score_dfs.append(df)

    model_scores = pd.concat(score_dfs)
    model_scores["distinct"] = model_scores["partition_scores"].map(len)
    model_scores["patience"] = patience
    return model_scores

In [16]:
all_model_scores = pd.concat([model_scores_at_patience(p) for p in np.linspace(0.0, 1.0, 11)])

eval/wildchat/anthropic/claude-3-5-haiku@20241022/scores.jsonl
eval/wildchat/anthropic/claude-3-5-sonnet-v2@20241022/scores.jsonl
eval/wildchat/anthropic/claude-3-opus@20240229/scores.jsonl
eval/wildchat/cohere/command-r7b-12-2024/scores.jsonl
eval/wildchat/cohere/command-r-08-2024/scores.jsonl
eval/wildchat/cohere/command-r-plus-08-2024/scores.jsonl
eval/wildchat/gemini/gemini-1.5-pro/scores.jsonl
eval/wildchat/gemini/gemini-2.0-flash-lite-preview-02-05/scores.jsonl
eval/wildchat/gemini/gemini-2.0-flash/scores.jsonl
eval/wildchat/gemini/gemini-2.0-pro-exp-02-05/scores.jsonl
eval/wildchat/google/gemma-2-2b-it/scores.jsonl
eval/wildchat/google/gemma-2-9b-it/scores.jsonl
eval/wildchat/google/gemma-2-27b-it/scores.jsonl
eval/wildchat/meta-llama/Llama-3.2-1B-Instruct/scores.jsonl
eval/wildchat/meta-llama/Llama-3.2-3B-Instruct/scores.jsonl
eval/wildchat/meta-llama/Llama-3.1-8B-Instruct/scores.jsonl
eval/wildchat/meta-llama/Llama-3.3-70B-Instruct/scores.jsonl
eval/wildchat/meta-llama/Llama-3

In [17]:
def compute_summary(df):
    
    summary = df.groupby(['model_family', 'model_alias', 'patience']).agg(
        mean_distinct=('distinct', 'mean'),
        mean_utility=('utility', 'mean')
    ).reset_index()
    
    summary['distinct'] = summary['mean_distinct'] + 1
    
    return summary[['model_family', 'model_alias', 'distinct', 'mean_utility', 'patience']]

eval_data = compute_summary(all_model_scores)
eval_data = eval_data.rename(columns={'mean_utility': 'utility'})


In [30]:
llama_patience_chart = alt.Chart(eval_data[eval_data["model_family"].isin(["Llama"])]).mark_line(point=True).encode(
    x=alt.X('patience:Q', title='Patience'),
    y=alt.Y('mean(utility):Q', title='Utility'),
    color=alt.Color('model_alias:O', title="Model", sort=model_order)
).properties(
    width=250,
    height=200
)

llama_patience_chart.save("plots/llama-patience.json")
llama_patience_chart

In [31]:
gemma_patience_chart = alt.Chart(eval_data[eval_data["model_family"].isin(["Gemma"])]).mark_line(point=True).encode(
    x=alt.X('patience:Q', title='Patience'),
    y=alt.Y('mean(utility):Q', title='Utility'),
    color=alt.Color('model_alias:O', title="Model", sort=model_order)
).properties(
    width=250,
    height=200
)

gemma_patience_chart.save("plots/gemma-patience.json")
gemma_patience_chart