In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from eagle import EagleRanker, ModelScore
from eval import compute_accuracy

# Load dataset
ds = load_dataset("notdiamond/repliqa_gpt4o_gpt4omini_evals")
ds_train = ds["train"].to_pandas().sample(n=1000, random_state=42)
ds_val = ds["val"].to_pandas().sample(n=100, random_state=42)
ds_test = ds["test"].to_pandas().sample(n=100, random_state=42)


In [2]:
# Initialize EagleRanker with training data
eagle_ranker = EagleRanker(
    prompts=list(ds_train["prompt"]),
    model_scores = {
        "gpt-4o-mini-2024-07-18": list(ds_train["gpt-4o-mini-2024-07-18/score"]),
        "gpt-4o-2024-08-06": list(ds_train["gpt-4o-2024-08-06/score"])
    },
    embedding_filepath="embeddings.json",
    k=32
)

In [5]:
def evaluate(eagle_ranker: EagleRanker, dataset: pd.DataFrame, p: float, print_flags: bool = False):
    # Compute test set performance on original p
    ranked_scores = list[list[ModelScore]]()
    for _, row in dataset.iterrows():
        model_scores = eagle_ranker.rank(
                text=row["prompt"],
                n=20,
                p=p
            )
        ranked_scores.append(
            model_scores
        )
    accuracy = compute_accuracy(
        model_ground_truth_scores={
            "gpt-4o-mini-2024-07-18": list(dataset["gpt-4o-mini-2024-07-18/score"]),
            "gpt-4o-2024-08-06": list(dataset["gpt-4o-2024-08-06/score"])
        },
        ranker_scores=ranked_scores,
        print_flags=print_flags,
    )
    return accuracy

In [9]:
# Find the optimal value for p using the validation set
for p in np.linspace(0, 1, 11):
    accuracy = evaluate(eagle_ranker, ds_val, p, print_flags=False)
    print(f"p={p:.2f}, accuracy={accuracy:.4f}")

p=0.00, accuracy=0.1400
p=0.10, accuracy=0.1600
p=0.20, accuracy=0.1500
p=0.30, accuracy=0.1400
p=0.40, accuracy=0.1600
p=0.50, accuracy=0.1600
p=0.60, accuracy=0.1600
p=0.70, accuracy=0.1600
p=0.80, accuracy=0.1600
p=0.90, accuracy=0.1600
p=1.00, accuracy=0.1600


In [8]:
# Compute test set performance on original p
accuracy = evaluate(eagle_ranker, ds_test, p=0.5, print_flags=True)
print(f"accuracy={accuracy:.4f}")

gpt-4o-mini-2024-07-18 gpt-4o-2024-08-06
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
gpt-4o-mini-2024-07-18 gpt-4o-mini-2024-07-18
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
gpt-4o-mini-2024-07-18 gpt-4o-mini-2024-07-18
Same scores but different rank
Same scores but different rank
Same scores but different rank
gpt-4o-mini-2024-07-18 gpt-4o-mini-2024-07-18
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
Same scores but different rank
gpt-4o-mini-2024-07-18 gpt-4o-mini-2024-07-18


In [11]:
# Compute test set performance on validated p
# I'm just using 1, validation doesn't give us better results than the default
# but if we use 1 that means we can get away with just using the global which means less computation
accuracy = evaluate(eagle_ranker, ds_test, p=1, print_flags=False)
print(f"accuracy={accuracy:.4f}")

accuracy=0.0800
