# Cost-of-Pass: An Economic Framework for Evaluating Language Models

This notebook provides codes for fully reproducing our results! Each section (below Setup) corresponds to the experiments in the paper!

## Setup

In [None]:
from cost_of_pass import *
from cost_of_pass.models.litellm import make_litellm_client
from collections import defaultdict
from datetime import datetime
from cost_of_pass.evaluation.utils import expected_value

### Tasks & Metrics

In [None]:
class AnalysisTask:
    def __init__(self, task_name, n_samples, metric_name, human_baseline_cost, task_type):
        self.task_name = task_name
        self.n_samples = n_samples
        self.metric_name = metric_name
        self.human_baseline_cost = human_baseline_cost
        self.task_type = task_type

        self.task = get_task(task_name, n_samples=n_samples)
        self.metric = get_metric(metric_name)
        self.baseline_cost = human_baseline_cost
        

basic_quantitative_tasks = [
    AnalysisTask(task_name="TwoDigitAddition", n_samples=128, metric_name="MathExpressionMatch", human_baseline_cost=0.02, task_type="Basic Quantitative"),
    AnalysisTask(task_name="GSM8K", n_samples=128, metric_name="MathExpressionMatch", human_baseline_cost=3.50, task_type="Basic Quantitative"),
]

knowledge_based_tasks = [
    AnalysisTask(task_name="BBQ", n_samples=128, metric_name="MultipleChoiceMatch", human_baseline_cost=0.10, task_type="Knowledge Based"),
    AnalysisTask(task_name="GPQA_Diamond", n_samples=128, metric_name="MultipleChoiceMatch", human_baseline_cost=58.0, task_type="Knowledge Based"),
]

complex_quantitative_tasks = [
    AnalysisTask(task_name="MATH500", n_samples=128, metric_name="MathExpressionMatch", human_baseline_cost=12.0, task_type="Complex Quantitative"),
    AnalysisTask(task_name="AIME_2024", n_samples=128, metric_name="MathExpressionMatch", human_baseline_cost=20.0, task_type="Complex Quantitative"),
]

all_tasks = [
    basic_quantitative_tasks,
    knowledge_based_tasks,
    complex_quantitative_tasks,
]

### Models

In [3]:
class AnalysisModel:
    def __init__(self, model_name, model_type, release_date=None, should_register=False, runtime_kwargs={}):
        self.model_name = model_name
        self.model_type = model_type
        self.release_date = release_date
        self.should_register = should_register
        self.runtime_kwargs = runtime_kwargs
        self.runtime_kwargs["max_attempts"] = 5 # How many times to retry querying the API if it fails
        
        if should_register:
            register_client(model_name)(make_litellm_client(model_name))
        self.client = get_client(model_name)

# NOTE: We manually register the models from togetherai, as they are not directly retrievable through LiteLLM (but are available there)
# NOTE: We pass different sampling args to the OpenAI reasoning models, as required so.

lightweight_models = [
    AnalysisModel(model_name="together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", should_register=True, model_type="Lightweight", release_date="2024-07-23"),
    AnalysisModel(model_name="gpt-4o-mini-2024-07-18", model_type="Lightweight", release_date="2024-07-18"),
    AnalysisModel(model_name="together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo", should_register=True, model_type="Lightweight", release_date="2024-12-06"),
]

large_models = [
    AnalysisModel(model_name="together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", should_register=True, model_type="Large", release_date="2024-07-23"),
    AnalysisModel(model_name="claude-3-5-sonnet-20240620", model_type="Large", release_date="2024-06-20"),
    AnalysisModel(model_name="gpt-4o-2024-05-13", model_type="Large", release_date="2024-05-13"),
]

reasoning_models = [
    AnalysisModel(model_name="o1-mini-2024-09-12", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type="Reasoning", release_date="2024-09-12"),
    AnalysisModel(model_name="o1-2024-12-17", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type="Reasoning", release_date="2024-12-05"),
    AnalysisModel(model_name="together_ai/deepseek-ai/DeepSeek-R1", should_register=True, model_type="Reasoning", release_date="2025-01-20"),
    AnalysisModel(model_name="o3-mini-2025-01-31", runtime_kwargs={'sampling_args': SamplingArgs(top_p=None, temperature=1.0)}, model_type="Reasoning", release_date="2025-01-31"),
]

all_models = [
    lightweight_models,
    large_models,
    reasoning_models,
]

### Inference Time Techniques

In [4]:
vanilla_prompting = get_test_time_method("VanillaPromptMethod")

inference_time_techniques = [
    get_test_time_method("SelfRefinementMethod"),
    get_test_time_method("MajorityVotingMethod", n_votes=3),
    get_test_time_method("MajorityVotingMethod", n_votes=4), 
]

## 3.2. Frontier Cost-of-Pass with a Single Model

In [None]:
single_model_fcop = defaultdict(float)

for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        for model_category in all_models:
            for model in model_category:
                model_family = ModelFamily(f"Single Model: {model.model_name}")
                model_family = model_family.add_model(model.client, vanilla_prompting, model.runtime_kwargs)
                frontier_cop = frontier_cop_estimator.estimate_task(model_family, n_runs=8)
                single_model_fcop[(task.task_type, model.model_type, task.task.task_name, model.model_name)] = frontier_cop

single_model_fcop

## 3.3. Tracking Frontier Cost-of-Pass with New Releases

In [None]:
track_norm_fcop = defaultdict(float)

models_sorted_by_release_date = sorted(
    [model for model_category in all_models for model in model_category],
    key=lambda x: datetime.strptime(x.release_date, "%Y-%m-%d")
)

for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        model_family_t = ModelFamily("Models over time")
        base_frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)
        track_norm_fcop[(task.task_type, task.task.task_name, "-", "Human Expert Baseline")] = base_frontier_cop
        for model in models_sorted_by_release_date:
            model_family_t = model_family_t.add_model(model.client, vanilla_prompting, model.runtime_kwargs)
            frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)
            track_norm_fcop[(task.task_type, task.task.task_name, model.release_date, model.model_name)] = frontier_cop / base_frontier_cop

track_norm_fcop

## 3.4. Essentialness of Model Families: Counterfactual Frontier Cost-of-Pass

In [None]:
essentialness = defaultdict(float)

model_family_T = ModelFamily("All Models")
for model_category in all_models:
    for model in model_category:
        model_family_T = model_family_T.add_model(model.client, vanilla_prompting, model.runtime_kwargs)


for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        frontier_cop = frontier_cop_estimator.estimate_task(model_family_T, n_runs=8)
        for model_category in all_models:
            model_family_g_rem = ModelFamily(f"Counterfactual of {model_category[0].model_type}", model_family_T)
            for model in model_category:
                model_family_g_rem = model_family_g_rem.del_model(model.client, vanilla_prompting)
            frontier_cop_g_rem = frontier_cop_estimator.estimate_task(model_family_g_rem, n_runs=8)
            essentialness[(task.task_type, task.task.task_name, model.model_type)] = (frontier_cop_g_rem - frontier_cop) / frontier_cop_g_rem

essentialness

## 3.5.  Impact of Inference Time Techniques on Frontier Cost-of-Pass

In [None]:
impact_of_inf_tech = defaultdict(float)

model_family_L = ModelFamily("All non-reasoning models")
for model_category in all_models[:-1]:
    for model in model_category:
        model_family_L = model_family_L.add_model(model.client, vanilla_prompting, model.runtime_kwargs)


for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        frontier_cop_L = frontier_cop_estimator.estimate_task(model_family_L, n_runs=8)
        for inf_tech in inference_time_techniques:
            model_family_Ls = ModelFamily(f"Models augmented with {inf_tech.method_name}", model_family_L)
            for model_category in all_models[:-1]:
                for model in model_category:
                    model_family_Ls = model_family_Ls.add_model(model.client, inf_tech, model.runtime_kwargs)
            frontier_cop_Ls = frontier_cop_estimator.estimate_task(model_family_Ls, n_runs=8)
            impact_of_inf_tech[(task.task_type, task.task.task_name, inf_tech.method_name)] = (frontier_cop_L - frontier_cop_Ls) / frontier_cop_L

impact_of_inf_tech

## C.1. Expected Accuracy and Inference Costs

In [None]:
exp_acc, exp_cost = defaultdict(float), defaultdict(float)

for task_category in all_tasks:
    for task in task_category:
        for model_category in all_models:
            for model in model_category:
                evaluator = Evaluator(
                    model=model.client,
                    task=task.task,
                    tt_method=vanilla_prompting,
                )
                records = evaluator.evaluate(task.metric, n_runs=8)
                key = (task.task_type, task.task.task_name, model.model_type, model.model_name)
                exp_acc[key] = expected_value([record.performance for record in records])
                exp_cost[key] = expected_value([record.cost for record in records])

exp_acc, exp_cost

## C.2. Relative Gain per Model Release

In [None]:
rel_gain_per_model = defaultdict(float)

models_sorted_by_release_date = sorted(
    [model for model_category in all_models for model in model_category],
    key=lambda x: datetime.strptime(x.release_date, "%Y-%m-%d")
)

for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        model_family_t = ModelFamily("Models over time")
        prev_frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)
        for model in models_sorted_by_release_date:
            model_family_t = model_family_t.add_model(model.client, vanilla_prompting, model.runtime_kwargs)
            frontier_cop = frontier_cop_estimator.estimate_task(model_family_t, n_runs=8)
            rel_gain_per_model[(task.task_type, task.task.task_name, model.release_date, model.model_name)] = (prev_frontier_cop - frontier_cop) / prev_frontier_cop
            prev_frontier_cop = frontier_cop

rel_gain_per_model

## C.3. Counterfactual Frontier Cost-of-Pass in the Absence of a Single Model

In [None]:
counter_model = defaultdict(float)

model_family_T = ModelFamily("All Models")
for model_category in all_models:
    for model in model_category:
        model_family_T = model_family_T.add_model(model.client, vanilla_prompting, model.runtime_kwargs)


for task_category in all_tasks:
    for task in task_category:
        frontier_cop_estimator = FrontierCostofPass(
            task=task.task,
            baseline_cop=task.baseline_cost,
            metric=task.metric,
        )
        frontier_cop = frontier_cop_estimator.estimate_task(model_family_T, n_runs=8)
        for model_category in all_models:
            for model in model_category:
                model_family_m_rem = ModelFamily(f"Counterfactual of {model_category[0].model_type}", model_family_T)
                model_family_m_rem = model_family_m_rem.del_model(model.client, vanilla_prompting)
                frontier_cop_m_rem = frontier_cop_estimator.estimate_task(model_family_m_rem, n_runs=8)
                counter_model[(task.task_type, task.task.task_name, model.model_type, model.model_name)] = (frontier_cop_m_rem - frontier_cop) / frontier_cop_m_rem

counter_model