In [None]:
# install verdict
!uv pip install verdict --system

# data
!wget https://raw.githubusercontent.com/i-Eval/FairEval/refs/heads/main/question.jsonl --no-clobber
!wget https://raw.githubusercontent.com/i-Eval/FairEval/refs/heads/main/review/review_gpt35_vicuna-13b_human.txt --no-clobber
!wget https://raw.githubusercontent.com/i-Eval/FairEval/refs/heads/main/answer/answer_gpt35.jsonl --no-clobber
!wget https://raw.githubusercontent.com/i-Eval/FairEval/refs/heads/main/answer/answer_vicuna-13b.jsonl --no-clobber

> [**Large Language Model Evaluators are not Fair Evaluators**](https://arxiv.org/abs/2305.17926)  
> Peiyi Wang, Lei Li, Liang Chen, Dawei Zhu, Binghuai Lin, Yunbo Cao, Qi Liu, Tianyu Liu, Zhifang Sui  
> ACL 2024

## Load Dataset

In [8]:
import pandas as pd
from verdict.dataset import DatasetWrapper
from verdict.schema import Schema

df = pd.read_json("./question.jsonl", lines=True).join(
    pd.read_csv("./review_gpt35_vicuna-13b_human.txt", names=["judgement"])
).set_index("question_id").rename(columns={"text": "question"}).join(
    pd.read_json("./answer_gpt35.jsonl", lines=True).set_index("question_id")[["text"]].rename(columns={"text": "answer_gpt35"})
).join(
    pd.read_json("./answer_vicuna-13b.jsonl", lines=True).set_index("question_id")[["text"]].rename(columns={"text": "answer_vicuna-13b"})
)

dataset = DatasetWrapper.from_pandas(
    df,
    lambda row: Schema.of(
        question=row["question"],
        answer_assistant_1=row["answer_gpt35"],
        answer_assistant_2=row["answer_vicuna-13b"]
))

## Custom Judge Unit + Prompt

In [4]:
from verdict import Pipeline, Unit
from verdict.prompt import Prompt
from verdict.schema import Schema

class PairwiseEvidenceCalibrationJudgeUnit(Unit):
    class InputSchema(Schema):
        question: str
        answer_assistant_1: str
        answer_assistant_2: str

    class ResponseSchema(Schema):
        evaluation_evidence: str # Evidence Calibration (Section 3.1)
        score_assistant_1: float
        score_assistant_2: float

In [5]:
PROMPT = """
    [Question]
    {source.question}
    [The Start of Assistant 1's response]
    {source.answer_assistant_1}
    [The End of Assistant 1's response]
    [The Start of Assistant 2's response]
    {source.answer_assistant_2}
    [The End of Assistant 2's response]

    @system
    We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.
    Please rate the helpfulness, relevance, accuracy, and level of detail of their responses.
    Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.
    Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.
    Then, output scores for Assistant 1 and 2, respectively.
"""

pipeline = Pipeline("Pairwise") \
    >> PairwiseEvidenceCalibrationJudgeUnit().prompt(PROMPT).via("gpt-4", retries=3)

df, _ = pipeline.run_from_dataset(dataset['all'], max_workers=1024)

## Methods

### Vanilla

In [13]:
from verdict.util.experiment import ExperimentConfig, display_stats

import numpy as np
def add_judgement_column(df, score_assistant_1_col, score_assistant_2_col):
    df['_judgement'] = np.select([
        df[score_assistant_1_col] > df[score_assistant_2_col],
        df[score_assistant_1_col] == df[score_assistant_2_col]
    ], [
        'CHATGPT',
        'TIE'
    ], default='VICUNA13B')
    return df

df = add_judgement_column(df, 'Pairwise_root.block.unit[Unit]_score_assistant_1', 'Pairwise_root.block.unit[Unit]_score_assistant_2')
display_stats(df, ExperimentConfig(
    ground_truth_cols=['judgement'],
    prediction_cols=['_judgement']
));

### MEC (Multiple Evidence Calibration, Section 3.1)

In [6]:
from verdict import Layer
from verdict.transform import MeanPoolUnit

pipeline = Pipeline() \
    >> Layer(
        PairwiseEvidenceCalibrationJudgeUnit().prompt(PROMPT).via("gpt-4", retries=3)
    , 3) \
    >> MeanPoolUnit(["score_assistant_1", "score_assistant_2"])

df, _ = pipeline.run_from_dataset(dataset['all'], max_workers=1024)

In [21]:
df = add_judgement_column(df, 'Pipeline_root.block.block.unit[Map MeanPool]_score_assistant_1', 'Pipeline_root.block.block.unit[Map MeanPool]_score_assistant_2')
display_stats(df, ExperimentConfig(
    ground_truth_cols=['judgement'],
    prediction_cols=['_judgement']
));

### MEC + BPC (Balanced Position Calibration, Section 3.2)

In [17]:
from verdict.transform import MapUnit

PROMPT_REVERSED = """
    [Question]
    {source.question}
    [The Start of Assistant 1's response]
    {source.answer_assistant_2}
    [The End of Assistant 1's response]
    [The Start of Assistant 2's response]
    {source.answer_assistant_1}
    [The End of Assistant 2's response]

    @system
    We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.
    Please rate the helpfulness, relevance, accuracy, and level of detail of their responses.
    Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.
    Please first provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.
    Then, output scores for Assistant 1 and 2, respectively.
"""

pipeline = Pipeline() \
    >> Layer([
        PairwiseEvidenceCalibrationJudgeUnit().prompt(PROMPT),
        PairwiseEvidenceCalibrationJudgeUnit().prompt(PROMPT_REVERSED).propagate(lambda unit, previous, input, output: Schema.of(
            score_assistant_1=output.score_assistant_2,
            score_assistant_2=output.score_assistant_1
        ))
    ], 3).via("gpt-4", retries=3) \
    >> MeanPoolUnit(["score_assistant_1", "score_assistant_2"])

df, _ = pipeline.run_from_dataset(dataset['all'], max_workers=1024)

In [14]:
df = add_judgement_column(df, 'Pipeline_root.block.block.unit[Map MeanPool]_score_assistant_1', 'Pipeline_root.block.block.unit[Map MeanPool]_score_assistant_2')
display_stats(df, ExperimentConfig(
    ground_truth_cols=['judgement'],
    prediction_cols=['_judgement']
));