# Single Agent on JudgeBench with Position Swapping


## Imports

In [2]:
import pandas as pd
from datasets import load_dataset
import os
from dotenv import load_dotenv
from autogen import ConversableAgent
import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [3]:
JudgeBench_Claude = load_dataset("ScalerLab/JudgeBench", split="claude")

df = pd.DataFrame(JudgeBench_Claude)

df_sampled = df.iloc[:270].sample(n=100, random_state=42).reset_index(drop=True)

df_final = df_sampled[["question", "response_A", "response_B", "label"]]

print(df_final.info())
print(df_final.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    100 non-null    object
 1   response_A  100 non-null    object
 2   response_B  100 non-null    object
 3   label       100 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB
None
                                            question  \
0  Under standard temperature and pressure condit...   

                                          response_A  \
0  Let's approach this step-by-step:\n\n1) The ra...   

                                          response_B label  
0  Let's approach this step-by-step:\n\n1) The ra...   B>A  


## Config

In [4]:
load_dotenv()

api_key = os.getenv("AZURE_OPENAI_API_KEY")
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME")
api_version = os.getenv("AZURE_API_VERSION", "2023-12-01-preview")

config_list = [
    {
        "model": deployment_name,
        "api_key": api_key,
        "base_url": f"{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}",
        "api_type": "azure",
        "api_version": api_version,  
        "temperature": 0,
        "cache_seed": 42
    }
]

## System Design

In [5]:
agent_system_message = f"""
You are an objective Evalutor-Agent.
Your task is to choose the more factually correct response to the question between Respone A and Response B.
Explain your decision in about 200 words.
Always begin your output with: "As an objective Evaluator-Agent I think ..."
Always end your output with stating if Response A or B is more factual correct by using a JSON-object with the follwing format: {{"response": A/B}}
"""

In [6]:
initializer = ConversableAgent(
    "initializer", 
    llm_config={"config_list": config_list},
    human_input_mode="NEVER",
    )

agent = ConversableAgent(
    "Evaluator-Agent", 
    llm_config={"config_list": config_list},
    system_message=agent_system_message,
    human_input_mode="NEVER",
    )

## Evaluation

In [7]:
def evaluate(question, response_A, response_B, label):

    message = f""" 
    Question: {question}

    Response A: {response_A}

    Response B: {response_B}
    """

    result = initializer.initiate_chat(agent, message=message, max_turns=1)
    result_str = str(result)
    print(result_str)

    pattern = r'"response"\s*:\s*"?([AB])"?'

    match = re.search(pattern, result_str)
    system_decision = match.group(1) if match else "X"
    print(system_decision)

    ground_truth = "A" if "A>" in label else "B"

    is_correct = system_decision == ground_truth

    return {
        "system_decision": system_decision,
        "ground_truth": ground_truth,
        "is_correct": is_correct
    }

In [8]:
num_rows = 10

df_final_subset = df_final.head(num_rows)

# normal position
results_1 = []

for _, row in tqdm(df_final_subset.iterrows(), total=num_rows, desc="Progress"):
    result = evaluate(
        question=row["question"],
        response_A=row["response_A"],
        response_B=row["response_B"],
        label=row["label"],
    )
    results_1.append(result)

results_1_df = pd.DataFrame(results_1)
results_1_df.to_csv('Results/single_1.csv', index=False)

accuracy_1 = results_1_df["is_correct"].mean()
print(f"Accuracy_1: {accuracy_1:.2%}")

position_distribution_1 = results_1_df["system_decision"].value_counts(normalize=True) * 100
print(f"Position_Distribution_1: {position_distribution_1}")

# swapped position
results_2 = []

for _, row in tqdm(df_final_subset.iterrows(), total=num_rows, desc="Progress"):
    result = evaluate(
        question=row["question"],
        response_A=row["response_B"], # positions swap 
        response_B=row["response_A"], # positions swap 
        label=row["label"],
    )

    # position swap back
    if result["system_decision"] == "A":
        result["system_decision"] = "B"
    elif result["system_decision"] == "B":
        result["system_decision"] = "A"

    if result["is_correct"] == True:
        result["is_correct"] = False
    elif result["is_correct"] == False:
        result["is_correct"] = True

    results_2.append(result)

results_2_df = pd.DataFrame(results_2)
results_2_df.to_csv('Results/single_2.csv', index=False)

accuracy_2 = results_2_df["is_correct"].mean()
print(f"Accuracy_2: {accuracy_2:.2%}")

position_distribution_2 = results_2_df["system_decision"].value_counts(normalize=True) * 100
print(f"Position_Distribution_2: {position_distribution_2}")

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

[33minitializer[0m (to Evaluator-Agent):

 
    Question: Under standard temperature and pressure conditions, compare the relative rates at which inert gases,Ar, He, and Kr diffuse through a common orifice.
(A) .1002 : .3002 : .4002
(B) .3582 : .4582 : .0582
(C) .2582 : .4998 : .3092
(D) .1582 : .6008 : .2092
(E) .1582 : .4998 : .1092
(F) .2002 : .4002 : .1092
(G) .1582 : .3998 : .2592
(H) .2502 : .4502 : .1502
(I) .2082 : .5998 : .1592
(J) .1802 : .4802 : .2802
If you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is K, then write KKKKK.
Let's think step by step.

    Response A: Let's approach this step-by-step:

1) The rate of diffusion of gases is inversely proportional to the square root of their molecular masses. This is known as Graham's Law.

2) We need to find the relative rates, so we'll use the formula: 
   Rate ∝ 1/√(Molecular Mass)


--------------------------------------------------------------------------------
[33mEvaluator-Agent[0m (to initializer):

As an objective Evaluator-Agent I think both responses correctly apply Graham's Law of Diffusion to determine the relative rates of diffusion for Argon (Ar), Helium (He), and Krypton (Kr). They both accurately calculate the diffusion rates based on the molecular masses of the gases. However, the final conclusions drawn by each response differ.

Response A concludes that the closest match to the normalized rates is option D (0.1582 : 0.6008 : 0.2092), while Response B concludes that the closest match is option E (0.1582 : 0.4998 : 0.1092). 

Upon reviewing the calculations, Response A's normalization of the rates is slightly off, as it does not match the expected ratios based on the calculated values. Response B, while also making a slight error in the final matching, is closer to the expected ratios based on the calculations provided. 

Thus, while both response

Progress:  20%|██        | 2/10 [00:00<00:00, 16.79it/s]

ChatResult(chat_id=None, chat_history=[{'content': " \n    Question: The total cost of producing x cameras is C(x) = 2 + x^3. What is the average cost if 10 cameras are made? What is the marginal cost of producing 10 cameras?\n(A) Average Cost: $1000/unit, Marginal Cost: $100\n(B) Average Cost: $300/unit, Marginal Cost: $100\n(C) Average Cost: $300/unit, Marginal Cost: $1000\n(D) Average Cost: $102/unit, Marginal Cost: $301\n(E) Average Cost: $500/unit, Marginal Cost: $1000\n(F) Average Cost: $100/unit, Marginal Cost: $100\n(G) Average Cost: $200/unit, Marginal Cost: $300\n(H) Average Cost: $2/unit, Marginal Cost: $30\n(I) Average Cost: $1020/unit, Marginal Cost: $299\n(J) Average Cost: $100/unit, Marginal Cost: $300\nIf you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is K, then write KKKKK.\nLet's think step by step.\n\n    Response A: Let's 

Progress:  50%|█████     | 5/10 [00:00<00:00, 19.68it/s]

ChatResult(chat_id=None, chat_history=[{'content': " \n    Question: There are 3 people standing in a line numbered 1 through 3 in a left to right order.\nEach person has a set of attributes: Food, Nationality, Hobby.\nThe attributes have the following possible values:\n- Food: nectarine, garlic, cucumber\n- Nationality: chinese, japanese, thai\n- Hobby: magic-tricks, filmmaking, puzzles\nand exactly one person in the line has a given value for an attribute.\n\nGiven the following premises about the line of people:\n- the person that likes garlic is on the far left\n- the person who is thai is somewhere to the right of the person who likes magic-tricks\n- the person who is chinese is somewhere between the person that likes cucumber and the person who likes puzzles\n\nAnswer the following question:\nWhat is the hobby of the person who is thai? Return your answer as a single word, in the following format: ***X***, where X is the answer.\n\n    Response A: Let's solve this step by step:\n

Progress:  80%|████████  | 8/10 [00:00<00:00, 21.12it/s]

ChatResult(chat_id=None, chat_history=[{'content': " \n    Question: A radioactive sample contains two different isotopes, A and B. A has a half-life of 3 days, and B has a half-life of 6 days. Initially in the sample there are twice as many atoms of A as of B. In how many days will the ratio of the number of atoms of A to B be reversed?\n(A) 15.0\n(B) 18.0\n(C) 12.0\n(D) 24.0\n(E) 9.0\n(F) 6.0\n(G) 21.0\n(H) 27.0\n(I) 10.5\n(J) 3.0\nIf you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is K, then write KKKKK.\nLet's think step by step.\n\n    Response A: Let's approach this step-by-step:\n\n1) Let's say initially there are 100 atoms of A and 50 atoms of B.\n\n2) We need to find when the ratio of A to B becomes 1:2 (reversed from the initial 2:1).\n\n3) The decay of radioactive isotopes follows the formula: N(t) = N₀ * (1/2)^(t/t₁/₂)\n   Where N(

Progress: 100%|██████████| 10/10 [00:00<00:00, 20.77it/s]


ChatResult(chat_id=None, chat_history=[{'content': " \n    Question: You are given a 0-indexed integer array nums.\nA subsequence of nums having length k and consisting of indices i_0 < i_1 < ... < i_k-1 is balanced if the following holds:\n\nnums[i_j] - nums[i_j-1] >= i_j - i_j-1, for every j in the range [1, k - 1].\n\nA subsequence of nums having length 1 is considered balanced.\nReturn an integer denoting the maximum possible sum of elements in a balanced subsequence of nums.\nA subsequence of an array is a new non-empty array that is formed from the original array by deleting some (possibly none) of the elements without disturbing the relative positions of the remaining elements.\n \nExample 1:\n\nInput: nums = [3,3,5,6]\nOutput: 14\nExplanation: In this example, the subsequence [3,5,6] consisting of indices 0, 2, and 3 can be selected.\nnums[2] - nums[0] >= 2 - 0.\nnums[3] - nums[2] >= 3 - 2.\nHence, it is a balanced subsequence, and its sum is the maximum among the balanced subs

Progress:   0%|          | 0/10 [00:00<?, ?it/s]

[33minitializer[0m (to Evaluator-Agent):

 
    Question: Under standard temperature and pressure conditions, compare the relative rates at which inert gases,Ar, He, and Kr diffuse through a common orifice.
(A) .1002 : .3002 : .4002
(B) .3582 : .4582 : .0582
(C) .2582 : .4998 : .3092
(D) .1582 : .6008 : .2092
(E) .1582 : .4998 : .1092
(F) .2002 : .4002 : .1092
(G) .1582 : .3998 : .2592
(H) .2502 : .4502 : .1502
(I) .2082 : .5998 : .1592
(J) .1802 : .4802 : .2802
If you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is K, then write KKKKK.
Let's think step by step.

    Response A: Let's approach this step-by-step:

1) The rate of diffusion of gases is inversely proportional to the square root of their molecular masses. This is known as Graham's Law of Diffusion.

2) The atomic masses of the gases are:
   Ar (Argon): 39.95 g/mol
   He (Helium): 

Progress:  30%|███       | 3/10 [00:00<00:00, 20.10it/s]

[33minitializer[0m (to Evaluator-Agent):

 
    Question: What is a margin requirement?
(A) The financial reserve a trader must maintain to cover potential losses on leveraged positions
(B) Percentage of the value of a stock purchase required to be paid immediately
(C) The maximum percentage of a portfolio that can be invested in a single stock
(D) The total cost of a stock purchase
(E) The commission fee paid to a broker for executing a stock trade
(F) The minimum number of shares required to be purchased in a stock transaction
(G) The profit margin that a company must maintain on its stock price
(H) The amount of money required to open a stock account
(I) The interest rate on a stock purchase
(J) The insurance premium for protecting a stock portfolio against losses
If you cannot determine the correct multiple-choice answer, take your best guess. Once you have your answer, please duplicate that letter five times in a single string. For example, if the answer is K, then write KKKKK.


Progress:  60%|██████    | 6/10 [00:00<00:00, 20.19it/s]

[33minitializer[0m (to Evaluator-Agent):

 
    Question: There are 3 people standing in a line numbered 1 through 3 in a left to right order.
Each person has a set of attributes: Nationality, Hobby, Pet, Music-Genre.
The attributes have the following possible values:
- Nationality: malaysian, japanese, mexican
- Hobby: baking, collecting, hiking
- Pet: snake, guinea-pig, dog
- Music-Genre: funk, pop, electronic
and exactly one person in the line has a given value for an attribute.

Given the following premises about the line of people:
- the person that has a dog is somewhere to the left of the person who is mexican
- the person who listens to pop is not anywhere to the right of the person who listens to electronic
- the person that has a dog is not anywhere to the left of the person who listens to electronic
- the person who listens to electronic is somewhere to the right of the person who likes hiking
- the person who likes hiking does not own a snake
- the person who is malaysian

Progress:  90%|█████████ | 9/10 [00:00<00:00, 20.41it/s]

ChatResult(chat_id=None, chat_history=[{'content': " \n    Question: Compute the sample standard deviation of ${9, 14, 5, 4, -20, -13, -5, 13}$. Please put your final answer in a $\\\\boxed{}$.\n\n    Response A: Let's approach this step-by-step:\n\n1) First, we need to calculate the mean (average) of the dataset.\n   Mean = (9 + 14 + 5 + 4 + (-20) + (-13) + (-5) + 13) ÷ 8 = 7 ÷ 8 = 0.875\n\n2) Now, we subtract the mean from each data point and square the result:\n   (9 - 0.875)² = 66.015625\n   (14 - 0.875)² = 171.390625\n   (5 - 0.875)² = 16.890625\n   (4 - 0.875)² = 9.765625\n   (-20 - 0.875)² = 435.890625\n   (-13 - 0.875)² = 192.390625\n   (-5 - 0.875)² = 34.515625\n   (13 - 0.875)² = 147.015625\n\n3) We sum these squared differences:\n   66.015625 + 171.390625 + 16.890625 + 9.765625 + 435.890625 + 192.390625 + 34.515625 + 147.015625 = 1073.875\n\n4) We divide this sum by (n-1), where n is the number of data points. Here, n = 8.\n   1073.875 ÷ 7 = 153.410714\n\n5) Finally, we take

Progress: 100%|██████████| 10/10 [00:00<00:00, 19.92it/s]

Accuracy_2: 50.00%
Position_Distribution_2: system_decision
B    80.0
A    20.0
Name: proportion, dtype: float64



