# Parallel Interaction on JudgeBench with Position Swapping

Please see SOURCES.txt for further source information.

## Imports

In [None]:
import pandas as pd
from datasets import load_dataset
import os
from dotenv import load_dotenv
from autogen import ConversableAgent
import re
from tqdm import tqdm
from collections import Counter

## Data

In [None]:
# Load and prepare a subset of the JudgeBench dataset
JudgeBench_Claude = load_dataset("ScalerLab/JudgeBench", split="claude")
df = pd.DataFrame(JudgeBench_Claude)
df_sampled = df.sample(n=100, random_state=42).reset_index(drop=True)
df_final = df_sampled[["question", "response_A", "response_B", "label"]]

print(df_final.info())
print(df_final.head(1))

## Config

In [None]:
# Load Azure OpenAI configuration from environment variables
load_dotenv()

api_key = os.getenv("AZURE_OPENAI_API_KEY")
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME")
api_version = os.getenv("AZURE_API_VERSION")

# Start of code citation [1]
# The following code is adapted from the source above.

# Define the model configuration for Azure OpenAI API access
config_list = [
    {
        "model": deployment_name,
        "api_key": api_key,
        "base_url": f"{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version={api_version}",
        "api_type": "azure",
        "api_version": api_version,
        "temperature": 0,
        "cache_seed": 42
    }
]

# End of code citation [1]

## System Design

In [None]:
# Define the system prompt for the Scientific-Agent
agent_1_system_message =f"""
You are a Scientific-Agent with a background in academic research and critical thinking.
Your task is to assess which of Response A or Response B is more factually correct. 
Focus on whether the statements are logically sound, consistent with known facts and science.
Give an explanation for your decision using about 200 words.
Always begin your output with "As a Scientific-Agent I think ..."
Always end your output with stating if Response A or B is more factually correct by using a JSON-object with the following format: {{"response": A/B}}
"""

# Define the system prompt for the Logical-Agent
agent_2_system_message =f"""
You are a Logical-Agent specializing in evaluating argument consistency, clarity and coherence.
Your task is to analyze Response A and Response B in terms of internal logical structure and factual plausibility.
Look for contradictions, fallacies, and misleading reasoning, even if subtle.
Give an explanation for your decision using about 200 words.
Always begin your output with: "As a Logical-Agent I think ..."
Always end your output with stating if Response A or B is more factually correct by using a JSON-object with the following format: {{"response": A/B}}
"""

# Define the system prompt for the Domain-Expert-Agent
agent_3_system_message =f"""
You are a Domain-Expert-Agent with applied experience in the relevant field.
Your task is to judge which of Response A or Response B reflects real-world knowledge and practical correctness more accurately.
Focus on practical validity, typical use cases, and whether the information would hold in applied scenarios.
Give an explanation for your decision using about 200 words.
Always begin your output with: "As a Domain-Expert-Agent I think ..."
Always end your output with stating if Response A or B is more factually correct by using a JSON-object with the following format: {{"response": A/B}}
"""

In [None]:
# Start of code citation [1]
# The following code is adapted from the source above.

# Initialize the ConversableAgents for system setup
initializer = ConversableAgent(
    "initializer", 
    llm_config={"config_list": config_list},
    human_input_mode="NEVER",
    )

agent_1 = ConversableAgent(
    "Scientific-Agent",
    llm_config={"config_list": config_list},
    system_message=agent_1_system_message,
    human_input_mode="NEVER",
)
agent_2 = ConversableAgent(
    "Logical-Agent",
    llm_config={"config_list": config_list},
    system_message=agent_2_system_message,
    human_input_mode="NEVER",
)
agent_3 = ConversableAgent(
    "Domain-Expert",
    llm_config={"config_list": config_list},
    system_message=agent_3_system_message,
    human_input_mode="NEVER",
)

# End of code citation [1]

In [None]:
# Define the evaluation function with majority voting
def evaluate(question, response_A, response_B, label):

    message = f""" 
    Question: {question}

    Response A: {response_A}

    Response B: {response_B}
    """

    agents = [agent_1, agent_2, agent_3]
    decisions = []

    pattern = r'"response"\s*:\s*"?([AB])"?'

    ground_truth = "A" if "A>" in label else "B"

    for agent in agents:
            
            # Start of code citation [1]
            # The following code is adapted from the source above.

            result = initializer.initiate_chat(agent, message=message, max_turns=1)

            # End of code citation [1]

            result_str = str(result)

            match = re.search(pattern, result_str)
            # If pattern not found, assign "X" to indicate invalid response
            decision = match.group(1) if match else "X"
            decisions.append(decision)

    if "X" in decisions:
        return {
            "system_decision": "X",
            "ground_truth": ground_truth,
            "is_correct": False
        }

    system_decision = Counter(decisions).most_common(1)[0][0]
    is_correct = system_decision == ground_truth

    return {
        "system_decision": system_decision,
        "ground_truth": ground_truth,
        "is_correct": is_correct
    }


In [None]:
# Prepare evaluation data
num_rows = 100
df_final_subset = df_final.head(num_rows)

# Evaluate responses in normal position
results_1 = []
for _, row in tqdm(df_final_subset.iterrows(), total=num_rows, desc="Progress"):
    result = evaluate(
        question=row["question"],
        response_A=row["response_A"],
        response_B=row["response_B"],
        label=row["label"],
    )
    results_1.append(result)

results_1_df = pd.DataFrame(results_1)
results_1_df.to_csv('Results/parallel_1.csv', index=False)

# Evaluate responses in swapped positions
results_2 = []
for _, row in tqdm(df_final_subset.iterrows(), total=num_rows, desc="Progress"):
    result = evaluate(
        question=row["question"],
        response_A=row["response_B"],  # swap positions
        response_B=row["response_A"],  # swap positions
        label=row["label"],
    )

    # Correct system decision and correctness flag after swapping
    # If system_decision is "X", leave it unchanged
    if result["system_decision"] == "A":
        result["system_decision"] = "B"
    elif result["system_decision"] == "B":
        result["system_decision"] = "A"

    if result["is_correct"] == True:
        result["is_correct"] = False
    elif result["is_correct"] == False:
        result["is_correct"] = True

    results_2.append(result)

results_2_df = pd.DataFrame(results_2)
results_2_df.to_csv('Results/parallel_2.csv', index=False)