In [None]:
%pip install groq requests pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Pairwise Judgment Evaluator with Swap Test
from groq import Groq
import os
import time
from typing import Tuple, Optional

# Using Groq API
# Get API key at: https://console.groq.com/keys
os.environ["GROQ_API_KEY"] = ""  # Set your Groq API key here

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

def judge_pair(response_a: str, response_b: str) -> str:
    """
    Judge which response is better (A or B) using Groq API.
    Returns: "A", "B", or "tie"
    """
    
    prompt = f"""You are evaluating two responses. Which is better? Consider accuracy, clarity, and completeness.

Response A:
{response_a}

Response B:
{response_b}

Respond with ONLY: A, B, or tie"""

    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=10
        )
        
        result = response.choices[0].message.content.strip().upper()
        
        # Normalize response
        if "TIE" in result[:20]:
            return "tie"
        elif result.startswith("A") or " A" in result[:10]:
            return "A"
        elif result.startswith("B") or " B" in result[:10]:
            return "B"
        else:
            return "tie"
            
    except Exception as e:
        print(f"  Groq API Error: {e}")
        return "tie"


In [9]:
# Swap Test Implementation
def swap_test(response_a: str, response_b: str) -> Tuple[str, bool, Tuple[str, str]]:
    """
    Perform swap test: judge A vs B, then B vs A.
    Returns: (final_judgment, is_consistent, (judgment_1, judgment_2))
    - final_judgment: "A", "B", or "tie"
    - is_consistent: True if judge was consistent, False if inconsistent
    - (judgment_1, judgment_2): The two judgments for logging
    """
    # Test 1: A vs B
    judgment_1 = judge_pair(response_a, response_b)
    time.sleep(0.1)  # Small delay between calls
    
    # Test 2: B vs A (swapped)
    judgment_2 = judge_pair(response_b, response_a)
    
  
    consistent = False
    final_judgment = "tie"
    
    if judgment_1 == "A" and judgment_2 == "B":
        # A wins in both orders (different labels but consistent conclusion)
        consistent = True
        final_judgment = "A"
    elif judgment_1 == "B" and judgment_2 == "A":
        # B wins in both orders (different labels but consistent conclusion)
        consistent = True
        final_judgment = "B"
    elif judgment_1 == "tie" and judgment_2 == "tie":
        # Both are ties (same judgment)
        consistent = True
        final_judgment = "tie"
    else:
        # ALL OTHER CASES ARE INCONSISTENT (positional bias):
        consistent = False
        final_judgment = "tie"  # Default to tie on inconsistency
    
    return final_judgment, consistent, (judgment_1, judgment_2)

print("Swap test function defined.")


Swap test function defined.


In [10]:
# Test Cases
test_cases = [
    {
        "name": "Longer vs Shorter (both correct)",
        "response_a": "Quantum computing is a computational paradigm that leverages quantum mechanical phenomena like superposition and entanglement to perform calculations. It uses qubits instead of classical bits, allowing for exponentially more computational states.",
        "response_b": "Quantum computing uses quantum mechanics for faster calculations."
    },
    {
        "name": "Correct vs Incorrect",
        "response_a": "The capital of France is Paris. It has been the capital since 987 AD.",
        "response_b": "The capital of France is London."
    },
    {
        "name": "Verbose vs Concise (both correct)",
        "response_a": "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. It uses algorithms to analyze data, identify patterns, and make decisions or predictions.",
        "response_b": "Machine learning is AI that learns from data."
    },
    {
        "name": "Similar quality",
        "response_a": "Python is a high-level programming language known for its simplicity and readability.",
        "response_b": "Python is an easy-to-read programming language with powerful features."
    },
    {
        "name": "First bias test",
        "response_a": "Deep learning uses neural networks.",
        "response_b": "Deep learning utilizes artificial neural networks with multiple layers to learn complex patterns from data through backpropagation and gradient descent optimization."
    }
]


In [11]:

results = []
consistent_count = 0
bias_instances = []

for i, case in enumerate(test_cases, 1):
    print(f"\nTest {i}: {case['name']}")
    print(f"  Response A length: {len(case['response_a'])} chars")
    print(f"  Response B length: {len(case['response_b'])} chars")
    
    final_judgment, is_consistent, (judgment_1, judgment_2) = swap_test(
        case['response_a'], case['response_b']
    )
    
    results.append({
        'name': case['name'],
        'judgment_1': judgment_1,
        'judgment_2': judgment_2,
        'final': final_judgment,
        'consistent': is_consistent
    })
    
    if is_consistent:
        consistent_count += 1
        print(f"  Consistent: {judgment_1} (A vs B) → {judgment_2} (B vs A)")
        print(f"  Final judgment: {final_judgment}")
    else:
        bias_instances.append(case['name'])
        print(f"  INCONSISTENT: {judgment_1} (A vs B) but {judgment_2} (B vs A)")
        print(f"  Bias detected! Defaulting to TIE")

print(f"Total tests: {len(test_cases)}")
print(f"Consistent judgments: {consistent_count}")
print(f"Inconsistent judgments (bias): {len(bias_instances)}")
print(f"Agreement rate: {consistent_count/len(test_cases)*100:.1f}%")
print(f"\nBias instances: {bias_instances if bias_instances else 'None'}")


Test 1: Longer vs Shorter (both correct)
  Response A length: 246 chars
  Response B length: 65 chars
  Consistent: A (A vs B) → B (B vs A)
  Final judgment: A

Test 2: Correct vs Incorrect
  Response A length: 69 chars
  Response B length: 32 chars
  Consistent: A (A vs B) → B (B vs A)
  Final judgment: A

Test 3: Verbose vs Concise (both correct)
  Response A length: 240 chars
  Response B length: 45 chars
  Consistent: A (A vs B) → B (B vs A)
  Final judgment: A

Test 4: Similar quality
  Response A length: 85 chars
  Response B length: 70 chars
  Consistent: A (A vs B) → B (B vs A)
  Final judgment: A

Test 5: First bias test
  Response A length: 35 chars
  Response B length: 165 chars
  Consistent: B (A vs B) → A (B vs A)
  Final judgment: B
Total tests: 5
Consistent judgments: 5
Inconsistent judgments (bias): 0
Agreement rate: 100.0%

Bias instances: None
