In [21]:
# 1. Setup config

import pandas as pd
import json
import random
import time
from typing import List, Dict, Tuple
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.WARNING,  # default to WARNING
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("SlopRankLogger")
logger.setLevel(logging.INFO)  # Our SlopRank logs at INFO


In [22]:
# 2. Configuration (EvalConfig)

@dataclass
class EvalConfig:
    """Configuration for the evaluation system."""
    model_names: List[str]
    evaluation_method: int  # e.g., 1 => numeric rating
    use_subset_evaluation: bool
    evaluators_subset_size: int
    output_dir: Path
    request_delay: float = 0.0  # adjustable delay between requests if needed
    
    def __post_init__(self):
        self.output_dir.mkdir(parents=True, exist_ok=True)
        if self.evaluation_method not in {1, 2}:
            raise ValueError("evaluation_method must be 1 or 2")
        if self.evaluators_subset_size >= len(self.model_names):
            raise ValueError("evaluators_subset_size must be < number of models")

DEFAULT_CONFIG = EvalConfig(
    model_names=[
        "gemini-2.0-flash-thinking-exp-1219",
        "gemini-exp-1206",
        "claude-3-5-sonnet-latest",
        "o1-preview",
        "gpt-4o",
        "deepseek-chat",
        # "groq-llama-3.3-70b"
    ],
    evaluation_method=1,  # numeric
    use_subset_evaluation=True,
    evaluators_subset_size=3,
    output_dir=Path("results"),  # folder for CSV outputs
    request_delay=0.0
)


In [23]:
# 3. Read prompts
# You should have a local "prompts.xlsx" file with columns ["Questions", "Answer_key"].

from dotenv import load_dotenv
load_dotenv()  # if you have .env credentials

logger.info("Reading prompts from prompts.xlsx ...")
prompts_df = pd.read_excel("prompts.xlsx", sheet_name=0)
prompts = prompts_df["Questions"].tolist()

# If "Answer_key" column exists, read it; otherwise fallback to None
if "Answer_key" in prompts_df.columns:
    answer_keys = prompts_df["Answer_key"].tolist()
else:
    logger.warning("No Answer_key column found; using None.")
    answer_keys = [None]*len(prompts_df)

prompt_pairs = list(zip(prompts, answer_keys))
logger.info(f"Loaded {len(prompt_pairs)} prompts from Excel.")


2025-01-13 23:10:50,274 - INFO - Reading prompts from prompts.xlsx ...
2025-01-13 23:10:50,290 - INFO - Loaded 37 prompts from Excel.


In [24]:
# 4. Collecting the responses (with partial checks)

def collect_responses(prompt_pairs: List[Tuple[str, str]], config: EvalConfig, llm_module) -> pd.DataFrame:
    """
    Query each model with each prompt, skipping any (prompt, model) pairs
    already found in the existing responses.csv. 
    Return the combined DataFrame: (prompt, model, response, is_valid, response_time, Answer_key, token_count).
    """

    logger.info("Collecting responses (with partial coverage check)...")

    # 1) Try to load existing responses
    resp_path = config.output_dir / "responses.csv"
    existing_responses_df = None
    if resp_path.exists():
        logger.info(f"Found existing responses at {resp_path}, will skip duplicates.")
        existing_responses_df = pd.read_csv(resp_path)
    else:
        logger.info("No existing responses file found; we'll collect everything from scratch.")

    new_rows = []
    total_start = time.time()

    # 2) For each (prompt, answer_key) pair
    for i, (prompt, answer_key) in enumerate(prompt_pairs, 1):
        logger.info(f"Processing prompt {i}/{len(prompt_pairs)}: {prompt[:60]}...")
        for model_name in config.model_names:
            # Skip if we already have a row for (prompt, model_name)
            if existing_responses_df is not None:
                subset = existing_responses_df[
                    (existing_responses_df["prompt"] == prompt) &
                    (existing_responses_df["model"] == model_name)
                ]
                if not subset.empty:
                    # Already have it; skip
                    logger.info(f"Skipping existing response for model={model_name}, prompt={prompt[:40]}...")
                    continue

            # Otherwise, query the model now
            start_time = time.time()
            logger.info(f"Querying {model_name} for new response...")
            try:
                model = llm_module.get_model(model_name)
                raw_response = model.prompt(prompt).text()

                valid = isinstance(raw_response, str) and len(raw_response.strip()) >= 10
                elapsed = time.time() - start_time
                tokens_used = len(raw_response.split())

                new_rows.append({
                    'prompt': prompt,
                    'model': model_name,
                    'response': raw_response if valid else None,
                    'is_valid': valid,
                    'response_time': elapsed,
                    'Answer_key': answer_key,
                    'token_count': tokens_used
                })
                logger.info(
                    f"{model_name} responded in {elapsed:.2f}s - {'Valid' if valid else 'Invalid'}"
                )

            except Exception as e:
                elapsed = time.time() - start_time
                logger.error(f"Error from {model_name} after {elapsed:.2f}s: {str(e)}")
                new_rows.append({
                    'prompt': prompt,
                    'model': model_name,
                    'response': None,
                    'is_valid': False,
                    'response_time': elapsed,
                    'Answer_key': answer_key,
                    'token_count': 0
                })

            if config.request_delay > 0.0:
                time.sleep(config.request_delay)

    total_time = time.time() - total_start
    logger.info(f"Response collection done in {total_time:.2f}s")

    # 3) Combine with existing responses if any
    if existing_responses_df is not None:
        new_df = pd.DataFrame(new_rows)
        combined_df = pd.concat([existing_responses_df, new_df], ignore_index=True)
        # Drop duplicates if needed
        combined_df.drop_duplicates(subset=["prompt", "model"], keep="first", inplace=True)
        return combined_df
    else:
        # No prior file => just return new rows
        return pd.DataFrame(new_rows)


In [25]:
# 5. Collecting Raw Evaluations (Unparsed), with partial checks

def collect_raw_evaluations(responses_df: pd.DataFrame, config: EvalConfig, llm_module) -> pd.DataFrame:
    """
    Each model in config.model_names evaluates (rates) the others' responses
    but we skip if we already have a row for (prompt, judge_model, model_mapping) 
    in raw_evaluations.csv. 
    Returns the combined DataFrame of new + old.
    """

    logger.info("Collecting raw evaluations (unparsed, partial check)...")

    # 1) Try loading existing raw evaluations
    raw_eval_path = config.output_dir / "raw_evaluations.csv"
    existing_raw_eval_df = None
    if raw_eval_path.exists():
        logger.info(f"Found existing raw evaluations at {raw_eval_path}, will skip duplicates.")
        existing_raw_eval_df = pd.read_csv(raw_eval_path)
    else:
        logger.info("No existing raw_evaluations.csv found; collecting from scratch.")

    new_judgments = []

    unique_prompts = responses_df['prompt'].unique()
    for prompt in unique_prompts:
        prompt_subset = responses_df[responses_df['prompt'] == prompt]
        answer_key = prompt_subset['Answer_key'].iloc[0] if 'Answer_key' in prompt_subset.columns else None
        prompt_responses = prompt_subset.set_index('model')['response'].to_dict()

        # Evaluate with each model as judge
        for judge_model in config.model_names:
            # Exclude judge's own or missing responses
            other_models = [
                m for m in config.model_names
                if m != judge_model and prompt_responses.get(m) is not None
            ]
            if config.use_subset_evaluation and other_models:
                other_models = random.sample(
                    other_models,
                    min(config.evaluators_subset_size, len(other_models))
                )

            if not other_models:
                continue

            # Build the anonymized mapping
            model_to_anon = {m: f"Model_{i+1}" for i, m in enumerate(other_models)}
            answers_section = "\n".join([
                f"{model_to_anon[m]}:\n{prompt_responses[m]}\n---"
                for m in other_models
            ])
            if answer_key:
                answer_key_edited = f"The Answer Key here is:\n{answer_key}\n---\n"
            else:
                answer_key_edited = ""

            instructions = f"""
You are an expert evaluator tasked with assessing the quality of responses from different language models. Your goal is to provide accurate and unbiased ratings based on a given problem, answer key, and set of criteria.

First, carefully read the following information:

Here is the original problem or prompt:
<problem>
{prompt}
</problem>

Here are the answers provided by different models:
<answers_section>
{answers_section}
</answers_section>

Here is the answer key to guide your evaluation. It will tell you what could be considered GOOD and BAD, so you can rate appropriately:
<answer_key>
{answer_key_edited}
</answer_key>

Your task is to evaluate the answers provided by all the models (Model_1, Model_2, etc.) based on these criteria:
1. Accuracy: How well does the answer align with the information as per the answer key?
2. Completeness: Does the answer cover all necessary aspects of the problem?
3. Clarity: Is the answer easy to understand and well-structured?
4. Relevance: Does the answer directly address the given problem?

For each model, you will provide a rating on a scale of 1 to 10 for each criterion, where:
- 10: Exceptional, world-class, zero errors and all the relevant nuances.
- 8-9: Excellent, like a top professional in the field. Not perfect though.
- 6-7: Good, like a competent undergraduate student. Doesn't stand out. Average.
- 4-5: Fair, like an average high school student. Barely satisfactory.
- 1-3: Poor. Factually incorrect and wrong logic.

Please follow the following process to evaluate each model, in markdown format:
1. Read the problem, answer key, and the model's answer carefully.
2. For each criterion:
   a. Think through the key points from the answer that relate to this criterion.
   b. Consider strengths and weaknesses.
   c. Provide a score.
3. Calculate an overall score.

Wrap your detailed evaluation for each model in <detailed_evaluation> tags.

Provide your final ratings in a JSON object with the following structure:
{{"Model_1": X, "Model_2": Y}}
Where X and Y are integer values between 1 and 10.

Remember:
- Adhere strictly to the JSON format specified above, i.e., put the response inside a curly bracket.
- Provide neutral and accurate ratings based solely on the answer key and the given criteria.
- Ensure that your evaluation is thorough and justified.

Begin your evaluation now.""".strip()

            # 2) If we already have a row for (prompt, judge_model, model_mapping), skip
            model_mapping_str = json.dumps(model_to_anon, sort_keys=True)
            already_exists = False
            if existing_raw_eval_df is not None:
                possible_matches = existing_raw_eval_df[
                    (existing_raw_eval_df["prompt"] == prompt) &
                    (existing_raw_eval_df["judge_model"] == judge_model)
                ]
                # Now check if any row has the exact same model_mapping
                # We sort_keys=True above so that JSON string is consistent
                found_match = possible_matches[
                    possible_matches["model_mapping"] == model_mapping_str
                ]
                if not found_match.empty:
                    logger.info(f"Skipping existing raw eval for judge={judge_model}, prompt={prompt[:40]}...")
                    already_exists = True

            if already_exists:
                continue

            # 3) Otherwise, run the LLM judge
            try:
                judge_llm = llm_module.get_model(judge_model)
                judge_result_obj = judge_llm.prompt(instructions)
                raw_judgment = judge_result_obj.text()
                raw_judgment_tokens = len(raw_judgment.split())

                new_judgments.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "raw_judgment": raw_judgment,
                    # store the same sorted string
                    "model_mapping": model_mapping_str,
                    "raw_judgment_token_count": raw_judgment_tokens
                })

            except Exception as e:
                logger.error(f"Error collecting raw eval from judge={judge_model} on prompt='{prompt}': {str(e)}")
                new_judgments.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "raw_judgment": None,
                    "model_mapping": model_mapping_str,
                    "raw_judgment_token_count": 0,
                    "error": str(e)
                })

    new_eval_df = pd.DataFrame(new_judgments)
    logger.info("Finished collecting new raw evaluation outputs.")

    # 4) Combine with existing, if any
    if existing_raw_eval_df is not None and not new_eval_df.empty:
        combined_df = pd.concat([existing_raw_eval_df, new_eval_df], ignore_index=True)
        combined_df.drop_duplicates(subset=["prompt", "judge_model", "model_mapping"], keep="first", inplace=True)
        return combined_df
    elif existing_raw_eval_df is not None:
        # No new data, just return old
        return existing_raw_eval_df
    else:
        # Everything is new
        return new_eval_df

In [26]:
# 6. Parsing the raw evaluations

def parse_evaluation_rows(raw_eval_df: pd.DataFrame, config: EvalConfig) -> pd.DataFrame:
    """
    Parse each row of the raw_eval_df, which contains judge's raw JSON-like output.
    If parsing fails, fallback to a default rating (4.1) for each rated model.
    
    Returns a DataFrame: (prompt, judge_model, rated_model, score).
    """
    evaluations = []

    for _, row in raw_eval_df.iterrows():
        prompt = row["prompt"]
        judge_model = row["judge_model"]
        raw_judgment = row["raw_judgment"]
        raw_judgment_tokens = row.get("raw_judgment_token_count", 0)

        # Convert model_mapping from JSON string back to dict
        try:
            model_mapping = json.loads(row["model_mapping"])  # e.g. {"gemini-exp-1206":"Model_1"}
        except:
            model_mapping = {}

        if not raw_judgment:
            # If there's no raw judgment at all, we might skip or fallback
            for real_model in model_mapping.keys():
                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": 4.1,           # << changed fallback
                    "parse_failed": True
                })
            logger.warning(f"No raw_judgment for prompt={prompt}, judge={judge_model}; skipping parse.")
            continue

        # Try to parse a JSON object from the raw_judgment
        try:
            start = raw_judgment.find("{")
            end = raw_judgment.rfind("}") + 1

            if start == -1 or end == 0:
                raise ValueError("No JSON object found in raw_judgment")

            data = json.loads(raw_judgment[start:end])
            # Reverse mapping: "Model_1" => "gemini-exp-1206"
            anon_to_real = {v: k for k, v in model_mapping.items()}

            for anon_id, score in data.items():
                real_model = anon_to_real.get(anon_id)
                if not real_model:
                    # If we can't find the real model name, skip
                    continue
                numeric_score = float(score)
                numeric_score = max(1.0, min(10.0, numeric_score))  # clamp 1..10

                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": numeric_score,
                    "parse_failed": False,
                    "raw_judgment_token_count": raw_judgment_tokens
                })

        except Exception as e:
            logger.error(f"Parsing error for judge={judge_model}, prompt={prompt}: {str(e)}")
            # If parse fails, assign a default rating
            for real_model in model_mapping.keys():
                evaluations.append({
                    "prompt": prompt,
                    "judge_model": judge_model,
                    "rated_model": real_model,
                    "score": 4.1,
                    "parse_failed": True,
                    "raw_judgment_token_count": raw_judgment_tokens
                })

    evals_df = pd.DataFrame(evaluations)
    return evals_df


In [27]:
# 7. Full workflow
import llm  # custom LLM module

# 1) Create a config
config = DEFAULT_CONFIG
logger.info(f"Using config: {config}")

# 2) Collect or load responses
resp_path = config.output_dir / "responses.csv"

if resp_path.exists():
    logger.info(f"Loading existing responses from {resp_path}")
    responses_df = pd.read_csv(resp_path)
else:
    logger.info("No responses.csv found; collecting now from each model.")
    responses_df = collect_responses(prompt_pairs, config, llm)
    responses_df.to_csv(resp_path, index=False)
    logger.info(f"Saved new responses to {resp_path}")



2025-01-13 23:10:50,326 - INFO - Using config: EvalConfig(model_names=['gemini-2.0-flash-thinking-exp-1219', 'gemini-exp-1206', 'claude-3-5-sonnet-latest', 'o1-preview', 'gpt-4o', 'deepseek-chat'], evaluation_method=1, use_subset_evaluation=True, evaluators_subset_size=3, output_dir=PosixPath('results'), request_delay=0.0)
2025-01-13 23:10:50,327 - INFO - No responses.csv found; collecting now from each model.
2025-01-13 23:10:50,327 - INFO - Collecting responses (with partial coverage check)...
2025-01-13 23:10:50,327 - INFO - No existing responses file found; we'll collect everything from scratch.
2025-01-13 23:10:50,327 - INFO - Processing prompt 1/37: Analyze and compare the architectural styles of the Hagia So...
2025-01-13 23:10:50,328 - INFO - Querying gemini-2.0-flash-thinking-exp-1219 for new response...
2025-01-13 23:11:03,623 - INFO - gemini-2.0-flash-thinking-exp-1219 responded in 13.29s - Valid
2025-01-13 23:11:03,624 - INFO - Querying gemini-exp-1206 for new response...
2

In [28]:
# 8. Collect or load raw evaluations
raw_eval_path = config.output_dir / "raw_evaluations.csv"

if raw_eval_path.exists():
    logger.info(f"Loading existing raw evaluations from {raw_eval_path}")
    raw_eval_df = pd.read_csv(raw_eval_path)
else:
    logger.info("No raw_evaluations.csv found; collecting now (unparsed).")
    raw_eval_df = collect_raw_evaluations(responses_df, config, llm)
    raw_eval_df.to_csv(raw_eval_path, index=False)
    logger.info(f"Saved raw evaluations to {raw_eval_path}")


2025-01-14 00:03:34,699 - INFO - No raw_evaluations.csv found; collecting now (unparsed).
2025-01-14 00:03:34,700 - INFO - Collecting raw evaluations (unparsed, partial check)...
2025-01-14 00:03:34,700 - INFO - No existing raw_evaluations.csv found; collecting from scratch.
2025-01-14 00:05:48,697 - ERROR - Error collecting raw eval from judge=gemini-2.0-flash-thinking-exp-1219 on prompt='What are the characteristics of APOBEC-driven SGMs, particularly their association with YTCA motifs and APOBEC3A expression, especially cancer mutagenesis? ': The model is overloaded. Please try again later.
2025-01-14 00:28:57,021 - ERROR - Error collecting raw eval from judge=gemini-2.0-flash-thinking-exp-1219 on prompt='What are the core assumptions and basic mechanisms and results of the Harberger corporate tax model?
': An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting
2025-01-14 00:47:08,680 - ERROR - Error collecting raw eval

In [31]:
# 9. Parse or load final evaluations
eval_path = config.output_dir / "evaluations.csv"

if eval_path.exists():
    logger.info(f"Loading parsed evaluations from {eval_path}")
    evaluations_df = pd.read_csv(eval_path)
else:
    logger.info("No evaluations.csv found; parsing raw evaluations now.")
    evaluations_df = parse_evaluation_rows(raw_eval_df, config)
    evaluations_df.to_csv(eval_path, index=False)
    logger.info(f"Saved parsed evaluations to {eval_path}")


# 10. Inspect or analyze the final numeric scores
logger.info("Here are the first few rows of the parsed evaluations:")
display(evaluations_df.head())


2025-01-14 10:21:11,959 - INFO - Loading parsed evaluations from results/evaluations.csv
2025-01-14 10:21:11,982 - INFO - Here are the first few rows of the parsed evaluations:


Unnamed: 0,prompt,judge_model,rated_model,score,parse_failed,raw_judgment_token_count
0,Analyze and compare the architectural styles o...,gemini-2.0-flash-thinking-exp-1219,o1-preview,8.0,False,773
1,Analyze and compare the architectural styles o...,gemini-2.0-flash-thinking-exp-1219,deepseek-chat,8.0,False,773
2,Analyze and compare the architectural styles o...,gemini-2.0-flash-thinking-exp-1219,claude-3-5-sonnet-latest,8.0,False,773
3,Analyze and compare the architectural styles o...,gemini-exp-1206,claude-3-5-sonnet-latest,9.0,False,1090
4,Analyze and compare the architectural styles o...,gemini-exp-1206,gemini-2.0-flash-thinking-exp-1219,9.0,False,1090


In [32]:
# 11. Build a graph from evaluations, run PageRank, and display final rankings

import networkx as nx

def build_endorsement_graph(evaluations_df: pd.DataFrame, config: EvalConfig, skip_failed: bool = True) -> nx.DiGraph:
    """
    Builds a directed graph from the numeric evaluations.
    Edge: judge_model -> rated_model, weighted by 'score'.
    """
    if skip_failed:
        evaluations_df = evaluations_df[evaluations_df["parse_failed"] == False]

    G = nx.DiGraph()
    G.add_nodes_from(config.model_names)

    for _, row in evaluations_df.iterrows():
        judge = row["judge_model"]
        rated = row["rated_model"]
        score = float(row["score"])

        # Add or update edge
        if G.has_edge(judge, rated):
            G[judge][rated]["weight"] += score
        else:
            G.add_edge(judge, rated, weight=score)

    return G


G = build_endorsement_graph(evaluations_df, config, skip_failed=True)

if len(G.edges) == 0:
    logger.warning("No edges in the endorsement graph. Nothing to PageRank.")
else:
    # Compute PageRank
    pagerank_scores = nx.pagerank(G, weight="weight")
    # Sort models from highest to lowest PageRank
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

    logger.info("=== PageRank Results ===")
    for model, score in ranked_models:
        logger.info(f"{model}: {score:.4f}")

    # Optionally display or store in a file
    display(pd.DataFrame(ranked_models, columns=["model", "pagerank_score"]))
    
    # Write GML
    nx.write_gml(G, config.output_dir / "endorsement_graph.gml")
    logger.info("Saved endorsement_graph.gml")

    # Write Rankings JSON
    results = {
       "rankings": ranked_models,
       "metadata": {
           "evaluation_method": config.evaluation_method,
           "timestamp": datetime.now().isoformat()
       }
    }
    with open(config.output_dir / "rankings.json", "w") as f:
        json.dump(results, f, indent=4)
    logger.info("Saved rankings.json")



2025-01-14 10:21:14,424 - INFO - === PageRank Results ===
2025-01-14 10:21:14,425 - INFO - o1-preview: 0.1794
2025-01-14 10:21:14,425 - INFO - gpt-4o: 0.1783
2025-01-14 10:21:14,426 - INFO - deepseek-chat: 0.1671
2025-01-14 10:21:14,426 - INFO - gemini-2.0-flash-thinking-exp-1219: 0.1647
2025-01-14 10:21:14,427 - INFO - claude-3-5-sonnet-latest: 0.1556
2025-01-14 10:21:14,427 - INFO - gemini-exp-1206: 0.1549


Unnamed: 0,model,pagerank_score
0,o1-preview,0.179404
1,gpt-4o,0.178305
2,deepseek-chat,0.167105
3,gemini-2.0-flash-thinking-exp-1219,0.164732
4,claude-3-5-sonnet-latest,0.155571
5,gemini-exp-1206,0.154884


2025-01-14 10:21:14,432 - INFO - Saved endorsement_graph.gml
2025-01-14 10:21:14,433 - INFO - Saved rankings.json
