In [1]:
# SlopRank

import pandas as pd
import numpy as np
import networkx as nx
import json
import os
import random
import time
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from pathlib import Path
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ## Configuration
@dataclass
class EvalConfig:
    """Configuration for the evaluation system."""
    model_names: List[str]
    evaluation_method: int  # 1 for numeric, 2 for ranking
    use_subset_evaluation: bool
    evaluators_subset_size: int
    output_dir: Path
    
    def __post_init__(self):
        self.output_dir.mkdir(parents=True, exist_ok=True)
        if self.evaluation_method not in {1, 2}:
            raise ValueError("evaluation_method must be 1 or 2")
        if self.evaluators_subset_size >= len(self.model_names):
            raise ValueError("evaluators_subset_size must be less than number of models")

# Default configuration
DEFAULT_CONFIG = EvalConfig(
    model_names=[
        "gemini-2.0-flash-thinking-exp-1219",
        "gemini-exp-1206",
        "claude-3-5-sonnet-latest",
        "claude-3-opus-latest",
        "o1-preview",
        "gpt-4o",
        "deepseek-chat"
    ],
    evaluation_method=1,
    use_subset_evaluation=True,
    evaluators_subset_size=3,
    output_dir=Path("results")
)

# ## Core Evaluation Classes
class ResponseManager:
    """Handles model response collection and validation."""
    
    def __init__(self, config: EvalConfig):
        self.config = config
        self.responses_df = None
        
    def _validate_response(self, response: str) -> bool:
        """Basic validation of model responses."""
        if not isinstance(response, str):
            return False
        if len(response.strip()) < 10:  # Arbitrary minimum length
            return False
        return True
    
    def collect_responses(self, prompts: List[str], llm_module) -> pd.DataFrame:
        """Collect responses from all models for given prompts."""
        responses = []
        total_start = time.time()
        
        for i, prompt in enumerate(prompts, 1):
            logger.info(f"Processing prompt {i}/{len(prompts)}")
            for model_name in self.config.model_names:
                start_time = time.time()
                logger.info(f"Querying {model_name}...")
                try:
                    model = llm_module.get_model(model_name)
                    response = model.prompt(prompt).text()
                    is_valid = self._validate_response(response)
                    elapsed = time.time() - start_time
                    
                    logger.info(f"{model_name} responded in {elapsed:.2f}s - " + 
                              f"{'Valid' if is_valid else 'Invalid'} response")
                    
                    responses.append({
                        'prompt': prompt,
                        'model': model_name,
                        'response': response if is_valid else None,
                        'is_valid': is_valid,
                        'response_time': elapsed
                    })
                    
                except Exception as e:
                    elapsed = time.time() - start_time
                    logger.error(f"Error from {model_name} after {elapsed:.2f}s: {str(e)}")
                    responses.append({
                        'prompt': prompt,
                        'model': model_name,
                        'response': None,
                        'is_valid': False,
                        'response_time': elapsed
                    })
                    
                # Add a small delay between requests to avoid rate limits
                time.sleep(0.5)
                
        total_time = time.time() - total_start
        logger.info(f"All responses collected in {total_time:.2f}s")
        
        self.responses_df = pd.DataFrame(responses)
        return self.responses_df

class Evaluator:
    """Handles the evaluation of model responses."""
    
    def __init__(self, config: EvalConfig):
        self.config = config
        
    def _create_evaluation_prompt(self, 
                                prompt: str, 
                                responses: Dict[str, str]) -> Tuple[str, Dict[str, str]]:
        """Creates the evaluation prompt and model mappings."""
        model_to_anonymous = {
            model: f"Model_{i+1}" 
            for i, model in enumerate(responses.keys())
        }
        
        answers_section = "\n".join([
            f"{model_to_anonymous[model]}:\n{response}\n---" 
            for model, response in responses.items()
        ])
        
        if self.config.evaluation_method == 1:
            instructions = f"""IMPORTANT: Return only a JSON object with ratings.
            
            Rate these responses to: "{prompt}"

            {answers_section}

            Rate 1-10 based on: accuracy, completeness, clarity, relevance, depth, usefulness
            10: Exceptional, 8-9: Excellent, 6-7: Good, 4-5: Fair, 1-3: Poor

            Format: {{"Model_1": 8, "Model_2": 7}}"""
        else:
            instructions = f"""IMPORTANT: Return only a JSON object with rankings.
            
            Rank these responses to: "{prompt}"

            {answers_section}

            Rank from best (1) to worst. No ties allowed.
            Consider: accuracy, completeness, clarity, relevance, depth, usefulness

            Format: {{"Model_1": 1, "Model_2": 2}}"""
            
        return instructions.strip(), model_to_anonymous
    
    def _parse_evaluation(self, 
                         raw_judgment: str, 
                         model_mapping: Dict[str, str]) -> Dict[str, float]:
        """Parse and validate evaluation responses."""
        try:
            # Extract JSON object
            start = raw_judgment.find("{")
            end = raw_judgment.rfind("}") + 1
            if start == -1 or end == 0:
                raise ValueError("No JSON object found")
            
            data = json.loads(raw_judgment[start:end])
            
            # Convert anonymous IDs back to real model names
            anonymous_to_model = {v: k for k, v in model_mapping.items()}
            results = {}
            
            for anon_id, score in data.items():
                model = anonymous_to_model.get(anon_id)
                if not model:
                    continue
                    
                if self.config.evaluation_method == 1:
                    # Numeric scores
                    score = float(score)
                    score = max(1.0, min(10.0, score))
                else:
                    # Rankings
                    score = int(score)
                
                results[model] = score
                
            return results
            
        except Exception as e:
            logger.error(f"Error parsing evaluation: {str(e)}")
            # Return neutral scores
            return {
                model: 5.0 if self.config.evaluation_method == 1 else len(model_mapping)
                for model in model_mapping.keys()
            }

    def evaluate_responses(self, 
                         responses_df: pd.DataFrame,
                         llm_module) -> Tuple[nx.DiGraph, pd.DataFrame]:
        """Evaluate all responses and build the graph."""
        G = nx.DiGraph()
        G.add_nodes_from(self.config.model_names)
        
        evaluations = []
        
        for prompt in responses_df['prompt'].unique():
            prompt_responses = responses_df[
                responses_df['prompt'] == prompt
            ].set_index('model')['response'].to_dict()
            
            # For each judge model
            for judge_model in self.config.model_names:
                # Select models to evaluate
                other_models = [
                    m for m in self.config.model_names 
                    if m != judge_model and prompt_responses.get(m) is not None
                ]
                
                if self.config.use_subset_evaluation:
                    other_models = random.sample(
                        other_models,
                        min(self.config.evaluators_subset_size, len(other_models))
                    )
                
                if not other_models:
                    continue
                
                # Create evaluation prompt
                eval_prompt, model_mapping = self._create_evaluation_prompt(
                    prompt,
                    {m: prompt_responses[m] for m in other_models}
                )
                
                try:
                    # Get evaluation from judge
                    raw_judgment = llm_module.get_model(judge_model).prompt(eval_prompt).text()
                    parsed_judgments = self._parse_evaluation(raw_judgment, model_mapping)
                    
                    # Record evaluations
                    for rated_model, score in parsed_judgments.items():
                        evaluations.append({
                            'prompt': prompt,
                            'judge_model': judge_model,
                            'rated_model': rated_model,
                            'score': score
                        })
                        
                        # Update graph
                        if G.has_edge(judge_model, rated_model):
                            G[judge_model][rated_model]['weight'] += score
                        else:
                            G.add_edge(judge_model, rated_model, weight=score)
                            
                except Exception as e:
                    logger.error(f"Error during evaluation: {str(e)}")
                    continue
        
        return G, pd.DataFrame(evaluations)

class SlopRank:
    """Main class for running the evaluation pipeline."""
    
    def __init__(self, config: EvalConfig = DEFAULT_CONFIG):
        self.config = config
        self.response_manager = ResponseManager(config)
        self.evaluator = Evaluator(config)
        
    def run(self, prompts: List[str], llm_module) -> Dict:
        """Run the full evaluation pipeline."""
        # Collect responses
        logger.info("Collecting responses...")
        responses_df = self.response_manager.collect_responses(prompts, llm_module)
        
        # Save responses
        responses_path = self.config.output_dir / "responses.csv"
        responses_df.to_csv(responses_path, index=False)
        logger.info(f"Saved responses to {responses_path}")
        
        # Evaluate responses
        logger.info("Evaluating responses...")
        G, evaluations_df = self.evaluator.evaluate_responses(responses_df, llm_module)
        
        # Calculate PageRank
        if len(G.edges) == 0:
            logger.error("No valid evaluations to compute PageRank")
            return {}
            
        pagerank_scores = nx.pagerank(G, weight="weight")
        ranked_models = sorted(
            pagerank_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        # Save results
        results = {
            "rankings": ranked_models,
            "metadata": {
                "evaluation_method": self.config.evaluation_method,
                "use_subset_evaluation": self.config.use_subset_evaluation,
                "evaluators_subset_size": self.config.evaluators_subset_size,
                "timestamp": datetime.now().isoformat()
            }
        }
        
        # Save all artifacts
        evaluations_df.to_csv(
            self.config.output_dir / "evaluations.csv",
            index=False
        )
        nx.write_gml(G, self.config.output_dir / "endorsement_graph.gml")
        with open(self.config.output_dir / "rankings.json", "w") as f:
            json.dump(results, f, indent=4)
            
        return results

# %%
def main():
    """Example usage of SlopRank."""
    import llm  # Your LLM module import
    from dotenv import load_dotenv
    load_dotenv()
    
    logger.info("Starting SlopRank evaluation")
    start_time = time.time()
    
    try:
        # Read prompts
        logger.info("Reading prompts from prompts.csv")
        prompts_df = pd.read_csv("prompts.csv")
        prompts = prompts_df["Questions"].tolist()
        # prompts = [prompts_df["Questions"].iloc[2]]  # Use only select prompts - for testing
        logger.info(f"Loaded {len(prompts)} prompts")
        
        # Initialize evaluation
        config = DEFAULT_CONFIG
        logger.info(f"Using configuration: {config}")
        evaluator = SlopRank(config)
        
        # Run evaluation
        results = evaluator.run(prompts, llm)
        
        # Print results
        if results:
            print("\n=== Model Rankings ===")
            max_score = max(score for _, score in results["rankings"])
            for model, score in results["rankings"]:
                normalized_score = score / max_score * 10  # Normalize to 0-10 scale
                print(f"{model:30} {score:.4f} (normalized: {normalized_score:.2f})")
                
            total_time = time.time() - start_time
            logger.info(f"Evaluation completed in {total_time:.2f}s")
        else:
            logger.error("No results generated")
    except:
        pass

if __name__ == "__main__":
    main()

2025-01-06 11:39:01,853 - INFO - Starting SlopRank evaluation
2025-01-06 11:39:01,854 - INFO - Reading prompts from prompts.csv
2025-01-06 11:39:01,866 - INFO - Loaded 29 prompts
2025-01-06 11:39:01,868 - INFO - Using configuration: EvalConfig(model_names=['gemini-2.0-flash-thinking-exp-1219', 'gemini-exp-1206', 'claude-3-5-sonnet-latest', 'claude-3-opus-latest', 'o1-preview', 'gpt-4o', 'deepseek-chat'], evaluation_method=1, use_subset_evaluation=True, evaluators_subset_size=3, output_dir=PosixPath('results'))
2025-01-06 11:39:01,868 - INFO - Collecting responses...
2025-01-06 11:39:01,869 - INFO - Processing prompt 1/29
2025-01-06 11:39:01,869 - INFO - Querying gemini-2.0-flash-thinking-exp-1219...
2025-01-06 11:39:02,989 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-thinking-exp-1219:streamGenerateContent "HTTP/1.1 200 OK"
2025-01-06 11:39:15,052 - INFO - gemini-2.0-flash-thinking-exp-1219 responded in 13.18s - Valid response
202


=== Model Rankings ===
o1-preview                     0.1592 (normalized: 10.00)
deepseek-chat                  0.1489 (normalized: 9.35)
gemini-exp-1206                0.1447 (normalized: 9.09)
gemini-2.0-flash-thinking-exp-1219 0.1397 (normalized: 8.77)
claude-3-opus-latest           0.1375 (normalized: 8.64)
gpt-4o                         0.1373 (normalized: 8.62)
claude-3-5-sonnet-latest       0.1326 (normalized: 8.33)


In [None]:
"""
Older implementation: SlopRank
"""

import pandas as pd
import time
from dotenv import load_dotenv
import llm
import networkx as nx
import json
import os
import random

load_dotenv()

##############################################################################
# 1. Configuration
##############################################################################

MODEL_NAMES = [
    "gemini-2.0-flash-thinking-exp-1219",
    "gemini-exp-1206",
    "claude-3-5-sonnet-latest",
    "claude-3-opus-latest",
    "o1-preview",
    "gpt-4o",
    "deepseek-chat"
]

model_objects = {m: llm.get_model(m) for m in MODEL_NAMES}

EVALUATION_METHOD = 1  # 1 (numeric 1–10) or 2 (Best-to-Worst)
USE_SUBSET_EVALUATION = True  # Toggle to use partial evaluation
EVALUATORS_SUBSET_SIZE = 3  # If True, limit judges to evaluate a subset of models

##############################################################################
# 2. Prompting functions
##############################################################################

def query_model(prompt, model_name):
    """
    Sends a prompt to a specified model via 'llm' and returns the response text.
    """
    response = model_objects[model_name].prompt(prompt)
    return response.text()

def query_model_all(df, model_name):
    """
    Query the chosen model for all prompts in the DataFrame and save responses.
    """
    t0 = time.time()
    cleaned_prompts = df["prompt"].str.strip().str.lower()
    colname = f"response_{model_name}"
    df[colname] = cleaned_prompts.map(lambda x: query_model(x, model_name))
    print(f"{model_name} processing time: {time.time() - t0:.2f}s")

    # Ensure the 'responses' directory exists
    os.makedirs("responses", exist_ok=True)
    
    # Save responses for this model
    response_file_path = f"responses/responses_{model_name}.csv"
    df[[colname]].to_csv(response_file_path, index=False)
    print(f"Saved responses for {model_name} to {response_file_path}")
    
    return df

def gather_all_model_responses(raw_prompts):
    """
    Gather responses from all models and save to disk incrementally.
    """
    df = pd.DataFrame({"prompt": raw_prompts})
    for m in MODEL_NAMES:
        df = query_model_all(df, m)
    return df

##############################################################################
# 3. Evaluate the responses
##############################################################################

def create_evaluation_mapping(other_models):
    """
    Creates a mapping between model names and anonymous identifiers.
    Returns both forward and reverse mappings.
    """
    model_to_anonymous = {model: f"Model_{i+1}" for i, model in enumerate(other_models)}
    anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}
    return model_to_anonymous, anonymous_to_model

def build_evaluation_prompt(method, prompt, judge_model, model_responses, other_models):
    """
    Build a meta-prompt.
    """
    model_to_anonymous, _ = create_evaluation_mapping(other_models)
    
    answers_section = "\n".join([
        f"{model_to_anonymous[om]}:\n{model_responses[om]}\n---" 
        for om in other_models
    ])

    if method == 1:
        instructions = f"""IMPORTANT: You must ONLY return a JSON object with ratings. No explanation or additional text.

        You are an impartial judge able to analyse multiple responses to a prompt and give it an objective rating.

        PROMPT TO EVALUATE:
        "{prompt}"

        RESPONSES TO RATE:
        {answers_section}

        RATING INSTRUCTIONS:
        - Rate each response from 1 to 10
        - Consider: accuracy, completeness, clarity, relevance, depth, usefulness
        - 10: Exceptional, 8-9: Excellent, 6-7: Good, 4-5: Fair, 1-3: Poor

        YOUR RESPONSE MUST BE EXACTLY IN THIS FORMAT:
        {{
            "Model_1": 8,
            "Model_2": 7
        }}

        DO NOT include any other text, explanations, or analysis. ONLY the JSON object above."""
    else:
        instructions = f"""IMPORTANT: You must ONLY return a JSON object with rankings. No explanation or additional text.

        You are an impartial judge who will rank the given responses to a question objectvely.

        PROMPT TO EVALUATE:
        "{prompt}"

        RESPONSES TO RANK:
        {answers_section}

        RANKING INSTRUCTIONS:
        - Rank all responses from best to worst (1 = best, higher numbers = worse).
        - Consider: accuracy, completeness, clarity, relevance, depth, usefulness.
        - Do NOT assign the same rank to multiple responses (no ties).

        YOUR RESPONSE MUST BE EXACTLY IN THIS FORMAT:
        {{
            "Model_1": 1,
            "Model_2": 2,
            "Model_3": 3
        }}

        DO NOT include any other text, explanations, or analysis. ONLY the JSON object above."""

    return instructions.strip(), model_to_anonymous

def parse_evaluation_output(method, raw_judgment, anonymous_mapping):
    """
    Enhanced parsing with better error handling and logging.
    """
    try:
        # Clean the input text more aggressively
        cleaned_text = raw_judgment.strip()
        # Find the first { and last }
        start = cleaned_text.find("{")
        end = cleaned_text.rfind("}") + 1
        
        if start == -1 or end == 0:
            raise ValueError("No JSON object found in response")
            
        json_str = cleaned_text[start:end]
        data = json.loads(json_str)
        
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Warning: Failed to parse judgment. Error: {str(e)}")
        print(f"Raw response: {raw_judgment[:200]}...")
        # Return neutral fallback values
        return {anonymous_mapping[k]: (5.0 if method == 1 else 0) for k in anonymous_mapping}

    endorsement_map = {}
    for anonymous_id, real_model in anonymous_mapping.items():
        val = data.get(anonymous_id)
        
        if val is None:
            print(f"Warning: Missing rating for {anonymous_id}")
            endorsement_map[real_model] = 5.0 if method == 1 else 0
            continue

        if method == 1:
            try:
                score = float(val)
                if not (1 <= score <= 10):
                    print(f"Warning: Score {score} for {anonymous_id} out of range, clamping to [1,10]")
                endorsement_map[real_model] = max(1.0, min(10.0, score))
            except (TypeError, ValueError):
                print(f"Warning: Invalid numeric score for {anonymous_id}: {val}")
                endorsement_map[real_model] = 5.0
        else:
            if val is None:
                print(f"Warning: Missing rank for {anonymous_id}")
                endorsement_map[real_model] = len(anonymous_mapping)  # Assign worst rank
            else:
                try:
                    endorsement_map[real_model] = int(val)
                except ValueError:
                    print(f"Warning: Invalid rank for {anonymous_id}: {val}")
                    endorsement_map[real_model] = len(anonymous_mapping)  # Assign worst rank

    return endorsement_map

def evaluate_responses(df):
    """
    Evaluate responses with improved error handling and DataFrame operations.
    """
    G = nx.DiGraph()
    G.add_nodes_from(MODEL_NAMES)

    # Initialize DataFrame to store evaluations
    evaluations_df = pd.DataFrame({
        'prompt': [],
        'judge_model': [],
        'rated_model_anonymous': [],
        'rated_model_real': [],
        'rating': [],
        'method': []
    })

    # Filter valid evaluations only
    valid_evaluations = evaluations_df[evaluations_df['rating'].notnull()]
    
    if valid_evaluations.empty:
        print("No valid evaluations found. Skipping PageRank calculation.")
        return G, []

    # Iterate through prompts and evaluate responses
    for idx, row in df.iterrows():
        prompt = row["prompt"]
        model_responses = {m: row.get(f"response_{m}", "No response") for m in MODEL_NAMES}

        for judge_model in MODEL_NAMES:
            other_models = [m for m in MODEL_NAMES if m != judge_model]

            # Use subset evaluation if enabled
            if USE_SUBSET_EVALUATION and len(other_models) > EVALUATORS_SUBSET_SIZE:
                other_models = random.sample(other_models, EVALUATORS_SUBSET_SIZE)

            # Skip if no valid models to evaluate
            if not other_models:
                print(f"Skipping evaluation for prompt: {prompt} (insufficient valid responses)")
                continue

            # Build evaluation prompt
            evaluation_prompt, model_to_anonymous = build_evaluation_prompt(
                EVALUATION_METHOD, prompt, judge_model, model_responses, other_models
            )
            anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}

            # Query the judge model
            raw_judgment = query_model(evaluation_prompt, judge_model)
            parsed_judgments = parse_evaluation_output(
                EVALUATION_METHOD, raw_judgment, anonymous_to_model
            )

            # Create a new DataFrame for this batch of evaluations
            new_evaluations = []
            for rated_model, endorsement_val in parsed_judgments.items():
                anonymous_id = model_to_anonymous[rated_model]
                new_evaluations.append({
                    'prompt': prompt,
                    'judge_model': judge_model,
                    'rated_model_anonymous': anonymous_id,
                    'rated_model_real': rated_model,
                    'rating': endorsement_val,
                    'method': EVALUATION_METHOD
                })
            
            # Concatenate efficiently
            if new_evaluations:
                evaluations_df = pd.concat([
                    evaluations_df, 
                    pd.DataFrame(new_evaluations)
                ], ignore_index=True)

            # Update graph with valid ratings
            for rated_model, endorsement_val in parsed_judgments.items():
                if G.has_edge(judge_model, rated_model):
                    G[judge_model][rated_model]["weight"] += endorsement_val
                else:
                    G.add_edge(judge_model, rated_model, weight=endorsement_val)

    # Compute PageRank only if the graph has valid edges
    if len(G.edges) == 0:
        print("Graph has no valid edges. Cannot compute PageRank.")
        return G, []

    pagerank_scores = nx.pagerank(G, weight="weight")
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

    # Save all data
    nx.write_gml(G, "responses/endorsement_graph.gml")
    print("Saved endorsement graph to endorsement_graph.gml")

    evaluations_df.to_csv("responses/evaluations_with_mapping.csv", index=False)
    print("Saved detailed evaluations with mappings to evaluations_with_mapping.csv")

    with open("rankings.json", "w") as f:
        json.dump({
            "rankings": ranked_models,
            "metadata": {
                "evaluation_method": EVALUATION_METHOD,
                "use_subset_evaluation": USE_SUBSET_EVALUATION,
                "evaluators_subset_size": EVALUATORS_SUBSET_SIZE,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
        }, f, indent=4)
    print("Saved rankings to rankings.json")

    return G, ranked_models

##############################################################################
# 4. Main
##############################################################################

if __name__ == "__main__":
    prompts_df = pd.read_csv("prompts.csv")  # Assuming the CSV has a column named "Questions"
    raw_prompts = prompts_df["Questions"].tolist()
    # raw_prompts = [prompts_df["Questions"].iloc[2]]  # Use only select prompts - for testing

    # Gather responses for each model
    df_responses = gather_all_model_responses(raw_prompts)

    # Evaluate responses and compute rankings
    G, ranked = evaluate_responses(df_responses)

    print("\n=== PageRank Scores ===")
    for model, score in ranked:
        print(f"{model}: {score:.4f}")
