In [None]:
"""
Introducing SlopRank
"""

import pandas as pd
import time
from dotenv import load_dotenv
import llm
import networkx as nx
import json
import os
import random

load_dotenv()

##############################################################################
# 1. Configuration
##############################################################################

MODEL_NAMES = [
    "gemini-2.0-flash-thinking-exp-1219",
    "gemini-exp-1206",
    "claude-3-5-sonnet-latest",
    "claude-3-opus-latest",
    "o1-preview",
    "gpt-4o",
    "deepseek-chat"
]

model_objects = {m: llm.get_model(m) for m in MODEL_NAMES}

EVALUATION_METHOD = 1  # 1 (numeric 1–10) or 2 (Upvote/Downvote)
USE_SUBSET_EVALUATION = False  # Toggle to use partial evaluation
EVALUATORS_SUBSET_SIZE = 2  # If True, limit judges to evaluate a subset of models

##############################################################################
# 2. Prompting functions
##############################################################################

def query_model(prompt, model_name):
    """
    Sends a prompt to a specified model via 'llm' and returns the response text.
    """
    response = model_objects[model_name].prompt(prompt)
    return response.text()

def query_model_all(df, model_name):
    """
    Query the chosen model for all prompts in the DataFrame and save responses.
    """
    t0 = time.time()
    cleaned_prompts = df["prompt"].str.strip().str.lower()
    colname = f"response_{model_name}"
    df[colname] = cleaned_prompts.map(lambda x: query_model(x, model_name))
    print(f"{model_name} processing time: {time.time() - t0:.2f}s")

    # Ensure the 'responses' directory exists
    os.makedirs("responses", exist_ok=True)
    
    # Save responses for this model
    response_file_path = f"responses/responses_{model_name}.csv"
    df[[colname]].to_csv(response_file_path, index=False)
    print(f"Saved responses for {model_name} to {response_file_path}")
    
    return df

def gather_all_model_responses(raw_prompts):
    """
    Gather responses from all models and save to disk incrementally.
    """
    df = pd.DataFrame({"prompt": raw_prompts})
    for m in MODEL_NAMES:
        df = query_model_all(df, m)
    return df

##############################################################################
# 3. Evaluate the responses
##############################################################################

def create_evaluation_mapping(other_models):
    """
    Creates a mapping between model names and anonymous identifiers.
    Returns both forward and reverse mappings.
    """
    model_to_anonymous = {model: f"Model_{i+1}" for i, model in enumerate(other_models)}
    anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}
    return model_to_anonymous, anonymous_to_model

def build_evaluation_prompt(method, prompt, judge_model, model_responses, other_models):
    """
    Build a meta-prompt using anonymous identifiers for models.
    """
    # Create anonymous mapping for this evaluation round
    model_to_anonymous, _ = create_evaluation_mapping(other_models)
    
    answers_section = "\n".join([
        f"{model_to_anonymous[om]}:\n{model_responses[om]}\n---" 
        for om in other_models
    ])

    if method == 1:
        instructions = f"""You are evaluating several AI responses to a given prompt. Analyze each response carefully using these criteria:

ORIGINAL PROMPT:
"{prompt}"

RESPONSES TO EVALUATE:
{answers_section}

EVALUATION CRITERIA:
1. Accuracy & Correctness (if applicable)
2. Completeness of the answer
3. Clarity and coherence
4. Relevance to the prompt
5. Depth of analysis
6. Practical usefulness
7. Originality of insights (if applicable)

RATING SCALE:
- 10: Exceptional, exceeds expectations in all criteria
- 8-9: Excellent, minor room for improvement
- 6-7: Good, satisfactory response
- 4-5: Fair, notable shortcomings
- 1-3: Poor, significant issues

REQUIRED FORMAT:
Return ratings in strict JSON format with no additional text:
{{
    "Model_1": 8,
    "Model_2": 7
}}

IMPORTANT:
- Use whole numbers only (1-10)
- Rate each response independently
- Focus on substance over style
- Consider the specific context of the prompt
"""
    else:
        instructions = f"""You are evaluating several AI responses to a given prompt. Analyze each response carefully.

ORIGINAL PROMPT:
"{prompt}"

RESPONSES TO EVALUATE:
{answers_section}

EVALUATION CRITERIA:
- Upvote: The response is helpful, accurate, and addresses the prompt effectively
- Downvote: The response is inadequate, incorrect, or fails to address the prompt properly

Consider:
1. Does the response directly answer the prompt?
2. Is the information accurate and reliable?
3. Is the response clear and well-structured?
4. Does it provide practical value?

REQUIRED FORMAT:
Return votes in strict JSON format with no additional text:
{{
    "Model_1": "Upvote",
    "Model_2": "Downvote"
}}

IMPORTANT:
- Be consistent in your evaluation criteria
- Focus on the actual value provided
- Consider the specific context of the prompt
"""

    return instructions.strip(), model_to_anonymous

def parse_evaluation_output(method, raw_judgment, anonymous_mapping):
    """
    Parse the raw text and map anonymous identifiers back to model names.
    """
    try:
        if "{" in raw_judgment and "}" in raw_judgment:
            start = raw_judgment.find("{")
            end = raw_judgment.rfind("}") + 1
            raw_judgment = raw_judgment[start:end]
        
        data = json.loads(raw_judgment)
    except json.JSONDecodeError:
        print(f"Warning: Failed to parse judgment: {raw_judgment[:100]}...")
        if method == 1:
            return {anonymous_mapping[k]: 5.0 for k in anonymous_mapping}
        else:
            return {anonymous_mapping[k]: 0 for k in anonymous_mapping}

    endorsement_map = {}
    for anonymous_id, real_model in anonymous_mapping.items():
        val = data.get(anonymous_id, None)
        
        if method == 1:
            try:
                score = float(val)
                if not (1 <= score <= 10):
                    print(f"Warning: Score {score} for {anonymous_id} out of range, clamping to [1,10]")
                endorsement_map[real_model] = max(1.0, min(10.0, score))
            except (TypeError, ValueError):
                print(f"Warning: Invalid numeric score for {anonymous_id}: {val}")
                endorsement_map[real_model] = 5.0
        else:
            if isinstance(val, str):
                val = val.lower().strip()
                if val == "upvote":
                    endorsement_map[real_model] = 1
                elif val == "downvote":
                    endorsement_map[real_model] = 0
                else:
                    print(f"Warning: Invalid vote value for {anonymous_id}: {val}")
                    endorsement_map[real_model] = 0
            else:
                print(f"Warning: Invalid vote type for {anonymous_id}: {type(val)}")
                endorsement_map[real_model] = 0

    return endorsement_map

def evaluate_responses(df):
    """
    Evaluate responses using anonymous identifiers while maintaining external mapping.
    """
    G = nx.DiGraph()
    G.add_nodes_from(MODEL_NAMES)

    # Create a DataFrame to store all evaluations with both anonymous and real identifiers
    evaluations_df = pd.DataFrame(columns=[
        'prompt', 
        'judge_model', 
        'rated_model_anonymous', 
        'rated_model_real',
        'rating', 
        'method'
    ])

    for idx, row in df.iterrows():
        prompt = row["prompt"]
        model_responses = {m: row.get(f"response_{m}", "No response") for m in MODEL_NAMES}

        for judge_model in MODEL_NAMES:
            other_models = [m for m in MODEL_NAMES if m != judge_model]

            if USE_SUBSET_EVALUATION and len(other_models) > EVALUATORS_SUBSET_SIZE:
                other_models = random.sample(other_models, EVALUATORS_SUBSET_SIZE)

            if not other_models:
                continue

            evaluation_prompt, model_to_anonymous = build_evaluation_prompt(
                EVALUATION_METHOD, prompt, judge_model, model_responses, other_models
            )
            anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}
            
            raw_judgment = query_model(evaluation_prompt, judge_model)
            parsed_judgments = parse_evaluation_output(
                EVALUATION_METHOD, raw_judgment, anonymous_to_model
            )

            # Store evaluations with both anonymous and real identifiers
            for rated_model, endorsement_val in parsed_judgments.items():
                anonymous_id = model_to_anonymous[rated_model]
                evaluations_df = pd.concat([evaluations_df, pd.DataFrame({
                    'prompt': [prompt],
                    'judge_model': [judge_model],
                    'rated_model_anonymous': [anonymous_id],
                    'rated_model_real': [rated_model],
                    'rating': [endorsement_val],
                    'method': [EVALUATION_METHOD]
                })], ignore_index=True)

                if G.has_edge(judge_model, rated_model):
                    G[judge_model][rated_model]["weight"] += endorsement_val
                else:
                    G.add_edge(judge_model, rated_model, weight=endorsement_val)

    # Compute PageRank
    pagerank_scores = nx.pagerank(G, weight="weight")
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

    # Save all data
    nx.write_gml(G, "endorsement_graph.gml")
    print("Saved endorsement graph to endorsement_graph.gml")

    evaluations_df.to_csv("evaluations_with_mapping.csv", index=False)
    print("Saved detailed evaluations with mappings to evaluations_with_mapping.csv")

    with open("rankings.json", "w") as f:
        json.dump({
            "rankings": ranked_models,
            "metadata": {
                "evaluation_method": EVALUATION_METHOD,
                "use_subset_evaluation": USE_SUBSET_EVALUATION,
                "evaluators_subset_size": EVALUATORS_SUBSET_SIZE,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
        }, f, indent=4)
    print("Saved rankings to rankings.json")

    return G, ranked_models
##############################################################################
# 4. Main
##############################################################################

if __name__ == "__main__":
    prompts_df = pd.read_csv("prompts.csv")  # Assuming the CSV has a column named "Questions"
    raw_prompts = prompts_df["Questions"].tolist()
    raw_prompts = [prompts_df["Questions"].iloc[0]]  # Use only the first prompt

    # Gather responses for each model
    df_responses = gather_all_model_responses(raw_prompts)

    # Evaluate responses and compute rankings
    G, ranked = evaluate_responses(df_responses)

    print("\n=== PageRank Scores ===")
    for model, score in ranked:
        print(f"{model}: {score:.4f}")
