In [None]:
"""
Introducing SlopRank
"""

import pandas as pd
import time
from dotenv import load_dotenv
import llm
import networkx as nx
import json
import random

load_dotenv()

##############################################################################
# 1. Configuration
##############################################################################

MODEL_NAMES = [
    "gemini-2.0-flash-thinking-exp-1219",
    "gemini-exp-1206",
    "claude-3-5-sonnet-latest",
    "claude-3-opus-latest",
    "o1-preview",
    "gpt-4o",
    "deepseek-chat"
]

model_objects = {m: llm.get_model(m) for m in MODEL_NAMES}

EVALUATION_METHOD = 1  # 1 (numeric 1–10) or 2 (Upvote/Downvote)
USE_SUBSET_EVALUATION = False  # Toggle to use partial evaluation
EVALUATORS_SUBSET_SIZE = 2  # If True, limit judges to evaluate a subset of models

##############################################################################
# 2. Prompting functions
##############################################################################

def query_model(prompt, model_name):
    """
    Sends a prompt to a specified model via 'llm' and returns the response text.
    """
    response = model_objects[model_name].prompt(prompt)
    return response.text()

def query_model_all(df, model_name):
    """
    Query the chosen model for all prompts in the DataFrame and save responses.
    """
    t0 = time.time()
    cleaned_prompts = df["prompt"].str.strip().str.lower()
    colname = f"response_{model_name}"
    df[colname] = cleaned_prompts.map(lambda x: query_model(x, model_name))
    print(f"{model_name} processing time: {time.time() - t0:.2f}s")
    
    # Save responses for this model
    df[[colname]].to_csv(f"responses_{model_name}.csv", index=False)
    print(f"Saved responses for {model_name} to responses_{model_name}.csv")
    
    return df

def gather_all_model_responses(raw_prompts):
    """
    Gather responses from all models and save to disk incrementally.
    """
    df = pd.DataFrame({"prompt": raw_prompts})
    for m in MODEL_NAMES:
        df = query_model_all(df, m)
    return df

##############################################################################
# 3. Evaluate the responses
##############################################################################

def evaluate_responses(df):
    """
    Each model evaluates the other models' responses, builds a graph of endorsements,
    and computes PageRank to rank models.
    """
    G = nx.DiGraph()
    G.add_nodes_from(MODEL_NAMES)

    for idx, row in df.iterrows():
        prompt = row["prompt"]
        model_responses = {m: row.get(f"response_{m}", "No response") for m in MODEL_NAMES}

        for judge_model in MODEL_NAMES:
            other_models = [m for m in MODEL_NAMES if m != judge_model]

            if USE_SUBSET_EVALUATION and len(other_models) > EVALUATORS_SUBSET_SIZE:
                other_models = random.sample(other_models, EVALUATORS_SUBSET_SIZE)

            if not other_models:
                continue

            evaluation_prompt = build_evaluation_prompt(
                EVALUATION_METHOD, prompt, judge_model, model_responses, other_models
            )
            raw_judgment = query_model(evaluation_prompt, judge_model)
            parsed_judgments = parse_evaluation_output(
                EVALUATION_METHOD, raw_judgment, other_models
            )

            for rated_model, endorsement_val in parsed_judgments.items():
                if G.has_edge(judge_model, rated_model):
                    G[judge_model][rated_model]["weight"] += endorsement_val
                else:
                    G.add_edge(judge_model, rated_model, weight=endorsement_val)

    # Compute PageRank
    pagerank_scores = nx.pagerank(G, weight="weight")
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

    # Save graph and rankings
    nx.write_gml(G, "endorsement_graph.gml")
    print("Saved endorsement graph to endorsement_graph.gml")

    with open("rankings.json", "w") as f:
        json.dump(ranked_models, f, indent=4)
    print("Saved rankings to rankings.json")

    return G, ranked_models

def build_evaluation_prompt(method, prompt, judge_model, model_responses, other_models):
    """
    Build a meta-prompt that instructs 'judge_model' to evaluate the other models' answers.
    """
    answers_section = "\n".join([f"**Model {om}**: {model_responses[om]}" for om in other_models])

    if method == 1:
        instructions = f"""
You are {judge_model}. You see the following question:
PROMPT: "{prompt}"

Here are other models' answers:

{answers_section}

Give each model a rating from 1 to 10, strictly in JSON format with no extra text.
Example:
{{
    "modelA": 8,
    "modelB": 5
}}
"""
    else:
        instructions = f"""
You are {judge_model}. You see the following question:
PROMPT: "{prompt}"

Here are other models' answers:

{answers_section}

Simply say "Upvote" or "Downvote" for each model, in JSON format, no extra text.
Example:
{{
    "modelA": "Upvote",
    "modelB": "Downvote"
}}
"""
    return instructions.strip()

def parse_evaluation_output(method, raw_judgment, other_models):
    """
    Parse the raw text from the judge model into a dict of endorsements.
    """
    try:
        data = json.loads(raw_judgment)
    except:
        if method == 1:
            return {m: 5.0 for m in other_models}  # fallback
        else:
            return {m: 0 for m in other_models}    # fallback

    endorsement_map = {}
    for m in other_models:
        val = data.get(m, None)
        if method == 1:
            try:
                score = float(val)
                endorsement_map[m] = max(1.0, min(10.0, score))
            except:
                endorsement_map[m] = 5.0
        else:
            if isinstance(val, str) and val.lower().strip() == "upvote":
                endorsement_map[m] = 1
            elif isinstance(val, str) and val.lower().strip() == "downvote":
                endorsement_map[m] = 0
            else:
                endorsement_map[m] = 0

    return endorsement_map

##############################################################################
# 4. Main
##############################################################################

if __name__ == "__main__":
    prompts_df = pd.read_csv("prompts.csv")  # Assuming the CSV has a column named "Questions"
    raw_prompts = prompts_df["Questions"].tolist()
    # raw_prompts = [prompts_df["Questions"].iloc[0]]  # Use only the first prompt

    # Gather responses for each model
    df_responses = gather_all_model_responses(raw_prompts)

    # Evaluate responses and compute rankings
    G, ranked = evaluate_responses(df_responses)

    print("\n=== PageRank Scores ===")
    for model, score in ranked:
        print(f"{model}: {score:.4f}")
