In [None]:
"""
Inimitable SlopRank
"""

import pandas as pd
import time
from dotenv import load_dotenv
import llm
import networkx as nx
import json
import os
import random

load_dotenv()

##############################################################################
# 1. Configuration
##############################################################################

MODEL_NAMES = [
    "gemini-2.0-flash-thinking-exp-1219",
    "gemini-exp-1206",
    "claude-3-5-sonnet-latest",
    "claude-3-opus-latest",
    "o1-preview",
    "gpt-4o",
    "deepseek-chat"
]

model_objects = {m: llm.get_model(m) for m in MODEL_NAMES}

EVALUATION_METHOD = 1  # 1 (numeric 1–10) or 2 (Upvote/Downvote)
USE_SUBSET_EVALUATION = True  # Toggle to use partial evaluation
EVALUATORS_SUBSET_SIZE = 3  # If True, limit judges to evaluate a subset of models

##############################################################################
# 2. Prompting functions
##############################################################################

def query_model(prompt, model_name):
    """
    Sends a prompt to a specified model via 'llm' and returns the response text.
    """
    response = model_objects[model_name].prompt(prompt)
    return response.text()

def query_model_all(df, model_name):
    """
    Query the chosen model for all prompts in the DataFrame and save responses.
    """
    t0 = time.time()
    cleaned_prompts = df["prompt"].str.strip().str.lower()
    colname = f"response_{model_name}"
    df[colname] = cleaned_prompts.map(lambda x: query_model(x, model_name))
    print(f"{model_name} processing time: {time.time() - t0:.2f}s")

    # Ensure the 'responses' directory exists
    os.makedirs("responses", exist_ok=True)
    
    # Save responses for this model
    response_file_path = f"responses/responses_{model_name}.csv"
    df[[colname]].to_csv(response_file_path, index=False)
    print(f"Saved responses for {model_name} to {response_file_path}")
    
    return df

def gather_all_model_responses(raw_prompts):
    """
    Gather responses from all models and save to disk incrementally.
    """
    df = pd.DataFrame({"prompt": raw_prompts})
    for m in MODEL_NAMES:
        df = query_model_all(df, m)
    return df

##############################################################################
# 3. Evaluate the responses
##############################################################################

def create_evaluation_mapping(other_models):
    """
    Creates a mapping between model names and anonymous identifiers.
    Returns both forward and reverse mappings.
    """
    model_to_anonymous = {model: f"Model_{i+1}" for i, model in enumerate(other_models)}
    anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}
    return model_to_anonymous, anonymous_to_model

def build_evaluation_prompt(method, prompt, judge_model, model_responses, other_models):
    """
    Build a meta-prompt with stronger emphasis on format requirements.
    """
    model_to_anonymous, _ = create_evaluation_mapping(other_models)
    
    answers_section = "\n".join([
        f"{model_to_anonymous[om]}:\n{model_responses[om]}\n---" 
        for om in other_models
    ])

    if method == 1:
        instructions = f"""IMPORTANT: You must ONLY return a JSON object with ratings. No explanation or additional text.

PROMPT TO EVALUATE:
"{prompt}"

RESPONSES TO RATE:
{answers_section}

RATING INSTRUCTIONS:
- Rate each response from 1 to 10
- Consider: accuracy, completeness, clarity, relevance, depth, usefulness
- 10: Exceptional, 8-9: Excellent, 6-7: Good, 4-5: Fair, 1-3: Poor

YOUR RESPONSE MUST BE EXACTLY IN THIS FORMAT:
{{
    "Model_1": 8,
    "Model_2": 7
}}

DO NOT include any other text, explanations, or analysis. ONLY the JSON object above."""
    else:
        instructions = f"""IMPORTANT: You must ONLY return a JSON object with votes. No explanation or additional text.

PROMPT TO EVALUATE:
"{prompt}"

RESPONSES TO RATE:
{answers_section}

VOTING INSTRUCTIONS:
- "Upvote": helpful, accurate, effective response
- "Downvote": inadequate, incorrect, or poor response

YOUR RESPONSE MUST BE EXACTLY IN THIS FORMAT:
{{
    "Model_1": "Upvote",
    "Model_2": "Downvote"
}}

DO NOT include any other text, explanations, or analysis. ONLY the JSON object above."""

    return instructions.strip(), model_to_anonymous

def parse_evaluation_output(method, raw_judgment, anonymous_mapping):
    """
    Enhanced parsing with better error handling and logging.
    """
    try:
        # Clean the input text more aggressively
        cleaned_text = raw_judgment.strip()
        # Find the first { and last }
        start = cleaned_text.find("{")
        end = cleaned_text.rfind("}") + 1
        
        if start == -1 or end == 0:
            raise ValueError("No JSON object found in response")
            
        json_str = cleaned_text[start:end]
        data = json.loads(json_str)
        
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Warning: Failed to parse judgment. Error: {str(e)}")
        print(f"Raw response: {raw_judgment[:200]}...")
        # Return neutral fallback values
        return {anonymous_mapping[k]: (5.0 if method == 1 else 0) for k in anonymous_mapping}

    endorsement_map = {}
    for anonymous_id, real_model in anonymous_mapping.items():
        val = data.get(anonymous_id)
        
        if val is None:
            print(f"Warning: Missing rating for {anonymous_id}")
            endorsement_map[real_model] = 5.0 if method == 1 else 0
            continue

        if method == 1:
            try:
                score = float(val)
                if not (1 <= score <= 10):
                    print(f"Warning: Score {score} for {anonymous_id} out of range, clamping to [1,10]")
                endorsement_map[real_model] = max(1.0, min(10.0, score))
            except (TypeError, ValueError):
                print(f"Warning: Invalid numeric score for {anonymous_id}: {val}")
                endorsement_map[real_model] = 5.0
        else:
            if isinstance(val, str):
                val = val.lower().strip()
                if val == "upvote":
                    endorsement_map[real_model] = 1
                elif val == "downvote":
                    endorsement_map[real_model] = 0
                else:
                    print(f"Warning: Invalid vote value for {anonymous_id}: {val}, expected 'Upvote' or 'Downvote'")
                    endorsement_map[real_model] = 0
            else:
                print(f"Warning: Invalid vote type for {anonymous_id}: {type(val)}")
                endorsement_map[real_model] = 0

    return endorsement_map

def evaluate_responses(df):
    """
    Evaluate responses with improved error handling and DataFrame operations.
    """
    G = nx.DiGraph()
    G.add_nodes_from(MODEL_NAMES)

    # Initialize DataFrame to store evaluations
    evaluations_df = pd.DataFrame({
        'prompt': [],
        'judge_model': [],
        'rated_model_anonymous': [],
        'rated_model_real': [],
        'rating': [],
        'method': []
    })

    # Filter valid evaluations only
    valid_evaluations = evaluations_df[evaluations_df['rating'].notnull()]
    
    if valid_evaluations.empty:
        print("No valid evaluations found. Skipping PageRank calculation.")
        return G, []

    # Iterate through prompts and evaluate responses
    for idx, row in df.iterrows():
        prompt = row["prompt"]
        model_responses = {m: row.get(f"response_{m}", "No response") for m in MODEL_NAMES}

        for judge_model in MODEL_NAMES:
            other_models = [m for m in MODEL_NAMES if m != judge_model]

            # Use subset evaluation if enabled
            if USE_SUBSET_EVALUATION and len(other_models) > EVALUATORS_SUBSET_SIZE:
                other_models = random.sample(other_models, EVALUATORS_SUBSET_SIZE)

            # Skip if no valid models to evaluate
            if not other_models:
                print(f"Skipping evaluation for prompt: {prompt} (insufficient valid responses)")
                continue

            # Build evaluation prompt
            evaluation_prompt, model_to_anonymous = build_evaluation_prompt(
                EVALUATION_METHOD, prompt, judge_model, model_responses, other_models
            )
            anonymous_to_model = {v: k for k, v in model_to_anonymous.items()}

            # Query the judge model
            raw_judgment = query_model(evaluation_prompt, judge_model)
            parsed_judgments = parse_evaluation_output(
                EVALUATION_METHOD, raw_judgment, anonymous_to_model
            )

            # Create a new DataFrame for this batch of evaluations
            new_evaluations = []
            for rated_model, endorsement_val in parsed_judgments.items():
                anonymous_id = model_to_anonymous[rated_model]
                new_evaluations.append({
                    'prompt': prompt,
                    'judge_model': judge_model,
                    'rated_model_anonymous': anonymous_id,
                    'rated_model_real': rated_model,
                    'rating': endorsement_val,
                    'method': EVALUATION_METHOD
                })
            
            # Concatenate efficiently
            if new_evaluations:
                evaluations_df = pd.concat([
                    evaluations_df, 
                    pd.DataFrame(new_evaluations)
                ], ignore_index=True)

            # Update graph with valid ratings
            for rated_model, endorsement_val in parsed_judgments.items():
                if G.has_edge(judge_model, rated_model):
                    G[judge_model][rated_model]["weight"] += endorsement_val
                else:
                    G.add_edge(judge_model, rated_model, weight=endorsement_val)

    # Compute PageRank only if the graph has valid edges
    if len(G.edges) == 0:
        print("Graph has no valid edges. Cannot compute PageRank.")
        return G, []

    pagerank_scores = nx.pagerank(G, weight="weight")
    ranked_models = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

    # Save all data
    nx.write_gml(G, "responses/endorsement_graph.gml")
    print("Saved endorsement graph to endorsement_graph.gml")

    evaluations_df.to_csv("responses/evaluations_with_mapping.csv", index=False)
    print("Saved detailed evaluations with mappings to evaluations_with_mapping.csv")

    with open("rankings.json", "w") as f:
        json.dump({
            "rankings": ranked_models,
            "metadata": {
                "evaluation_method": EVALUATION_METHOD,
                "use_subset_evaluation": USE_SUBSET_EVALUATION,
                "evaluators_subset_size": EVALUATORS_SUBSET_SIZE,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
        }, f, indent=4)
    print("Saved rankings to rankings.json")

    return G, ranked_models

##############################################################################
# 4. Main
##############################################################################

if __name__ == "__main__":
    prompts_df = pd.read_csv("prompts.csv")  # Assuming the CSV has a column named "Questions"
    raw_prompts = prompts_df["Questions"].tolist()
    raw_prompts = [prompts_df["Questions"].iloc[1]]  # Use only select prompts - for testing

    # Gather responses for each model
    df_responses = gather_all_model_responses(raw_prompts)

    # Evaluate responses and compute rankings
    G, ranked = evaluate_responses(df_responses)

    print("\n=== PageRank Scores ===")
    for model, score in ranked:
        print(f"{model}: {score:.4f}")
