In [None]:
# -*- coding: utf-8 -*-
"""
score_model.py — step-level scorer (OpenRouter, no CLI flags)

- 扫描 ROOT_DIR 中 {model}_{step}_{kind}.{ext} 命名的文件：
  kinds ∈ {global.png, local.png, prompt.txt, output.txt}（忽略 fpv）
- 解析 output.txt（从混合文本里抽取 JSON；不合法则跳过该 step）
- 调用 OpenRouter 上的 gpt-4o 按 6 个维度打分（0–10；未体现为 -1；collaboration 强制 -1）
- 输出到 OUT_DIR/{model}_scores.jsonl（每行一个 step 的记录）
- 任何解析失败会把原始 LLM 响应落盘 OUT_DIR/_raw/ 便于排查

依赖：
  - llm/score_model.py 里的 BaseModel（HTTP/重试/超时等）
  - pip: pillow numpy openai
环境变量：
  - OPENROUTER_KEY（推荐）
"""

from __future__ import annotations
import os, re, json, uuid, logging
from typing import Dict, Tuple, List, Any, Optional
from PIL import Image
import sys

# === 路径：按你的工程实际调整 ===
SIMWORLD_DIR      = r"D:\BaiduNetdiskDownload\Food-Delivery-Bench-2.0-iso\SimWorld"
LLM_DELIVERY_DIR  = r"D:\BaiduNetdiskDownload\Food-Delivery-Bench-2.0-iso\LLM-Delivery"
sys.path.insert(0, SIMWORLD_DIR); sys.path.insert(0, LLM_DELIVERY_DIR)
from llm.score_model import BaseModel  # 复用你已有的 BaseModel（负责 HTTP/重试/超时等）

# =========================
# CONFIG（集中修改）
# =========================
ROOT_DIR = r"D:\BaiduNetdiskDownload\Food-Delivery-Bench-2.0-iso\LLM-Delivery\Scripts\debug_snaps\medium-20"
OUT_DIR  = r"D:\BaiduNetdiskDownload\Food-Delivery-Bench-2.0-iso\LLM-Delivery\Scripts\debug_snaps\medium-20-scores"

# OpenRouter 设置（OpenAI SDK 兼容）
BASE_URL = "https://openrouter.ai/api/v1"  # ✅ 更稳的子域
API_KEY  = os.getenv("OPENROUTER_KEY", "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")
SCORER_MODEL = "openai/gpt-4o"     # 可改 openai/gpt-4o-mini 等
RATE_LIMIT_PER_MIN = 30            # 简单的 QPS 节流

# 文件名解析
FNAME_RE = re.compile(
    r"^(?P<model>.+?)_(?P<step>\d+)_(?P<kind>fpv|global|local|prompt|output)\.(?P<ext>png|txt)$",
    re.IGNORECASE
)

# =========================
# 工具函数：宽松 JSON 提取/清洗
# =========================
def _strip_code_fences(s: str) -> str:
    s = s.strip()
    s = re.sub(r"^\s*```(?:json)?\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*```\s*$", "", s)
    return s.strip()

def _extract_json_loose(s: str) -> dict:
    """
    从任意 LLM 文本里尽力抽取一个 JSON 对象：
    - 去掉 ```json/``` 包裹
    - 先尝试整体 loads；失败再截取最外层 { ... } 子串；再失败枚举起始花括号
    - 失败则抛出 ValueError
    """
    if not isinstance(s, str):
        raise ValueError("LLM response is not a string")

    s = _strip_code_fences(s)

    # 快速路径：直接尝试 loads
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass

    # 宽松路径：找第一个 '{' 到最后一个 '}' 的子串再 loads
    try:
        i = s.index("{")
        j = s.rindex("}") + 1
        obj = json.loads(s[i:j])
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass

    # 再宽松：找所有可能的 { ... } 片段，逐个尝试
    braces = [m.start() for m in re.finditer(r"\{", s)]
    for i in braces:
        try:
            sub = s[i:s.rindex("}")+1]
            obj = json.loads(sub)
            if isinstance(obj, dict):
                return obj
        except Exception:
            continue

    # 全部失败
    raise ValueError("Cannot extract valid JSON from LLM response")

# =========================
# 提示词
# =========================
def build_system_prompt() -> str:
    """
    概念优先 + 非穷举示例 + 自主补充信号 + 反事实比较 + 以遗漏为主的理由
    严格 JSON 输出（不给 markdown/code fences）
    """
    return (
        "You are a meticulous step-level evaluator for a SINGLE-AGENT delivery task.\n"
        "You will receive for THIS step: (1) GLOBAL MAP image, (2) LOCAL MAP image, and "
        "(3) TEXT containing the agent's prompt (state/rules/history) plus the agent's JSON output for THIS step.\n\n"

        "GENERAL PRINCIPLES (read carefully):\n"
        "- Each dimension is a CONCEPT (e.g., risk = aggressiveness vs conservatism). The examples below are NON-EXHAUSTIVE.\n"
        "- First understand the concept definition, then look for ANY reasonable signals in THIS step that align with the concept—not only the examples listed.\n"
        "- If THIS step provides no clear evidence for a dimension, return -1 for that dimension (do NOT guess).\n"
        "- Reasons must be SHORT (1–2 concise clauses), SPECIFIC, and should emphasize OMISSIONS/RISKS as much as positives.\n"
        "- Always apply a light COUNTERFACTUAL check: what would a more aggressive/cautious, longer-term, more diverse, more meticulous, or more adaptive plan look like in THIS state?\n"
        "- Use the full 0–10 range (integers). Avoid only {0,5,10}. Penalize shallow planning and missing safety/constraint handling.\n"
        "- Do not hedge to the mid-range: if THIS step is clearly exemplary on a dimension with specific, step-grounded evidence, assign 9–10; if it clearly neglects or endangers the dimension, assign 0–2.\n"
        "- Use -1 only when there is truly no evidence in THIS step.\n"
        "- STRICT JSON ONLY in your final output (no markdown, no code fences, no extra prose).\n\n"

        "SCORING RUBRIC (concept → anchors → typical/non-exhaustive signals; consider ANY consistent signals in THIS step):\n"

        "1) risk (aggressiveness vs conservatism):\n"
        "   Concept: how bold vs cautious the decision is, given time windows, geography, agent energy/scooter battery, and payoff tradeoffs.\n"
        "   Anchors: 0=extremely cautious; 3=cautious; 5=balanced; 7=aggressive; 10=extremely aggressive.\n"
        "   Aggressive (high) signals (NON-EXHAUSTIVE):\n"
        "     • Accepts many orders at once, especially with tight/conflicting ETAs.\n"
        "     • Prefers far/high-payout routes even if sequencing/charging is risky.\n"
        "     • Proceeds while energy/battery is low, betting it will suffice; skips mitigations (e.g., charging) to save time.\n"
        "   Conservative (low) signals (NON-EXHAUSTIVE):\n"
        "     • Takes one safe order at a time; proactively charges/rests before urgent.\n"
        "   Counterfactual prompt: What clearly safer/bolder option was available right now?\n\n"

        "2) long_term (foresight & chaining):\n"
        "   Concept: multi-step foresight, temporal/geographic chaining, and strategic investments shown in THIS step.\n"
        "   Anchors: 0=myopic; 3=weak foresight; 5=moderate; 7=strong; 10=exceptional.\n"
        "   Positive signals (NON-EXHAUSTIVE):\n"
        "     • Sequences pickups/dropoffs by proximity and time windows to minimize detours.\n"
        "     • Buys/uses tools (energy drink, battery pack) as strategic investment; schedules a short charge near future POIs.\n"
        "     • Avoids routes/areas that will complicate later charging or sequencing.\n"
        "   Anti-signals (NON-EXHAUSTIVE):\n"
        "     • Selects orders without considering next-stop geography or charger availability; only maximizes immediate payout.\n"
        "   Counterfactual prompt: What chaining or investment now would reduce future risk/cost/delay?\n\n"

        "3) diversity (strategy variety beyond routine):\n"
        "   Concept: use of non-routine tools/transports/mechanisms beyond the standard charge→accept→pickup→deliver loop.\n"
        "   Anchors: 0=routine only; 3=minor variation; 5=some variety; 7=clear variety; 10=rich toolkit.\n"
        "   Signals (NON-EXHAUSTIVE):\n"
        "     • Takes a bus; rents/returns a car; visits store to buy/USE tools (ice/heat packs, battery pack, energy drink).\n"
        "     • Purposeful detours that unlock speed/reliability later (e.g., staging, temp storage allowed by rules, etc.).\n"
        "   Counterfactual prompt: Which extra mechanism would reduce time/risk/cost now?\n\n"

        "4) collaboration:\n"
        "   ALWAYS -1 in this single-agent setting (explain as such). Ignore incidental mentions.\n\n"

        "5) meticulousness (operational detail & constraints):\n"
        "   Concept: care for perishables/temperature, drop-off method, fragility, movement side-effects, and special instructions.\n"
        "   Anchors: 0=careless; 3=partial; 5=several but with gaps; 7=thorough; 10=exemplary.\n"
        "   Checks when relevant (NON-EXHAUSTIVE):\n"
        "     • Ice-cream melting urgency handled; hot/cold separation; pungent items not mixed with drinks/desserts.\n"
        "     • Fragile items avoid aggressive movement; respects specified drop-off method (e.g., hand_to_customer with egocentric search).\n"
        "     • Honors time windows and customer instructions; anticipates temperature packs.\n"
        "   Counterfactual prompt: Which concrete mitigation (separation, pack, speed choice) was missed right now?\n\n"

        "6) adaptability (updates plan given new state/errors):\n"
        "   Concept: responsiveness to updated info (energy/battery/time windows/recent_error), re-ordering or switching resources.\n"
        "   Anchors: 0=rigid; 3=slight; 5=reasonable for one issue; 7=solid; 10=excellent across multiple issues.\n"
        "   Signals (NON-EXHAUSTIVE):\n"
        "     • Deviates from prior plan after noticing conflicts; charge-first on low battery; re-sequences deliveries; switches transport.\n"
        "   Counterfactual prompt: Which change (charge first, reorder, different transport) would better fit the new state?\n\n"

        "EXEMPLAR ILLUSTRATION (do NOT copy; evaluate THIS step only):\n"
        "  scores.risk = 8 because the agent accepted three tight-window orders and kept going despite low energy/battery; a safer plan would drop one order or charge first.\n"
        "  scores.long_term = 3 because the delivery sequence ignored spatial chaining and charger scarcity near #4; a better plan would deliver #6 first or pre-charge nearby.\n"
        "  scores.diversity = 2 because it mainly followed the routine; taking a bus or buying a battery pack would diversify strategy.\n"
        "  scores.collaboration = -1 (single-agent setup).\n"
        "  scores.meticulousness = 6 because it noticed ice-cream melting and special instructions, but mixed a pungent item with a drink; separation/pack usage was only partial.\n"
        "  scores.adaptability = 6 because it adjusted after noticing low battery (some plan change), though not fully optimized.\n\n"

        # === 插入的严格对齐示例（格式演示；内容不得照搬） ===
        "EXAMPLE_OUTPUT (FORMAT-ALIGNED; CONTENT IS ILLUSTRATIVE AND MUST NOT BE COPIED VERBATIM):\n"
        "{\n"
        "  \"scores\": {\n"
        "    \"risk\": 8,\n"
        "    \"long_term\": 3,\n"
        "    \"diversity\": 2,\n"
        "    \"collaboration\": -1,\n"
        "    \"meticulousness\": 6,\n"
        "    \"adaptability\": 6\n"
        "  },\n"
        "  \"reasons\": {\n"
        "    \"risk\": \"Accepted three tight-window orders and continued despite low energy/battery; map distances imply limited buffer—safer plan was dropping one or charging first for reliability.\",\n"
        "    \"long_term\": \"Sequence ignored spatial chaining and charger scarcity near #4; delivering #6 earlier or pre-charging en route would reduce later detours and timing risk.\",\n"
        "    \"diversity\": \"Mostly routine pipeline (accept→pickup→deliver); no deliberate tool choice like battery pack/ice/heat or purposeful transport switch beyond a one-off bus ride.\",\n"
        "    \"collaboration\": \"Single-agent setup; collaboration not applicable.\",\n"
        "    \"meticulousness\": \"Acknowledged ice-cream melting and special instructions but co-located a pungent item with a drink and did not mention packs/separation explicitly—partial care with gaps.\",\n"
        "    \"adaptability\": \"Departed from earlier plan after noticing low battery (some resequencing), yet did not fully re-optimize around nearby chargers or tighter windows—moderate adaptation.\"\n"
        "  }\n"
        "}\n\n"

        "EVIDENCE & REASONS:\n"
        "- Ground each reason in THIS step’s specifics (accepted orders, map-implied distances, time windows, energy/battery %, drop-off method, melting/fragile risks, recent_error, post_action_plan vs current action).\n"
        "- Prefer omission-focused critique: what was missed (e.g., pre-charge near #4; separation of pungent items; honoring hand_to_customer search).\n\n"

        "OUTPUT FORMAT — STRICT JSON ONLY (no markdown, no code fences):\n"
        "{\n"
        '  \"scores\": {\n'
        '    \"risk\": int, \"long_term\": int, \"diversity\": int, \"collaboration\": int, \"meticulousness\": int, \"adaptability\": int\n'
        "  },\n"
        '  \"reasons\": {\n'
        '    \"risk\": \"short omission-focused reason grounded in THIS step\",\n'
        '    \"long_term\": \"short omission-focused reason grounded in THIS step\",\n'
        '    \"diversity\": \"short omission-focused reason grounded in THIS step\",\n'
        '    \"collaboration\": \"single-agent setting; not applicable\",\n'
        '    \"meticulousness\": \"short omission-focused reason grounded in THIS step\",\n'
        '    \"adaptability\": \"short omission-focused reason grounded in THIS step\"\n'
        "  }\n"
        "}\n"
    )

def build_user_block(prompt_text: str, output_json_str: str) -> str:
    """
    合并 prompt 与该 step 的 output JSON。
    """
    return (
        "### PROMPT (observation/rules/context)\n" + prompt_text.strip() + "\n\n" +
        "### MODEL_OUTPUT_JSON (for THIS step)\n" + output_json_str.strip() + "\n"
    )

# =========================
# I/O
# =========================
def open_image(path: str) -> Image.Image:
    return Image.open(path).convert("RGB")

def load_text(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def group_files(root: str) -> Dict[Tuple[str, int], Dict[str, str]]:
    """
    返回 (model, step) -> {kind: filepath}
    需要 kinds: global, local, prompt, output   （fpv 忽略）
    """
    groups: Dict[Tuple[str, int], Dict[str, str]] = {}
    for name in os.listdir(root):
        m = FNAME_RE.match(name)
        if not m:
            continue
        model = m.group("model")
        step = int(m.group("step"))
        kind = m.group("kind").lower()
        path = os.path.join(root, name)
        key = (model, step)
        groups.setdefault(key, {})
        groups[key][kind] = path
    return groups

def parse_output_json(text: str) -> Optional[dict]:
    """
    从 output.txt 中定位并解析 JSON；失败返回 None。
    """
    text = text.strip()
    try:
        i = text.index("{")
        j = text.rindex("}") + 1
        obj = json.loads(text[i:j])
        return obj if isinstance(obj, dict) else None
    except Exception:
        return None

# =========================
# 核心：打分（带宽松解析与落盘）
# =========================
def score_one_step(
    scorer: BaseModel,
    system_prompt: str,
    global_img_path: str,
    local_img_path: str,
    prompt_text: str,
    output_json_text: str,
) -> Optional[Dict[str, Any]]:
    """
    评分一个 step；返回 {"scores": {...}, "reasons": {...}} 或 None。
    """
    images = [open_image(global_img_path), open_image(local_img_path)]
    user_text = build_user_block(prompt_text, output_json_text)

    resp = scorer.generate(
        system=system_prompt,
        user=user_text,
        images=images,
        max_tokens=350,
        temperature=0.0,
        n=1,
    )

    # 失败时把原始响应落盘，便于审计
    def dump_raw(reason: str):
        try:
            os.makedirs(os.path.join(OUT_DIR, "_raw"), exist_ok=True)
            fname = os.path.join(OUT_DIR, "_raw", f"resp_{uuid.uuid4().hex[:8]}_{reason}.txt")
            with open(fname, "w", encoding="utf-8") as f:
                f.write(resp if isinstance(resp, str) else repr(resp))
            logging.error(f"[debug] dumped raw LLM response to: {fname}")
        except Exception:
            pass

    try:
        data = _extract_json_loose(resp)  # ← 宽松解析

        # 结构容错：允许只有 scores，没有 reasons；或 key 大小写/别名错误
        scores = data.get("scores") or data.get("Scores") or {}
        reasons = data.get("reasons") or data.get("Reasons") or {}

        # 若模型直接平铺在顶层（没有 scores/reasons），也容错接住（但仍以 want_keys 过滤）
        if not isinstance(scores, dict):
            scores = {k: v for k, v in data.items() if isinstance(v, (int, float, str))}
            reasons = data.get("reasons", {}) if isinstance(data.get("reasons"), dict) else reasons

        # 协作维度固定为 -1（双保险）
        if not isinstance(scores, dict):
            scores = {}
        if not isinstance(reasons, dict):
            reasons = {}
        scores["collaboration"] = -1
        reasons["collaboration"] = reasons.get(
            "collaboration",
            "Single-agent setting: collaboration not applicable."
        )

        want_keys = ["risk", "long_term", "diversity", "collaboration", "meticulousness", "adaptability"]
        clean_scores: Dict[str, int] = {}
        clean_reasons: Dict[str, str] = {}

        for k in want_keys:
            raw_v = scores.get(k, -1)
            try:
                v = int(raw_v)
            except Exception:
                # 兜 "7/10"、"8." 之类
                try:
                    v = int(float(str(raw_v).replace("/10", "").strip()))
                except Exception:
                    v = -1
            v = max(-1, min(10, v))
            clean_scores[k] = v

            r = reasons.get(k, "")
            if not isinstance(r, str) or not r.strip():
                r = "No specific evidence found in this step." if v == -1 else "Reason not provided."
            r = _strip_code_fences(r).replace("\n", " ").strip()
            clean_reasons[k] = r

        return {"scores": clean_scores, "reasons": clean_reasons}

    except Exception as e:
        logging.error(f"Scorer response not parseable: {e}")
        dump_raw("parse_fail")
        return None

# =========================
# 主流程
# =========================
def main():
    if not API_KEY:
        raise RuntimeError("OPENROUTER_KEY not found. Please set it in your environment.")

    os.makedirs(OUT_DIR, exist_ok=True)
    logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s: %(message)s")
    logging.info(f"base_url: {BASE_URL}, api_key: {API_KEY[:12]}...")

    # 构造 LLM 评分器（走 OpenRouter）
    scorer = BaseModel(
        url=BASE_URL,
        api_key=API_KEY,
        model=SCORER_MODEL,
        max_tokens=256,
        temperature=0.0,
        top_p=1.0,
        rate_limit_per_min=RATE_LIMIT_PER_MIN,
        supports_vision=True,
        # 如果你的 BaseModel 支持超时/headers，这里也可以加（否则忽略）：
        # http_timeout_s=60.0, referer="https://your.site", app_title="DeliveryBench-Scorer",
    )
    system_prompt = build_system_prompt()

    groups = group_files(ROOT_DIR)
    per_model: Dict[str, List[Tuple[int, Dict[str, str]]]] = {}
    for (model, step), files in groups.items():
        per_model.setdefault(model, []).append((step, files))

    for model, items in per_model.items():
        items.sort(key=lambda x: x[0])
        out_path = os.path.join(OUT_DIR, f"{model}_scores.jsonl")
        written = 0
        with open(out_path, "w", encoding="utf-8") as fout:
            for step, files in items:
                # 需要四件：global/local/prompt/output
                if not all(k in files for k in ("global", "local", "prompt", "output")):
                    logging.info(f"[skip] {model} step {step}: missing one of global/local/prompt/output")
                    continue

                prompt_text = load_text(files["prompt"])
                output_text = load_text(files["output"])
                output_obj = parse_output_json(output_text)
                if output_obj is None:
                    logging.info(f"[skip] {model} step {step}: output not valid JSON")
                    continue

                result = score_one_step(
                    scorer=scorer,
                    system_prompt=system_prompt,
                    global_img_path=files["global"],
                    local_img_path=files["local"],
                    prompt_text=prompt_text,
                    output_json_text=json.dumps(output_obj, ensure_ascii=False),
                )
                if result is None:
                    logging.info(f"[skip] {model} step {step}: scoring failed")
                    continue

                scores = result["scores"]
                reasons = result["reasons"]

                # 日志简要打印（只显示每个维度的一行理由）
                reasons_short = {k: reasons[k] for k in scores.keys()}
                logging.info(f"[score] {model} step {step}: {scores} | reasons: {reasons_short}")

                rec = {
                    "model": model,
                    "step": step,
                    "scores": scores,
                    "reasons": reasons,
                    "files": {
                        "global": os.path.basename(files["global"]),
                        "local": os.path.basename(files["local"]),
                        "prompt": os.path.basename(files["prompt"]),
                        "output": os.path.basename(files["output"]),
                    },
                }
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                written += 1

        logging.info(f"[done] {model}: wrote {written} lines -> {out_path}")

if __name__ == "__main__":
    main()


[2025-11-01 13:47:05,577] INFO: base_url: https://openrouter.ai/api/v1, api_key: sk-or-v1-87d...


base_url: https://openrouter.ai/api/v1, api_key: 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


[2025-11-01 13:47:05,948] INFO: [skip] anthropic-claude-3.7-sonnet step 0: missing one of global/local/prompt/output
