In [1]:
!pip -q install gradio>=4.36.0 pandas reportlab

In [2]:
import os, io, json, traceback, datetime, re
import pathlib
from pathlib import Path
import pandas as pd
import gradio as gr

from openai import OpenAI
from typing import List, Dict, Any, Tuple

# --- PDF helpers (ReportLab) ---
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted

from xml.sax.saxutils import escape  # for safe Paragraph content

In [3]:
# Candidate generation
def generate_candidate_answer(task_prompt: str, *, candidate_model: str = "gpt-4o-mini") -> str:

    # Generate a sample response to prompt with the model-under-test.

    vs = globals().get("vs_id")

    # Build tools only when a vector store is available
    tools = [{"type": "file_search", "vector_store_ids": [vs]}] if vs else None

    # Only include 'tools' in the API call if we actually built it
    extra_kwargs = {"tools": tools} if tools else {}

    resp = client.responses.create(
        model=candidate_model,
        input=[{
            "role": "user",
            "content": (
                "Follow the instructions carefully. Use attached files when noted. Include any code, values, files, or plots the task requires.\n\n"
                f"TASK:\n{task_prompt}"
            )
        }],
        **({"tools": tools} if tools else {})
        #tools=[{"type": "file_search", "vector_store_ids": [vs_id]}],
        # temperature=0
        #**extra_kwargs
    )
    # SDK compatibility: try the modern accessor, fall back if needed.
    try:
        return resp.output_text
    except AttributeError:
        return resp.output[0].content[0].text

# Rubric preparation (ONLY criterion + score)
def _prepare_rubric_items(raw_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Prepare rubric items for grading. IMPORTANT: penalty is determined ONLY by the sign of `score`.
      - score > 0  => reward item
      - score < 0  => penalty item
    """
    prepped = []
    for it in raw_items:
        crit  = (it.get("criterion") or "").strip()
        score = float(it.get("score", 0))
        rid   = it.get("rubricItemId") or ""
        is_penalty = (score < 0)  # <-- CHANGED: sign-of-score, not wording
        prepped.append({
            "rubricItemId": rid,
            "criterion": crit,
            "score": score,          # positive or negative; used as-is
            "is_penalty": is_penalty
        })
    return prepped

def _build_rubric_prompts(sample, completion, items):
    # Only pass what the grader truly needs: id + criterion (hide score/is_penalty)
    compact = [{
        "rubricItemId": it["rubricItemId"],
        "criterion": it["criterion"]
    } for it in items]

    system_message = (
        "You are a meticulous grader that outputs STRICT JSON per schema.\n"
        "Evaluate each rubric item independently using ONLY its 'criterion' text. "
        "Ignore all other rubric metadata (scores, penalties, tags, sources) and any external facts.\n"
        "Unit handling: If a numeric value in the candidate is within the stated tolerance after proper unit conversion, "
        "consider the criterion satisfied even if the candidate does not restate the value in the rubric's unit.\n"
        "Consistency rule: The boolean 'met' must reflect the final verdict in your rationale.\n"
    )

    user_message = f"""
TASK PROMPT:
{sample.get('prompt','')}

CANDIDATE ANSWER:
{completion}

RUBRIC_ITEMS (evaluate ONLY against criterion; ignore any other metadata):
{json.dumps(compact, ensure_ascii=False)}

INSTRUCTIONS:
- Decide each item strictly from the candidate answer vs. the item's 'criterion'. Do NOT use outside knowledge.
- Treat values as equivalent across unit systems if, after correct conversion, the value is within the stated tolerance.
- Write a short rationale. The last sentence MUST be exactly one of:
  "Verdict: MET"  or  "Verdict: NOT MET".
- Set the boolean 'met' to True iff the final sentence is "Verdict: MET"; else set it to False.
- Never output a rationale that concludes "Verdict: MET" while setting met=false, or vice versa.
- If a tolerance is written in a criteria and the value in the response is within the tolerance, then "Verdict: MET"

OUTPUT (STRICT JSON ONLY):
{{
  "items": [
    {{
      "rubricItemId": "string",
      "met": true | false,
      "rationale": "string that ends with 'Verdict: MET' or 'Verdict: NOT MET'"
    }}
  ]
}}
""".strip()

    return system_message, user_message


def _rubric_bounds(items: List[Dict[str, Any]]) -> Tuple[float, float]:
    pos = sum(s["score"] for s in items if s["score"] > 0)
    neg = sum(s["score"] for s in items if s["score"] < 0)
    return pos, neg  # (max positive, min negative)


# Exact-match grader (optional)

def simple_string_grader(sample, completion) -> float:

    #1.0 if completion exactly matches sample['reference'], else 0.0.
    #Leave sample['reference'] empty ('') to effectively ignore this.

    ref = (sample.get("reference") or "").strip()
    if not ref:
        return 0.0
    return 1.0 if completion.strip() == ref else 0.0


def model_grader_with_rubric(sample, completion, raw_rubric_items, *, grader_model: str = "gpt-4o-mini"):
    items = _prepare_rubric_items(raw_rubric_items)
    pos_max = sum(max(0.0, float(it["score"])) for it in items)
    neg_min = sum(min(0.0, float(it["score"])) for it in items)
    bounds = {"pos_max": float(pos_max), "neg_min": float(neg_min)}

    system_message, user_message = _build_rubric_prompts(sample, completion, items)

    resp = client.chat.completions.create(
        model=grader_model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ],
        #temperature=0,
        response_format={"type": "json_object"},
    )

    content = resp.choices[0].message.content

    def _strip_fences(s: str) -> str:
        s = s.strip()
        if s.startswith("```"):
            parts = s.split("```")
            if len(parts) >= 3:
                s = parts[1]
            s = s.lstrip()
            if s.startswith("json"):
                s = s[4:].lstrip()
        return s

    try:
        rubric_json = json.loads(content)
    except Exception:
        rubric_json = json.loads(_strip_fences(content))

    if not isinstance(rubric_json, dict) or "items" not in rubric_json:
        raise ValueError("Grader did not return JSON with an 'items' array per schema.")

        # Enforce: boolean must match final "Verdict: ..." line in rationale
    for it in rubric_json.get("items", []):
        tail = re.sub(r"\s+", " ", str(it.get("rationale", ""))).strip().lower().rstrip(" .!;")
    if tail.endswith("verdict: met"):
        it["met"] = True
    elif tail.endswith("verdict: not met"):
        it["met"] = False

    return rubric_json, bounds


def _compute_signed_total_from_decisions(rubric_json: dict, raw_rubric_items):
    """
    Compute signed total strictly from rubric item scores and decisions.
    Also return enriched per-item rows including:
      - criterion_number (1-indexed position in the original rubric JSON)
      - point_value (the rubric score, positive or negative)
      - awarded (0 or point_value, unless a valid 'awarded' was provided)

    Rules:
    - If 'met' is True -> contribution = rubric score (can be negative for penalties)
    - If 'met' is False -> contribution = 0
    - Only trust 'awarded' if it is exactly 0 or exactly equal to the rubric score
    """
    # Build lookups from the rubric’s original order
    id_to_score = {}
    id_to_index1 = {}
    id_to_criterion = {}
    for idx, it in enumerate(raw_rubric_items, start=1):
        rid = it.get("rubricItemId") or it.get("id") or (it.get("criterion", "")[:48])
        sc = float(it.get("score", 0.0))
        id_to_score[rid] = sc
        id_to_index1[rid] = idx
        id_to_criterion[rid] = it.get("criterion", "")

    # Pull decisions list from the grader output (be tolerant of formats)
    if isinstance(rubric_json, dict):
        decisions = rubric_json.get("items") or rubric_json.get("results") or []
    elif isinstance(rubric_json, list):
        decisions = rubric_json
    else:
        decisions = []

    items_out = []
    total = 0.0

    for dec in decisions:
        rid = dec.get("rubricItemId") or dec.get("id") or (dec.get("criterion", "")[:48])
        score = id_to_score.get(rid, 0.0)
        met = bool(dec.get("met", False))

        # Only trust 'awarded' if it's exactly 0 or exactly equal to the rubric score
        aw = dec.get("awarded", None)
        if isinstance(aw, (int, float)) and (abs(float(aw) - score) < 1e-9 or abs(float(aw)) < 1e-9):
            awarded = float(aw)
        else:
            awarded = score if met else 0.0

        total += awarded
        items_out.append({
            "criterion_number": id_to_index1.get(rid, None),
            "rubricItemId": rid,
            "criterion": id_to_criterion.get(rid, ""),
            "point_value": score,
            "met": met,
            "awarded": awarded,
            "rationale": dec.get("rationale", "")
        })

    return total, items_out

# Normalize rubric total to [0,1] and blend with exact-match
def _normalize_total_score(total: float, pos_max: float, neg_min: float) -> float:
    """
    Map a signed total into [0,1] given bounds:
      neg_min <= total <= pos_max
    """
    lo = float(neg_min)
    hi = float(pos_max)
    span = hi - lo
    if span == 0:
        return 0.5
    norm = (float(total) - lo) / span
    # Clamp for safety
    return max(0.0, min(1.0, norm))


def multigrader(sample, completion, raw_rubric_items,
                *, grader_model: str = "gpt-4o-mini",
                exact_weight: float = 0.0, rubric_weight: float = 1.0):
    """
    Returns (final_score_0_1, details).
    This version recomputes score from decisions.
    - Penalties/rewards are determined solely by the sign of each rubric item's score.
    - 'rubric_details' returns enriched per-item rows (criterion number, point value, rationale, etc.).
    """
    # 1) Exact-match channel (if you use it)
    exact = simple_string_grader(sample, completion)

    # 2) Ask the grader model for per-item decisions and bounds
    #    Expect: rubric_json = {"items":[...]} and bounds = {"pos_max": float, "neg_min": float}
    rubric_json, bounds = model_grader_with_rubric(
        sample, completion, raw_rubric_items, grader_model=grader_model
    )

    # 3) Recompute signed total from decisions (now returns (signed_total, items_out))
    signed_total, items_out = _compute_signed_total_from_decisions(rubric_json, raw_rubric_items)

    # 4) Normalize and blend
    rubric_norm = _normalize_total_score(signed_total, bounds.get("pos_max", 0.0), bounds.get("neg_min", 0.0))
    final = (exact_weight * float(exact)) + (rubric_weight * float(rubric_norm))

    # 5) Return details
    return final, {
        "exact_match": float(exact),
        "rubric_total": float(signed_total),      # signed
        "rubric_norm": float(rubric_norm),
        "rubric_bounds": bounds,
        "rubric_details": {"items": items_out},   # enriched rows for your table/JSON panel
    }


In [4]:
# Final state grading UI helpers and pipelines
from pathlib import Path


def _fpath(obj):
    if obj is None:
        return None
    return getattr(obj, "name", None) or getattr(obj, "path", None) or str(obj)


def _read_text_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def _read_json_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return json.load(f)


def _ensure_client_from_key(api_key_text):
    if "client" in globals() and globals()["client"] is not None:
        return globals()["client"]
    api_key = (api_key_text or "").strip() or os.getenv("OPENAI_API_KEY", "")
    if not api_key:
        raise RuntimeError("No OpenAI client found and no API key provided.")
    os.environ["OPENAI_API_KEY"] = api_key
    try:
        from openai import OpenAI
    except Exception as e:
        raise RuntimeError("Please install openai>=1.40.0") from e
    globals()["client"] = OpenAI(api_key=api_key)
    return globals()["client"]


class _LooseJSONParser:
    """Parse the lightly formatted "final state" export into Python data."""

    __slots__ = ("s", "n", "i")

    def __init__(self, text: str):
        self.s = text
        self.n = len(text)
        self.i = 0

    def _error(self, msg: str):
        context = self.s[self.i:self.i + 60]
        raise ValueError(f"{msg} at pos {self.i}: {context!r}")

    def _skip_ws(self):
        while self.i < self.n and self.s[self.i] in " \t\r\n":
            self.i += 1

    def parse(self):
        self._skip_ws()
        value = self._parse_value(relaxed=True)
        self._skip_ws()
        return value

    def _parse_value(self, relaxed: bool = False):
        self._skip_ws()
        if self.i >= self.n:
            self._error("Unexpected end of input")
        ch = self.s[self.i]
        if ch == '"':
            return self._parse_string(relaxed=relaxed)
        if ch == '{':
            return self._parse_object()
        if ch == '[':
            return self._parse_array()
        if self.s.startswith("true", self.i):
            self.i += 4
            return True
        if self.s.startswith("false", self.i):
            self.i += 5
            return False
        if self.s.startswith("null", self.i):
            self.i += 4
            return None
        start = self.i
        while self.i < self.n and self.s[self.i] not in " \t\r\n,]}":
            self.i += 1
        token = self.s[start:self.i].strip()
        if not token:
            self._error("Empty token")
        try:
            if any(c in token for c in (".", "e", "E")):
                return float(token)
            return int(token)
        except ValueError:
            return token

    def _parse_string(self, relaxed: bool = False) -> str:
        if self.s[self.i] != '"':
            self._error("Expected string start")
        self.i += 1
        chars = []
        while self.i < self.n:
            ch = self.s[self.i]
            if ch == '\\':
                if self.i + 1 >= self.n:
                    self._error("Bad escape sequence")
                nxt = self.s[self.i + 1]
                esc_map = {'"': '"', '\\': '\\', '/': '/', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
                if nxt in esc_map:
                    chars.append(esc_map[nxt])
                    self.i += 2
                    continue
                if nxt == 'u':
                    hex_digits = self.s[self.i + 2:self.i + 6]
                    chars.append(chr(int(hex_digits, 16)))
                    self.i += 6
                    continue
                chars.append(nxt)
                self.i += 2
                continue
            if ch == '"':
                nxt = self.s[self.i + 1] if self.i + 1 < self.n else ''
                if not relaxed or nxt in ('', '\n', '\r', ',', '}', ']'):
                    self.i += 1
                    return ''.join(chars)
                chars.append(ch)
                self.i += 1
                continue
            chars.append(ch)
            self.i += 1
        self._error("Unterminated string")

    def _parse_object(self):
        if self.s[self.i] != '{':
            self._error("Expected '{'")
        self.i += 1
        obj = {}
        while True:
            self._skip_ws()
            if self.i < self.n and self.s[self.i] == '}':
                self.i += 1
                return obj
            key = self._parse_string(relaxed=False)
            self._skip_ws()
            if self.i >= self.n or self.s[self.i] != ':':
                self._error("Expected ':' after key")
            self.i += 1
            value = self._parse_value(relaxed=True)
            obj[key] = value
            self._skip_ws()
            if self.i < self.n and self.s[self.i] == ',':
                self.i += 1
                continue
            self._skip_ws()
            if self.i < self.n and self.s[self.i] == '}':
                continue

    def _parse_array(self):
        if self.s[self.i] != '[':
            self._error("Expected '['")
        self.i += 1
        arr = []
        while True:
            self._skip_ws()
            if self.i < self.n and self.s[self.i] == ']':
                self.i += 1
                return arr
            start = self.i
            while self.i < self.n and self.s[self.i].isdigit():
                self.i += 1
            if self.i > start and self.i < self.n and self.s[self.i] == ':':
                self.i += 1
            value = self._parse_value(relaxed=True)
            arr.append(value)
            self._skip_ws()
            if self.i < self.n and self.s[self.i] == ',':
                self.i += 1
                continue


def _load_final_state(path) -> Dict[str, Any]:
    raw = Path(path).read_text(encoding="utf-8", errors="ignore")
    text = raw.replace("\r\n", "\n").replace("\r", "\n").replace("NULL", "null")
    parser = _LooseJSONParser(text)
    return parser.parse()


def _iter_dicts(obj):
    if isinstance(obj, dict):
        yield obj
        for value in obj.values():
            yield from _iter_dicts(value)
    elif isinstance(obj, list):
        for item in obj:
            yield from _iter_dicts(item)


def _find_first_str(obj, keys):
    lowered = [k.lower() for k in keys]
    for mapping in _iter_dicts(obj):
        for key, value in mapping.items():
            if key.lower() in lowered and isinstance(value, str) and value.strip():
                return value.strip()
    return ""


def _extract_prompt_from_state(state):
    return _find_first_str(state, ["prompt", "task", "instructions", "question", "request"])


def _extract_candidate_answer(state):
    direct = _find_first_str(state, [
        "final_response", "final_output", "response", "answer", "output_text", "completion"
    ])
    if direct:
        return direct
    messages = state.get("messages") if isinstance(state, dict) else None
    if isinstance(messages, list):
        for msg in reversed(messages):
            if not isinstance(msg, dict):
                continue
            sender = str(msg.get("from", "")).lower()
            if sender.startswith("you") or "you@" in sender:
                text = msg.get("text") or msg.get("body") or msg.get("content")
                if isinstance(text, str) and text.strip():
                    return text.strip()
                html = msg.get("html")
                if isinstance(html, str) and html.strip():
                    return html.strip()
    fallback = _find_first_str(state, ["text", "content", "message"])
    if fallback:
        return fallback
    raise ValueError("Could not locate a candidate response inside the final state file.")


def _build_items_table(items_raw):
    rows = []
    for idx, it in enumerate(items_raw or [], start=1):
        criterion_number = it.get("criterion_number", idx)
        criterion = it.get("criterion") or it.get("rubricItemId") or ""
        point_value = it.get("point_value", it.get("score", 0.0))
        try:
            point_value = float(point_value)
        except Exception:
            point_value = 0.0
        awarded = it.get("awarded", point_value if it.get("met") else 0.0)
        try:
            awarded = float(awarded)
        except Exception:
            awarded = 0.0
        rationale = it.get("rationale", "")
        rows.append({
            "#": criterion_number,
            "criterion": criterion,
            "possible_score": point_value,
            "score_awarded": awarded,
            "explanation": rationale,
        })
    if not rows:
        return pd.DataFrame(columns=["#", "criterion", "possible_score", "score_awarded", "explanation"])
    df = pd.DataFrame(rows)
    if df["#"].notna().any():
        df = df.sort_values(by=["#"], kind="stable")
    return df[["#", "criterion", "possible_score", "score_awarded", "explanation"]]


def _out_dir():
    env = os.getenv("GRADING_REPORT_DIR", "")
    if env:
        base = Path(env).expanduser()
    else:
        candidates = [Path.home() / "Downloads", Path.home() / "OneDrive" / "Downloads"]
        base = next((p for p in candidates if p.exists()), Path.cwd() / "grading_reports")
    base.mkdir(parents=True, exist_ok=True)
    return base


def _make_pdf(output_path, context):
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(
        name="Cell",
        parent=styles["Normal"],
        fontSize=9,
        leading=11,
        splitLongWords=True,
    ))
    mono = ParagraphStyle(
        name="Mono",
        parent=styles["Normal"],
        fontName="Courier",
        fontSize=9,
        leading=10,
    )

    doc = SimpleDocTemplate(
        str(output_path),
        pagesize=letter,
        rightMargin=36,
        leftMargin=36,
        topMargin=36,
        bottomMargin=36,
    )

    story = []
    story.append(Paragraph("Grading Report", styles["Title"]))
    story.append(Spacer(1, 6))
    meta_lines = [
        f"Generated: {context['timestamp']}",
        f"Submission file: {context.get('submission_name') or '—'}",
        f"Grader model: {context['grader_model']}",
        f"Prompt file: {context.get('prompt_name') or '—'}",
        f"Rubric file: {context.get('rubric_name') or '—'}",
    ]
    story.append(Paragraph("<br/>".join(meta_lines), styles["Normal"]))
    story.append(Spacer(1, 12))

    story.append(Paragraph("Candidate Response", styles["Heading2"]))
    story.append(Spacer(1, 4))
    story.append(Preformatted(context.get("candidate_answer") or "—", mono))
    story.append(Spacer(1, 12))

    story.append(Paragraph("Per-Criterion Scores", styles["Heading2"]))
    story.append(Spacer(1, 6))

    df = context["items_df"].copy()

    def P(x):
        return Paragraph(escape(str(x)).replace("\n", "<br/>"), styles["Cell"])

    headers = ["Criterion", "Score Awarded", "Explanation"]
    table_data = [headers]
    for _, row in df.iterrows():
        table_data.append([
            P(row["criterion"]),
            f"{float(row['score_awarded']):g}",
            P(row["explanation"]),
        ])

    col_widths = [doc.width * f for f in (0.35, 0.15, 0.50)]
    tbl = Table(table_data, colWidths=col_widths, repeatRows=1)
    tbl.setStyle(TableStyle([
        ("FONT", (0, 0), (-1, 0), "Helvetica-Bold", 10),
        ("BACKGROUND", (0, 0), (-1, 0), colors.lightgrey),
        ("LINEABOVE", (0, 0), (-1, 0), 0.5, colors.black),
        ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.black),
        ("GRID", (0, 1), (-1, -1), 0.25, colors.grey),
        ("VALIGN", (0, 0), (-1, -1), "TOP"),
        ("LEFTPADDING", (0, 0), (-1, -1), 4),
        ("RIGHTPADDING", (0, 0), (-1, -1), 4),
        ("WORDWRAP", (0, 0), (-1, -1), True),
    ]))
    story.append(tbl)
    story.append(Spacer(1, 12))

    signed = float(context["signed_total"])
    norm = float(context["normalized"])
    story.append(Paragraph("Totals", styles["Heading2"]))
    story.append(Paragraph(f"Total (signed): <b>{signed:g}</b>", styles["Normal"]))
    story.append(Paragraph(f"Normalized (0..1): <b>{norm:.3f}</b>", styles["Normal"]))

    doc.build(story)
    return str(output_path)


def run_pipeline(prompt_file, rubric_file, final_state_file,
                 grader_model, api_key_text, exact_weight, rubric_weight):
    logs = []

    def log(msg):
        logs.append(str(msg))

    try:
        cli = _ensure_client_from_key(api_key_text)
        log("OpenAI client ready.")

        if not rubric_file or not final_state_file:
            return (
                None,
                None,
                "",
                pd.DataFrame(),
                None,
                "\n".join(logs + ["Please upload both a rubric JSON file and a final state JSON file."]),
                "",
                "",
                "",
                "",
            )

        rubric_path = _fpath(rubric_file)
        rubric_raw = _read_json_file(rubric_path)
        if not isinstance(rubric_raw, list) or not all(("criterion" in x and "score" in x) for x in rubric_raw):
            return (
                None,
                None,
                "",
                pd.DataFrame(),
                None,
                "\n".join(logs + ["Rubric must be a JSON list of {criterion, score} objects."]),
                "",
                "",
                "",
                "",
            )

        prompt_path = _fpath(prompt_file) if prompt_file else None
        prompt_text = _read_text_file(prompt_path) if prompt_path else ""
        prompt_name = Path(prompt_path).name if prompt_path else ""

        submission_path = _fpath(final_state_file)
        submission_name = Path(submission_path).name if submission_path else ""
        final_state = _load_final_state(submission_path)
        log(f"Loaded final state: {submission_name}")

        candidate_answer = _extract_candidate_answer(final_state)
        log("Candidate response extracted from final state.")

        if not prompt_text:
            derived_prompt = _extract_prompt_from_state(final_state)
            if derived_prompt:
                prompt_text = derived_prompt
                log("Derived prompt from final state.")

        rubric_name = Path(rubric_path).name if rubric_path else ""
        log(f"Loaded rubric with {len(rubric_raw)} items.")

        sample = {"prompt": prompt_text, "reference": ""}
        final_norm, details = multigrader(
            sample,
            candidate_answer,
            rubric_raw,
            grader_model=grader_model,
            exact_weight=float(exact_weight),
            rubric_weight=float(rubric_weight),
        )
        log("Grading complete.")

        items_raw = (details or {}).get("rubric_details", {}).get("items", [])
        items_df = _build_items_table(items_raw)
        signed_total = float((details or {}).get("rubric_total", 0.0))
        final_norm = float(final_norm)

        out_dir = _out_dir()
        ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        pdf_path = out_dir / f"grading_report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
        _make_pdf(pdf_path, {
            "timestamp": ts,
            "submission_name": submission_name,
            "grader_model": grader_model,
            "prompt_name": prompt_name,
            "rubric_name": rubric_name,
            "candidate_answer": candidate_answer,
            "items_df": items_df,
            "signed_total": signed_total,
            "normalized": final_norm,
        })
        log(f"PDF created: {pdf_path}")

        return (
            final_norm,
            signed_total,
            candidate_answer,
            items_df,
            str(pdf_path),
            "\n".join(logs),
            prompt_text,
            prompt_name,
            submission_name,
            candidate_answer,
        )

    except Exception as exc:
        tb = traceback.format_exc(limit=2)
        logs.append(f"ERROR: {exc}\n{tb}")
        return (
            None,
            None,
            "",
            pd.DataFrame(),
            None,
            "\n".join(logs),
            "",
            "",
            "",
            "",
        )


def regrade_pipeline(rubric_file, grader_model, api_key_text,
                     exact_weight, rubric_weight,
                     saved_prompt_text, saved_prompt_name, saved_submission_name, saved_candidate_answer,
                     override_answer):
    logs = []

    def log(msg):
        logs.append(str(msg))

    try:
        _ensure_client_from_key(api_key_text)
        log("OpenAI client ready (regrade).")

        if not rubric_file:
            return (
                None,
                None,
                saved_candidate_answer or "",
                pd.DataFrame(),
                None,
                "\n".join(logs + ["Please upload a rubric JSON file to regrade."]),
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
            )

        candidate_answer = override_answer.strip() if override_answer and override_answer.strip() else saved_candidate_answer
        if not candidate_answer:
            return (
                None,
                None,
                "",
                pd.DataFrame(),
                None,
                "\n".join(logs + ["No candidate response available to grade. Upload a final state or paste an override."]),
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
            )

        rubric_path = _fpath(rubric_file)
        rubric_raw = _read_json_file(rubric_path)
        if not isinstance(rubric_raw, list) or not all(("criterion" in x and "score" in x) for x in rubric_raw):
            return (
                None,
                None,
                candidate_answer,
                pd.DataFrame(),
                None,
                "\n".join(logs + ["Rubric must be a JSON list of {criterion, score} objects."]),
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
            )

        rubric_name = Path(rubric_path).name if rubric_path else ""

        sample = {"prompt": saved_prompt_text or "", "reference": ""}
        final_norm, details = multigrader(
            sample,
            candidate_answer,
            rubric_raw,
            grader_model=grader_model,
            exact_weight=float(exact_weight),
            rubric_weight=float(rubric_weight),
        )
        log("Regrade complete.")

        items_raw = (details or {}).get("rubric_details", {}).get("items", [])
        items_df = _build_items_table(items_raw)
        signed_total = float((details or {}).get("rubric_total", 0.0))
        final_norm = float(final_norm)

        out_dir = _out_dir()
        ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        pdf_path = out_dir / f"grading_regrade_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
        _make_pdf(pdf_path, {
            "timestamp": ts,
            "submission_name": saved_submission_name,
            "grader_model": grader_model,
            "prompt_name": saved_prompt_name,
            "rubric_name": rubric_name,
            "candidate_answer": candidate_answer,
            "items_df": items_df,
            "signed_total": signed_total,
            "normalized": final_norm,
        })
        log(f"Regrade PDF created: {pdf_path}")

        return (
            final_norm,
            signed_total,
            candidate_answer,
            items_df,
            str(pdf_path),
            "\n".join(logs),
            saved_prompt_text,
            saved_prompt_name,
            saved_submission_name,
            candidate_answer,
        )

    except Exception as exc:
        tb = traceback.format_exc(limit=2)
        logs.append(f"ERROR: {exc}\n{tb}")
        return (
            None,
            None,
            saved_candidate_answer or "",
            pd.DataFrame(),
            None,
            "\n".join(logs),
            saved_prompt_text,
            saved_prompt_name,
            saved_submission_name,
            saved_candidate_answer,
        )


def _instruction_md():
    return (
        "## Rubric Grader\n"
        "Upload a rubric (JSON) and a final state export (JSON) produced by your workflow. "
        "Optionally include the original prompt text so it can be embedded in the grading context. "
        "Choose a grader model and click **Grade Final State**. You can optionally paste a different response "
        "and press **Regrade** to compare graders without uploading again."
    )


def build_ui():
    default_grader_models = [
        "gpt-4.1",
        "gpt-4o-mini",
        "o4-mini-2025-04-16",
    ]

    with gr.Blocks(title="Rubric Grader") as demo:
        gr.Markdown(_instruction_md())

        with gr.Row():
            prompt_file = gr.File(label="Prompt (.txt/.md, optional)", file_types=[".txt", ".md"], type="filepath")
            rubric_file = gr.File(label="Rubric (.json)", file_types=[".json"], type="filepath")
            final_state_file = gr.File(label="Final state (.json)", file_types=[".json"], type="filepath")

        with gr.Row():
            grader_model = gr.Dropdown(
                default_grader_models,
                label="Grader model",
                value=default_grader_models[0],
                allow_custom_value=True,
            )

        with gr.Accordion("Advanced / Auth", open=False):
            api_key_text = gr.Textbox(label="OpenAI API Key (optional if set in env)", type="password", placeholder="sk-...")
            with gr.Row():
                exact_weight = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Exact-match weight")
                rubric_weight = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Rubric weight")

        with gr.Row():
            run_btn = gr.Button("Grade Final State", variant="primary")
            regrade_btn = gr.Button("Regrade current response")

        with gr.Row():
            final_norm = gr.Number(label="Final score (normalized 0..1)")
            total_signed = gr.Number(label="Total score (signed)")
        sample_response = gr.Textbox(label="Candidate response", lines=12, interactive=False)
        items_table_out = gr.Dataframe(label="Criterion scores", interactive=False, wrap=True)
        pdf_out = gr.File(label="Download report (PDF)")
        logs_out = gr.Textbox(label="Run log", lines=6)

        with gr.Accordion("Regrade options", open=False):
            override_answer = gr.Textbox(
                label="Override candidate response (optional)",
                lines=8,
                placeholder="Paste a response to grade without uploading a new final state.",
            )

        saved_prompt_text = gr.State("")
        saved_prompt_name = gr.State("")
        saved_submission_name = gr.State("")
        saved_candidate_answer = gr.State("")

        run_btn.click(
            fn=run_pipeline,
            inputs=[
                prompt_file,
                rubric_file,
                final_state_file,
                grader_model,
                api_key_text,
                exact_weight,
                rubric_weight,
            ],
            outputs=[
                final_norm,
                total_signed,
                sample_response,
                items_table_out,
                pdf_out,
                logs_out,
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
            ],
        )

        regrade_btn.click(
            fn=regrade_pipeline,
            inputs=[
                rubric_file,
                grader_model,
                api_key_text,
                exact_weight,
                rubric_weight,
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
                override_answer,
            ],
            outputs=[
                final_norm,
                total_signed,
                sample_response,
                items_table_out,
                pdf_out,
                logs_out,
                saved_prompt_text,
                saved_prompt_name,
                saved_submission_name,
                saved_candidate_answer,
            ],
        )

    return demo


demo = build_ui()
demo.launch(share=False)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


