In [1]:
!pip -q install gradio>=4.36.0 pandas reportlab

In [2]:
import os, io, json, traceback, datetime, re
import pathlib
from pathlib import Path
import pandas as pd
import gradio as gr

from openai import OpenAI
from typing import List, Dict, Any, Tuple

# --- PDF helpers (ReportLab) ---
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Preformatted

from xml.sax.saxutils import escape  # for safe Paragraph content

In [3]:
# Candidate generation
def generate_candidate_answer(task_prompt: str, *, candidate_model: str = "gpt-4o-mini") -> str:

    # Generate a sample response to prompt with the model-under-test.

    vs = globals().get("vs_id")

    # Build tools only when a vector store is available
    tools = [{"type": "file_search", "vector_store_ids": [vs]}] if vs else None

    # Only include 'tools' in the API call if we actually built it
    extra_kwargs = {"tools": tools} if tools else {}

    resp = client.responses.create(
        model=candidate_model,
        input=[{
            "role": "user",
            "content": (
                "Follow the instructions carefully. Use attached files when noted. Include any code, values, files, or plots the task requires.\n\n"
                f"TASK:\n{task_prompt}"
            )
        }],
        **({"tools": tools} if tools else {})
        #tools=[{"type": "file_search", "vector_store_ids": [vs_id]}],
        # temperature=0
        #**extra_kwargs
    )
    # SDK compatibility: try the modern accessor, fall back if needed.
    try:
        return resp.output_text
    except AttributeError:
        return resp.output[0].content[0].text

# Rubric preparation (ONLY criterion + score)
def _prepare_rubric_items(raw_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Prepare rubric items for grading. IMPORTANT: penalty is determined ONLY by the sign of `score`.
      - score > 0  => reward item
      - score < 0  => penalty item
    """
    prepped = []
    for it in raw_items:
        crit  = (it.get("criterion") or "").strip()
        score = float(it.get("score", 0))
        rid   = it.get("rubricItemId") or ""
        is_penalty = (score < 0)  # <-- CHANGED: sign-of-score, not wording
        prepped.append({
            "rubricItemId": rid,
            "criterion": crit,
            "score": score,          # positive or negative; used as-is
            "is_penalty": is_penalty
        })
    return prepped

def _build_rubric_prompts(sample, completion, items):
    # Only pass what the grader truly needs: id + criterion (hide score/is_penalty)
    compact = [{
        "rubricItemId": it["rubricItemId"],
        "criterion": it["criterion"]
    } for it in items]

    system_message = (
        "You are a meticulous grader that outputs STRICT JSON per schema.\n"
        "Evaluate each rubric item independently using ONLY its 'criterion' text. "
        "Ignore all other rubric metadata (scores, penalties, tags, sources) and any external facts.\n"
        "Unit handling: If a numeric value in the candidate is within the stated tolerance after proper unit conversion, "
        "consider the criterion satisfied even if the candidate does not restate the value in the rubric's unit.\n"
        "Consistency rule: The boolean 'met' must reflect the final verdict in your rationale.\n"
    )

    user_message = f"""
TASK PROMPT:
{sample.get('prompt','')}

CANDIDATE ANSWER:
{completion}

RUBRIC_ITEMS (evaluate ONLY against criterion; ignore any other metadata):
{json.dumps(compact, ensure_ascii=False)}

INSTRUCTIONS:
- Decide each item strictly from the candidate answer vs. the item's 'criterion'. Do NOT use outside knowledge.
- Treat values as equivalent across unit systems if, after correct conversion, the value is within the stated tolerance.
- Write a short rationale. The last sentence MUST be exactly one of:
  "Verdict: MET"  or  "Verdict: NOT MET".
- Set the boolean 'met' to True iff the final sentence is "Verdict: MET"; else set it to False.
- Never output a rationale that concludes "Verdict: MET" while setting met=false, or vice versa.
- If a tolerance is written in a criteria and the value in the response is within the tolerance, then "Verdict: MET"

OUTPUT (STRICT JSON ONLY):
{{
  "items": [
    {{
      "rubricItemId": "string",
      "met": true | false,
      "rationale": "string that ends with 'Verdict: MET' or 'Verdict: NOT MET'"
    }}
  ]
}}
""".strip()

    return system_message, user_message


def _rubric_bounds(items: List[Dict[str, Any]]) -> Tuple[float, float]:
    pos = sum(s["score"] for s in items if s["score"] > 0)
    neg = sum(s["score"] for s in items if s["score"] < 0)
    return pos, neg  # (max positive, min negative)


# Exact-match grader (optional)

def simple_string_grader(sample, completion) -> float:

    #1.0 if completion exactly matches sample['reference'], else 0.0.
    #Leave sample['reference'] empty ('') to effectively ignore this.

    ref = (sample.get("reference") or "").strip()
    if not ref:
        return 0.0
    return 1.0 if completion.strip() == ref else 0.0


def model_grader_with_rubric(sample, completion, raw_rubric_items, *, grader_model: str = "gpt-4o-mini"):
    items = _prepare_rubric_items(raw_rubric_items)
    pos_max = sum(max(0.0, float(it["score"])) for it in items)
    neg_min = sum(min(0.0, float(it["score"])) for it in items)
    bounds = {"pos_max": float(pos_max), "neg_min": float(neg_min)}

    system_message, user_message = _build_rubric_prompts(sample, completion, items)

    resp = client.chat.completions.create(
        model=grader_model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ],
        #temperature=0,
        response_format={"type": "json_object"},
    )

    content = resp.choices[0].message.content

    def _strip_fences(s: str) -> str:
        s = s.strip()
        if s.startswith("```"):
            parts = s.split("```")
            if len(parts) >= 3:
                s = parts[1]
            s = s.lstrip()
            if s.startswith("json"):
                s = s[4:].lstrip()
        return s

    try:
        rubric_json = json.loads(content)
    except Exception:
        rubric_json = json.loads(_strip_fences(content))

    if not isinstance(rubric_json, dict) or "items" not in rubric_json:
        raise ValueError("Grader did not return JSON with an 'items' array per schema.")

        # Enforce: boolean must match final "Verdict: ..." line in rationale
    for it in rubric_json.get("items", []):
        tail = re.sub(r"\s+", " ", str(it.get("rationale", ""))).strip().lower().rstrip(" .!;")
    if tail.endswith("verdict: met"):
        it["met"] = True
    elif tail.endswith("verdict: not met"):
        it["met"] = False

    return rubric_json, bounds


def _compute_signed_total_from_decisions(rubric_json: dict, raw_rubric_items):
    """
    Compute signed total strictly from rubric item scores and decisions.
    Also return enriched per-item rows including:
      - criterion_number (1-indexed position in the original rubric JSON)
      - point_value (the rubric score, positive or negative)
      - awarded (0 or point_value, unless a valid 'awarded' was provided)

    Rules:
    - If 'met' is True -> contribution = rubric score (can be negative for penalties)
    - If 'met' is False -> contribution = 0
    - Only trust 'awarded' if it is exactly 0 or exactly equal to the rubric score
    """
    # Build lookups from the rubric’s original order
    id_to_score = {}
    id_to_index1 = {}
    id_to_criterion = {}
    for idx, it in enumerate(raw_rubric_items, start=1):
        rid = it.get("rubricItemId") or it.get("id") or (it.get("criterion", "")[:48])
        sc = float(it.get("score", 0.0))
        id_to_score[rid] = sc
        id_to_index1[rid] = idx
        id_to_criterion[rid] = it.get("criterion", "")

    # Pull decisions list from the grader output (be tolerant of formats)
    if isinstance(rubric_json, dict):
        decisions = rubric_json.get("items") or rubric_json.get("results") or []
    elif isinstance(rubric_json, list):
        decisions = rubric_json
    else:
        decisions = []

    items_out = []
    total = 0.0

    for dec in decisions:
        rid = dec.get("rubricItemId") or dec.get("id") or (dec.get("criterion", "")[:48])
        score = id_to_score.get(rid, 0.0)
        met = bool(dec.get("met", False))

        # Only trust 'awarded' if it's exactly 0 or exactly equal to the rubric score
        aw = dec.get("awarded", None)
        if isinstance(aw, (int, float)) and (abs(float(aw) - score) < 1e-9 or abs(float(aw)) < 1e-9):
            awarded = float(aw)
        else:
            awarded = score if met else 0.0

        total += awarded
        items_out.append({
            "criterion_number": id_to_index1.get(rid, None),
            "rubricItemId": rid,
            "criterion": id_to_criterion.get(rid, ""),
            "point_value": score,
            "met": met,
            "awarded": awarded,
            "rationale": dec.get("rationale", "")
        })

    return total, items_out

# Normalize rubric total to [0,1] and blend with exact-match
def _normalize_total_score(total: float, pos_max: float, neg_min: float) -> float:
    """
    Map a signed total into [0,1] given bounds:
      neg_min <= total <= pos_max
    """
    lo = float(neg_min)
    hi = float(pos_max)
    span = hi - lo
    if span == 0:
        return 0.5
    norm = (float(total) - lo) / span
    # Clamp for safety
    return max(0.0, min(1.0, norm))


def multigrader(sample, completion, raw_rubric_items,
                *, grader_model: str = "gpt-4o-mini",
                exact_weight: float = 0.0, rubric_weight: float = 1.0):
    """
    Returns (final_score_0_1, details).
    This version recomputes score from decisions.
    - Penalties/rewards are determined solely by the sign of each rubric item's score.
    - 'rubric_details' returns enriched per-item rows (criterion number, point value, rationale, etc.).
    """
    # 1) Exact-match channel (if you use it)
    exact = simple_string_grader(sample, completion)

    # 2) Ask the grader model for per-item decisions and bounds
    #    Expect: rubric_json = {"items":[...]} and bounds = {"pos_max": float, "neg_min": float}
    rubric_json, bounds = model_grader_with_rubric(
        sample, completion, raw_rubric_items, grader_model=grader_model
    )

    # 3) Recompute signed total from decisions (now returns (signed_total, items_out))
    signed_total, items_out = _compute_signed_total_from_decisions(rubric_json, raw_rubric_items)

    # 4) Normalize and blend
    rubric_norm = _normalize_total_score(signed_total, bounds.get("pos_max", 0.0), bounds.get("neg_min", 0.0))
    final = (exact_weight * float(exact)) + (rubric_weight * float(rubric_norm))

    # 5) Return details
    return final, {
        "exact_match": float(exact),
        "rubric_total": float(signed_total),      # signed
        "rubric_norm": float(rubric_norm),
        "rubric_bounds": bounds,
        "rubric_details": {"items": items_out},   # enriched rows for your table/JSON panel
    }


In [4]:
# UPDATED GRADIO UI WITH REGRADE SUPPORT
# ---------- Helpers ----------
def _fpath(obj):
    if obj is None:
        return None
    return getattr(obj, "name", None) or getattr(obj, "path", None) or str(obj)

def _read_text_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def _read_json_file(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return json.load(f)

def _ensure_client_from_key(api_key_text):
    # Reuse existing client if present; otherwise create from textbox/env
    if "client" in globals() and globals()["client"] is not None:
        return globals()["client"]
    api_key = (api_key_text or "").strip() or os.getenv("OPENAI_API_KEY", "")
    if not api_key:
        raise RuntimeError("No OpenAI client found and no API key provided.")
    os.environ["OPENAI_API_KEY"] = api_key
    try:
        from openai import OpenAI
    except Exception as e:
        raise RuntimeError("Please install openai>=1.40.0") from e
    globals()["client"] = OpenAI(api_key=api_key)
    return globals()["client"]

# File types we can upload directly to File Search (keep conservative)
SUPPORTED_FOR_RETRIEVAL = {
    ".pdf", ".txt", ".md", ".rtf", ".html", ".htm", ".docx", ".pptx"
}

# Types we'll transparently convert to .txt before upload
CONVERT_TO_TXT = {
    ".csv", ".tsv", ".py", ".json", ".yaml", ".yml", ".ipynb"
}

def _stage_for_retrieval(paths, *, max_csv_rows=2000):
    """
    Ensure every path is uploadable by File Search:
      - If extension is supported => keep as is
      - Else => convert to .txt (CSV/TSV truncated to first `max_csv_rows` rows)
    Returns (staged_paths, notes) where staged_paths are Paths safe to upload.
    """
    staging_dir = Path.cwd() / "_staged_refs"
    staging_dir.mkdir(parents=True, exist_ok=True)

    staged, notes = [], []
    for p in paths:
        p = Path(p)
        ext = p.suffix.lower()

        if ext in SUPPORTED_FOR_RETRIEVAL:
            staged.append(p)
            notes.append(f"Using as-is: {p.name}")
            continue

        if ext in CONVERT_TO_TXT:
            txt_name = f"{p.stem}{ext.replace('.', '_')}.txt"
            txt_path = staging_dir / txt_name
            try:
                if ext in {".csv", ".tsv"}:
                    # Load limited rows; write back as CSV text (UTF-8)
                    sep = "\t" if ext == ".tsv" else ","
                    df = pd.read_csv(p, sep=sep, nrows=max_csv_rows, dtype=str, encoding="utf-8", engine="python")
                    txt_path.write_text(df.to_csv(index=False), encoding="utf-8")
                    notes.append(f"Converted {p.name} → {txt_path.name} (first {len(df)} rows).")
                else:
                    # Treat as text-like: code, json, yaml, ipynb (raw)
                    raw = p.read_text(encoding="utf-8", errors="ignore")
                    header = f"### Original file: {p.name}\n\n"
                    txt_path.write_text(header + raw, encoding="utf-8")
                    notes.append(f"Converted {p.name} → {txt_path.name}.")
                staged.append(txt_path)
            except Exception as e:
                notes.append(f"Skipped {p.name}: {e}")
            continue

        # Unknown/unsupported: try raw text fallback; if fails, skip
        try:
            txt_path = staging_dir / f"{p.stem}{ext.replace('.', '_')}.txt"
            raw = p.read_text(encoding="utf-8", errors="ignore")
            txt_path.write_text(f"### Original file: {p.name}\n\n" + raw, encoding="utf-8")
            staged.append(txt_path)
            notes.append(f"Converted (fallback) {p.name} → {txt_path.name}.")
        except Exception as e:
            notes.append(f"Skipped {p.name}: unsupported and not convertible ({e}).")

    return staged, notes


def _create_vs_and_upload_refs(client, ref_file_objs):
    """
    Creates a vector store and uploads supporting refs.
    Unsupported types (e.g., CSV, PY) are auto-converted to .txt first.
    Sets global `vs_id` and `VS`.
    """
    VS = getattr(client, "vector_stores", None) or client.beta.vector_stores
    vs = VS.create(name="prompt_refs")
    vs_id = vs.id

    # Collect local paths
    ref_paths = []
    for rf in (ref_file_objs or []):
        p = _fpath(rf)
        if p and Path(p).exists():
            ref_paths.append(Path(p))

    # Stage for retrieval (convert unsupported → .txt)
    staged_paths, notes = _stage_for_retrieval(ref_paths)
    # Make these notes visible in the UI log if `log` exists in outer scope
    try:
        for n in notes:
            print(n)  # Gradio captures stdout; plus we append to logs in run_pipeline
    except:
        pass

    if staged_paths:
        file_batches = getattr(VS, "file_batches", None)
        if file_batches and hasattr(file_batches, "upload_and_poll"):
            streams = [open(str(p), "rb") for p in staged_paths]
            try:
                file_batches.upload_and_poll(vector_store_id=vs_id, files=streams)
            finally:
                for s in streams:
                    try: s.close()
                    except: pass
        else:
            for p in staged_paths:
                f = client.files.create(file=open(str(p), "rb"), purpose="assistants")
                VS.files.create(vector_store_id=vs_id, file_id=f.id)

    globals()["vs_id"] = vs_id
    globals()["VS"] = VS
    return vs_id

def _build_items_table(items_raw):
    """
    Normalize per-item rows into a tidy DataFrame:
    ['#','criterion','score','rationale','verdict','awarded']
    """
    rows = []
    for idx, it in enumerate(items_raw or [], start=1):
        criterion_number = it.get("criterion_number", idx)
        criterion = it.get("criterion") or it.get("rubricItemId") or ""
        point_value = it.get("point_value", it.get("score", 0.0))
        try: point_value = float(point_value)
        except: point_value = 0.0
        met = bool(it.get("met", False))
        verdict = "Met" if met else "Not Met"
        awarded = it.get("awarded", point_value if met else 0.0)
        try: awarded = float(awarded)
        except: awarded = 0.0
        rationale = it.get("rationale", "")
        rows.append({
            "#": criterion_number,
            "criterion": criterion,
            "score": point_value,
            "rationale": rationale,
            "verdict": verdict,
            "awarded": awarded,
        })
    if not rows:
        return pd.DataFrame(columns=["#", "criterion", "score", "rationale", "verdict", "awarded"])
    df = pd.DataFrame(rows)
    if df["#"].notna().any():
        df = df.sort_values(by=["#"], kind="stable")
    return df[["#", "criterion", "score", "rationale", "verdict", "awarded"]]

def _out_dir():
    """
    Cross-platform output dir:
      1) env GRADING_REPORT_DIR if set
      2) ~/Downloads if it exists
      3) ./grading_reports
    """
    env = os.getenv("GRADING_REPORT_DIR", "")
    if env:
        d = Path(env).expanduser()
    else:
        candidates = [Path.home()/ "Downloads", Path.home()/ "OneDrive"/ "Downloads"]
        d = next((p for p in candidates if p.exists()), Path.cwd()/ "grading_reports")
    d.mkdir(parents=True, exist_ok=True)
    return d

def _make_pdf(output_path, context):
    """
    PDF with wrapped table cells using Paragraphs.
    context expects:
      "timestamp","candidate_model","grader_model",
      "prompt_name","rubric_name","ref_names",
      "candidate_answer","items_df","signed_total","normalized"
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    styles = getSampleStyleSheet()
    styles.add(ParagraphStyle(
        name="Cell",
        parent=styles["Normal"],
        fontSize=9,
        leading=11,
        splitLongWords=True,
    ))
    mono = ParagraphStyle(
        name="Mono",
        parent=styles["Normal"],
        fontName="Courier",
        fontSize=9,
        leading=10,
    )

    doc = SimpleDocTemplate(
        str(output_path), pagesize=letter,
        rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36
    )

    story = []
    # Title & meta
    story.append(Paragraph("Grading Report", styles["Title"]))
    story.append(Spacer(1, 6))
    meta_lines = [
        f"Generated: {context['timestamp']}",
        f"Candidate model: {context['candidate_model']}",
        f"Grader model: {context['grader_model']}",
        f"Prompt file: {context.get('prompt_name') or '—'}",
        f"Rubric file: {context.get('rubric_name') or '—'}",
    ]
    if context.get("ref_names"):
        meta_lines.append("Attachments: " + ", ".join(context["ref_names"]))
    story.append(Paragraph("<br/>".join(meta_lines), styles["Normal"]))
    story.append(Spacer(1, 12))

    # Candidate answer
    story.append(Paragraph("Sample Response", styles["Heading2"]))
    story.append(Spacer(1, 4))
    story.append(Preformatted(context.get("candidate_answer") or "—", mono))
    story.append(Spacer(1, 12))

    # Per-criterion table
    story.append(Paragraph("Per-Criterion Decisions", styles["Heading2"]))
    story.append(Spacer(1, 6))

    df = context["items_df"].copy()

    def P(x):
        return Paragraph(escape(str(x)).replace("\n", "<br/>"), styles["Cell"])

    headers = ["#", "Criterion", "Score", "Verdict", "Awarded", "Rationale"]
    table_data = [headers]
    for _, r in df.iterrows():
        table_data.append([
            str(r["#"]),
            P(r["criterion"]),
            f"{float(r['score']):g}",
            str(r["verdict"]),
            f"{float(r['awarded']):g}",
            P(r["rationale"]),
        ])

    available = SimpleDocTemplate(str(output_path)).width  # same margins
    fractions = [0.07, 0.34, 0.09, 0.12, 0.11, 0.27]
    scale = min(1.0, 1.0 / sum(fractions))
    col_widths = [doc.width * f * scale for f in fractions]

    tbl = Table(table_data, colWidths=col_widths, repeatRows=1)
    tbl.setStyle(TableStyle([
        ("FONT", (0,0), (-1,0), "Helvetica-Bold", 10),
        ("BACKGROUND", (0,0), (-1,0), colors.lightgrey),
        ("LINEABOVE", (0,0), (-1,0), 0.5, colors.black),
        ("LINEBELOW", (0,0), (-1,0), 0.5, colors.black),
        ("GRID", (0,1), (-1,-1), 0.25, colors.grey),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
        ("LEFTPADDING", (0,0), (-1,-1), 4),
        ("RIGHTPADDING", (0,0), (-1,-1), 4),
        ("WORDWRAP", (0,0), (-1,-1), True),
    ]))
    story.append(tbl)
    story.append(Spacer(1, 12))

    signed = float(context["signed_total"])
    norm = float(context["normalized"])
    story.append(Paragraph("Totals", styles["Heading2"]))
    story.append(Paragraph(f"Total (signed): <b>{signed:g}</b>", styles["Normal"]))
    story.append(Paragraph(f"Normalized (0..1): <b>{norm:.3f}</b>", styles["Normal"]))

    doc.build(story)
    return str(output_path)

# ---------- Main pipelines ----------
def run_pipeline(prompt_file, rubric_file, ref_files,
                 candidate_model, grader_model,
                 api_key_text, exact_weight, rubric_weight):
    """
    Full generate+grade path.
    Saves session state so we can regrade later.
    """
    logs = []
    def log(x): logs.append(str(x))
    try:
        cli = _ensure_client_from_key(api_key_text); log("OpenAI client ready.")

        if not prompt_file or not rubric_file:
            return (None, None, "", pd.DataFrame(), None, "\n".join(logs + ["Please upload both a prompt and a rubric."]),
                    "", "", [], "")  # <- states

        prompt_path = _fpath(prompt_file)
        rubric_path = _fpath(rubric_file)
        prompt_text = _read_text_file(prompt_path)
        rubric_raw = _read_json_file(rubric_path)
        if not isinstance(rubric_raw, list) or not all(("criterion" in x and "score" in x) for x in rubric_raw):
            return (None, None, "", pd.DataFrame(), None, "\n".join(logs + ["Rubric must be a JSON list of {criterion, score}."]),
                    "", "", [], "")

        prompt_name = Path(prompt_path).name if prompt_path else ""
        rubric_name = Path(rubric_path).name if rubric_path else ""
        ref_names = [Path(_fpath(r)).name for r in (ref_files or []) if _fpath(r)]

        log(f"Loaded prompt ({len(prompt_text)} chars) and rubric ({len(rubric_raw)} items).")

        # Attachments (optional) for candidate generation
        vs_id = _create_vs_and_upload_refs(cli, ref_files); log(f"Vector store ready: {vs_id}")
        log("Attachments processed (unsupported types auto-converted to .txt for retrieval).")


        # Candidate generation
        candidate_answer = generate_candidate_answer(prompt_text, candidate_model=candidate_model); log("Candidate answer generated.")

        # Grading
        sample = {"prompt": prompt_text, "reference": ""}
        final_norm, details = multigrader(
            sample, candidate_answer, rubric_raw,
            grader_model=grader_model,
            exact_weight=float(exact_weight), rubric_weight=float(rubric_weight),
        )
        log("Grading complete.")

        items_raw = (details or {}).get("rubric_details", {}).get("items", [])
        items_df = _build_items_table(items_raw)
        signed_total = float((details or {}).get("rubric_total", 0.0))
        final_norm = float(final_norm)

        # PDF path (Downloads by default if present)
        out_dir = _out_dir()
        ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        pdf_path = out_dir / f"grading_report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
        _make_pdf(pdf_path, {
            "timestamp": ts,
            "candidate_model": candidate_model,
            "grader_model": grader_model,
            "prompt_name": prompt_name,
            "rubric_name": rubric_name,
            "ref_names": ref_names,
            "candidate_answer": candidate_answer,
            "items_df": items_df,
            "signed_total": signed_total,
            "normalized": final_norm,
        })
        log(f"PDF created: {pdf_path}")

        # Return outputs + session state for regrading
        return (final_norm, signed_total, candidate_answer, items_df, str(pdf_path), "\n".join(logs),
                prompt_text, prompt_name, ref_names, candidate_answer)

    except Exception as e:
        tb = traceback.format_exc(limit=2)
        logs.append(f"ERROR: {e}\n{tb}")
        return (None, None, "", pd.DataFrame(), None, "\n".join(logs),
                "", "", [], "")

def regrade_pipeline(rubric_file, grader_model, api_key_text,
                     exact_weight, rubric_weight,
                     saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer,
                     override_answer):
    """
    Regrade the *existing* candidate answer (no regeneration).
    - Uses the rubric file provided (you can swap it if you like).
    - If `override_answer` is non-empty, that text is graded instead of the saved answer.
    """
    logs = []
    def log(x): logs.append(str(x))
    try:
        _ensure_client_from_key(api_key_text); log("OpenAI client ready (regrade).")

        if not rubric_file:
            return (None, None, saved_candidate_answer or "", pd.DataFrame(), None,
                    "\n".join(logs + ["Please provide a rubric file to regrade."]),
                    saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer)

        if not (override_answer or saved_candidate_answer):
            return (None, None, "", pd.DataFrame(), None,
                    "\n".join(logs + ["No candidate answer available to regrade. Run a full generation first or paste an override."]),
                    saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer)

        rubric_path = _fpath(rubric_file)
        rubric_raw = _read_json_file(rubric_path)
        if not isinstance(rubric_raw, list) or not all(("criterion" in x and "score" in x) for x in rubric_raw):
            return (None, None, saved_candidate_answer or "", pd.DataFrame(), None,
                    "\n".join(logs + ["Rubric must be a JSON list of {criterion, score}."]),
                    saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer)

        rubric_name = Path(rubric_path).name if rubric_path else ""
        candidate_answer = override_answer if (override_answer and override_answer.strip()) else saved_candidate_answer
        prompt_text = saved_prompt_text or ""

        # Grade (no generation)
        sample = {"prompt": prompt_text, "reference": ""}
        final_norm, details = multigrader(
            sample, candidate_answer, rubric_raw,
            grader_model=grader_model,
            exact_weight=float(exact_weight), rubric_weight=float(rubric_weight),
        )
        log("Regrade complete.")

        items_raw = (details or {}).get("rubric_details", {}).get("items", [])
        items_df = _build_items_table(items_raw)
        signed_total = float((details or {}).get("rubric_total", 0.0))
        final_norm = float(final_norm)

        # PDF (label as regrade)
        out_dir = _out_dir()
        ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        pdf_path = out_dir / f"grading_regrade_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
        _make_pdf(pdf_path, {
            "timestamp": ts,
            "candidate_model": "(regrade - unchanged)",  # candidate model irrelevant on regrade
            "grader_model": grader_model,
            "prompt_name": saved_prompt_name,
            "rubric_name": rubric_name,
            "ref_names": saved_ref_names or [],
            "candidate_answer": candidate_answer,
            "items_df": items_df,
            "signed_total": signed_total,
            "normalized": final_norm,
        })
        log(f"Regrade PDF created: {pdf_path}")

        # Persist the possibly overridden answer
        new_saved_answer = candidate_answer

        return (final_norm, signed_total, candidate_answer, items_df, str(pdf_path), "\n".join(logs),
                saved_prompt_text, saved_prompt_name, saved_ref_names, new_saved_answer)

    except Exception as e:
        tb = traceback.format_exc(limit=2)
        logs.append(f"ERROR: {e}\n{tb}")
        return (None, None, saved_candidate_answer or "", pd.DataFrame(), None, "\n".join(logs),
                saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer)

# ---------- Build the Gradio app ----------
default_candidate_models = [
    "gpt-4o-mini",
    "gpt-4.1",
    "o4-mini-2025-04-16",
    "o3-mini-2025-01-31",
]
default_grader_models = [
    "gpt-4.1",
    "gpt-4o-mini",
    "o4-mini-2025-04-16",
]

with gr.Blocks(title="Rubric Grader") as demo:
    gr.Markdown("## Rubric Grader\nUpload your **prompt** (.txt/.md), **rubric** (.json), and any **attachments**. Choose models and click **Generate & Grade**. Then you can **Regrade** the same answer with a different grader model.")

    with gr.Row():
        prompt_file = gr.File(label="Prompt (.txt / .md)", file_types=[".txt", ".md"], type="filepath")
        rubric_file = gr.File(label="Rubric (.json)", file_types=[".json"], type="filepath")
        ref_files   = gr.Files(label="Attachments for generation (optional)", file_count="multiple")

    with gr.Row():
        candidate_model = gr.Dropdown(default_candidate_models, label="Candidate model", value=default_candidate_models[0], allow_custom_value=True)
        grader_model    = gr.Dropdown(default_grader_models,    label="Grader model",    value=default_grader_models[0], allow_custom_value=True)

    with gr.Accordion("Advanced / Auth", open=False):
        api_key_text = gr.Textbox(label="OpenAI API Key (optional if set in env)", type="password", placeholder="sk-...")
        with gr.Row():
            exact_weight  = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Exact-match weight")
            rubric_weight = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Rubric weight")

    # Primary actions
    with gr.Row():
        run_btn = gr.Button("Generate & Grade", variant="primary")
        regrade_btn = gr.Button("Regrade current answer (skip generation)")

    # Outputs
    with gr.Row():
        final_norm  = gr.Number(label="Final score (normalized 0..1)")
        total_signed = gr.Number(label="Total score (signed)")
    sample_response = gr.Textbox(label="Sample response", lines=12, interactive=False)
    items_table_out = gr.Dataframe(label="Per-criterion", interactive=False, wrap=True)
    pdf_out = gr.File(label="Download final deliverable (PDF)")
    logs_out = gr.Textbox(label="Run log", lines=6)

    # Optional override for regrading:
    with gr.Accordion("Regrade options", open=False):
        override_answer = gr.Textbox(label="Override candidate answer (optional)", lines=8, placeholder="Paste a candidate answer here to regrade it without regenerating.")

    # ---- Session state (hidden) ----
    saved_prompt_text = gr.State("")
    saved_prompt_name = gr.State("")
    saved_ref_names   = gr.State([])
    saved_candidate_answer = gr.State("")

    # Wiring: Generate & Grade
    run_btn.click(
        fn=run_pipeline,
        inputs=[prompt_file, rubric_file, ref_files,
                candidate_model, grader_model,
                api_key_text, exact_weight, rubric_weight],
        outputs=[final_norm, total_signed, sample_response, items_table_out, pdf_out, logs_out,
                 saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer],
    )

    # Wiring: Regrade (no generation)
    regrade_btn.click(
        fn=regrade_pipeline,
        inputs=[rubric_file, grader_model, api_key_text,
                exact_weight, rubric_weight,
                saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer,
                override_answer],
        outputs=[final_norm, total_signed, sample_response, items_table_out, pdf_out, logs_out,
                 saved_prompt_text, saved_prompt_name, saved_ref_names, saved_candidate_answer],
    )

demo.launch(share=False)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


