# Agentic Explainability Workflow – Usage

This notebook runs the agentic explainability pipeline: ask a natural-language question about optimization results, get a counterfactual run and a summary (trade-offs or infeasibility conflict).

**Prerequisites:**
- `config/secrets.env` with `OPENAI_API_KEY` and `GUROBI_LICENSE_FILE=config/WLS-dev-key.lic`
- Run the first two sections once to create the baseline and RAG index; then you can run only the workflow section.

## 1. Setup paths and load secrets

In [20]:
# check Jupyter kernel ID
import sys, os
print("PID:", os.getpid())
import pprint
pprint.pprint({"Kernel ID": sys.argv})

PID: 38325
{'Kernel ID': ['/Users/Larry.Jin/miniconda3/envs/rag/lib/python3.11/site-packages/ipykernel_launcher.py',
               '--f=/Users/Larry.Jin/Library/Jupyter/runtime/kernel-v3492970f19098298ea80574fb0c222b563e691e6e.json']}


In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Project root: directory that contains agentic_explain and use_case
def find_project_root():
    for start in [Path.cwd()] + list(Path.cwd().parents):
        if (start / "agentic_explain").is_dir() and (start / "use_case").is_dir():
            return start
    return Path.cwd()

PROJECT_ROOT = find_project_root()

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from config.load_secrets import load_secrets, get_gurobi_env_kwargs

load_secrets()  # load from config/secrets.env or project root
print("Project root:", PROJECT_ROOT)

Project root: /Users/Larry.Jin/Documents/research/agent_explain


## 2. Run baseline (once)

Saves `outputs/baseline_result.json`, `outputs/model.lp`, `outputs/model.mps`. Skip this cell if you already have them.

In [2]:
import json
from gurobipy import GRB

from use_case.staffing_model import load_raw_data, process_data, build_gurobi_model

from use_case.staffing_model import STAFFING_DATA_DIR, STAFFING_OUTPUTS_DIR
DATA_DIR = STAFFING_DATA_DIR
OUTPUTS_DIR = STAFFING_OUTPUTS_DIR
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

# Controller: if baseline outputs already exist, skip running Gurobi
SKIP_IF_BASELINE_EXISTS = True  # Set this to False to force rerun

baseline_result_path = OUTPUTS_DIR / "baseline_result.json"
model_lp_path = OUTPUTS_DIR / "model.lp"
model_mps_path = OUTPUTS_DIR / "model.mps"

raw = load_raw_data(DATA_DIR)
inputs = process_data(
    raw["fte_mapping"],
    raw["concurrent_projects"],
    raw["oversight_ds_list"],
    raw["ds_list"],
    raw["project_list"],
)
env_kwargs = get_gurobi_env_kwargs()

def baseline_outputs_exist():
    return baseline_result_path.exists() and model_lp_path.exists() and model_mps_path.exists()

if SKIP_IF_BASELINE_EXISTS and baseline_outputs_exist():
    print("Skipping Gurobi model run: baseline outputs already exist in", OUTPUTS_DIR)
else:
    model = build_gurobi_model(inputs, env_kwargs)
    model.setParam(GRB.Param.TimeLimit, 100)
    model.optimize()

    if model.status in (GRB.OPTIMAL, GRB.TIME_LIMIT):
        baseline_result = {
            "status": "optimal" if model.status == GRB.OPTIMAL else "time_limit",
            "objective_value": model.ObjVal,
            "decision_variables": {v.VarName: v.X for v in model.getVars()},
        }
        with open(baseline_result_path, "w", encoding="utf-8") as f:
            json.dump(baseline_result, f, indent=2)
        model.write(str(model_lp_path))
        model.write(str(model_mps_path))
        print("Baseline saved to", OUTPUTS_DIR)
    else:
        print("Model status:", model.status)

Skipping Gurobi model run: baseline outputs already exist in /Users/Larry.Jin/Documents/research/agent_explain/use_case/staffing_model/outputs


## 3. Build RAG index (once)

Builds `outputs/rag_index/` from the formulation .py, .lp, .mps, and data. Skip if already built.

In [3]:
from agentic_explain.rag.build_index import build_rag_index
import os

py_path = PROJECT_ROOT / "use_case" / "staffing_model" / "staffing_model.py"
lp_path = OUTPUTS_DIR / "model.lp"
mps_path = OUTPUTS_DIR / "model.mps"
rag_index_dir = OUTPUTS_DIR / "rag_index"

# Default: only rebuild if index files are missing
REBUILD_RAG_INDEX = False  # set to True to force rebuild

# To skip rebuilding if already built, check for the LlamaIndex SimpleVectorStore files
def rag_index_exists(rag_index_dir):
    if not rag_index_dir.exists():
        return False
    expected = [
        rag_index_dir / "docstore.json",
        rag_index_dir / "default__vector_store.json",
        rag_index_dir / "index_store.json",
    ]
    return all(p.exists() for p in expected)

if REBUILD_RAG_INDEX or not rag_index_exists(rag_index_dir):
    build_rag_index(
        py_path=py_path,
        lp_path=lp_path if lp_path.exists() else None,
        mps_path=mps_path if mps_path.exists() else None,
        data_dir=DATA_DIR,
        persist_dir=rag_index_dir,
    )
    print("RAG index built at", rag_index_dir)
else:
    print(f"Skipping RAG build: index already exists at {rag_index_dir}")


Skipping RAG build: index already exists at /Users/Larry.Jin/Documents/research/agent_explain/use_case/staffing_model/outputs/rag_index


### 3a. Inspect RAG Chunks

Visualize every chunk that was indexed, grouped by **source** (py, lp, mps, index_mapping).
Each chunk is shown with its metadata and text (word-wrapped for readability).

In [4]:
import textwrap
from collections import Counter
from agentic_explain.rag.build_index import collect_raw_chunks

py_path  = PROJECT_ROOT / "use_case" / "staffing_model" / "staffing_model.py"
lp_path  = OUTPUTS_DIR / "model.lp"
mps_path = OUTPUTS_DIR / "model.mps"

raw_chunks = collect_raw_chunks(
    py_path=py_path,
    lp_path=lp_path if lp_path.exists() else None,
    mps_path=mps_path if mps_path.exists() else None,
    data_dir=DATA_DIR,
)

# Summary table
source_counts = Counter(c["metadata"].get("source", "?") for c in raw_chunks)
section_counts = Counter(
    f"{c['metadata'].get('source','?')}/{c['metadata'].get('section','?')}"
    for c in raw_chunks
)

print(f"Total chunks: {len(raw_chunks)}\n")
print("By source:")
for src, cnt in sorted(source_counts.items()):
    print(f"  {src:20s} {cnt:4d} chunks")
print("\nBy source/section:")
for key, cnt in sorted(section_counts.items()):
    print(f"  {key:40s} {cnt:4d} chunks")

Total chunks: 18298

By source:
  index_mapping           1 chunks
  lp                   18265 chunks
  py                     32 chunks

By source/section:
  index_mapping/index_mapping                 1 chunks
  lp/bounds                                   1 chunks
  lp/constraints                           18262 chunks
  lp/objective                                1 chunks
  lp/variables                                1 chunks
  py/constraints                              9 chunks
  py/functions                                3 chunks
  py/index_mapping                            4 chunks
  py/objectives                               8 chunks
  py/overview                                 1 chunks
  py/problem_overview                         1 chunks
  py/variables                                6 chunks


In [5]:
# === Browse chunks: pick a source to sample ===
# Change SOURCE_FILTER to inspect different sources: "py", "lp", "mps", "index_mapping", or None for all
SOURCE_FILTER = "index_mapping"       # <-- change me
MAX_DISPLAY   = 10         # how many chunks to show

filtered = [c for c in raw_chunks if SOURCE_FILTER is None or c["metadata"].get("source") == SOURCE_FILTER]
print(f"Showing {min(MAX_DISPLAY, len(filtered))} of {len(filtered)} chunks (source={SOURCE_FILTER or 'all'})\n")

for i, chunk in enumerate(filtered[:MAX_DISPLAY]):
    meta = chunk["metadata"]
    text = chunk["text"]
    # Header
    print(f"{'─' * 80}")
    print(f"Chunk {i}  |  source={meta.get('source')}  section={meta.get('section')}")
    extra_keys = {k: v for k, v in meta.items() if k not in ("source", "section", "path")}
    if extra_keys:
        print(f"         |  {extra_keys}")
    print(f"         |  chars={len(text)}")
    print(f"{'─' * 80}")
    # Word-wrapped text (first 600 chars if very long)
    display_text = text if len(text) <= 1000 else text[:1000] + "\n... [truncated]"
    print(textwrap.fill(display_text, width=100, subsequent_indent="  "))
    print()

Showing 1 of 1 chunks (source=index_mapping)

────────────────────────────────────────────────────────────────────────────────
Chunk 0  |  source=index_mapping  section=index_mapping
         |  chars=991
────────────────────────────────────────────────────────────────────────────────
Index mapping (same model in .py, .lp, .mps): j = employee index (0 to n_employees-1):   j=0: Josh
  (Lead)   j=1: Utsav (Junior)   j=2: Shivarjun (Junior)   j=3: Nancy (Senior)   j=4: Larry
  (Manager)   j=5: Yimin (Lead)   j=6: Minnie (Junior)   j=7: Jianchen (Senior)   j=8: Stefano
  (Senior)   j=9: Yash (Manager)   j=10: Bhavya (Senior)   j=11: Arpit (Junior)   j=12: Jason
  (Junior)   j=13: Sruti (Senior) d = project index (0 to n_projects-1):   d=0: PSO Base development
  d=1: PSO v8 migration   d=2: PSO product demo   d=3: Paa PSO COE   d=4: Cgg PSO COE   d=5: Foo PSO
  Pilot   d=6: Nuu PSO COE   d=7: Saa PSO Pilot   d=8: IO Base development   d=9: IO product demo
  d=10: Ipp IO Pilot   d=11: Moo I

### 3b. Inspect Persisted Index: Docstore Nodes & Embedding Vectors

After LlamaIndex builds the index, it sub-chunks your documents into smaller nodes and embeds each one.
The persisted files (`docstore.json`, `default__vector_store.json`) are single-line JSON and too large to open in an IDE.
This cell **samples** a few nodes and their embeddings so you can inspect them here.

> **Note**: `docstore.json` (31 MB, 18k+ nodes) and `default__vector_store.json` (646 MB, 18k embeddings × 1536 dims) are written by LlamaIndex in compact single-line JSON. The cell below loads and pretty-prints samples without modifying the files.

In [6]:
import json, textwrap, random

rag_dir = OUTPUTS_DIR / "rag_index"
NUM_SAMPLES = 5  # how many nodes/vectors to show

# ── 1. Docstore: sample text nodes ──────────────────────────────────────────
with open(rag_dir / "docstore.json", "r") as f:
    docstore = json.load(f)

doc_data = docstore.get("docstore/data", {})
doc_ids = list(doc_data.keys())
print(f"Docstore: {len(doc_ids)} nodes total\n")

sample_ids = random.sample(doc_ids, min(NUM_SAMPLES, len(doc_ids)))
for i, nid in enumerate(sample_ids):
    entry = doc_data[nid]
    d = entry.get("__data__", entry)
    text = d.get("text", "")
    meta = d.get("metadata", {})
    source = meta.get("source", "?")
    section = meta.get("section", "?")
    print(f"{'─' * 90}")
    print(f"  Node {i}  id={nid[:12]}...  source={source}  section={section}")
    print(f"  chars={len(text)}  metadata_keys={list(meta.keys())}")
    print(f"{'─' * 90}")
    display = text if len(text) <= 400 else text[:400] + "\n  ... [truncated]"
    print(textwrap.fill(display, width=95, initial_indent="  ", subsequent_indent="  "))
    print()

# ── 2. Vector store: sample embeddings ──────────────────────────────────────
# Stream-parse to avoid loading 646MB into memory all at once
# We just need a few sample keys from embedding_dict
print(f"{'=' * 90}")
print("  Sampling from default__vector_store.json (embeddings)")
print(f"{'=' * 90}\n")

# Load only the metadata and text_id mapping (small), and sample embedding keys
vs_path = rag_dir / "default__vector_store.json"
# Read the full file — it's large but we only extract what we need
with open(vs_path, "r") as f:
    vs_data = json.load(f)

emb_dict = vs_data.get("embedding_dict", {})
text_to_doc = vs_data.get("text_id_to_ref_doc_id", {})
meta_dict = vs_data.get("metadata_dict", {})
emb_ids = list(emb_dict.keys())
print(f"Total embeddings: {len(emb_ids)}")
if emb_ids:
    dim = len(emb_dict[emb_ids[0]])
    print(f"Embedding dimension: {dim}\n")

sample_emb_ids = random.sample(emb_ids, min(NUM_SAMPLES, len(emb_ids)))
for i, eid in enumerate(sample_emb_ids):
    vec = emb_dict[eid]
    ref_doc = text_to_doc.get(eid, "?")
    meta = meta_dict.get(eid, {})
    # Look up the text from docstore
    doc_entry = doc_data.get(eid, {})
    doc_d = doc_entry.get("__data__", doc_entry) if doc_entry else {}
    node_text = doc_d.get("text", "(not in docstore)")

    print(f"{'─' * 90}")
    print(f"  Embedding {i}  id={eid[:12]}...")
    print(f"  ref_doc_id={ref_doc[:12]}...  metadata={meta}")
    print(f"  vector (first 8 dims): {[round(v, 6) for v in vec[:8]]}...")
    print(f"  vector (last  4 dims): ...{[round(v, 6) for v in vec[-4:]]}")
    text_preview = node_text if len(node_text) <= 200 else node_text[:200] + "..."
    print(f"  text: {text_preview}")
    print()

del vs_data, emb_dict  # free memory

Docstore: 18533 nodes total

──────────────────────────────────────────────────────────────────────────────────────────
  Node 0  id=a2b11031-293...  source=lp  section=constraints
  chars=77  metadata_keys=['source', 'section', 'constraint_name', 'path']
──────────────────────────────────────────────────────────────────────────────────────────
  Constraint: indicator_constraint_0_2_1_6 - 1e+08 x[2,1,6] + x_ind[2,1,6] <= 0

──────────────────────────────────────────────────────────────────────────────────────────
  Node 1  id=6ae5a987-820...  source=lp  section=constraints
  chars=77  metadata_keys=['source', 'section', 'constraint_name', 'path']
──────────────────────────────────────────────────────────────────────────────────────────
  Constraint: indicator_constraint_0_5_0_2 - 1e+08 x[5,0,2] + x_ind[5,0,2] <= 0

──────────────────────────────────────────────────────────────────────────────────────────
  Node 2  id=abae8f1e-4f7...  source=lp  section=constraints
  chars=77  metadata_

## 4. Run the workflow

Load baseline and RAG index, then run the agentic workflow for a natural-language query.

In [7]:
from openai import OpenAI

from use_case.staffing_model import build_gurobi_model
from agentic_explain.rag.plain_rag import build_plain_rag
from agentic_explain.workflow.graph import create_workflow, invoke_workflow

# Load baseline and build Plain RAG strategy
with open(OUTPUTS_DIR / "baseline_result.json", "r", encoding="utf-8") as f:
    baseline_result = json.load(f)

rag_strategy = build_plain_rag(
    py_path=PROJECT_ROOT / "use_case" / "staffing_model" / "staffing_model.py",
    lp_path=OUTPUTS_DIR / "model.lp",
    mps_path=OUTPUTS_DIR / "model.mps",
    data_dir=DATA_DIR,
    persist_dir=OUTPUTS_DIR / "rag_index",
)
openai_client = OpenAI()

LLM_TEMPERATURE = 0  # 0 = deterministic; increase for more varied summaries

workflow = create_workflow(
    openai_client=openai_client,
    rag_strategy=rag_strategy,
    baseline_result=baseline_result,
    data_dir=str(DATA_DIR),
    build_model_fn=build_gurobi_model,
    inputs=inputs,
    env_kwargs=env_kwargs,
    outputs_dir=str(OUTPUTS_DIR),
    temperature=LLM_TEMPERATURE,
)

In [8]:
# # Example query: counterfactual ("why not")
# user_query = "Why was Josh not staffed on Ipp IO Pilot in week 6?"

# final_state = invoke_workflow(
#     workflow,
#     user_query,
#     baseline_result=baseline_result,
# )

In [9]:
# print("Query:", user_query)
# print()
# print("Summary:")
# import textwrap
# print(textwrap.fill(
#     final_state.get("final_summary", "(no summary)"), 
#     width=80
#     ))

# # Optional: print full debug (retrieval, LLM messages, applied constraints, comparison)
# # from agentic_explain.workflow import debug
# # debug.print_workflow_summary(final_state)

## 5. Evaluation Dataset

Load the evaluation query dataset (`use_case/staffing_model/queries.json`) and run individual queries.
Each query has a **reference answer** for LLM-as-judge evaluation.

| Category | Count | Description |
|----------|-------|-------------|
| `objective / missing_demand` | 7 | Why is project X understaffed in week Y? |
| `objective / idle_time` | 3 | Why is employee X idle in week Y? |
| `objective / staffing_consistency` | 2 | Why is employee X on project Y? |
| `objective / out_of_cohort_penalty` | 2 | Why is employee X (cohort A) on project Y (cohort B)? |
| `constraint / max_concurrency` | 5 | Why is employee X not on project Y? (at concurrency limit) |
| `constraint / demand_balance_inactive` | 4 | Why is employee X not on project Y in week Z? (project inactive) |
| `constraint / oversight_requirement` | 2 | Oversight-related staffing questions |
| `constraint / employee_allocation` | 1 | Capacity (100%) constraint |
| `mixed` | 3 | Peak crunch, specific employee requirements |

In [10]:
from use_case.staffing_model import STAFFING_QUERIES_PATH
import json

eval_path = STAFFING_QUERIES_PATH
with open(eval_path, "r", encoding="utf-8") as f:
    eval_queries = json.load(f)

print(f"Loaded {len(eval_queries)} evaluation queries\n")

# Preview all queries
for i, q in enumerate(eval_queries):
    path_marker = "F" if q["expected_path"] == "feasible" else "I"
    print(f"  [{i:2d}] [{path_marker}] {q['category']:10s} / {q['subcategory']:28s}  {q['query']}")

Loaded 29 evaluation queries

  [ 0] [F] objective  / missing_demand                Why is IO Base development understaffed in week 10?
  [ 1] [F] objective  / missing_demand                Why is PSO Base development missing demand in week 15?
  [ 2] [F] objective  / missing_demand                Why is Saa DF Pilot not fully staffed in week 14?
  [ 3] [F] objective  / missing_demand                Why is DF Base development understaffed in week 16?
  [ 4] [F] objective  / missing_demand                Why is Foo PSO Pilot understaffed in week 11?
  [ 5] [F] objective  / missing_demand                Why is Saa PSO Pilot not fully staffed in week 19?
  [ 6] [F] objective  / missing_demand                Why does IO Base development have unmet demand in week 20?
  [ 7] [F] objective  / idle_time                     Why is Yimin idle in week 0?
  [ 8] [F] objective  / idle_time                     Why is Sruti idle in week 3?
  [ 9] [F] objective  / idle_time                     Why is 

In [11]:
# === Pick a query by index and run the workflow ===
QUERY_INDEX = 0  # <-- change this to run a different query

q = eval_queries[QUERY_INDEX]
print(f"[{q['id']}]  {q['category']} / {q['subcategory']}")
print(f"Query:    {q['query']}")
print(f"Expected: path={q['expected_path']}, expr={q['expected_constraint_expr']}")
print("=" * 80)

final_state = invoke_workflow(
    workflow,
    q["query"],
    baseline_result=baseline_result,
)

actual_path = final_state.get("counterfactual_status", "unknown")
actual_answer = final_state.get("final_summary", "(no summary)")
actual_exprs = final_state.get("constraint_expressions", [])

[obj1_missing_demand_01]  objective / missing_demand
Query:    Why is IO Base development understaffed in week 10?
Expected: path=feasible, expr=d_miss[10,8] == 0
Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2678051
WLS license 2678051 - registered to C3.ai
Set parameter TimeLimit to value 100
Gurobi Optimizer version 13.0.1 build v13.0.1rc0 (mac64[arm] - Darwin 25.2.0 25C56)

CPU model: Apple M3 Max
Thread count: 16 physical cores, 16 logical processors, using up to 16 threads

Non-default parameters:
TimeLimit  100

WLS license 2678051 - registered to C3.ai
Optimize a model with 18263 rows, 17260 columns and 83921 nonzeros (Min)
Model fingerprint: 0xf48b1980
Model has 1244 linear objective coefficients
Variable types: 8944 continuous, 8316 integer (8316 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+08]
  Objective range  [6e-01, 2e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [5e-01, 6e+00]

Found heuristic solution: o

In [12]:
import textwrap

print(f"\n--- ACTUAL ---")
print(f"Path:        {actual_path}  {'MATCH' if actual_path == q['expected_path'] else 'MISMATCH'}")
print(f"Constraints: {actual_exprs}")
wrapped_actual_answer = textwrap.fill(actual_answer, width=95, initial_indent="             ", subsequent_indent="             ") if actual_answer else actual_answer
print(f"Answer:      {wrapped_actual_answer}")
print(f"\n--- REFERENCE ---")
print(f"Path:        {q['expected_path']}")
print(f"Constraint:  {q['expected_constraint_expr']}")
print(f"Theme:       {q['expected_answer_theme']}")
wrapped_reference_answer = textwrap.fill(q['reference_answer'], width=95, initial_indent="             ", subsequent_indent="             ") if q.get('reference_answer') else q['reference_answer']
print(f"Answer:      {wrapped_reference_answer}")


--- ACTUAL ---
Path:        feasible  MATCH
Constraints: ['d_miss[10,8] == 0']
Answer:                   The user's change resulted in a slight worsening of the total objective by
             approximately 1.6%, increasing from 129.81 to 131.85. The most significant changes
             were observed in the cost of missing demand, which increased by 3.8%, and idle
             time, which rose by 3.1%. This indicates that while some staffing adjustments may
             have improved project allocations, they also led to higher unmet demand and
             increased idle time for employees. The trade-off here is that although the out-of-
             cohort penalty decreased significantly (by 50%), the overall impact was negative
             due to the increased costs associated with unmet staffing needs and idle
             resources.

--- REFERENCE ---
Path:        feasible
Constraint:  d_miss[10,8] == 0
Theme:       trade-off: low understaffing cost vs higher-cost projects
Answ

### 5. Debug & Evaluation

For **quick debug**: uncomment and run `from agentic_explain.workflow import debug; debug.print_workflow_summary(final_state)` in the cell above to print retrieval, LLM messages, applied constraints, and comparison in one go.

For **comparing Plain RAG vs Graph RAG vs No-RAG**, use `notebooks/RAGComparison.ipynb`.

### 5a. Retrieval Debug: Retrieved Chunks & Scores

After running a query above, inspect which RAG chunks were retrieved and their relevance scores.

In [13]:
import textwrap

rag_debug = final_state.get("rag_retrieval_debug", {})

for stage_name, info in rag_debug.items():
    if stage_name == "strategy" or not isinstance(info, dict):
        continue
    print(f"{'=' * 90}")
    print(f"  Stage: {stage_name}")
    print(f"  Retrieval query: {info.get('query', '?')}")
    print(f"  Top-k: {info.get('top_k', '?')}")
    if "iis_constraint_names" in info:
        print(f"  IIS constraints: {info['iis_constraint_names']}")
    print(f"{'=' * 90}")

    for i, chunk in enumerate(info.get("chunks", [])):
        score = chunk.get("score")
        meta = chunk.get("metadata", {})
        text = chunk.get("text", "")
        print(f"\n  ── Chunk {i}  score={score:.4f}" if score is not None else f"\n  ── Chunk {i}  score=N/A")
        print(f"     source={meta.get('source', '?')}  section={meta.get('section', '?')}")
        extra = {k: v for k, v in meta.items() if k not in ("source", "section", "path")}
        if extra:
            print(f"     {extra}")
        # Show text (truncated & wrapped)
        display = text if len(text) <= 500 else text[:500] + "\n     ... [truncated]"
        wrapped = textwrap.fill(display, width=95, initial_indent="     ", subsequent_indent="     ")
        print(wrapped)
    print()

  Stage: constraint_generation
  Retrieval query: Force d=8 to be adequately staffed in t=10.
  Top-k: 5

  ── Chunk 0  score=0.8034
     source=lp  section=constraints
     {'constraint_name': 'staffed_indicator_0_8_10'}
     Constraint: staffed_indicator_0_8_10 - x[8,0,10] - x[8,1,10] - x[8,2,10] - x[8,3,10]

  ── Chunk 1  score=0.8032
     source=py  section=constraints
     {'constraint_name': 'demand_balance'}
     ### demand_balance Name: Demand Balance. For each project d and week t: sum_j x[j,t,d] *
     F_j + d_miss[t,d] = D[d,t]. Description: Total staffing (allocations weighted by employee
     FTE) plus unmet demand equals required demand. Business context: Core balance of supply
     and demand. If we cannot meet demand, d_miss captures the shortage. Variables: x, d_miss.

  ── Chunk 2  score=0.8030
     source=lp  section=constraints
     {'constraint_name': 'staffed_indicator_1_8_10'}
     Constraint: staffed_indicator_1_8_10 - x[8,0,10] - x[8,1,10] - x[8,2,10] - x[8,3,1

### 5b. LLM Messages Debug

The exact system + user messages sent to the LLM at each RAG-augmented stage, and the raw response.

In [14]:
CHARS_TO_SHOW = 3000
llm_debug = final_state.get("llm_messages_debug", {})

for stage_name, info in llm_debug.items():
    print(f"{'=' * 90}")
    print(f"  Stage: {stage_name}")
    print(f"{'=' * 90}")

    print(f"\n  [SYSTEM MESSAGE]")
    sys_msg = info.get("system", "")
    print(textwrap.fill(sys_msg, width=95, initial_indent="  ", subsequent_indent="  "))

    print(f"\n  [USER MESSAGE]  (first {CHARS_TO_SHOW} chars)")
    user_msg = info.get("user", "")
    display_user = user_msg if len(user_msg) <= CHARS_TO_SHOW else user_msg[:CHARS_TO_SHOW] + "\n  ... [truncated]"
    print(textwrap.fill(display_user, width=95, initial_indent="  ", subsequent_indent="  "))

    print(f"\n  [RAW LLM RESPONSE]")
    raw = info.get("raw_response", "")
    print(textwrap.fill(raw, width=95, initial_indent="  ", subsequent_indent="  "))
    print()

  Stage: constraint_generation

  [SYSTEM MESSAGE]
  You translate a user request into one or more constraint expressions for a Gurobi
  optimization model. Available decision variables (use exact names): d_miss, x, x_idle, x_ind,
  x_p_ind. Format: variable_name[index1,index2,...] == value or >= value or <= value. Value can
  be a number (e.g. 0, 1) or a data parameter (e.g. D[t,d] for demand). You may use a single
  variable or a sum of variables (e.g. x[j,t,d] + x[j+1,t,d] + ... >= D[d,t]). Example format:
  d_miss[0,0] == 1, x[0,0] == 1, x_idle[0,0] == 1. For 'force no unmet demand': d_miss[t,d] ==
  0. Use the RAG context below to understand variable dimensions and index meanings. Output
  only the constraint line(s), one per line, no explanation.

  [USER MESSAGE]  (first 3000 chars)
  RAG context: Constraint: staffed_indicator_0_8_10 - x[8,0,10] - x[8,1,10] - x[8,2,10] -
  x[8,3,10] ### demand_balance Name: Demand Balance. For each project d and week t: sum_j
  x[j,t,d] * F_j + 

### 5c. Constraint Parsing Debug

Shows how the raw LLM response was parsed into constraint expressions:
regex matches, which passed/failed variable-name validation, and why.

In [15]:
llm_debug = final_state.get("llm_messages_debug", {})
cg_debug = llm_debug.get("constraint_generation", {})
parsing = cg_debug.get("parsing", {})

print("=" * 90)
print("  Constraint Parsing Debug")
print("=" * 90)

print(f"\n  Raw LLM response (attempt 1):")
print(f"    {cg_debug.get('raw_response', '(not captured)')}")

print(f"\n  Regex matches (attempt 1): {parsing.get('regex_matches_attempt1', parsing.get('regex_matches', '(not captured)'))}")
print(f"  Valid expressions: {parsing.get('valid_expressions', '(not captured)')}")
print(f"  Rejected expressions: {parsing.get('rejected_expressions', '(not captured)')}")
if parsing.get("rejected_reason"):
    print(f"  Rejection reason: {parsing['rejected_reason']}")

if parsing.get("retry_raw_response") is not None:
    print(f"\n  ** RETRY triggered (attempt 1 produced no valid expressions) **")
    print(f"  Retry LLM response:")
    print(f"    {parsing['retry_raw_response']}")
    print(f"  Regex matches (retry): {parsing.get('regex_matches_retry', [])}")

if parsing.get("parse_failure"):
    print(f"\n  ** PARSE FAILURE (even after retry) **")
    print(f"    {parsing['parse_failure']}")

print(f"\n  Final constraint_expressions in state: {final_state.get('constraint_expressions', '(key missing)')}")

  Constraint Parsing Debug

  Raw LLM response (attempt 1):
    d_miss[10,8] == 0

  Regex matches (attempt 1): ['d_miss[10,8] == 0']
  Valid expressions: ['d_miss[10,8] == 0']
  Rejected expressions: []

  Final constraint_expressions in state: ['d_miss[10,8] == 0']


### 5d. Applied Constraints Debug

Which constraints were actually added to the Gurobi model, and what were the baseline values of those variables?

In [16]:
cf_result = final_state.get("counterfactual_result", {})
applied = cf_result.get("applied_constraints", [])

if not applied:
    print("No constraints were applied (check counterfactual_result for errors).")
    if cf_result.get("error"):
        print(f"  Error: {cf_result['error']}")
else:
    print(f"{'=' * 90}")
    print(f"  {len(applied)} constraint(s) added to the counterfactual Gurobi model")
    print(f"{'=' * 90}")
    for i, ac in enumerate(applied):
        baseline_val = ac.get("baseline_value")
        forced_val = ac.get("forced_value")
        bv_str = f"{baseline_val:.4f}" if baseline_val is not None else "N/A"
        direction = ""
        if baseline_val is not None:
            if abs(forced_val - baseline_val) < 1e-8:
                direction = "(no change from baseline)"
            elif forced_val > baseline_val:
                direction = f"(forcing UP from {bv_str})"
            else:
                direction = f"(forcing DOWN from {bv_str})"

        print(f"\n  Constraint {i}:")
        print(f"    Expression:      {ac.get('expr')}")
        print(f"    Gurobi var:      {ac.get('gurobi_var_name')}")
        print(f"    Forced value:    {forced_val}  {direction}")
        print(f"    Baseline value:  {bv_str}")
        print(f"    Var type:        {ac.get('var_type')}  bounds=[{ac.get('var_lb')}, {ac.get('var_ub')}]")
        print(f"    Constr name:     {ac.get('constraint_name')}")

    print(f"\n  Counterfactual status: {final_state.get('counterfactual_status')}")
    cf_obj = cf_result.get("objective_value")
    base_obj = final_state.get("baseline_result", {}).get("objective_value")
    if cf_obj is not None and base_obj is not None:
        print(f"  Baseline obj:          {base_obj:.4f}")
        print(f"  Counterfactual obj:    {cf_obj:.4f}")
        print(f"  Delta:                 {cf_obj - base_obj:+.4f}")

  1 constraint(s) added to the counterfactual Gurobi model

  Constraint 0:
    Expression:      d_miss[10,8] == 0
    Gurobi var:      d_miss[10,8]
    Forced value:    0.0  (forcing DOWN from 1.5000)
    Baseline value:  1.5000
    Var type:        C  bounds=[0.0, inf]
    Constr name:     user_constr_d_miss_10_8

  Counterfactual status: feasible
  Baseline obj:          129.8083
  Counterfactual obj:    131.8528
  Delta:                 +2.0445


### 5e. Detailed Objective Comparison & Variable Changes

The compare node produces a structured breakdown of all four objective terms (baseline vs counterfactual)
and highlights which variables changed to cause the differences.

In [17]:
# Print the full comparison summary (generated by the compare node)
comparison = final_state.get("comparison_summary", "(no comparison)")
print(comparison)

=== OBJECTIVE COMPARISON ===
Term                                    Baseline     Counter.        Delta    %Change
─────────────────────────────────────────────────────────────────────────────────────
cost_of_missing_demand                   56.0750      58.2195      +2.1445      +3.8%
  (Weighted sum of unmet staffing demand across all projects and weeks)
idle_time                                28.7333      29.6333      +0.9000      +3.1%
  (Total employee idle time (FTE-weeks not assigned to any project))
staffing_consistency                     43.0000      43.0000      -0.0000      -0.0%
  (Number of unique employee-project pairings (fewer = less context switching))
out_of_cohort_penalty                     2.0000       1.0000      -1.0000     -50.0%
  (Penalty for assigning employees to projects outside their preferred cohort)
─────────────────────────────────────────────────────────────────────────────────────
TOTAL                                   129.8083     131.8528      +2

In [18]:
# === (Optional) Batch run: evaluate all queries and collect results ===
# Uncomment and run to evaluate the full dataset.
# Results are saved to outputs/eval_results.json for later analysis.

# eval_results = []
# for i, q in enumerate(eval_queries):
#     print(f"\n[{i}/{len(eval_queries)}] {q['id']}: {q['query'][:60]}...")
#     state = invoke_workflow(
#         workflow, q["query"],
#         baseline_result=baseline_result,
#     )
#     eval_results.append({
#         "query_id": q["id"],
#         "query": q["query"],
#         "expected_path": q["expected_path"],
#         "actual_path": state.get("counterfactual_status", "unknown"),
#         "path_match": state.get("counterfactual_status") == q["expected_path"],
#         "expected_constraint_expr": q["expected_constraint_expr"],
#         "actual_constraint_exprs": state.get("constraint_expressions", []),
#         "actual_answer": state.get("final_summary", ""),
#         "reference_answer": q["reference_answer"],
#         "expected_answer_theme": q["expected_answer_theme"],
#     })
#     print(f"  Path: {state.get('counterfactual_status')} "
#           f"{'MATCH' if state.get('counterfactual_status') == q['expected_path'] else 'MISMATCH'}")
#
# with open(OUTPUTS_DIR / "eval_results.json", "w", encoding="utf-8") as f:
#     json.dump(eval_results, f, indent=2)
# 
# n_match = sum(1 for r in eval_results if r["path_match"])
# print(f"\n=== Summary ===")
# print(f"Path match: {n_match}/{len(eval_results)} ({100*n_match/len(eval_results):.0f}%)")
# print(f"Results saved to {OUTPUTS_DIR / 'eval_results.json'}")