# RAG Strategy Comparison

Compare **Plain RAG**, **Graph RAG**, and **No-RAG** on the same queries. Use this notebook for systematic evaluation; for single-query exploration use `AgenticExplainability_Usage.ipynb`.

## 1. Setup

In [None]:
# check Jupyter kernel ID
import sys, os
print("PID:", os.getpid())
import pprint
pprint.pprint({"Kernel ID": sys.argv})

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import json
from pathlib import Path

def find_project_root():
    for start in [Path.cwd()] + list(Path.cwd().parents):
        if (start / "agentic_explain").is_dir() and (start / "use_case").is_dir():
            return start
    return Path.cwd()

PROJECT_ROOT = find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from config.load_secrets import load_secrets, get_gurobi_env_kwargs
from use_case.staffing_model import STAFFING_DATA_DIR, STAFFING_OUTPUTS_DIR, load_raw_data, process_data, build_gurobi_model

load_secrets()
DATA_DIR = STAFFING_DATA_DIR
OUTPUTS_DIR = STAFFING_OUTPUTS_DIR
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

raw = load_raw_data(DATA_DIR)
inputs = process_data(**raw)
env_kwargs = get_gurobi_env_kwargs()

with open(OUTPUTS_DIR / "baseline_result.json", "r", encoding="utf-8") as f:
    baseline_result = json.load(f)

openai_client = __import__("openai").OpenAI()

## 2. Build all three RAG strategies

In [2]:
from agentic_explain.rag import build_plain_rag, build_no_rag, build_graph_rag
from agentic_explain.workflow.graph import create_workflow, invoke_workflow
from use_case.staffing_model import StaffingGraphRAGConfig

REBUILD_PLAIN = False
REBUILD_GRAPH = False
LLM_TEMPERATURE = 0  # 0 = deterministic; increase for more varied summaries

py_path = PROJECT_ROOT / "use_case" / "staffing_model" / "staffing_model.py"
lp_path = OUTPUTS_DIR / "model.lp"
mps_path = OUTPUTS_DIR / "model.mps"
rag_dir = OUTPUTS_DIR / "rag_index"
graph_dir = OUTPUTS_DIR / "graph_rag_index"

plain_strategy = build_plain_rag(py_path, lp_path=lp_path, mps_path=mps_path, data_dir=DATA_DIR, persist_dir=rag_dir, force_rebuild=REBUILD_PLAIN)
no_rag_strategy = build_no_rag(py_path=py_path, data_dir=DATA_DIR)
graph_config = StaffingGraphRAGConfig(DATA_DIR)
graph_strategy = build_graph_rag(graph_config, lp_path=lp_path, persist_dir=graph_dir, force_rebuild=REBUILD_GRAPH)

strategies = {"plain_rag": plain_strategy, "no_rag": no_rag_strategy, "graph_rag": graph_strategy}
print("Strategies built:", list(strategies.keys()))

Strategies built: ['plain_rag', 'no_rag', 'graph_rag']


## 3. Single-query comparison

In [3]:
from use_case.staffing_model import STAFFING_QUERIES_PATH
import json

eval_path = STAFFING_QUERIES_PATH
with open(eval_path, "r", encoding="utf-8") as f:
    all_queries = json.load(f)

# Only use queries that have a reference answer
eval_queries = [q for q in all_queries if q.get("reference_answer")]
print(f"Loaded {len(eval_queries)} evaluation queries (with reference)\n")

# Preview: index, path marker, category, subcategory, query
for i, q in enumerate(eval_queries):
    path_marker = "F" if q.get("expected_path") == "feasible" else "I"
    print(f"  [{i:2d}] [{path_marker}] {q.get('category', '?'):10s} / {q.get('subcategory', '?'):28s}  {q['query']}")

Loaded 29 evaluation queries (with reference)

  [ 0] [F] objective  / missing_demand                Why is IO Base development understaffed in week 10?
  [ 1] [F] objective  / missing_demand                Why is PSO Base development missing demand in week 15?
  [ 2] [F] objective  / missing_demand                Why is Saa DF Pilot not fully staffed in week 14?
  [ 3] [F] objective  / missing_demand                Why is DF Base development understaffed in week 16?
  [ 4] [F] objective  / missing_demand                Why is Foo PSO Pilot understaffed in week 11?
  [ 5] [F] objective  / missing_demand                Why is Saa PSO Pilot not fully staffed in week 19?
  [ 6] [F] objective  / missing_demand                Why does IO Base development have unmet demand in week 20?
  [ 7] [F] objective  / idle_time                     Why is Yimin idle in week 0?
  [ 8] [F] objective  / idle_time                     Why is Sruti idle in week 3?
  [ 9] [F] objective  / idle_time           

In [4]:
# Pick a query by index (change QUERY_INDEX to run a different query)
QUERY_INDEX = 0

q = eval_queries[QUERY_INDEX]
query = q["query"]
print(f"[{q.get('id', '?')}]  {q.get('category', '?')} / {q.get('subcategory', '?')}")
print(f"Query:       {query}")
print(f"Expected:    path={q.get('expected_path', '?')}, expr={q.get('expected_constraint_expr', '?')}")
print("=" * 80)

[obj1_missing_demand_01]  objective / missing_demand
Query:       Why is IO Base development understaffed in week 10?
Expected:    path=feasible, expr=d_miss[10,8] == 0


In [5]:
import textwrap
from agentic_explain.evaluation.run_storage import create_run_dir, save_result

run_dir = create_run_dir(OUTPUTS_DIR)
print(f"Run dir: {run_dir}\n")

results = {}
for name, strategy in strategies.items():
    # debug
    # if name != "plain_rag":
    #     continue
    # debug
    print("\n" + "=" * 40)
    print(f">>> RUNNING STRATEGY: {name} <<<")
    print("=" * 40 + "\n")
    # # Print inputs to create_workflow for debugging
    # print("create_workflow Inputs:")
    # print(f"  openai_client = {openai_client}")
    # print(f"  rag_strategy = {strategy}")
    # print(f"  baseline_result = {baseline_result}")
    # print(f"  data_dir = {str(DATA_DIR)}")
    # print(f"  build_model_fn = {build_gurobi_model}")
    # print(f"  inputs = {inputs}")
    # print(f"  env_kwargs = {env_kwargs}")
    # print(f"  outputs_dir = {str(OUTPUTS_DIR)}")

    workflow = create_workflow(
        openai_client=openai_client,
        rag_strategy=strategy,
        baseline_result=baseline_result,
        data_dir=str(DATA_DIR),
        build_model_fn=build_gurobi_model,
        inputs=inputs,
        env_kwargs=env_kwargs,
        outputs_dir=str(OUTPUTS_DIR),
        temperature=LLM_TEMPERATURE,
    )

    # # Print inputs to invoke_workflow for debugging
    # print("invoke_workflow Inputs:")
    # print(f"  workflow = {workflow}")
    # print(f"  query = {query}")
    # print(f"  baseline_result = {baseline_result}")
    # print(f"  thread_id = {'rag_compare_' + name}")

    state = invoke_workflow(
        workflow,
        query,
        baseline_result=baseline_result,
    )
    results[name] = state
    save_result(
        run_dir, name, QUERY_INDEX, state,
        query_meta={
            "query": query,
            "query_id": q.get("id"),
            "expected_path": q.get("expected_path"),
            "expected_constraint_expr": q.get("expected_constraint_expr"),
            "reference_answer": q.get("reference_answer"),
        },
    )
    print(f"{name}: status={state.get('counterfactual_status')}")
    print("-" * 40)


>>> RUNNING STRATEGY: plain_rag <<<

Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2678051
WLS license 2678051 - registered to C3.ai
Set parameter TimeLimit to value 100
Gurobi Optimizer version 13.0.1 build v13.0.1rc0 (mac64[arm] - Darwin 25.2.0 25C56)

CPU model: Apple M3 Max
Thread count: 16 physical cores, 16 logical processors, using up to 16 threads

Non-default parameters:
TimeLimit  100

WLS license 2678051 - registered to C3.ai
Optimize a model with 18263 rows, 17260 columns and 83921 nonzeros (Min)
Model fingerprint: 0xf48b1980
Model has 1244 linear objective coefficients
Variable types: 8944 continuous, 8316 integer (8316 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+08]
  Objective range  [6e-01, 2e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [5e-01, 6e+00]

Found heuristic solution: objective 781.6652000
Presolve removed 6419 rows and 6063 columns
Presolve time: 0.03s
Presolved: 11844 rows, 11197 columns, 4

In [6]:
print("\nFinal summaries (detailed):")
for name, state in results.items():
    print(f"\n--- {name} ---")
    actual_path = state.get('counterfactual_status', '(no status)')
    actual_exprs = state.get('constraint_expressions', [])
    constraints_str = ', '.join(actual_exprs) if isinstance(actual_exprs, list) and actual_exprs else '(none)'
    actual_answer = state.get('final_summary', '(none)')
    print(f"Path:        {actual_path}  {'MATCH' if actual_path == q.get('expected_path') else 'MISMATCH'}")
    print(f"Constraints: {constraints_str}")
    wrapped_actual_answer = textwrap.fill(
        actual_answer, width=95, initial_indent="             ", subsequent_indent="             "
    ) if actual_answer else actual_answer
    print(f"Answer:      {wrapped_actual_answer}")

# Reference (print once, at the end for consistency)
wrapped_reference_answer = textwrap.fill(
    q.get('reference_answer', q.get('reference', '(no reference found)')),
    width=95,
    initial_indent="             ",
    subsequent_indent="             "
)
print("\n" + "=" * 80)
print("Reference (ground truth):")
print(f"Path:        {q.get('expected_path', '(no expected status/path found)')}")
print(f"Constraint:  {q.get('expected_constraint_expr', '(none)')}")
print(f"Theme:       {q.get('expected_answer_theme', '(none)')}")
print(f"Answer:      {wrapped_reference_answer}")


Final summaries (detailed):

--- plain_rag ---
Path:        feasible  MATCH
Constraints: d_miss[10,8] == 0
Answer:                   The user's change resulted in a slight worsening of the total objective by
             approximately 2.04 units, or 1.6%. The most significant changes were observed in
             the cost of missing demand, which increased by 3.8%, and idle time, which rose by
             3.1%. This indicates that while some staffing adjustments may have improved
             project allocations, they also led to increased unmet demand and idle resources,
             highlighting a trade-off between optimizing project assignments and maintaining
             adequate staffing levels. The reduction in the out-of-cohort penalty by 50%
             suggests that while employees were better aligned with their preferred projects,
             the overall staffing strategy may have inadvertently left some projects
             understaffed, particularly in week 10.

--- n

## 4. Batch evaluation (optional)

In [7]:
# Control: set to True to run batch (all strategies x all queries); progress is printed and results saved per (strategy, query)
RUN_BATCH = False
# To resume a previous run that did not finish: set RESUME_BATCH = True and set BATCH_RUN_DIR to the run folder (e.g. from list below)
RESUME_BATCH = False
BATCH_RUN_DIR = None  # e.g. OUTPUTS_DIR / "runs" / "2025-02-09_a1b2c3d4"

from agentic_explain.evaluation.run_storage import list_run_dirs
from agentic_explain.evaluation.helpers import run_batch_multi_strategy
from agentic_explain.workflow.graph import create_workflow

# List recent run dirs (for resuming: pick one and set BATCH_RUN_DIR)
_recent = list_run_dirs(OUTPUTS_DIR)[:5]
print("Recent run dirs:", [str(p) for p in _recent])

if RUN_BATCH:
    run_dir = BATCH_RUN_DIR if RESUME_BATCH and BATCH_RUN_DIR else None
    run_batch_multi_strategy(
        create_workflow,
        strategies,
        eval_queries,
        baseline_result,
        OUTPUTS_DIR,
        run_dir=run_dir,
        resume=RESUME_BATCH,
        openai_client=openai_client,
        data_dir=str(DATA_DIR),
        build_model_fn=build_gurobi_model,
        inputs=inputs,
        env_kwargs=env_kwargs,
        temperature=LLM_TEMPERATURE,
    )
else:
    print("RUN_BATCH is False; skipping batch. Set RUN_BATCH = True to run.")

## 5. Single-Strategy Debug

Run **one** strategy in isolation and inspect every intermediate step:
retrieval chunks & scores, LLM messages, applied constraints, and objective comparison.

Change `DEBUG_STRATEGY` below to `"plain_rag"`, `"no_rag"`, or `"graph_rag"`.

In [8]:
# === Load debug state: from in-memory run (Section 3) or from stored JSON ===
LOAD_DEBUG_FROM = "memory"  # "memory" = use results / re-run single strategy below; "file" = load from run_dir
DEBUG_STRATEGY = "plain_rag"  # <-- change to "no_rag" or "graph_rag"
# When LOAD_DEBUG_FROM == "file": set run dir (e.g. run_dir from Section 3, or a path from list_run_dirs) and query index
RUN_DIR_FOR_DEBUG = run_dir if "run_dir" in dir() else None  # use run_dir from Section 3, or set explicitly
DEBUG_QUERY_INDEX = QUERY_INDEX if "QUERY_INDEX" in dir() else 0

import textwrap
from agentic_explain.evaluation.run_storage import load_result

if LOAD_DEBUG_FROM == "file" and RUN_DIR_FOR_DEBUG is not None:
    data = load_result(RUN_DIR_FOR_DEBUG, DEBUG_STRATEGY, DEBUG_QUERY_INDEX)
    debug_state = data["state"]
    q_meta = data.get("query_meta", {})
    q = {"query": q_meta.get("query"), "expected_path": q_meta.get("expected_path"), "expected_constraint_expr": q_meta.get("expected_constraint_expr"), "reference_answer": q_meta.get("reference_answer"), "id": q_meta.get("query_id")}
    print(f"Loaded from: {RUN_DIR_FOR_DEBUG} ({DEBUG_STRATEGY} query {DEBUG_QUERY_INDEX})")
    print(f"Strategy:  {DEBUG_STRATEGY}")
    print(f"Query:     {q_meta.get('query', '?')}")
    print("=" * 90)
else:
    strategy = strategies[DEBUG_STRATEGY]
    print(f"Strategy:  {DEBUG_STRATEGY}")
    print(f"Query:     {query}")
    print(f"Expected:  path={q.get('expected_path', '?')}, expr={q.get('expected_constraint_expr', '?')}")
    print("=" * 90)
    debug_workflow = create_workflow(
        openai_client=openai_client,
        rag_strategy=strategy,
        baseline_result=baseline_result,
        data_dir=str(DATA_DIR),
        build_model_fn=build_gurobi_model,
        inputs=inputs,
        env_kwargs=env_kwargs,
        outputs_dir=str(OUTPUTS_DIR),
        temperature=LLM_TEMPERATURE,
    )
    debug_state = invoke_workflow(debug_workflow, query, baseline_result=baseline_result)

print(f"\nStatus:       {debug_state.get('counterfactual_status', '(none)')}")
exprs = debug_state.get("constraint_expressions", [])
print(f"Constraints:  {exprs if exprs else '(none)'}")
summary = debug_state.get('final_summary', '(none)')
wrapped_summary = textwrap.fill(summary, width=100, initial_indent="Summary:      ", subsequent_indent="               ")
print(wrapped_summary)

Strategy:  plain_rag
Query:     Why is IO Base development understaffed in week 10?
Expected:  path=feasible, expr=d_miss[10,8] == 0
Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2678051
WLS license 2678051 - registered to C3.ai
Set parameter TimeLimit to value 100
Gurobi Optimizer version 13.0.1 build v13.0.1rc0 (mac64[arm] - Darwin 25.2.0 25C56)

CPU model: Apple M3 Max
Thread count: 16 physical cores, 16 logical processors, using up to 16 threads

Non-default parameters:
TimeLimit  100

WLS license 2678051 - registered to C3.ai
Optimize a model with 18263 rows, 17260 columns and 83921 nonzeros (Min)
Model fingerprint: 0xf48b1980
Model has 1244 linear objective coefficients
Variable types: 8944 continuous, 8316 integer (8316 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+08]
  Objective range  [6e-01, 2e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [5e-01, 6e+00]

Found heuristic solution: objective 781.6652000
Presolve 

### 5a. Retrieval Debug: Retrieved Chunks & Scores

Inspect which RAG chunks were retrieved and their relevance scores at each retrieval stage.

In [9]:
import textwrap


rag_debug = debug_state.get("rag_retrieval_debug", {})
# rag_debug = results['plain_rag'].get("rag_retrieval_debug", {})

for stage_name, info in rag_debug.items():
    if stage_name == "strategy" or not isinstance(info, dict):
        continue
    print(f"{'=' * 90}")
    print(f"  Stage: {stage_name}")
    print(f"  Retrieval query: {info.get('query', '?')}")
    print(f"  Top-k: {info.get('top_k', '?')}")
    if "iis_constraint_names" in info:
        print(f"  IIS constraints: {info['iis_constraint_names']}")
    print(f"{'=' * 90}")

    for i, chunk in enumerate(info.get("chunks", [])):
        score = chunk.get("score")
        meta = chunk.get("metadata", {})
        text = chunk.get("text", "")
        print(f"\n  ── Chunk {i}  score={score:.4f}" if score is not None else f"\n  ── Chunk {i}  score=N/A")
        print(f"     source={meta.get('source', '?')}  section={meta.get('section', '?')}")
        extra = {k: v for k, v in meta.items() if k not in ("source", "section", "path")}
        if extra:
            print(f"     {extra}")
        # Show text (truncated & wrapped)
        display = text if len(text) <= 500 else text[:500] + "\n     ... [truncated]"
        wrapped = textwrap.fill(display, width=95, initial_indent="     ", subsequent_indent="     ")
        print(wrapped)
    print()

  Stage: constraint_generation
  Retrieval query: Force d=8 to be adequately staffed in t=10.
  Top-k: 5

  ── Chunk 0  score=0.8033
     source=py  section=constraints
     {'constraint_name': 'demand_balance'}
     ### demand_balance Name: Demand Balance. For each project d and week t: sum_j x[j,t,d] *
     F_j + d_miss[t,d] = D[d,t]. Description: Total staffing (allocations weighted by employee
     FTE) plus unmet demand equals required demand. Business context: Core balance of supply
     and demand. If we cannot meet demand, d_miss captures the shortage. Variables: x, d_miss.

  ── Chunk 1  score=0.8032
     source=lp  section=constraints
     {'constraint_name': 'staffed_indicator_0_8_10'}
     Constraint: staffed_indicator_0_8_10 - x[8,0,10] - x[8,1,10] - x[8,2,10] - x[8,3,10]

  ── Chunk 2  score=0.8028
     source=lp  section=constraints
     {'constraint_name': 'staffed_indicator_1_8_10'}
     Constraint: staffed_indicator_1_8_10 - x[8,0,10] - x[8,1,10] - x[8,2,10] - x[8,3,1

### 5b. LLM Messages Debug

The exact system + user messages sent to the LLM at each RAG-augmented stage, and the raw response.

In [10]:
CHARS_TO_SHOW = 3000
llm_debug = debug_state.get("llm_messages_debug", {})
# llm_debug = results['plain_rag'].get("llm_messages_debug", {})

for stage_name, info in llm_debug.items():
    print(f"{'=' * 90}")
    print(f"  Stage: {stage_name}")
    print(f"{'=' * 90}")

    print(f"\n  [SYSTEM MESSAGE]")
    sys_msg = info.get("system", "")
    print(textwrap.fill(sys_msg, width=95, initial_indent="  ", subsequent_indent="  "))

    print(f"\n  [USER MESSAGE]  (first {CHARS_TO_SHOW} chars)")
    user_msg = info.get("user", "")
    display_user = user_msg if len(user_msg) <= CHARS_TO_SHOW else user_msg[:CHARS_TO_SHOW] + "\n  ... [truncated]"
    print(textwrap.fill(display_user, width=95, initial_indent="  ", subsequent_indent="  "))

    print(f"\n  [RAW LLM RESPONSE]")
    raw = info.get("raw_response", "")
    print(textwrap.fill(raw, width=95, initial_indent="  ", subsequent_indent="  "))
    print()

  Stage: constraint_generation

  [SYSTEM MESSAGE]
  You translate a user request into one or more constraint expressions for a Gurobi
  optimization model. Available decision variables (use exact names): d_miss, x, x_idle, x_ind,
  x_p_ind. Format: variable_name[index1,index2,...] == value or >= value or <= value. Value can
  be a number (e.g. 0, 1) or a data parameter (e.g. D[t,d] for demand). You may use a single
  variable or a sum of variables (e.g. x[j,t,d] + x[j+1,t,d] + ... >= D[d,t]). Example format:
  d_miss[0,0] == 1, x[0,0] == 1, x_idle[0,0] == 1. For 'force no unmet demand': d_miss[t,d] ==
  0. Use the RAG context below to understand variable dimensions and index meanings. Output
  only the constraint line(s), one per line, no explanation.

  [USER MESSAGE]  (first 3000 chars)
  RAG context: ### demand_balance Name: Demand Balance. For each project d and week t: sum_j
  x[j,t,d] * F_j + d_miss[t,d] = D[d,t]. Description: Total staffing (allocations weighted by
  employee F

### 5c. Constraint Parsing Debug

Shows how the raw LLM response was parsed into constraint expressions:
regex matches, which passed/failed variable-name validation, and why.

In [11]:
llm_debug = debug_state.get("llm_messages_debug", {})
cg_debug = llm_debug.get("constraint_generation", {})
parsing = cg_debug.get("parsing", {})

print("=" * 90)
print("  Constraint Parsing Debug")
print("=" * 90)

print(f"\n  Raw LLM response (attempt 1):")
print(f"    {cg_debug.get('raw_response', '(not captured)')}")

print(f"\n  Regex matches (attempt 1): {parsing.get('regex_matches_attempt1', parsing.get('regex_matches', '(not captured)'))}")
print(f"  Valid expressions: {parsing.get('valid_expressions', '(not captured)')}")
print(f"  Rejected expressions: {parsing.get('rejected_expressions', '(not captured)')}")
if parsing.get("rejected_reason"):
    print(f"  Rejection reason: {parsing['rejected_reason']}")

if parsing.get("retry_raw_response") is not None:
    print(f"\n  ** RETRY triggered (attempt 1 produced no valid expressions) **")
    print(f"  Retry LLM response:")
    print(f"    {parsing['retry_raw_response']}")
    print(f"  Regex matches (retry): {parsing.get('regex_matches_retry', [])}")

if parsing.get("parse_failure"):
    print(f"\n  ** PARSE FAILURE (even after retry) **")
    print(f"    {parsing['parse_failure']}")

print(f"\n  Final constraint_expressions in state: {debug_state.get('constraint_expressions', '(key missing)')}")

  Constraint Parsing Debug

  Raw LLM response (attempt 1):
    d_miss[10,8] == 0

  Regex matches (attempt 1): ['d_miss[10,8] == 0']
  Valid expressions: ['d_miss[10,8] == 0']
  Rejected expressions: []

  Final constraint_expressions in state: ['d_miss[10,8] == 0']


### 5d. Applied Constraints Debug

Which constraints were actually added to the Gurobi model, and what were the baseline values of those variables?

In [12]:
# cf_result = results['plain_rag'].get("counterfactual_result", {})
cf_result = debug_state.get("counterfactual_result", {})
applied = cf_result.get("applied_constraints", [])

if not applied:
    print("No constraints were applied (check counterfactual_result for errors).")
    if cf_result.get("error"):
        print(f"  Error: {cf_result['error']}")
else:
    print(f"{'=' * 90}")
    print(f"  {len(applied)} constraint(s) added to the counterfactual Gurobi model")
    print(f"{'=' * 90}")
    for i, ac in enumerate(applied):
        baseline_val = ac.get("baseline_value")
        forced_val = ac.get("forced_value")
        bv_str = f"{baseline_val:.4f}" if baseline_val is not None else "N/A"
        direction = ""
        if baseline_val is not None:
            if abs(forced_val - baseline_val) < 1e-8:
                direction = "(no change from baseline)"
            elif forced_val > baseline_val:
                direction = f"(forcing UP from {bv_str})"
            else:
                direction = f"(forcing DOWN from {bv_str})"

        print(f"\n  Constraint {i}:")
        print(f"    Expression:      {ac.get('expr')}")
        print(f"    Gurobi var:      {ac.get('gurobi_var_name')}")
        print(f"    Forced value:    {forced_val}  {direction}")
        print(f"    Baseline value:  {bv_str}")
        print(f"    Var type:        {ac.get('var_type')}  bounds=[{ac.get('var_lb')}, {ac.get('var_ub')}]")
        print(f"    Constr name:     {ac.get('constraint_name')}")

    print(f"\n  Counterfactual status: {debug_state.get('counterfactual_status')}")
    cf_obj = cf_result.get("objective_value")
    base_obj = debug_state.get("baseline_result", {}).get("objective_value")
    if cf_obj is not None and base_obj is not None:
        print(f"  Baseline obj:          {base_obj:.4f}")
        print(f"  Counterfactual obj:    {cf_obj:.4f}")
        print(f"  Delta:                 {cf_obj - base_obj:+.4f}")

  1 constraint(s) added to the counterfactual Gurobi model

  Constraint 0:
    Expression:      d_miss[10,8] == 0
    Gurobi var:      d_miss[10,8]
    Forced value:    0.0  (forcing DOWN from 1.5000)
    Baseline value:  1.5000
    Var type:        C  bounds=[0.0, inf]
    Constr name:     user_constr_d_miss_10_8

  Counterfactual status: feasible
  Baseline obj:          129.8083
  Counterfactual obj:    132.3263
  Delta:                 +2.5180


### 5e. Detailed Objective Comparison & Variable Changes

The compare node produces a structured breakdown of all four objective terms (baseline vs counterfactual)
and highlights which variables changed to cause the differences.

In [13]:
# Print the full comparison summary (generated by the compare node)
comparison = debug_state.get("comparison_summary", "(no comparison)")
# comparison = results['plain_rag'].get("comparison_summary", "(no comparison)")

print(comparison)

=== OBJECTIVE COMPARISON ===
Term                                    Baseline     Counter.        Delta    %Change
─────────────────────────────────────────────────────────────────────────────────────
cost_of_missing_demand                   56.0750      60.7430      +4.6680      +8.3%
  (Weighted sum of unmet staffing demand across all projects and weeks)
idle_time                                28.7333      32.5833      +3.8500     +13.4%
  (Total employee idle time (FTE-weeks not assigned to any project))
staffing_consistency                     43.0000      38.0000      -5.0000     -11.6%
  (Number of unique employee-project pairings (fewer = less context switching))
out_of_cohort_penalty                     2.0000       1.0000      -1.0000     -50.0%
  (Penalty for assigning employees to projects outside their preferred cohort)
─────────────────────────────────────────────────────────────────────────────────────
TOTAL                                   129.8083     132.3263      +2