Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 50 additions & 7 deletions megaplan/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,60 @@ def _dir_is_claimed(diff_path: str) -> bool:
f"Sense check {sense_check_id} acknowledgment is perfunctory: {note.strip()!r}."
)

pending_tasks: list[str] = []
skipped_without_reason: list[str] = []
blocked_without_reason: list[str] = []
hollow_done_tasks: list[str] = []
for task in finalize_data.get("tasks", []):
if task.get("status") != "done":
continue
task_id = task.get("id", "?")
status = task.get("status", "")
notes = task.get("executor_notes", "")
if not isinstance(notes, str) or not notes.strip():
notes_text = notes.strip() if isinstance(notes, str) else ""
if status == "pending":
pending_tasks.append(task_id)
continue
if is_rubber_stamp(notes, strict=True):
findings.append(
f"Task {task_id} executor_notes are perfunctory: {notes.strip()!r}."
)
if status == "skipped" and not notes_text:
skipped_without_reason.append(task_id)
continue
if status == "blocked" and not notes_text:
blocked_without_reason.append(task_id)
continue
if status == "done":
files = task.get("files_changed") or []
commands = task.get("commands_run") or []
if not files and not commands:
hollow_done_tasks.append(task_id)
continue
if notes_text and is_rubber_stamp(notes, strict=True):
findings.append(
f"Task {task_id} executor_notes are perfunctory: {notes_text!r}."
)

# Tasks left in `pending` after execute means the executor never started
# them — the most common cause of silent scope shrink (e.g., quota
# exhaustion mid-execute). Flag every one so the auto-driver retries
# execute or escalates instead of treating the partial run as clean.
if pending_tasks:
findings.append(
"Tasks left pending after execute (executor never started them): "
+ ", ".join(pending_tasks)
)
if skipped_without_reason:
findings.append(
"Tasks marked skipped without an executor_notes reason: "
+ ", ".join(skipped_without_reason)
)
if blocked_without_reason:
findings.append(
"Tasks marked blocked without an executor_notes reason: "
+ ", ".join(blocked_without_reason)
)
if hollow_done_tasks:
findings.append(
"Tasks marked done with neither files_changed nor commands_run "
"(suspicious — executor may have skipped without flagging): "
+ ", ".join(hollow_done_tasks)
)

return {
"findings": findings,
Expand Down
114 changes: 114 additions & 0 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,120 @@ def test_is_rubber_stamp_allows_short_specific_ack_only_in_loose_mode() -> None:
assert is_rubber_stamp(note, strict=True) is True


def _stub_clean_git_status(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(
"megaplan.evaluation.subprocess.run",
lambda *args, **kwargs: subprocess.CompletedProcess(
args=["git", "status", "--short"],
returncode=0,
stdout="",
stderr="",
),
)


def test_validate_execution_evidence_flags_pending_tasks(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
project_dir = tmp_path / "project"
(project_dir / ".git").mkdir(parents=True)
_stub_clean_git_status(monkeypatch)
finalize_data = {
"tasks": [
{"id": "T1", "status": "done", "files_changed": ["src/a.py"], "commands_run": [], "executor_notes": "implemented A"},
{"id": "T2", "status": "pending", "files_changed": [], "commands_run": [], "executor_notes": ""},
{"id": "T3", "status": "pending", "files_changed": [], "commands_run": [], "executor_notes": ""},
],
"sense_checks": [],
}
result = validate_execution_evidence(finalize_data, project_dir)
assert result["skipped"] is False
finding = next((f for f in result["findings"] if "pending" in f), None)
assert finding is not None
assert "T2" in finding and "T3" in finding
assert "T1" not in finding


def test_validate_execution_evidence_flags_skipped_without_reason(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
project_dir = tmp_path / "project"
(project_dir / ".git").mkdir(parents=True)
_stub_clean_git_status(monkeypatch)
finalize_data = {
"tasks": [
{"id": "T1", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": ""},
{"id": "T2", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": "Not needed because the helper was inlined into T1."},
],
"sense_checks": [],
}
result = validate_execution_evidence(finalize_data, project_dir)
finding = next((f for f in result["findings"] if "skipped without" in f), None)
assert finding is not None
assert "T1" in finding
assert "T2" not in finding


def test_validate_execution_evidence_flags_blocked_without_reason(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
project_dir = tmp_path / "project"
(project_dir / ".git").mkdir(parents=True)
_stub_clean_git_status(monkeypatch)
finalize_data = {
"tasks": [
{"id": "T1", "status": "blocked", "files_changed": [], "commands_run": [], "executor_notes": ""},
{"id": "T2", "status": "blocked", "files_changed": [], "commands_run": [], "executor_notes": "DB env var missing; awaiting U1."},
],
"sense_checks": [],
}
result = validate_execution_evidence(finalize_data, project_dir)
finding = next((f for f in result["findings"] if "blocked without" in f), None)
assert finding is not None
assert "T1" in finding
assert "T2" not in finding


def test_validate_execution_evidence_flags_hollow_done_tasks(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
project_dir = tmp_path / "project"
(project_dir / ".git").mkdir(parents=True)
_stub_clean_git_status(monkeypatch)
finalize_data = {
"tasks": [
{"id": "T1", "status": "done", "files_changed": [], "commands_run": [], "executor_notes": "all good"},
{"id": "T2", "status": "done", "files_changed": [], "commands_run": ["pytest -q"], "executor_notes": "ran the test suite"},
{"id": "T3", "status": "done", "files_changed": ["src/x.py"], "commands_run": [], "executor_notes": "patched x"},
],
"sense_checks": [],
}
result = validate_execution_evidence(finalize_data, project_dir)
finding = next((f for f in result["findings"] if "neither files_changed nor commands_run" in f), None)
assert finding is not None
assert "T1" in finding
assert "T2" not in finding
assert "T3" not in finding


def test_validate_execution_evidence_clean_run_produces_no_completion_findings(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
project_dir = tmp_path / "project"
(project_dir / ".git").mkdir(parents=True)
_stub_clean_git_status(monkeypatch)
finalize_data = {
"tasks": [
{"id": "T1", "status": "done", "files_changed": ["src/a.py"], "commands_run": [], "executor_notes": "added A in src/a.py"},
{"id": "T2", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": "redundant after T1 covered the case"},
],
"sense_checks": [],
}
result = validate_execution_evidence(finalize_data, project_dir)
for keyword in ("pending", "skipped without", "blocked without", "neither files_changed"):
assert not any(keyword in f for f in result["findings"]), result["findings"]


def test_validate_execution_evidence_flags_diff_mismatches_and_weak_notes(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
Expand Down