diff --git a/megaplan/evaluation.py b/megaplan/evaluation.py index d3d5d962..1d072c7d 100644 --- a/megaplan/evaluation.py +++ b/megaplan/evaluation.py @@ -308,17 +308,60 @@ def _dir_is_claimed(diff_path: str) -> bool: f"Sense check {sense_check_id} acknowledgment is perfunctory: {note.strip()!r}." ) + pending_tasks: list[str] = [] + skipped_without_reason: list[str] = [] + blocked_without_reason: list[str] = [] + hollow_done_tasks: list[str] = [] for task in finalize_data.get("tasks", []): - if task.get("status") != "done": - continue task_id = task.get("id", "?") + status = task.get("status", "") notes = task.get("executor_notes", "") - if not isinstance(notes, str) or not notes.strip(): + notes_text = notes.strip() if isinstance(notes, str) else "" + if status == "pending": + pending_tasks.append(task_id) continue - if is_rubber_stamp(notes, strict=True): - findings.append( - f"Task {task_id} executor_notes are perfunctory: {notes.strip()!r}." - ) + if status == "skipped" and not notes_text: + skipped_without_reason.append(task_id) + continue + if status == "blocked" and not notes_text: + blocked_without_reason.append(task_id) + continue + if status == "done": + files = task.get("files_changed") or [] + commands = task.get("commands_run") or [] + if not files and not commands: + hollow_done_tasks.append(task_id) + continue + if notes_text and is_rubber_stamp(notes, strict=True): + findings.append( + f"Task {task_id} executor_notes are perfunctory: {notes_text!r}." + ) + + # Tasks left in `pending` after execute means the executor never started + # them — the most common cause of silent scope shrink (e.g., quota + # exhaustion mid-execute). Flag every one so the auto-driver retries + # execute or escalates instead of treating the partial run as clean. + if pending_tasks: + findings.append( + "Tasks left pending after execute (executor never started them): " + + ", ".join(pending_tasks) + ) + if skipped_without_reason: + findings.append( + "Tasks marked skipped without an executor_notes reason: " + + ", ".join(skipped_without_reason) + ) + if blocked_without_reason: + findings.append( + "Tasks marked blocked without an executor_notes reason: " + + ", ".join(blocked_without_reason) + ) + if hollow_done_tasks: + findings.append( + "Tasks marked done with neither files_changed nor commands_run " + "(suspicious — executor may have skipped without flagging): " + + ", ".join(hollow_done_tasks) + ) return { "findings": findings, diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index deffc10f..71e5a146 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -590,6 +590,120 @@ def test_is_rubber_stamp_allows_short_specific_ack_only_in_loose_mode() -> None: assert is_rubber_stamp(note, strict=True) is True +def _stub_clean_git_status(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "megaplan.evaluation.subprocess.run", + lambda *args, **kwargs: subprocess.CompletedProcess( + args=["git", "status", "--short"], + returncode=0, + stdout="", + stderr="", + ), + ) + + +def test_validate_execution_evidence_flags_pending_tasks( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "project" + (project_dir / ".git").mkdir(parents=True) + _stub_clean_git_status(monkeypatch) + finalize_data = { + "tasks": [ + {"id": "T1", "status": "done", "files_changed": ["src/a.py"], "commands_run": [], "executor_notes": "implemented A"}, + {"id": "T2", "status": "pending", "files_changed": [], "commands_run": [], "executor_notes": ""}, + {"id": "T3", "status": "pending", "files_changed": [], "commands_run": [], "executor_notes": ""}, + ], + "sense_checks": [], + } + result = validate_execution_evidence(finalize_data, project_dir) + assert result["skipped"] is False + finding = next((f for f in result["findings"] if "pending" in f), None) + assert finding is not None + assert "T2" in finding and "T3" in finding + assert "T1" not in finding + + +def test_validate_execution_evidence_flags_skipped_without_reason( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "project" + (project_dir / ".git").mkdir(parents=True) + _stub_clean_git_status(monkeypatch) + finalize_data = { + "tasks": [ + {"id": "T1", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": ""}, + {"id": "T2", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": "Not needed because the helper was inlined into T1."}, + ], + "sense_checks": [], + } + result = validate_execution_evidence(finalize_data, project_dir) + finding = next((f for f in result["findings"] if "skipped without" in f), None) + assert finding is not None + assert "T1" in finding + assert "T2" not in finding + + +def test_validate_execution_evidence_flags_blocked_without_reason( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "project" + (project_dir / ".git").mkdir(parents=True) + _stub_clean_git_status(monkeypatch) + finalize_data = { + "tasks": [ + {"id": "T1", "status": "blocked", "files_changed": [], "commands_run": [], "executor_notes": ""}, + {"id": "T2", "status": "blocked", "files_changed": [], "commands_run": [], "executor_notes": "DB env var missing; awaiting U1."}, + ], + "sense_checks": [], + } + result = validate_execution_evidence(finalize_data, project_dir) + finding = next((f for f in result["findings"] if "blocked without" in f), None) + assert finding is not None + assert "T1" in finding + assert "T2" not in finding + + +def test_validate_execution_evidence_flags_hollow_done_tasks( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "project" + (project_dir / ".git").mkdir(parents=True) + _stub_clean_git_status(monkeypatch) + finalize_data = { + "tasks": [ + {"id": "T1", "status": "done", "files_changed": [], "commands_run": [], "executor_notes": "all good"}, + {"id": "T2", "status": "done", "files_changed": [], "commands_run": ["pytest -q"], "executor_notes": "ran the test suite"}, + {"id": "T3", "status": "done", "files_changed": ["src/x.py"], "commands_run": [], "executor_notes": "patched x"}, + ], + "sense_checks": [], + } + result = validate_execution_evidence(finalize_data, project_dir) + finding = next((f for f in result["findings"] if "neither files_changed nor commands_run" in f), None) + assert finding is not None + assert "T1" in finding + assert "T2" not in finding + assert "T3" not in finding + + +def test_validate_execution_evidence_clean_run_produces_no_completion_findings( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + project_dir = tmp_path / "project" + (project_dir / ".git").mkdir(parents=True) + _stub_clean_git_status(monkeypatch) + finalize_data = { + "tasks": [ + {"id": "T1", "status": "done", "files_changed": ["src/a.py"], "commands_run": [], "executor_notes": "added A in src/a.py"}, + {"id": "T2", "status": "skipped", "files_changed": [], "commands_run": [], "executor_notes": "redundant after T1 covered the case"}, + ], + "sense_checks": [], + } + result = validate_execution_evidence(finalize_data, project_dir) + for keyword in ("pending", "skipped without", "blocked without", "neither files_changed"): + assert not any(keyword in f for f in result["findings"]), result["findings"] + + def test_validate_execution_evidence_flags_diff_mismatches_and_weak_notes( tmp_path: Path, monkeypatch: pytest.MonkeyPatch,