From b6f47989d807946a086c3bf6cff863c72d5d9af1 Mon Sep 17 00:00:00 2001 From: Rafael Richards Date: Mon, 11 May 2026 10:35:41 -0400 Subject: [PATCH] phase5-B: link-check gate + fix validate-repo-meta binary-URL decode bug (TDD) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5 Track B per phase5-plan.md §3. Gate 2 of 4. Two pieces: ### B0 — validate-repo-meta.py binary-URL fix (Phase-4 carry-over) Pre-Track-B, ``resolve_one()`` blindly UTF-8-decoded every fetched body. Any ``exposes.*`` URL pointing at a binary asset (e.g. m-dev-tools-mcp's v0.1.0 wheel) crashed with UnicodeDecodeError; the documented workaround was ``ARGS=--no-resolve`` on every call. Fix: detect non-text extensions (``.whl`` / ``.tar.gz`` / ``.tgz`` / ``.zip`` / ``.gz``) and skip the body-read entirely — HEAD-equivalent semantics. Text URLs unchanged. * New _BINARY_EXTENSIONS module constant + _is_binary_url() helper. * resolve_one() branches on binary-ness; binary URLs only check "did the server answer?". * tests/test_validate_repo_meta.py — 5 cases pinning the behaviour: text-URL decodes + JSON-validates; invalid-JSON text URL reports; binary .whl URL does NOT decode (the bug-fix assertion); .tar.gz same defense; transport failure on a binary URL still surfaces. Verified: ``make check-repo-meta META=…/m-dev-tools-mcp/dist/repo.meta.json`` in full-resolve mode now returns OK against the v0.1.0 wheel URL. The ``ARGS=--no-resolve`` workaround can be dropped from any call site that wants the stricter check (out of scope here — just fix the bug, leave the call sites alone). ### B1 + B2 — check-links.py Walks every URL surfaced by the catalog: * profile/llms.txt — Markdown link targets (one row per link, line number captured as the label) * profile/tools.json — top-level $schema + every tools.._url value across every entry * profile/task_index.json — top-level $schema + every doc field on every category/row Per-URL behaviour: HEAD with a 15s timeout. On 405 Method Not Allowed, retry with GET. 200/301/302 = OK (urllib follows redirects natively when --allow-redirects is on, which is the default). Anything else = FAIL with the HTTP status + reason captured. Modes: * --offline — inventory only. No network round-trip. Status column reads INVENTORIED. Used by per-PR CI. * default — full HEAD walk. Used by the weekly cron firing. Exit codes: 0 all OK (or all INVENTORIED), 1 any FAIL, 2 fixture error. 13 new TDD cases (tests/test_check_links.py): * Inventory (3): tools.json $schema + *_url surfacing; task_index doc fields; llms.txt Markdown link extraction. * check_url (5): 200 OK, 404 FAIL, 405→GET fallback (HEAD then GET retry assertion), DNS failure FAIL, redirect-follow OK. * Driver (2): worst-first aggregation; all-OK aggregator. * CLI (3): --offline must not touch the network (urlopen patched to raise); a 404 surfaces as rc=1; smoke against committed catalog. ### B3 — Make + CI wiring * make check-links — invokes the script with --offline (PR-mode). * .github/workflows/ci.yml: PR-mode adds a "Link inventory" step to the `check` job. The weekly cron firing on the `handshake` job adds a parallel step running the full live HEAD walk so a broken upstream URL surfaces within 7 days. ### Verification locally * pytest profile/build/ — 84 / 84 (66 prior + 5 validate-repo-meta + 13 check-links) * make check-links — clean (58 URLs catalogued offline) * make check-freshness — still clean (Phase-5 Track A unchanged) * make check-repo-meta full mode (no --no-resolve) green against v0.1.0 wheel URL ### What's NOT in this PR * JSON-body validation in the cron firing — check-links only HEAD- checks; deeper "is the body well-formed" lives in build-catalog + validate-catalog. Adding a JSON-parse pass in check-links would duplicate that path. * Retry-with-backoff for transient 5xx on the cron firing — phase5-plan.md §9 explicitly defers this until CI demonstrates flake. --- .github/workflows/ci.yml | 15 ++ Makefile | 9 +- profile/build/check-links.py | 288 +++++++++++++++++++++++ profile/build/test_check_links.py | 288 +++++++++++++++++++++++ profile/build/test_validate_repo_meta.py | 122 ++++++++++ profile/build/validate-repo-meta.py | 21 +- 6 files changed, 740 insertions(+), 3 deletions(-) create mode 100644 profile/build/check-links.py create mode 100644 profile/build/test_check_links.py create mode 100644 profile/build/test_validate_repo_meta.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72ecbbf..f76cbc2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,6 +77,13 @@ jobs: if: github.event_name != 'schedule' run: make check-freshness + - name: Link inventory (Phase-5 Track B — PR-mode) + # Per phase5-plan.md §3 B3: PR-mode runs --offline so CI is + # network-free + flake-free. The weekly cron firing (handshake + # job below) does the actual HEAD walk against the live URLs. + if: github.event_name != 'schedule' + run: make check-links + handshake: # Phase-3 Track D — discovery-protocol handshake. Runs the 8-step # external-agent walk-through end-to-end. On push/PR uses bundled @@ -108,3 +115,11 @@ jobs: # PR is opened in between. if: github.event_name == 'schedule' run: python3 profile/build/check-freshness.py --offline --strict + + - name: Link-check (Phase-5 Track B — cron live URLs) + # Phase-5 Track B weekly fetch: HEAD every URL in llms.txt + + # tools.json + task_index.json against the live web. A 404 or + # transport failure surfaces here within 7 days even if no PR + # opens in between. Per-PR jobs use --offline (see above). + if: github.event_name == 'schedule' + run: python3 profile/build/check-links.py diff --git a/Makefile b/Makefile index c9b3a93..4d4e940 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness +.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links # Phase-1 Track B's generator. Fetches each TIER_1+TIER_2+TIER_3 repo's # dist/repo.meta.json, validates it, translates it into a `tools.` @@ -108,3 +108,10 @@ handshake: # failure so a freshness slip surfaces within 7 days. check-freshness: python3 profile/build/check-freshness.py --offline + +# Phase-5 Track B: link-checking gate over llms.txt + tools.json's +# *_url fields + task_index.json's doc fields. Per-PR runs --offline +# (inventory only — no network); the weekly cron firing runs the +# full HEAD walk so broken upstream URLs surface within 7 days. +check-links: + python3 profile/build/check-links.py --offline diff --git a/profile/build/check-links.py b/profile/build/check-links.py new file mode 100644 index 0000000..09c0aba --- /dev/null +++ b/profile/build/check-links.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +"""Phase 5 Track B — link-checking gate. + +Walks every URL surfaced by the catalog: + +* ``profile/llms.txt`` — Markdown link targets. +* ``profile/tools.json`` — top-level ``$schema`` + every ``*_url`` + field across every ``tools.`` entry. +* ``profile/task_index.json`` — top-level ``$schema`` + every + ``doc`` field across every category/row. + +For each URL, issues a HEAD request (15s timeout). 200/301/302 count +as OK when ``--allow-redirects`` is on (the default). A 405 Method +Not Allowed retries with a GET. Anything else surfaces as ``FAIL`` +with the HTTP status + message. + +Binary URLs (``.whl`` / ``.tar.gz`` / ``.tgz`` / ``.zip`` / ``.gz``) +work the same way — HEAD only, no body decode. Same defense as +``validate-repo-meta.py`` after Phase 5 Track B0. + +Modes: + +* ``--offline`` — inventory only. No network. Status column reads + ``INVENTORIED``. Used by per-PR CI. +* default — full HEAD walk. Used by the weekly cron firing. + +Exit codes: + +* ``0`` — all URLs OK (or all INVENTORIED in --offline mode). +* ``1`` — any URL FAIL. +* ``2`` — fixture/parse error. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +PROFILE = REPO_ROOT / "profile" +DEFAULT_TOOLS_JSON = PROFILE / "tools.json" +DEFAULT_TASK_INDEX = PROFILE / "task_index.json" +DEFAULT_LLMS_TXT = PROFILE / "llms.txt" + +MD_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)") +URL_RE = re.compile(r"^https?://") + +# Status priority for aggregator + sort (lower = better). +_STATUS_PRIORITY = { + "OK": 0, + "INVENTORIED": 1, + "FAIL": 2, +} + + +# ---- inventory helpers ----------------------------------------------------- + + +def extract_urls_from_tools_json(tools: dict[str, Any]) -> list[tuple[str, str]]: + """Return ``(label, url)`` rows for every URL in ``tools.json``. + + Surfaces: + * top-level ``$schema`` + * every ``tools.._url`` value that's an http(s) URL + """ + out: list[tuple[str, str]] = [] + schema = tools.get("$schema") + if isinstance(schema, str) and URL_RE.match(schema): + out.append(("$schema", schema)) + tools_map = tools.get("tools", {}) + if isinstance(tools_map, dict): + for key, entry in tools_map.items(): + if not isinstance(entry, dict): + continue + for field, value in entry.items(): + if not isinstance(value, str): + continue + if not field.endswith("_url"): + continue + if not URL_RE.match(value): + continue + out.append((f"tools.{key}.{field}", value)) + return out + + +def extract_urls_from_task_index(ti: dict[str, Any]) -> list[tuple[str, str]]: + """Return ``(label, url)`` rows for every URL in ``task_index.json``.""" + out: list[tuple[str, str]] = [] + schema = ti.get("$schema") + if isinstance(schema, str) and URL_RE.match(schema): + out.append(("$schema", schema)) + categories = ti.get("categories", {}) + if isinstance(categories, dict): + for cat_name, cat in categories.items(): + if not isinstance(cat, dict): + continue + for row_name, row in cat.items(): + if not isinstance(row, dict): + continue + doc = row.get("doc") + if isinstance(doc, str) and URL_RE.match(doc): + out.append((f"categories.{cat_name}.{row_name}.doc", doc)) + return out + + +def extract_urls_from_llms_txt(text: str) -> list[tuple[str, str]]: + """Return ``(label, url)`` rows for every Markdown-link URL in + ``llms.txt``. Labels are the line index where the link appeared.""" + out: list[tuple[str, str]] = [] + for line_idx, line in enumerate(text.splitlines(), start=1): + for m in MD_LINK_RE.finditer(line): + url = m.group(1) + if URL_RE.match(url): + out.append((f"llms.txt:L{line_idx}", url)) + return out + + +# ---- single-URL check ------------------------------------------------------ + + +def check_url(url: str, *, allow_redirects: bool = True, timeout: float = 15.0) -> dict[str, Any]: + """HEAD-check one URL. Returns + ``{"status": "OK"|"FAIL", "http_status": int|None, "message": str}``. + + On 405, retries with a GET. ``allow_redirects`` is informational + here — urllib follows 3xx natively; the final 200 is what we see. + """ + req = urllib.request.Request(url, method="HEAD") + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310 + status = getattr(resp, "status", 200) + return {"status": "OK", "http_status": status, "message": ""} + except urllib.error.HTTPError as exc: + if exc.code == 405: + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: # noqa: S310 + status = getattr(resp, "status", 200) + return {"status": "OK", "http_status": status, "message": ""} + except urllib.error.HTTPError as exc2: + return { + "status": "FAIL", + "http_status": exc2.code, + "message": str(exc2.reason or exc2), + } + except (urllib.error.URLError, TimeoutError) as exc2: + return { + "status": "FAIL", + "http_status": None, + "message": str(exc2.reason if hasattr(exc2, "reason") else exc2), + } + return {"status": "FAIL", "http_status": exc.code, "message": str(exc.reason or exc)} + except (urllib.error.URLError, TimeoutError) as exc: + msg = str(exc.reason) if hasattr(exc, "reason") else str(exc) + return {"status": "FAIL", "http_status": None, "message": msg} + + +# ---- driver ---------------------------------------------------------------- + + +def check_links_impl( + urls: list[tuple[str, str]], + *, + allow_redirects: bool = True, +) -> tuple[list[dict[str, Any]], str]: + """Walk each ``(label, url)`` pair, return ``(rows, worst_status)`` + sorted worst-first.""" + rows: list[dict[str, Any]] = [] + for label, url in urls: + result = check_url(url, allow_redirects=allow_redirects) + rows.append( + { + "label": label, + "url": url, + "status": result["status"], + "http_status": result["http_status"], + "message": result["message"], + } + ) + rows.sort( + key=lambda r: ( + -_STATUS_PRIORITY.get(r["status"], 0), + r["label"], + ) + ) + worst = "OK" + for r in rows: + if _STATUS_PRIORITY[r["status"]] > _STATUS_PRIORITY[worst]: + worst = r["status"] + return rows, worst + + +# ---- table formatting ------------------------------------------------------ + + +def format_table(title: str, rows: list[dict[str, Any]]) -> str: + if not rows: + return f"### {title}\n\n(no entries)\n" + lines = [f"### {title}", "", "| status | label | url | http |", "|---|---|---|---|"] + for r in rows: + h = r.get("http_status") + h_text = "—" if h is None else str(h) + lines.append(f"| {r['status']} | `{r['label']}` | {r['url']} | {h_text} |") + lines.append("") + return "\n".join(lines) + + +# ---- CLI ------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Link-checking gate over llms.txt + tools.json + task_index.json." + ) + parser.add_argument("--tools-json", type=Path, default=DEFAULT_TOOLS_JSON) + parser.add_argument("--task-index", type=Path, default=DEFAULT_TASK_INDEX) + parser.add_argument("--llms-txt", type=Path, default=DEFAULT_LLMS_TXT) + parser.add_argument( + "--offline", + action="store_true", + help=( + "Inventory URLs only — don't fetch. Status column reads " + "INVENTORIED. Used by per-PR CI to avoid network flake." + ), + ) + parser.add_argument( + "--allow-redirects", + action=argparse.BooleanOptionalAction, + default=True, + help="Follow 3xx redirects (default ON).", + ) + args = parser.parse_args(argv) + + inventory: list[tuple[str, str]] = [] + try: + if args.tools_json.exists(): + inventory.extend( + extract_urls_from_tools_json( + json.loads(args.tools_json.read_text(encoding="utf-8")) + ) + ) + if args.task_index.exists(): + inventory.extend( + extract_urls_from_task_index( + json.loads(args.task_index.read_text(encoding="utf-8")) + ) + ) + if args.llms_txt.exists(): + inventory.extend( + extract_urls_from_llms_txt(args.llms_txt.read_text(encoding="utf-8")) + ) + except (OSError, json.JSONDecodeError) as exc: + print(f"ERROR: cannot read catalog input: {exc}", file=sys.stderr) + return 2 + + if args.offline: + rows = [ + { + "label": label, + "url": url, + "status": "INVENTORIED", + "http_status": None, + "message": "", + } + for label, url in inventory + ] + worst = "OK" + print(format_table("Inventory (offline)", rows)) + print(f"check-links: clean (offline; {len(rows)} URLs catalogued)") + return 0 + + rows, worst = check_links_impl(inventory, allow_redirects=args.allow_redirects) + print(format_table("Link check", rows)) + if worst == "FAIL": + n_fail = sum(1 for r in rows if r["status"] == "FAIL") + print(f"check-links: FAIL ({n_fail} broken URL(s))", file=sys.stderr) + return 1 + print(f"check-links: clean ({len(rows)} URL(s) OK)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/profile/build/test_check_links.py b/profile/build/test_check_links.py new file mode 100644 index 0000000..60337a2 --- /dev/null +++ b/profile/build/test_check_links.py @@ -0,0 +1,288 @@ +"""Tests for check-links.py — Phase 5 Track B1 (RED) before B2 (GREEN). + +Pure-function inventory helpers + the URL-walker, tested with +``unittest.mock.patch`` over ``urllib.request.urlopen`` so the unit +suite is network-free. +""" + +from __future__ import annotations + +import importlib.util +import json +import urllib.error +from pathlib import Path +from unittest.mock import MagicMock, patch + +REPO_ROOT = Path(__file__).resolve().parents[2] +BUILD = REPO_ROOT / "profile" / "build" + +_path = BUILD / "check-links.py" +_spec = importlib.util.spec_from_file_location("_check_links", _path) +_check_links = importlib.util.module_from_spec(_spec) +assert _spec and _spec.loader +_spec.loader.exec_module(_check_links) + + +# ----------------------------------------------------------- inventory + + +def test_extract_urls_from_tools_json_captures_star_url_fields() -> None: + """Every *_url field on every tools. entry surfaces, plus the + top-level $schema.""" + tools = { + "$schema": "https://schema.example.invalid/tools.schema.json", + "tools": { + "m-cli": { + "id": "tool:m-cli", + "repo_meta_url": "https://example.invalid/m-cli/repo.meta.json", + "commands_url": "https://example.invalid/m-cli/commands.json", + "verified_on": "2026-05-11", + }, + "m-stdlib": { + "id": "tool:m-stdlib", + "modules_url": "https://example.invalid/m-stdlib/manifest.json", + }, + }, + } + urls = _check_links.extract_urls_from_tools_json(tools) + by_label = {label: url for label, url in urls} + assert by_label["$schema"] == "https://schema.example.invalid/tools.schema.json" + assert ( + by_label["tools.m-cli.repo_meta_url"] + == "https://example.invalid/m-cli/repo.meta.json" + ) + assert ( + by_label["tools.m-cli.commands_url"] + == "https://example.invalid/m-cli/commands.json" + ) + assert by_label["tools.m-stdlib.modules_url"] + # verified_on must NOT surface — not a URL. + assert not any("verified_on" in label for label, _ in urls) + + +def test_extract_urls_from_task_index_captures_doc_fields() -> None: + """Each row's optional ``doc`` field surfaces as a URL row.""" + ti = { + "$schema": "https://schema.example.invalid/task_index.schema.json", + "categories": { + "lib": { + "json_parse": { + "intent": "Parse JSON in M", + "primary": "module:m-stdlib#STDJSON", + # no doc — should not surface + }, + }, + "infra": { + "agent_integration": { + "intent": "Point my MCP agent at the catalog", + "primary": "tool:m-dev-tools-mcp", + "doc": "https://example.invalid/agent-integration.md", + }, + }, + }, + } + urls = _check_links.extract_urls_from_task_index(ti) + by_label = {label: url for label, url in urls} + assert by_label["$schema"] == "https://schema.example.invalid/task_index.schema.json" + assert ( + by_label["categories.infra.agent_integration.doc"] + == "https://example.invalid/agent-integration.md" + ) + # json_parse has no doc — must not surface + assert "categories.lib.json_parse.doc" not in by_label + + +def test_extract_urls_from_llms_txt_captures_markdown_links() -> None: + text = ( + "# org\n\n" + "Start: [tools.json](https://example.invalid/tools.json)\n" + "Schema: [tools.schema.json](https://example.invalid/tools.schema.json)\n" + ) + urls = _check_links.extract_urls_from_llms_txt(text) + by_url = {url for _label, url in urls} + assert "https://example.invalid/tools.json" in by_url + assert "https://example.invalid/tools.schema.json" in by_url + + +# ----------------------------------------------------------- check_url + + +def _mock_resp(status: int = 200) -> MagicMock: + resp = MagicMock() + resp.__enter__ = MagicMock(return_value=resp) + resp.__exit__ = MagicMock(return_value=False) + resp.status = status + resp.read = MagicMock(return_value=b"") + return resp + + +def test_check_url_200_is_ok() -> None: + with patch("urllib.request.urlopen", return_value=_mock_resp(200)): + result = _check_links.check_url("https://example.invalid/x") + assert result["status"] == "OK" + assert result["http_status"] == 200 + + +def test_check_url_404_is_fail() -> None: + with patch( + "urllib.request.urlopen", + side_effect=urllib.error.HTTPError( + "https://example.invalid/x", 404, "Not Found", hdrs={}, fp=None + ), + ): + result = _check_links.check_url("https://example.invalid/x") + assert result["status"] == "FAIL" + assert result["http_status"] == 404 + + +def test_check_url_405_on_head_falls_back_to_get() -> None: + """If HEAD returns 405, retry with GET. The walker uses a Request + object with method='HEAD'; on 405 it falls back to a no-method GET.""" + call_count = {"head": 0, "get": 0} + + def _fake_urlopen(req_or_url, **_kwargs): + # urllib passes a Request object for the explicit-method call, + # a bare string for the GET fallback. + if hasattr(req_or_url, "get_method") and req_or_url.get_method() == "HEAD": + call_count["head"] += 1 + raise urllib.error.HTTPError( + "https://example.invalid/x", 405, "Method Not Allowed", hdrs={}, fp=None + ) + call_count["get"] += 1 + return _mock_resp(200) + + with patch("urllib.request.urlopen", side_effect=_fake_urlopen): + result = _check_links.check_url("https://example.invalid/x") + assert call_count["head"] == 1 + assert call_count["get"] == 1 + assert result["status"] == "OK" + + +def test_check_url_dns_failure_is_fail() -> None: + with patch( + "urllib.request.urlopen", + side_effect=urllib.error.URLError("nodename nor servname provided"), + ): + result = _check_links.check_url("https://nonexistent.invalid/") + assert result["status"] == "FAIL" + assert result["http_status"] is None + assert "nodename" in result["message"] + + +def test_check_url_redirect_followed_by_default() -> None: + """A 302 followed by a 200 counts as OK when --allow-redirects is on + (the default). urllib.request.urlopen follows redirects natively, + so we just simulate the final 200 here.""" + with patch("urllib.request.urlopen", return_value=_mock_resp(200)): + result = _check_links.check_url( + "https://example.invalid/redirect-to-x", allow_redirects=True + ) + assert result["status"] == "OK" + + +# ----------------------------------------------------------- driver / CLI + + +def test_check_links_impl_aggregates_worst() -> None: + """Driver aggregates per-URL results. Worst-first ordering.""" + urls = [ + ("$schema", "https://example.invalid/ok-a"), + ("tools.x.repo_meta_url", "https://example.invalid/bad"), + ("tools.y.modules_url", "https://example.invalid/ok-b"), + ] + + def _fake_check(url, **_kwargs): + if url.endswith("/bad"): + return {"status": "FAIL", "http_status": 404, "message": "Not Found"} + return {"status": "OK", "http_status": 200, "message": ""} + + with patch.object(_check_links, "check_url", side_effect=_fake_check): + rows, worst = _check_links.check_links_impl(urls, allow_redirects=True) + assert worst == "FAIL" + # Worst-first + assert rows[0]["status"] == "FAIL" + assert rows[0]["url"].endswith("/bad") + + +def test_check_links_impl_all_ok_returns_OK() -> None: + urls = [("a", "https://example.invalid/a"), ("b", "https://example.invalid/b")] + with patch.object( + _check_links, + "check_url", + return_value={"status": "OK", "http_status": 200, "message": ""}, + ): + rows, worst = _check_links.check_links_impl(urls, allow_redirects=True) + assert worst == "OK" + assert all(r["status"] == "OK" for r in rows) + + +def test_cli_offline_inventories_without_network(tmp_path, capsys) -> None: + """--offline must NOT touch the network. Patch urlopen to blow up + so an accidental fetch is caught.""" + tools = {"tools": {"m-cli": {"repo_meta_url": "https://example.invalid/a.json"}}} + ti = {"categories": {}} + llms = "# org\n[a](https://example.invalid/b.json)\n" + + tools_path = tmp_path / "tools.json" + ti_path = tmp_path / "task_index.json" + llms_path = tmp_path / "llms.txt" + tools_path.write_text(json.dumps(tools), encoding="utf-8") + ti_path.write_text(json.dumps(ti), encoding="utf-8") + llms_path.write_text(llms, encoding="utf-8") + + with patch( + "urllib.request.urlopen", + side_effect=AssertionError("network must not be touched in --offline mode"), + ): + rc = _check_links.main( + [ + "--offline", + "--tools-json", str(tools_path), + "--task-index", str(ti_path), + "--llms-txt", str(llms_path), + ] + ) + out = capsys.readouterr().out + assert rc == 0 + # Both URLs got inventoried + assert "example.invalid/a.json" in out + assert "example.invalid/b.json" in out + # Status column should say INVENTORIED / SKIPPED — not OK (since + # nothing was actually fetched) + assert "INVENTORIED" in out or "SKIPPED" in out + + +def test_cli_failure_returns_rc1(tmp_path) -> None: + tools = {"tools": {"x": {"repo_meta_url": "https://example.invalid/404"}}} + tools_path = tmp_path / "tools.json" + tools_path.write_text(json.dumps(tools), encoding="utf-8") + ti_path = tmp_path / "task_index.json" + ti_path.write_text(json.dumps({"categories": {}}), encoding="utf-8") + llms_path = tmp_path / "llms.txt" + llms_path.write_text("# x\n", encoding="utf-8") + + with patch( + "urllib.request.urlopen", + side_effect=urllib.error.HTTPError( + "https://example.invalid/404", 404, "Not Found", hdrs={}, fp=None + ), + ): + rc = _check_links.main( + [ + "--tools-json", str(tools_path), + "--task-index", str(ti_path), + "--llms-txt", str(llms_path), + ] + ) + assert rc == 1 + + +def test_cli_committed_inventory_runs_offline(capsys) -> None: + """Smoke against the real committed catalog. --offline mode never + hits the network; just sanity-checks inventory works against the + committed files.""" + rc = _check_links.main(["--offline"]) + out = capsys.readouterr().out + assert rc == 0 + # At least the tools.json $schema URL should appear in the inventory + assert "tools.schema.json" in out diff --git a/profile/build/test_validate_repo_meta.py b/profile/build/test_validate_repo_meta.py new file mode 100644 index 0000000..b626096 --- /dev/null +++ b/profile/build/test_validate_repo_meta.py @@ -0,0 +1,122 @@ +"""Tests for validate-repo-meta.py. + +Phase 5 Track B0 — pinning the binary-URL behaviour. Pre-Track-B, +``resolve_one()`` blindly UTF-8-decoded every fetched body, which +made full-resolve validation fail against any `exposes.*` URL that +pointed at a binary asset (e.g. m-dev-tools-mcp's v0.1.0 wheel). +The fix uses HEAD-only for non-text extensions; this test pins the +behaviour so a future regression surfaces immediately. + +Test strategy: monkey-patch ``urllib.request.urlopen`` so we control +the server's response shape. No real network round-trip. +""" + +from __future__ import annotations + +import importlib.util +from pathlib import Path +from unittest.mock import MagicMock, patch + +REPO_ROOT = Path(__file__).resolve().parents[2] +BUILD = REPO_ROOT / "profile" / "build" + +_path = BUILD / "validate-repo-meta.py" +_spec = importlib.util.spec_from_file_location("_validate_repo_meta", _path) +_validate_repo_meta = importlib.util.module_from_spec(_spec) +assert _spec and _spec.loader +_spec.loader.exec_module(_validate_repo_meta) + + +def _mock_response(*, body: bytes = b"", status: int = 200) -> MagicMock: + resp = MagicMock() + resp.__enter__ = MagicMock(return_value=resp) + resp.__exit__ = MagicMock(return_value=False) + resp.read = MagicMock(return_value=body) + resp.status = status + return resp + + +def test_resolve_one_text_url_decodes_body() -> None: + """Smoke: a .json URL still fetches + JSON-validates. Pinning the + pre-Track-B0 happy path so the fix doesn't regress text handling.""" + errors: list[str] = [] + valid_json = b'{"hello": "world"}' + with patch( + "urllib.request.urlopen", return_value=_mock_response(body=valid_json) + ): + _validate_repo_meta.resolve_one( + base="https://example.invalid/", + value="dist/repo.meta.json", + errors=errors, + label="repo_meta", + ) + assert errors == [] + + +def test_resolve_one_invalid_json_text_url_reports() -> None: + errors: list[str] = [] + with patch( + "urllib.request.urlopen", return_value=_mock_response(body=b"not json") + ): + _validate_repo_meta.resolve_one( + base="https://example.invalid/", + value="dist/something.json", + errors=errors, + label="bad_json", + ) + assert len(errors) == 1 + assert "invalid JSON" in errors[0] + + +def test_resolve_one_binary_wheel_url_no_decode() -> None: + """Track B0 contract: a .whl URL must NOT trigger UTF-8 decode of + the body. Validator returns success on HEAD-equivalent 200, no + errors raised. Mirrors the real-world v0.1.0 wheel asset URL + shape that crashed pre-Track-B0 validate-repo-meta.""" + errors: list[str] = [] + # Random non-UTF-8 bytes — would crash .decode("utf-8") + bogus_bytes = b"\xab\xcd\xef\x00\x01\x02" + with patch( + "urllib.request.urlopen", return_value=_mock_response(body=bogus_bytes) + ): + _validate_repo_meta.resolve_one( + base="https://github.com/m-dev-tools/m-dev-tools-mcp/releases/download/v0.1.0/", + value="m_dev_tools_mcp-0.1.0-py3-none-any.whl", + errors=errors, + label="release_wheel", + ) + assert errors == [], f"binary URL should not raise decode errors; got {errors!r}" + + +def test_resolve_one_binary_tarball_url_no_decode() -> None: + """Same contract for .tar.gz.""" + errors: list[str] = [] + with patch( + "urllib.request.urlopen", return_value=_mock_response(body=b"\x1f\x8b") + ): + _validate_repo_meta.resolve_one( + base="https://example.invalid/", + value="dist/payload.tar.gz", + errors=errors, + label="tarball", + ) + assert errors == [] + + +def test_resolve_one_binary_url_fetch_failure_reports() -> None: + """HEAD failures (404, transport) still surface as errors.""" + import urllib.error + + errors: list[str] = [] + with patch( + "urllib.request.urlopen", + side_effect=urllib.error.URLError("simulated dns failure"), + ): + _validate_repo_meta.resolve_one( + base="https://nonexistent.invalid/", + value="dist/foo.whl", + errors=errors, + label="dead_link", + ) + assert len(errors) == 1 + assert "cannot fetch" in errors[0] diff --git a/profile/build/validate-repo-meta.py b/profile/build/validate-repo-meta.py index dacf640..033469a 100755 --- a/profile/build/validate-repo-meta.py +++ b/profile/build/validate-repo-meta.py @@ -169,6 +169,17 @@ def validate_schema(data: dict, errors: list[str]) -> None: errors.append(f"schema: at {path}: {exc.message}") +# Extensions whose payloads are binary by design — never UTF-8-decode +# them. The validator's job for these is "does the URL resolve?", not +# "is the body valid text". Matches phase5-plan.md §3 B0. +_BINARY_EXTENSIONS = (".whl", ".tar.gz", ".tgz", ".zip", ".gz") + + +def _is_binary_url(target: str) -> bool: + lower = target.lower() + return any(lower.endswith(ext) for ext in _BINARY_EXTENSIONS) + + def resolve_one(base: str, value: str, errors: list[str], label: str) -> None: if URL_RE.match(value): target = value @@ -185,13 +196,19 @@ def resolve_one(base: str, value: str, errors: list[str], label: str) -> None: except json.JSONDecodeError as exc: errors.append(f"exposes[{label}]: {local} is invalid JSON: {exc}") return + + binary = _is_binary_url(target) try: with urllib.request.urlopen(target, timeout=15) as resp: - body = resp.read().decode("utf-8") + # For binary URLs we only need "did the server answer?". + # Skip the body read entirely so a non-UTF-8 payload (a wheel, + # a tarball, ...) doesn't trip a decode error. Text URLs still + # read + decode so the downstream JSON check has a body. + body = "" if binary else resp.read().decode("utf-8") except (urllib.error.URLError, TimeoutError) as exc: errors.append(f"exposes[{label}]: cannot fetch {target}: {exc}") return - if target.endswith(".json"): + if not binary and target.endswith(".json"): try: json.loads(body) except json.JSONDecodeError as exc: