From b6f47989d807946a086c3bf6cff863c72d5d9af1 Mon Sep 17 00:00:00 2001
From: Rafael Richards <rmrich5@gmail.com>
Date: Mon, 11 May 2026 10:35:41 -0400
Subject: [PATCH] phase5-B: link-check gate + fix validate-repo-meta binary-URL
 decode bug (TDD)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 Track B per phase5-plan.md §3. Gate 2 of 4. Two pieces:

### B0 — validate-repo-meta.py binary-URL fix (Phase-4 carry-over)

Pre-Track-B, ``resolve_one()`` blindly UTF-8-decoded every fetched
body. Any ``exposes.*`` URL pointing at a binary asset (e.g.
m-dev-tools-mcp's v0.1.0 wheel) crashed with UnicodeDecodeError; the
documented workaround was ``ARGS=--no-resolve`` on every call.

Fix: detect non-text extensions (``.whl`` / ``.tar.gz`` / ``.tgz`` /
``.zip`` / ``.gz``) and skip the body-read entirely — HEAD-equivalent
semantics. Text URLs unchanged.

* New _BINARY_EXTENSIONS module constant + _is_binary_url() helper.
* resolve_one() branches on binary-ness; binary URLs only check
  "did the server answer?".
* tests/test_validate_repo_meta.py — 5 cases pinning the behaviour:
  text-URL decodes + JSON-validates; invalid-JSON text URL reports;
  binary .whl URL does NOT decode (the bug-fix assertion); .tar.gz
  same defense; transport failure on a binary URL still surfaces.

Verified: ``make check-repo-meta META=…/m-dev-tools-mcp/dist/repo.meta.json``
in full-resolve mode now returns OK against the v0.1.0 wheel URL.
The ``ARGS=--no-resolve`` workaround can be dropped from any
call site that wants the stricter check (out of scope here — just
fix the bug, leave the call sites alone).

### B1 + B2 — check-links.py

Walks every URL surfaced by the catalog:

* profile/llms.txt — Markdown link targets (one row per link, line
  number captured as the label)
* profile/tools.json — top-level $schema + every tools.<key>.<field>_url
  value across every entry
* profile/task_index.json — top-level $schema + every doc field on
  every category/row

Per-URL behaviour: HEAD with a 15s timeout. On 405 Method Not Allowed,
retry with GET. 200/301/302 = OK (urllib follows redirects natively
when --allow-redirects is on, which is the default). Anything else
= FAIL with the HTTP status + reason captured.

Modes:

* --offline — inventory only. No network round-trip. Status column
  reads INVENTORIED. Used by per-PR CI.
* default — full HEAD walk. Used by the weekly cron firing.

Exit codes: 0 all OK (or all INVENTORIED), 1 any FAIL, 2 fixture
error.

13 new TDD cases (tests/test_check_links.py):

* Inventory (3): tools.json $schema + *_url surfacing; task_index doc
  fields; llms.txt Markdown link extraction.
* check_url (5): 200 OK, 404 FAIL, 405→GET fallback (HEAD then GET
  retry assertion), DNS failure FAIL, redirect-follow OK.
* Driver (2): worst-first aggregation; all-OK aggregator.
* CLI (3): --offline must not touch the network (urlopen patched to
  raise); a 404 surfaces as rc=1; smoke against committed catalog.

### B3 — Make + CI wiring

* make check-links — invokes the script with --offline (PR-mode).
* .github/workflows/ci.yml: PR-mode adds a "Link inventory" step to
  the `check` job. The weekly cron firing on the `handshake` job
  adds a parallel step running the full live HEAD walk so a broken
  upstream URL surfaces within 7 days.

### Verification locally

* pytest profile/build/ — 84 / 84 (66 prior + 5 validate-repo-meta
  + 13 check-links)
* make check-links — clean (58 URLs catalogued offline)
* make check-freshness — still clean (Phase-5 Track A unchanged)
* make check-repo-meta full mode (no --no-resolve) green against
  v0.1.0 wheel URL

### What's NOT in this PR

* JSON-body validation in the cron firing — check-links only HEAD-
  checks; deeper "is the body well-formed" lives in build-catalog +
  validate-catalog. Adding a JSON-parse pass in check-links would
  duplicate that path.
* Retry-with-backoff for transient 5xx on the cron firing —
  phase5-plan.md §9 explicitly defers this until CI demonstrates
  flake.
---
 .github/workflows/ci.yml                 |  15 ++
 Makefile                                 |   9 +-
 profile/build/check-links.py             | 288 +++++++++++++++++++++++
 profile/build/test_check_links.py        | 288 +++++++++++++++++++++++
 profile/build/test_validate_repo_meta.py | 122 ++++++++++
 profile/build/validate-repo-meta.py      |  21 +-
 6 files changed, 740 insertions(+), 3 deletions(-)
 create mode 100644 profile/build/check-links.py
 create mode 100644 profile/build/test_check_links.py
 create mode 100644 profile/build/test_validate_repo_meta.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 72ecbbf..f76cbc2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,6 +77,13 @@ jobs:
         if: github.event_name != 'schedule'
         run: make check-freshness
 
+      - name: Link inventory (Phase-5 Track B — PR-mode)
+        # Per phase5-plan.md §3 B3: PR-mode runs --offline so CI is
+        # network-free + flake-free. The weekly cron firing (handshake
+        # job below) does the actual HEAD walk against the live URLs.
+        if: github.event_name != 'schedule'
+        run: make check-links
+
   handshake:
     # Phase-3 Track D — discovery-protocol handshake. Runs the 8-step
     # external-agent walk-through end-to-end. On push/PR uses bundled
@@ -108,3 +115,11 @@ jobs:
         # PR is opened in between.
         if: github.event_name == 'schedule'
         run: python3 profile/build/check-freshness.py --offline --strict
+
+      - name: Link-check (Phase-5 Track B — cron live URLs)
+        # Phase-5 Track B weekly fetch: HEAD every URL in llms.txt +
+        # tools.json + task_index.json against the live web. A 404 or
+        # transport failure surfaces here within 7 days even if no PR
+        # opens in between. Per-PR jobs use --offline (see above).
+        if: github.event_name == 'schedule'
+        run: python3 profile/build/check-links.py
diff --git a/Makefile b/Makefile
index c9b3a93..4d4e940 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness
+.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links
 
 # Phase-1 Track B's generator. Fetches each TIER_1+TIER_2+TIER_3 repo's
 # dist/repo.meta.json, validates it, translates it into a `tools.<key>`
@@ -108,3 +108,10 @@ handshake:
 # failure so a freshness slip surfaces within 7 days.
 check-freshness:
 	python3 profile/build/check-freshness.py --offline
+
+# Phase-5 Track B: link-checking gate over llms.txt + tools.json's
+# *_url fields + task_index.json's doc fields. Per-PR runs --offline
+# (inventory only — no network); the weekly cron firing runs the
+# full HEAD walk so broken upstream URLs surface within 7 days.
+check-links:
+	python3 profile/build/check-links.py --offline
diff --git a/profile/build/check-links.py b/profile/build/check-links.py
new file mode 100644
index 0000000..09c0aba
--- /dev/null
+++ b/profile/build/check-links.py
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+"""Phase 5 Track B — link-checking gate.
+
+Walks every URL surfaced by the catalog:
+
+* ``profile/llms.txt`` — Markdown link targets.
+* ``profile/tools.json`` — top-level ``$schema`` + every ``*_url``
+  field across every ``tools.<key>`` entry.
+* ``profile/task_index.json`` — top-level ``$schema`` + every
+  ``doc`` field across every category/row.
+
+For each URL, issues a HEAD request (15s timeout). 200/301/302 count
+as OK when ``--allow-redirects`` is on (the default). A 405 Method
+Not Allowed retries with a GET. Anything else surfaces as ``FAIL``
+with the HTTP status + message.
+
+Binary URLs (``.whl`` / ``.tar.gz`` / ``.tgz`` / ``.zip`` / ``.gz``)
+work the same way — HEAD only, no body decode. Same defense as
+``validate-repo-meta.py`` after Phase 5 Track B0.
+
+Modes:
+
+* ``--offline`` — inventory only. No network. Status column reads
+  ``INVENTORIED``. Used by per-PR CI.
+* default — full HEAD walk. Used by the weekly cron firing.
+
+Exit codes:
+
+* ``0`` — all URLs OK (or all INVENTORIED in --offline mode).
+* ``1`` — any URL FAIL.
+* ``2`` — fixture/parse error.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import urllib.error
+import urllib.request
+from pathlib import Path
+from typing import Any
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+PROFILE = REPO_ROOT / "profile"
+DEFAULT_TOOLS_JSON = PROFILE / "tools.json"
+DEFAULT_TASK_INDEX = PROFILE / "task_index.json"
+DEFAULT_LLMS_TXT = PROFILE / "llms.txt"
+
+MD_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)")
+URL_RE = re.compile(r"^https?://")
+
+# Status priority for aggregator + sort (lower = better).
+_STATUS_PRIORITY = {
+    "OK": 0,
+    "INVENTORIED": 1,
+    "FAIL": 2,
+}
+
+
+# ---- inventory helpers -----------------------------------------------------
+
+
+def extract_urls_from_tools_json(tools: dict[str, Any]) -> list[tuple[str, str]]:
+    """Return ``(label, url)`` rows for every URL in ``tools.json``.
+
+    Surfaces:
+      * top-level ``$schema``
+      * every ``tools.<key>.<field>_url`` value that's an http(s) URL
+    """
+    out: list[tuple[str, str]] = []
+    schema = tools.get("$schema")
+    if isinstance(schema, str) and URL_RE.match(schema):
+        out.append(("$schema", schema))
+    tools_map = tools.get("tools", {})
+    if isinstance(tools_map, dict):
+        for key, entry in tools_map.items():
+            if not isinstance(entry, dict):
+                continue
+            for field, value in entry.items():
+                if not isinstance(value, str):
+                    continue
+                if not field.endswith("_url"):
+                    continue
+                if not URL_RE.match(value):
+                    continue
+                out.append((f"tools.{key}.{field}", value))
+    return out
+
+
+def extract_urls_from_task_index(ti: dict[str, Any]) -> list[tuple[str, str]]:
+    """Return ``(label, url)`` rows for every URL in ``task_index.json``."""
+    out: list[tuple[str, str]] = []
+    schema = ti.get("$schema")
+    if isinstance(schema, str) and URL_RE.match(schema):
+        out.append(("$schema", schema))
+    categories = ti.get("categories", {})
+    if isinstance(categories, dict):
+        for cat_name, cat in categories.items():
+            if not isinstance(cat, dict):
+                continue
+            for row_name, row in cat.items():
+                if not isinstance(row, dict):
+                    continue
+                doc = row.get("doc")
+                if isinstance(doc, str) and URL_RE.match(doc):
+                    out.append((f"categories.{cat_name}.{row_name}.doc", doc))
+    return out
+
+
+def extract_urls_from_llms_txt(text: str) -> list[tuple[str, str]]:
+    """Return ``(label, url)`` rows for every Markdown-link URL in
+    ``llms.txt``. Labels are the line index where the link appeared."""
+    out: list[tuple[str, str]] = []
+    for line_idx, line in enumerate(text.splitlines(), start=1):
+        for m in MD_LINK_RE.finditer(line):
+            url = m.group(1)
+            if URL_RE.match(url):
+                out.append((f"llms.txt:L{line_idx}", url))
+    return out
+
+
+# ---- single-URL check ------------------------------------------------------
+
+
+def check_url(url: str, *, allow_redirects: bool = True, timeout: float = 15.0) -> dict[str, Any]:
+    """HEAD-check one URL. Returns
+    ``{"status": "OK"|"FAIL", "http_status": int|None, "message": str}``.
+
+    On 405, retries with a GET. ``allow_redirects`` is informational
+    here — urllib follows 3xx natively; the final 200 is what we see.
+    """
+    req = urllib.request.Request(url, method="HEAD")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310
+            status = getattr(resp, "status", 200)
+            return {"status": "OK", "http_status": status, "message": ""}
+    except urllib.error.HTTPError as exc:
+        if exc.code == 405:
+            try:
+                with urllib.request.urlopen(url, timeout=timeout) as resp:  # noqa: S310
+                    status = getattr(resp, "status", 200)
+                    return {"status": "OK", "http_status": status, "message": ""}
+            except urllib.error.HTTPError as exc2:
+                return {
+                    "status": "FAIL",
+                    "http_status": exc2.code,
+                    "message": str(exc2.reason or exc2),
+                }
+            except (urllib.error.URLError, TimeoutError) as exc2:
+                return {
+                    "status": "FAIL",
+                    "http_status": None,
+                    "message": str(exc2.reason if hasattr(exc2, "reason") else exc2),
+                }
+        return {"status": "FAIL", "http_status": exc.code, "message": str(exc.reason or exc)}
+    except (urllib.error.URLError, TimeoutError) as exc:
+        msg = str(exc.reason) if hasattr(exc, "reason") else str(exc)
+        return {"status": "FAIL", "http_status": None, "message": msg}
+
+
+# ---- driver ----------------------------------------------------------------
+
+
+def check_links_impl(
+    urls: list[tuple[str, str]],
+    *,
+    allow_redirects: bool = True,
+) -> tuple[list[dict[str, Any]], str]:
+    """Walk each ``(label, url)`` pair, return ``(rows, worst_status)``
+    sorted worst-first."""
+    rows: list[dict[str, Any]] = []
+    for label, url in urls:
+        result = check_url(url, allow_redirects=allow_redirects)
+        rows.append(
+            {
+                "label": label,
+                "url": url,
+                "status": result["status"],
+                "http_status": result["http_status"],
+                "message": result["message"],
+            }
+        )
+    rows.sort(
+        key=lambda r: (
+            -_STATUS_PRIORITY.get(r["status"], 0),
+            r["label"],
+        )
+    )
+    worst = "OK"
+    for r in rows:
+        if _STATUS_PRIORITY[r["status"]] > _STATUS_PRIORITY[worst]:
+            worst = r["status"]
+    return rows, worst
+
+
+# ---- table formatting ------------------------------------------------------
+
+
+def format_table(title: str, rows: list[dict[str, Any]]) -> str:
+    if not rows:
+        return f"### {title}\n\n(no entries)\n"
+    lines = [f"### {title}", "", "| status | label | url | http |", "|---|---|---|---|"]
+    for r in rows:
+        h = r.get("http_status")
+        h_text = "—" if h is None else str(h)
+        lines.append(f"| {r['status']} | `{r['label']}` | {r['url']} | {h_text} |")
+    lines.append("")
+    return "\n".join(lines)
+
+
+# ---- CLI -------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Link-checking gate over llms.txt + tools.json + task_index.json."
+    )
+    parser.add_argument("--tools-json", type=Path, default=DEFAULT_TOOLS_JSON)
+    parser.add_argument("--task-index", type=Path, default=DEFAULT_TASK_INDEX)
+    parser.add_argument("--llms-txt", type=Path, default=DEFAULT_LLMS_TXT)
+    parser.add_argument(
+        "--offline",
+        action="store_true",
+        help=(
+            "Inventory URLs only — don't fetch. Status column reads "
+            "INVENTORIED. Used by per-PR CI to avoid network flake."
+        ),
+    )
+    parser.add_argument(
+        "--allow-redirects",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Follow 3xx redirects (default ON).",
+    )
+    args = parser.parse_args(argv)
+
+    inventory: list[tuple[str, str]] = []
+    try:
+        if args.tools_json.exists():
+            inventory.extend(
+                extract_urls_from_tools_json(
+                    json.loads(args.tools_json.read_text(encoding="utf-8"))
+                )
+            )
+        if args.task_index.exists():
+            inventory.extend(
+                extract_urls_from_task_index(
+                    json.loads(args.task_index.read_text(encoding="utf-8"))
+                )
+            )
+        if args.llms_txt.exists():
+            inventory.extend(
+                extract_urls_from_llms_txt(args.llms_txt.read_text(encoding="utf-8"))
+            )
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"ERROR: cannot read catalog input: {exc}", file=sys.stderr)
+        return 2
+
+    if args.offline:
+        rows = [
+            {
+                "label": label,
+                "url": url,
+                "status": "INVENTORIED",
+                "http_status": None,
+                "message": "",
+            }
+            for label, url in inventory
+        ]
+        worst = "OK"
+        print(format_table("Inventory (offline)", rows))
+        print(f"check-links: clean (offline; {len(rows)} URLs catalogued)")
+        return 0
+
+    rows, worst = check_links_impl(inventory, allow_redirects=args.allow_redirects)
+    print(format_table("Link check", rows))
+    if worst == "FAIL":
+        n_fail = sum(1 for r in rows if r["status"] == "FAIL")
+        print(f"check-links: FAIL ({n_fail} broken URL(s))", file=sys.stderr)
+        return 1
+    print(f"check-links: clean ({len(rows)} URL(s) OK)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/profile/build/test_check_links.py b/profile/build/test_check_links.py
new file mode 100644
index 0000000..60337a2
--- /dev/null
+++ b/profile/build/test_check_links.py
@@ -0,0 +1,288 @@
+"""Tests for check-links.py — Phase 5 Track B1 (RED) before B2 (GREEN).
+
+Pure-function inventory helpers + the URL-walker, tested with
+``unittest.mock.patch`` over ``urllib.request.urlopen`` so the unit
+suite is network-free.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import urllib.error
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+BUILD = REPO_ROOT / "profile" / "build"
+
+_path = BUILD / "check-links.py"
+_spec = importlib.util.spec_from_file_location("_check_links", _path)
+_check_links = importlib.util.module_from_spec(_spec)
+assert _spec and _spec.loader
+_spec.loader.exec_module(_check_links)
+
+
+# ----------------------------------------------------------- inventory
+
+
+def test_extract_urls_from_tools_json_captures_star_url_fields() -> None:
+    """Every *_url field on every tools.<key> entry surfaces, plus the
+    top-level $schema."""
+    tools = {
+        "$schema": "https://schema.example.invalid/tools.schema.json",
+        "tools": {
+            "m-cli": {
+                "id": "tool:m-cli",
+                "repo_meta_url": "https://example.invalid/m-cli/repo.meta.json",
+                "commands_url": "https://example.invalid/m-cli/commands.json",
+                "verified_on": "2026-05-11",
+            },
+            "m-stdlib": {
+                "id": "tool:m-stdlib",
+                "modules_url": "https://example.invalid/m-stdlib/manifest.json",
+            },
+        },
+    }
+    urls = _check_links.extract_urls_from_tools_json(tools)
+    by_label = {label: url for label, url in urls}
+    assert by_label["$schema"] == "https://schema.example.invalid/tools.schema.json"
+    assert (
+        by_label["tools.m-cli.repo_meta_url"]
+        == "https://example.invalid/m-cli/repo.meta.json"
+    )
+    assert (
+        by_label["tools.m-cli.commands_url"]
+        == "https://example.invalid/m-cli/commands.json"
+    )
+    assert by_label["tools.m-stdlib.modules_url"]
+    # verified_on must NOT surface — not a URL.
+    assert not any("verified_on" in label for label, _ in urls)
+
+
+def test_extract_urls_from_task_index_captures_doc_fields() -> None:
+    """Each row's optional ``doc`` field surfaces as a URL row."""
+    ti = {
+        "$schema": "https://schema.example.invalid/task_index.schema.json",
+        "categories": {
+            "lib": {
+                "json_parse": {
+                    "intent": "Parse JSON in M",
+                    "primary": "module:m-stdlib#STDJSON",
+                    # no doc — should not surface
+                },
+            },
+            "infra": {
+                "agent_integration": {
+                    "intent": "Point my MCP agent at the catalog",
+                    "primary": "tool:m-dev-tools-mcp",
+                    "doc": "https://example.invalid/agent-integration.md",
+                },
+            },
+        },
+    }
+    urls = _check_links.extract_urls_from_task_index(ti)
+    by_label = {label: url for label, url in urls}
+    assert by_label["$schema"] == "https://schema.example.invalid/task_index.schema.json"
+    assert (
+        by_label["categories.infra.agent_integration.doc"]
+        == "https://example.invalid/agent-integration.md"
+    )
+    # json_parse has no doc — must not surface
+    assert "categories.lib.json_parse.doc" not in by_label
+
+
+def test_extract_urls_from_llms_txt_captures_markdown_links() -> None:
+    text = (
+        "# org\n\n"
+        "Start: [tools.json](https://example.invalid/tools.json)\n"
+        "Schema: [tools.schema.json](https://example.invalid/tools.schema.json)\n"
+    )
+    urls = _check_links.extract_urls_from_llms_txt(text)
+    by_url = {url for _label, url in urls}
+    assert "https://example.invalid/tools.json" in by_url
+    assert "https://example.invalid/tools.schema.json" in by_url
+
+
+# ----------------------------------------------------------- check_url
+
+
+def _mock_resp(status: int = 200) -> MagicMock:
+    resp = MagicMock()
+    resp.__enter__ = MagicMock(return_value=resp)
+    resp.__exit__ = MagicMock(return_value=False)
+    resp.status = status
+    resp.read = MagicMock(return_value=b"")
+    return resp
+
+
+def test_check_url_200_is_ok() -> None:
+    with patch("urllib.request.urlopen", return_value=_mock_resp(200)):
+        result = _check_links.check_url("https://example.invalid/x")
+    assert result["status"] == "OK"
+    assert result["http_status"] == 200
+
+
+def test_check_url_404_is_fail() -> None:
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=urllib.error.HTTPError(
+            "https://example.invalid/x", 404, "Not Found", hdrs={}, fp=None
+        ),
+    ):
+        result = _check_links.check_url("https://example.invalid/x")
+    assert result["status"] == "FAIL"
+    assert result["http_status"] == 404
+
+
+def test_check_url_405_on_head_falls_back_to_get() -> None:
+    """If HEAD returns 405, retry with GET. The walker uses a Request
+    object with method='HEAD'; on 405 it falls back to a no-method GET."""
+    call_count = {"head": 0, "get": 0}
+
+    def _fake_urlopen(req_or_url, **_kwargs):
+        # urllib passes a Request object for the explicit-method call,
+        # a bare string for the GET fallback.
+        if hasattr(req_or_url, "get_method") and req_or_url.get_method() == "HEAD":
+            call_count["head"] += 1
+            raise urllib.error.HTTPError(
+                "https://example.invalid/x", 405, "Method Not Allowed", hdrs={}, fp=None
+            )
+        call_count["get"] += 1
+        return _mock_resp(200)
+
+    with patch("urllib.request.urlopen", side_effect=_fake_urlopen):
+        result = _check_links.check_url("https://example.invalid/x")
+    assert call_count["head"] == 1
+    assert call_count["get"] == 1
+    assert result["status"] == "OK"
+
+
+def test_check_url_dns_failure_is_fail() -> None:
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=urllib.error.URLError("nodename nor servname provided"),
+    ):
+        result = _check_links.check_url("https://nonexistent.invalid/")
+    assert result["status"] == "FAIL"
+    assert result["http_status"] is None
+    assert "nodename" in result["message"]
+
+
+def test_check_url_redirect_followed_by_default() -> None:
+    """A 302 followed by a 200 counts as OK when --allow-redirects is on
+    (the default). urllib.request.urlopen follows redirects natively,
+    so we just simulate the final 200 here."""
+    with patch("urllib.request.urlopen", return_value=_mock_resp(200)):
+        result = _check_links.check_url(
+            "https://example.invalid/redirect-to-x", allow_redirects=True
+        )
+    assert result["status"] == "OK"
+
+
+# ----------------------------------------------------------- driver / CLI
+
+
+def test_check_links_impl_aggregates_worst() -> None:
+    """Driver aggregates per-URL results. Worst-first ordering."""
+    urls = [
+        ("$schema", "https://example.invalid/ok-a"),
+        ("tools.x.repo_meta_url", "https://example.invalid/bad"),
+        ("tools.y.modules_url", "https://example.invalid/ok-b"),
+    ]
+
+    def _fake_check(url, **_kwargs):
+        if url.endswith("/bad"):
+            return {"status": "FAIL", "http_status": 404, "message": "Not Found"}
+        return {"status": "OK", "http_status": 200, "message": ""}
+
+    with patch.object(_check_links, "check_url", side_effect=_fake_check):
+        rows, worst = _check_links.check_links_impl(urls, allow_redirects=True)
+    assert worst == "FAIL"
+    # Worst-first
+    assert rows[0]["status"] == "FAIL"
+    assert rows[0]["url"].endswith("/bad")
+
+
+def test_check_links_impl_all_ok_returns_OK() -> None:
+    urls = [("a", "https://example.invalid/a"), ("b", "https://example.invalid/b")]
+    with patch.object(
+        _check_links,
+        "check_url",
+        return_value={"status": "OK", "http_status": 200, "message": ""},
+    ):
+        rows, worst = _check_links.check_links_impl(urls, allow_redirects=True)
+    assert worst == "OK"
+    assert all(r["status"] == "OK" for r in rows)
+
+
+def test_cli_offline_inventories_without_network(tmp_path, capsys) -> None:
+    """--offline must NOT touch the network. Patch urlopen to blow up
+    so an accidental fetch is caught."""
+    tools = {"tools": {"m-cli": {"repo_meta_url": "https://example.invalid/a.json"}}}
+    ti = {"categories": {}}
+    llms = "# org\n[a](https://example.invalid/b.json)\n"
+
+    tools_path = tmp_path / "tools.json"
+    ti_path = tmp_path / "task_index.json"
+    llms_path = tmp_path / "llms.txt"
+    tools_path.write_text(json.dumps(tools), encoding="utf-8")
+    ti_path.write_text(json.dumps(ti), encoding="utf-8")
+    llms_path.write_text(llms, encoding="utf-8")
+
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=AssertionError("network must not be touched in --offline mode"),
+    ):
+        rc = _check_links.main(
+            [
+                "--offline",
+                "--tools-json", str(tools_path),
+                "--task-index", str(ti_path),
+                "--llms-txt", str(llms_path),
+            ]
+        )
+    out = capsys.readouterr().out
+    assert rc == 0
+    # Both URLs got inventoried
+    assert "example.invalid/a.json" in out
+    assert "example.invalid/b.json" in out
+    # Status column should say INVENTORIED / SKIPPED — not OK (since
+    # nothing was actually fetched)
+    assert "INVENTORIED" in out or "SKIPPED" in out
+
+
+def test_cli_failure_returns_rc1(tmp_path) -> None:
+    tools = {"tools": {"x": {"repo_meta_url": "https://example.invalid/404"}}}
+    tools_path = tmp_path / "tools.json"
+    tools_path.write_text(json.dumps(tools), encoding="utf-8")
+    ti_path = tmp_path / "task_index.json"
+    ti_path.write_text(json.dumps({"categories": {}}), encoding="utf-8")
+    llms_path = tmp_path / "llms.txt"
+    llms_path.write_text("# x\n", encoding="utf-8")
+
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=urllib.error.HTTPError(
+            "https://example.invalid/404", 404, "Not Found", hdrs={}, fp=None
+        ),
+    ):
+        rc = _check_links.main(
+            [
+                "--tools-json", str(tools_path),
+                "--task-index", str(ti_path),
+                "--llms-txt", str(llms_path),
+            ]
+        )
+    assert rc == 1
+
+
+def test_cli_committed_inventory_runs_offline(capsys) -> None:
+    """Smoke against the real committed catalog. --offline mode never
+    hits the network; just sanity-checks inventory works against the
+    committed files."""
+    rc = _check_links.main(["--offline"])
+    out = capsys.readouterr().out
+    assert rc == 0
+    # At least the tools.json $schema URL should appear in the inventory
+    assert "tools.schema.json" in out
diff --git a/profile/build/test_validate_repo_meta.py b/profile/build/test_validate_repo_meta.py
new file mode 100644
index 0000000..b626096
--- /dev/null
+++ b/profile/build/test_validate_repo_meta.py
@@ -0,0 +1,122 @@
+"""Tests for validate-repo-meta.py.
+
+Phase 5 Track B0 — pinning the binary-URL behaviour. Pre-Track-B,
+``resolve_one()`` blindly UTF-8-decoded every fetched body, which
+made full-resolve validation fail against any `exposes.*` URL that
+pointed at a binary asset (e.g. m-dev-tools-mcp's v0.1.0 wheel).
+The fix uses HEAD-only for non-text extensions; this test pins the
+behaviour so a future regression surfaces immediately.
+
+Test strategy: monkey-patch ``urllib.request.urlopen`` so we control
+the server's response shape. No real network round-trip.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+BUILD = REPO_ROOT / "profile" / "build"
+
+_path = BUILD / "validate-repo-meta.py"
+_spec = importlib.util.spec_from_file_location("_validate_repo_meta", _path)
+_validate_repo_meta = importlib.util.module_from_spec(_spec)
+assert _spec and _spec.loader
+_spec.loader.exec_module(_validate_repo_meta)
+
+
+def _mock_response(*, body: bytes = b"", status: int = 200) -> MagicMock:
+    resp = MagicMock()
+    resp.__enter__ = MagicMock(return_value=resp)
+    resp.__exit__ = MagicMock(return_value=False)
+    resp.read = MagicMock(return_value=body)
+    resp.status = status
+    return resp
+
+
+def test_resolve_one_text_url_decodes_body() -> None:
+    """Smoke: a .json URL still fetches + JSON-validates. Pinning the
+    pre-Track-B0 happy path so the fix doesn't regress text handling."""
+    errors: list[str] = []
+    valid_json = b'{"hello": "world"}'
+    with patch(
+        "urllib.request.urlopen", return_value=_mock_response(body=valid_json)
+    ):
+        _validate_repo_meta.resolve_one(
+            base="https://example.invalid/",
+            value="dist/repo.meta.json",
+            errors=errors,
+            label="repo_meta",
+        )
+    assert errors == []
+
+
+def test_resolve_one_invalid_json_text_url_reports() -> None:
+    errors: list[str] = []
+    with patch(
+        "urllib.request.urlopen", return_value=_mock_response(body=b"not json")
+    ):
+        _validate_repo_meta.resolve_one(
+            base="https://example.invalid/",
+            value="dist/something.json",
+            errors=errors,
+            label="bad_json",
+        )
+    assert len(errors) == 1
+    assert "invalid JSON" in errors[0]
+
+
+def test_resolve_one_binary_wheel_url_no_decode() -> None:
+    """Track B0 contract: a .whl URL must NOT trigger UTF-8 decode of
+    the body. Validator returns success on HEAD-equivalent 200, no
+    errors raised. Mirrors the real-world v0.1.0 wheel asset URL
+    shape that crashed pre-Track-B0 validate-repo-meta."""
+    errors: list[str] = []
+    # Random non-UTF-8 bytes — would crash .decode("utf-8")
+    bogus_bytes = b"\xab\xcd\xef\x00\x01\x02"
+    with patch(
+        "urllib.request.urlopen", return_value=_mock_response(body=bogus_bytes)
+    ):
+        _validate_repo_meta.resolve_one(
+            base="https://github.com/m-dev-tools/m-dev-tools-mcp/releases/download/v0.1.0/",
+            value="m_dev_tools_mcp-0.1.0-py3-none-any.whl",
+            errors=errors,
+            label="release_wheel",
+        )
+    assert errors == [], f"binary URL should not raise decode errors; got {errors!r}"
+
+
+def test_resolve_one_binary_tarball_url_no_decode() -> None:
+    """Same contract for .tar.gz."""
+    errors: list[str] = []
+    with patch(
+        "urllib.request.urlopen", return_value=_mock_response(body=b"\x1f\x8b")
+    ):
+        _validate_repo_meta.resolve_one(
+            base="https://example.invalid/",
+            value="dist/payload.tar.gz",
+            errors=errors,
+            label="tarball",
+        )
+    assert errors == []
+
+
+def test_resolve_one_binary_url_fetch_failure_reports() -> None:
+    """HEAD failures (404, transport) still surface as errors."""
+    import urllib.error
+
+    errors: list[str] = []
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=urllib.error.URLError("simulated dns failure"),
+    ):
+        _validate_repo_meta.resolve_one(
+            base="https://nonexistent.invalid/",
+            value="dist/foo.whl",
+            errors=errors,
+            label="dead_link",
+        )
+    assert len(errors) == 1
+    assert "cannot fetch" in errors[0]
diff --git a/profile/build/validate-repo-meta.py b/profile/build/validate-repo-meta.py
index dacf640..033469a 100755
--- a/profile/build/validate-repo-meta.py
+++ b/profile/build/validate-repo-meta.py
@@ -169,6 +169,17 @@ def validate_schema(data: dict, errors: list[str]) -> None:
         errors.append(f"schema: at {path}: {exc.message}")
 
 
+# Extensions whose payloads are binary by design — never UTF-8-decode
+# them. The validator's job for these is "does the URL resolve?", not
+# "is the body valid text". Matches phase5-plan.md §3 B0.
+_BINARY_EXTENSIONS = (".whl", ".tar.gz", ".tgz", ".zip", ".gz")
+
+
+def _is_binary_url(target: str) -> bool:
+    lower = target.lower()
+    return any(lower.endswith(ext) for ext in _BINARY_EXTENSIONS)
+
+
 def resolve_one(base: str, value: str, errors: list[str], label: str) -> None:
     if URL_RE.match(value):
         target = value
@@ -185,13 +196,19 @@ def resolve_one(base: str, value: str, errors: list[str], label: str) -> None:
             except json.JSONDecodeError as exc:
                 errors.append(f"exposes[{label}]: {local} is invalid JSON: {exc}")
         return
+
+    binary = _is_binary_url(target)
     try:
         with urllib.request.urlopen(target, timeout=15) as resp:
-            body = resp.read().decode("utf-8")
+            # For binary URLs we only need "did the server answer?".
+            # Skip the body read entirely so a non-UTF-8 payload (a wheel,
+            # a tarball, ...) doesn't trip a decode error. Text URLs still
+            # read + decode so the downstream JSON check has a body.
+            body = "" if binary else resp.read().decode("utf-8")
     except (urllib.error.URLError, TimeoutError) as exc:
         errors.append(f"exposes[{label}]: cannot fetch {target}: {exc}")
         return
-    if target.endswith(".json"):
+    if not binary and target.endswith(".json"):
         try:
             json.loads(body)
         except json.JSONDecodeError as exc: