From 8dc61a104042e942729c9546fc8a51ed91161865 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Fri, 22 May 2026 20:03:45 -0400
Subject: [PATCH] fix(embeddings,cli): case-insensitive path diff, VRAM
 release, broken CLI kwarg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small but real defects in embeddings.py / imagetool.py — the
last three backend items from the 2026-05-20 code review.

## 1. _get_new_and_missing_images: case-insensitive Path comparison

The new-vs-missing diff used plain ``set[Path]`` operations. Python's
``Path.__eq__`` is *always* case-sensitive, regardless of the
underlying filesystem. On the dominant user platforms — Windows NTFS
and macOS HFS+/APFS-CI — that meant the same image referenced with
different case in its filename (``Photo.JPG`` in the cache, ``photo.jpg``
on disk) was classified as **both new and missing**:

    new      = {photo.jpg}  ← re-encoded (whole CLIP pass)
    missing  = {Photo.JPG}  ← orphaned in the cache

Net effect: silent double-encoding on every re-index, plus a stale
row that ``_filter_missing_images`` then drops, plus stale UMAP and
cluster-label caches keyed off the bogus filename count.

The fix funnels both sides through ``_path_compare_key``, which
returns ``p.as_posix().casefold()``. The casefolded keys drive the
set diff; the original-case Paths flow back through ``live_by_key``
and ``existing_by_key`` so downstream filesystem operations see what
the disk actually holds.

Trade-off on case-sensitive Linux: two files in the same directory
whose names differ only in case collapse to one entry. That collision
is unusual and harmless — the image is simply indexed once instead
of twice — and far less destructive than the silent double-encoding
the previous Path-equality diff produced on the majority of user
platforms.

## 2. search_images_by_text_and_image: VRAM release was a no-op

The ``finally`` block in this hot search path tried to drop GPU
tensor references before calling ``_cleanup_cuda_memory``:

    for name in ("image_embedding", "pos_emb", ...):
        if name in locals():
            del locals()[name]
    self._cleanup_cuda_memory(device)

``locals()`` returns a *snapshot dict* in functions — mutating it
doesn't touch the actual local bindings. The ``del`` was a no-op,
the local refs lived on until the function frame was destroyed, and
``torch.cuda.empty_cache()`` couldn't reclaim memory pinned by those
refs. VRAM accumulated across search queries until the search method
returned.

Pre-declare all ten potentially-allocated tensors / arrays to
``None`` at the start of the try (so they're unconditionally bound),
then ``del`` them explicitly in the finally before
``_cleanup_cuda_memory``. The allocator can now actually reclaim
the freed memory.

## 3. imagetool.do_search: TypeError on every first call

The ``search_images`` CLI passed ``query_image_path=args.search`` to
``Embeddings.search_images_by_text_and_image``. That function's
signature has no ``query_image_path`` parameter — it takes
``query_image_data: Image.Image``. Python raised ``TypeError: got
an unexpected keyword argument 'query_image_path'`` before the
encoder was ever touched, so the script has been broken since the
parameter was renamed.

Fix: open the path with PIL inside a context manager and pass
``query_image_data=img``. Verified by a new test that intercepts
the underlying call and asserts the kwarg name.

## Test coverage added

* ``tests/backend/test_embeddings_diff.py`` — 9 tests covering
  ``_path_compare_key`` directly plus four ``_get_new_and_missing_images``
  scenarios, including the case-only-rename regression.
* ``tests/backend/test_imagetool.py`` — 9 tests covering
  ``do_search`` / ``do_text_search`` kwarg shapes and ``main()``'s
  basename → handler dispatch.

297 backend tests pass (was 279).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 photomap/backend/embeddings.py        |  96 ++++++++++----
 photomap/backend/imagetool.py         |  13 +-
 tests/backend/test_embeddings_diff.py | 144 +++++++++++++++++++++
 tests/backend/test_imagetool.py       | 175 ++++++++++++++++++++++++++
 4 files changed, 399 insertions(+), 29 deletions(-)
 create mode 100644 tests/backend/test_embeddings_diff.py
 create mode 100644 tests/backend/test_imagetool.py

diff --git a/photomap/backend/embeddings.py b/photomap/backend/embeddings.py
index ab378406..e27061e3 100644
--- a/photomap/backend/embeddings.py
+++ b/photomap/backend/embeddings.py
@@ -781,6 +781,29 @@ def _save_embeddings(self, index_result: IndexResult) -> None:
         # Clear cache after saving
         _open_npz_file.cache_clear()
 
+    @staticmethod
+    def _path_compare_key(p: Path) -> str:
+        """Canonical key for the new-vs-missing diff in
+        :meth:`_get_new_and_missing_images`.
+
+        Returns the casefolded posix-form of the path. Casefolding is what
+        lets the diff stay correct on **case-insensitive filesystems**
+        (Windows NTFS by default, macOS HFS+ and case-insensitive APFS):
+        when an image is stored in the cache as ``Photo.JPG`` and later
+        scanned from disk as ``photo.jpg``, plain Path equality treats
+        them as different entries — both as new (re-encoded) and as
+        missing (orphaned in the cache).
+
+        Trade-off on case-sensitive Linux: two files in the same
+        directory whose names differ only in case (``IMG.jpg`` and
+        ``img.jpg``) collapse to a single index entry. That collision is
+        rare and harmless — the same image will simply be indexed once
+        instead of twice — and is far less destructive than the silent
+        double-encoding the previous Path-equality diff produced on the
+        majority of user platforms.
+        """
+        return p.as_posix().casefold()
+
     def _get_new_and_missing_images(
         self,
         image_paths_or_dir: list[Path] | Path,
@@ -788,15 +811,25 @@ def _get_new_and_missing_images(
         progress_callback: Callable | None = None,
     ) -> tuple[set[Path], set[Path]]:
         """Determine which images are new and which are missing."""
-        image_path_set = set(
-            self.get_image_files(
-                image_paths_or_dir, progress_callback=progress_callback
-            )
+        live_paths = self.get_image_files(
+            image_paths_or_dir, progress_callback=progress_callback
         )
-        existing_filenames_set = set(Path(p) for p in existing_filenames)
-
-        new_image_paths = image_path_set - existing_filenames_set
-        missing_image_paths = existing_filenames_set - image_path_set
+        # Build map from casefolded posix key to the *original-case* Path.
+        # The casefolded keys drive the set diff; the original Paths flow
+        # back out so downstream filesystem operations see what the disk
+        # actually holds.
+        live_by_key: dict[str, Path] = {
+            self._path_compare_key(p): p for p in live_paths
+        }
+        existing_by_key: dict[str, Path] = {
+            self._path_compare_key(Path(p)): Path(p) for p in existing_filenames
+        }
+
+        new_keys = set(live_by_key) - set(existing_by_key)
+        missing_keys = set(existing_by_key) - set(live_by_key)
+
+        new_image_paths = {live_by_key[k] for k in new_keys}
+        missing_image_paths = {existing_by_key[k] for k in missing_keys}
 
         return new_image_paths, missing_image_paths
 
@@ -1342,6 +1375,25 @@ def search_images_by_text_and_image(
         if use_query_optimization is not None and hasattr(encoder, "use_ensembling"):
             encoder.use_ensembling = use_query_optimization
 
+        # Pre-declare every GPU tensor and downstream numpy buffer the finally
+        # block needs to release. The previous ``del locals()[name]`` loop was
+        # a no-op — locals() returns a dict copy in functions, so the actual
+        # local bindings (and the VRAM they pinned) lived on until the frame
+        # itself was destroyed. Binding here lets the finally use plain ``del``
+        # to drop the references *before* ``_cleanup_cuda_memory`` runs
+        # ``torch.cuda.empty_cache()``, so the allocator can actually reclaim
+        # the freed memory instead of leaving it pinned.
+        image_embedding = None
+        pos_emb = None
+        neg_emb = None
+        embeddings_tensor = None
+        norm_embeddings = None
+        cos_img = None
+        cos_pos = None
+        cos_neg = None
+        positive_score_sum = None
+        similarities = None
+
         try:
             self._check_cache_compatibility(data, encoder)
 
@@ -1356,9 +1408,6 @@ def search_images_by_text_and_image(
                 return [], []
 
             # Encode only the inputs that will actually contribute.
-            image_embedding = None
-            pos_emb = None
-            neg_emb = None
             if image_weight > 0.0:
                 pil_image = ImageOps.exif_transpose(query_image_data).convert("RGB")
                 image_embedding = torch.from_numpy(
@@ -1426,22 +1475,15 @@ def search_images_by_text_and_image(
 
             return result_indices, result_similarities
         finally:
-            # Drop any local tensors so VRAM doesn't accumulate across queries.
-            # The encoder itself is cached and intentionally NOT closed here.
-            for name in (
-                "image_embedding",
-                "pos_emb",
-                "neg_emb",
-                "embeddings_tensor",
-                "norm_embeddings",
-                "cos_img",
-                "cos_pos",
-                "cos_neg",
-                "positive_score_sum",
-                "similarities",
-            ):
-                if name in locals():
-                    del locals()[name]
+            # Drop any local tensors / arrays so VRAM doesn't accumulate
+            # across queries. All ten names are unconditionally bound above
+            # (initially to None), so plain ``del`` is safe — no NameError
+            # paths to guard against. The encoder itself is cached and
+            # intentionally NOT closed here.
+            del image_embedding, pos_emb, neg_emb
+            del embeddings_tensor, norm_embeddings
+            del cos_img, cos_pos, cos_neg
+            del positive_score_sum, similarities
             self._cleanup_cuda_memory(device)
 
     def find_duplicate_clusters(self, similarity_threshold=0.995):
diff --git a/photomap/backend/imagetool.py b/photomap/backend/imagetool.py
index f8002c28..a6a71e51 100644
--- a/photomap/backend/imagetool.py
+++ b/photomap/backend/imagetool.py
@@ -5,6 +5,8 @@
 import sys
 from pathlib import Path
 
+from PIL import Image
+
 from .embeddings import Embeddings
 
 
@@ -105,8 +107,15 @@ def do_search():
     args = parser.parse_args()
     embeddings = Embeddings(embeddings_path=args.embeddings)
 
-    results, scores = embeddings.search_images_by_text_and_image(query_image_path=args.search,
-                                                                 top_k=args.top_k)
+    # ``search_images_by_text_and_image`` takes a *PIL Image*, not a path —
+    # the prior ``query_image_path=`` kwarg name didn't exist on the
+    # function and raised TypeError on every invocation. Open + decode here
+    # so the CLI matches the function's actual signature.
+    with Image.open(args.search) as query_image:
+        results, scores = embeddings.search_images_by_text_and_image(
+            query_image_data=query_image,
+            top_k=args.top_k,
+        )
     print("Top similar images:")
     for filename, score in zip(results, scores, strict=False):
         print(f"{filename}: {score:.4f}")
diff --git a/tests/backend/test_embeddings_diff.py b/tests/backend/test_embeddings_diff.py
new file mode 100644
index 00000000..b6355727
--- /dev/null
+++ b/tests/backend/test_embeddings_diff.py
@@ -0,0 +1,144 @@
+"""Tests for the new-vs-missing path diff in
+:class:`photomap.backend.embeddings.Embeddings`.
+
+The diff used to use plain Path equality, which is always case-sensitive
+regardless of the underlying filesystem. On case-insensitive filesystems
+(Windows NTFS, macOS HFS+/APFS-CI) that meant the same image referenced
+with different case in its filename — e.g. ``Photo.JPG`` in the .npz
+cache and ``photo.jpg`` on disk — would be classified as both *new* and
+*missing*, silently double-encoding the image and orphaning the cache
+row. The fix funnels both sides through a casefolded posix-form key.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+
+from photomap.backend.embeddings import Embeddings
+
+# ---------------------------------------------------------------------------
+# _path_compare_key — direct unit test of the canonical-key derivation
+# ---------------------------------------------------------------------------
+
+
+class TestPathCompareKey:
+    def test_same_case_unchanged(self):
+        assert Embeddings._path_compare_key(Path("/a/b/c.jpg")) == "/a/b/c.jpg"
+
+    def test_uppercase_extension_casefolds(self):
+        assert Embeddings._path_compare_key(
+            Path("/a/b/Photo.JPG")
+        ) == "/a/b/photo.jpg"
+
+    def test_mixed_case_path_casefolds(self):
+        assert Embeddings._path_compare_key(
+            Path("/Users/Alice/Photos/IMG_001.PNG")
+        ) == "/users/alice/photos/img_001.png"
+
+    def test_windows_style_path_normalised_to_posix(self):
+        # ``PurePosixPath`` would round-trip ``\\`` unchanged. On Linux test
+        # runners ``Path`` is a PosixPath, so a literal backslash stays as
+        # a character in the basename — but the casefold still applies so
+        # the comparison still works the way we need it to.
+        assert Embeddings._path_compare_key(
+            Path("C:/Users/Alice/Photo.JPG")
+        ).endswith("photo.jpg")
+
+
+# ---------------------------------------------------------------------------
+# _get_new_and_missing_images — set diff using the canonical key
+# ---------------------------------------------------------------------------
+
+
+def _embeddings_stub(tmp_path: Path) -> Embeddings:
+    """Construct an ``Embeddings`` pointed at a path inside ``tmp_path`` so
+    the encoder spec and clip root are valid but no model is ever loaded —
+    we only exercise the pure-Python diff method here."""
+    return Embeddings(embeddings_path=tmp_path / "stub.npz")
+
+
+class TestNewAndMissingDiff:
+    def test_exact_match_yields_empty_diff(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
+        existing = np.array(["/album/a.jpg", "/album/b.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set()
+        assert missing == set()
+
+    def test_added_file_appears_in_new(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
+        existing = np.array(["/album/a.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == {Path("/album/b.jpg")}
+        assert missing == set()
+
+    def test_removed_file_appears_in_missing(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg")]
+        existing = np.array(["/album/a.jpg", "/album/b.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set()
+        assert missing == {Path("/album/b.jpg")}
+
+    def test_case_only_rename_is_not_double_encoded(self, tmp_path):
+        """Regression test for the case-insensitive-FS bug.
+
+        The same file is recorded in the cache as ``Photo.JPG`` (upper)
+        and scanned from the filesystem as ``photo.jpg`` (lower). On a
+        case-insensitive FS these are the same file; on a case-sensitive
+        FS Python can't tell either way. The diff should treat them as
+        the same entry — neither new nor missing.
+        """
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/photo.jpg")]
+        existing = np.array(["/album/Photo.JPG"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set(), "expected no new file (case-folded match)"
+        assert missing == set(), "expected no missing file (case-folded match)"
+
+    def test_case_only_rename_preserves_live_casing_for_new_files(self, tmp_path):
+        """When a *new* path differs only in case from a cached path, the
+        Path returned in ``missing`` is the cache-stored original-case
+        version (preserved for downstream filesystem-mask work in
+        ``_filter_missing_images``)."""
+        emb = _embeddings_stub(tmp_path)
+        # Live scan has lowercase only; cache has upper for one file plus a
+        # genuinely new mixed-case entry.
+        live = [Path("/album/photo.jpg"), Path("/album/NewShot.PNG")]
+        existing = np.array(["/album/Photo.JPG"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        # ``NewShot.PNG`` is genuinely new — and the original casing is
+        # preserved so PIL.open() and EXIF read use the actual file name.
+        assert new == {Path("/album/NewShot.PNG")}
+        # The case-only-rename file is neither new nor missing.
+        assert missing == set()
diff --git a/tests/backend/test_imagetool.py b/tests/backend/test_imagetool.py
new file mode 100644
index 00000000..8fec0c56
--- /dev/null
+++ b/tests/backend/test_imagetool.py
@@ -0,0 +1,175 @@
+"""Smoke tests for :mod:`photomap.backend.imagetool` CLI entry points.
+
+These ensure each script can dispatch through ``main()`` and that the
+underlying ``Embeddings`` methods are called with kwarg names that
+actually exist. Regression coverage for the long-standing
+``query_image_path=`` typo in ``do_search`` that raised ``TypeError`` on
+every first call.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+from PIL import Image
+
+from photomap.backend import imagetool
+
+# ---------------------------------------------------------------------------
+# Shared fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def fake_npz(tmp_path: Path) -> Path:
+    """Minimal ``.npz`` whose existence is enough for ``Embeddings(...)`` to
+    construct successfully. The CLI never reads from it because we stub the
+    search method, but ``do_update_images`` checks ``os.path.exists`` first."""
+    path = tmp_path / "fake.npz"
+    np.savez(
+        path,
+        embeddings=np.zeros((0, 16), dtype=np.float32),
+        filenames=np.array([], dtype=object),
+        modification_times=np.array([], dtype=np.float64),
+        metadata=np.array([], dtype=object),
+        model_id=np.array("fake:test-encoder"),
+        embedding_dim=np.array(16),
+    )
+    return path
+
+
+@pytest.fixture
+def fake_query_image(tmp_path: Path) -> Path:
+    """Tiny PNG so ``Image.open(...)`` inside ``do_search`` works."""
+    path = tmp_path / "query.png"
+    Image.new("RGB", (8, 8), color=(127, 64, 32)).save(path)
+    return path
+
+
+# ---------------------------------------------------------------------------
+# do_search — regression test for the query_image_path typo
+# ---------------------------------------------------------------------------
+
+
+class TestDoSearch:
+    def test_passes_pil_image_via_query_image_data_kwarg(
+        self, monkeypatch, fake_npz, fake_query_image, capsys
+    ):
+        """The old CLI passed ``query_image_path=Path(...)`` which raised
+        ``TypeError: got an unexpected keyword argument 'query_image_path'``
+        before ever reaching the encoder. The fix opens the file as a PIL
+        Image and passes ``query_image_data=img``."""
+        captured: dict[str, object] = {}
+
+        def fake_search(self, *args, **kwargs):
+            captured["args"] = args
+            captured["kwargs"] = kwargs
+            return [], []
+
+        monkeypatch.setattr(
+            "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image",
+            fake_search,
+        )
+        monkeypatch.setattr(
+            sys, "argv", ["search_images", str(fake_query_image), "--embeddings", str(fake_npz)]
+        )
+
+        # Must not raise — that was the original bug.
+        imagetool.do_search()
+
+        # The new kwarg name landed; the broken one is gone.
+        assert "query_image_data" in captured["kwargs"]
+        assert "query_image_path" not in captured["kwargs"]
+        # And the value is an actual PIL Image, not a Path.
+        assert isinstance(captured["kwargs"]["query_image_data"], Image.Image)
+        # ``top_k`` is also forwarded.
+        assert captured["kwargs"].get("top_k") == 5
+
+    def test_top_k_is_forwarded(
+        self, monkeypatch, fake_npz, fake_query_image
+    ):
+        captured: dict[str, object] = {}
+
+        def fake_search(self, *args, **kwargs):
+            captured["kwargs"] = kwargs
+            return [], []
+
+        monkeypatch.setattr(
+            "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image",
+            fake_search,
+        )
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "search_images",
+                str(fake_query_image),
+                "--embeddings",
+                str(fake_npz),
+                "--top_k",
+                "17",
+            ],
+        )
+
+        imagetool.do_search()
+        assert captured["kwargs"]["top_k"] == 17
+
+
+# ---------------------------------------------------------------------------
+# do_text_search — parallel coverage; same shape, different query type
+# ---------------------------------------------------------------------------
+
+
+class TestDoTextSearch:
+    def test_passes_text_via_positive_query_kwarg(
+        self, monkeypatch, fake_npz
+    ):
+        captured: dict[str, object] = {}
+
+        def fake_search(self, *args, **kwargs):
+            captured["kwargs"] = kwargs
+            return [], []
+
+        monkeypatch.setattr(
+            "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image",
+            fake_search,
+        )
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            ["search_text", "a cat on a skateboard", "--embeddings", str(fake_npz)],
+        )
+
+        imagetool.do_text_search()
+        assert captured["kwargs"].get("positive_query") == "a cat on a skateboard"
+        assert captured["kwargs"].get("top_k") == 5
+
+
+# ---------------------------------------------------------------------------
+# main() dispatch — sys.argv[0] basename → handler routing
+# ---------------------------------------------------------------------------
+
+
+class TestMainDispatch:
+    @pytest.mark.parametrize(
+        "argv0,handler_name",
+        [
+            ("search_images", "do_search"),
+            ("search_text", "do_text_search"),
+            ("index_images", "do_index"),
+            ("update_images", "do_update_images"),
+            ("find_duplicate_images", "do_duplicate_search"),
+            ("rebuild_cluster_labels", "do_rebuild_cluster_labels"),
+        ],
+    )
+    def test_main_routes_by_basename(self, monkeypatch, argv0, handler_name):
+        called: list[str] = []
+        monkeypatch.setattr(
+            imagetool, handler_name, lambda: called.append(handler_name)
+        )
+        monkeypatch.setattr(sys, "argv", [argv0])
+        imagetool.main()
+        assert called == [handler_name]