From 8dc61a104042e942729c9546fc8a51ed91161865 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 22 May 2026 20:03:45 -0400 Subject: [PATCH] fix(embeddings,cli): case-insensitive path diff, VRAM release, broken CLI kwarg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small but real defects in embeddings.py / imagetool.py — the last three backend items from the 2026-05-20 code review. ## 1. _get_new_and_missing_images: case-insensitive Path comparison The new-vs-missing diff used plain ``set[Path]`` operations. Python's ``Path.__eq__`` is *always* case-sensitive, regardless of the underlying filesystem. On the dominant user platforms — Windows NTFS and macOS HFS+/APFS-CI — that meant the same image referenced with different case in its filename (``Photo.JPG`` in the cache, ``photo.jpg`` on disk) was classified as **both new and missing**: new = {photo.jpg} ← re-encoded (whole CLIP pass) missing = {Photo.JPG} ← orphaned in the cache Net effect: silent double-encoding on every re-index, plus a stale row that ``_filter_missing_images`` then drops, plus stale UMAP and cluster-label caches keyed off the bogus filename count. The fix funnels both sides through ``_path_compare_key``, which returns ``p.as_posix().casefold()``. The casefolded keys drive the set diff; the original-case Paths flow back through ``live_by_key`` and ``existing_by_key`` so downstream filesystem operations see what the disk actually holds. Trade-off on case-sensitive Linux: two files in the same directory whose names differ only in case collapse to one entry. That collision is unusual and harmless — the image is simply indexed once instead of twice — and far less destructive than the silent double-encoding the previous Path-equality diff produced on the majority of user platforms. ## 2. search_images_by_text_and_image: VRAM release was a no-op The ``finally`` block in this hot search path tried to drop GPU tensor references before calling ``_cleanup_cuda_memory``: for name in ("image_embedding", "pos_emb", ...): if name in locals(): del locals()[name] self._cleanup_cuda_memory(device) ``locals()`` returns a *snapshot dict* in functions — mutating it doesn't touch the actual local bindings. The ``del`` was a no-op, the local refs lived on until the function frame was destroyed, and ``torch.cuda.empty_cache()`` couldn't reclaim memory pinned by those refs. VRAM accumulated across search queries until the search method returned. Pre-declare all ten potentially-allocated tensors / arrays to ``None`` at the start of the try (so they're unconditionally bound), then ``del`` them explicitly in the finally before ``_cleanup_cuda_memory``. The allocator can now actually reclaim the freed memory. ## 3. imagetool.do_search: TypeError on every first call The ``search_images`` CLI passed ``query_image_path=args.search`` to ``Embeddings.search_images_by_text_and_image``. That function's signature has no ``query_image_path`` parameter — it takes ``query_image_data: Image.Image``. Python raised ``TypeError: got an unexpected keyword argument 'query_image_path'`` before the encoder was ever touched, so the script has been broken since the parameter was renamed. Fix: open the path with PIL inside a context manager and pass ``query_image_data=img``. Verified by a new test that intercepts the underlying call and asserts the kwarg name. ## Test coverage added * ``tests/backend/test_embeddings_diff.py`` — 9 tests covering ``_path_compare_key`` directly plus four ``_get_new_and_missing_images`` scenarios, including the case-only-rename regression. * ``tests/backend/test_imagetool.py`` — 9 tests covering ``do_search`` / ``do_text_search`` kwarg shapes and ``main()``'s basename → handler dispatch. 297 backend tests pass (was 279). Co-Authored-By: Claude Opus 4.7 (1M context) --- photomap/backend/embeddings.py | 96 ++++++++++---- photomap/backend/imagetool.py | 13 +- tests/backend/test_embeddings_diff.py | 144 +++++++++++++++++++++ tests/backend/test_imagetool.py | 175 ++++++++++++++++++++++++++ 4 files changed, 399 insertions(+), 29 deletions(-) create mode 100644 tests/backend/test_embeddings_diff.py create mode 100644 tests/backend/test_imagetool.py diff --git a/photomap/backend/embeddings.py b/photomap/backend/embeddings.py index ab378406..e27061e3 100644 --- a/photomap/backend/embeddings.py +++ b/photomap/backend/embeddings.py @@ -781,6 +781,29 @@ def _save_embeddings(self, index_result: IndexResult) -> None: # Clear cache after saving _open_npz_file.cache_clear() + @staticmethod + def _path_compare_key(p: Path) -> str: + """Canonical key for the new-vs-missing diff in + :meth:`_get_new_and_missing_images`. + + Returns the casefolded posix-form of the path. Casefolding is what + lets the diff stay correct on **case-insensitive filesystems** + (Windows NTFS by default, macOS HFS+ and case-insensitive APFS): + when an image is stored in the cache as ``Photo.JPG`` and later + scanned from disk as ``photo.jpg``, plain Path equality treats + them as different entries — both as new (re-encoded) and as + missing (orphaned in the cache). + + Trade-off on case-sensitive Linux: two files in the same + directory whose names differ only in case (``IMG.jpg`` and + ``img.jpg``) collapse to a single index entry. That collision is + rare and harmless — the same image will simply be indexed once + instead of twice — and is far less destructive than the silent + double-encoding the previous Path-equality diff produced on the + majority of user platforms. + """ + return p.as_posix().casefold() + def _get_new_and_missing_images( self, image_paths_or_dir: list[Path] | Path, @@ -788,15 +811,25 @@ def _get_new_and_missing_images( progress_callback: Callable | None = None, ) -> tuple[set[Path], set[Path]]: """Determine which images are new and which are missing.""" - image_path_set = set( - self.get_image_files( - image_paths_or_dir, progress_callback=progress_callback - ) + live_paths = self.get_image_files( + image_paths_or_dir, progress_callback=progress_callback ) - existing_filenames_set = set(Path(p) for p in existing_filenames) - - new_image_paths = image_path_set - existing_filenames_set - missing_image_paths = existing_filenames_set - image_path_set + # Build map from casefolded posix key to the *original-case* Path. + # The casefolded keys drive the set diff; the original Paths flow + # back out so downstream filesystem operations see what the disk + # actually holds. + live_by_key: dict[str, Path] = { + self._path_compare_key(p): p for p in live_paths + } + existing_by_key: dict[str, Path] = { + self._path_compare_key(Path(p)): Path(p) for p in existing_filenames + } + + new_keys = set(live_by_key) - set(existing_by_key) + missing_keys = set(existing_by_key) - set(live_by_key) + + new_image_paths = {live_by_key[k] for k in new_keys} + missing_image_paths = {existing_by_key[k] for k in missing_keys} return new_image_paths, missing_image_paths @@ -1342,6 +1375,25 @@ def search_images_by_text_and_image( if use_query_optimization is not None and hasattr(encoder, "use_ensembling"): encoder.use_ensembling = use_query_optimization + # Pre-declare every GPU tensor and downstream numpy buffer the finally + # block needs to release. The previous ``del locals()[name]`` loop was + # a no-op — locals() returns a dict copy in functions, so the actual + # local bindings (and the VRAM they pinned) lived on until the frame + # itself was destroyed. Binding here lets the finally use plain ``del`` + # to drop the references *before* ``_cleanup_cuda_memory`` runs + # ``torch.cuda.empty_cache()``, so the allocator can actually reclaim + # the freed memory instead of leaving it pinned. + image_embedding = None + pos_emb = None + neg_emb = None + embeddings_tensor = None + norm_embeddings = None + cos_img = None + cos_pos = None + cos_neg = None + positive_score_sum = None + similarities = None + try: self._check_cache_compatibility(data, encoder) @@ -1356,9 +1408,6 @@ def search_images_by_text_and_image( return [], [] # Encode only the inputs that will actually contribute. - image_embedding = None - pos_emb = None - neg_emb = None if image_weight > 0.0: pil_image = ImageOps.exif_transpose(query_image_data).convert("RGB") image_embedding = torch.from_numpy( @@ -1426,22 +1475,15 @@ def search_images_by_text_and_image( return result_indices, result_similarities finally: - # Drop any local tensors so VRAM doesn't accumulate across queries. - # The encoder itself is cached and intentionally NOT closed here. - for name in ( - "image_embedding", - "pos_emb", - "neg_emb", - "embeddings_tensor", - "norm_embeddings", - "cos_img", - "cos_pos", - "cos_neg", - "positive_score_sum", - "similarities", - ): - if name in locals(): - del locals()[name] + # Drop any local tensors / arrays so VRAM doesn't accumulate + # across queries. All ten names are unconditionally bound above + # (initially to None), so plain ``del`` is safe — no NameError + # paths to guard against. The encoder itself is cached and + # intentionally NOT closed here. + del image_embedding, pos_emb, neg_emb + del embeddings_tensor, norm_embeddings + del cos_img, cos_pos, cos_neg + del positive_score_sum, similarities self._cleanup_cuda_memory(device) def find_duplicate_clusters(self, similarity_threshold=0.995): diff --git a/photomap/backend/imagetool.py b/photomap/backend/imagetool.py index f8002c28..a6a71e51 100644 --- a/photomap/backend/imagetool.py +++ b/photomap/backend/imagetool.py @@ -5,6 +5,8 @@ import sys from pathlib import Path +from PIL import Image + from .embeddings import Embeddings @@ -105,8 +107,15 @@ def do_search(): args = parser.parse_args() embeddings = Embeddings(embeddings_path=args.embeddings) - results, scores = embeddings.search_images_by_text_and_image(query_image_path=args.search, - top_k=args.top_k) + # ``search_images_by_text_and_image`` takes a *PIL Image*, not a path — + # the prior ``query_image_path=`` kwarg name didn't exist on the + # function and raised TypeError on every invocation. Open + decode here + # so the CLI matches the function's actual signature. + with Image.open(args.search) as query_image: + results, scores = embeddings.search_images_by_text_and_image( + query_image_data=query_image, + top_k=args.top_k, + ) print("Top similar images:") for filename, score in zip(results, scores, strict=False): print(f"{filename}: {score:.4f}") diff --git a/tests/backend/test_embeddings_diff.py b/tests/backend/test_embeddings_diff.py new file mode 100644 index 00000000..b6355727 --- /dev/null +++ b/tests/backend/test_embeddings_diff.py @@ -0,0 +1,144 @@ +"""Tests for the new-vs-missing path diff in +:class:`photomap.backend.embeddings.Embeddings`. + +The diff used to use plain Path equality, which is always case-sensitive +regardless of the underlying filesystem. On case-insensitive filesystems +(Windows NTFS, macOS HFS+/APFS-CI) that meant the same image referenced +with different case in its filename — e.g. ``Photo.JPG`` in the .npz +cache and ``photo.jpg`` on disk — would be classified as both *new* and +*missing*, silently double-encoding the image and orphaning the cache +row. The fix funnels both sides through a casefolded posix-form key. +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +import numpy as np + +from photomap.backend.embeddings import Embeddings + +# --------------------------------------------------------------------------- +# _path_compare_key — direct unit test of the canonical-key derivation +# --------------------------------------------------------------------------- + + +class TestPathCompareKey: + def test_same_case_unchanged(self): + assert Embeddings._path_compare_key(Path("/a/b/c.jpg")) == "/a/b/c.jpg" + + def test_uppercase_extension_casefolds(self): + assert Embeddings._path_compare_key( + Path("/a/b/Photo.JPG") + ) == "/a/b/photo.jpg" + + def test_mixed_case_path_casefolds(self): + assert Embeddings._path_compare_key( + Path("/Users/Alice/Photos/IMG_001.PNG") + ) == "/users/alice/photos/img_001.png" + + def test_windows_style_path_normalised_to_posix(self): + # ``PurePosixPath`` would round-trip ``\\`` unchanged. On Linux test + # runners ``Path`` is a PosixPath, so a literal backslash stays as + # a character in the basename — but the casefold still applies so + # the comparison still works the way we need it to. + assert Embeddings._path_compare_key( + Path("C:/Users/Alice/Photo.JPG") + ).endswith("photo.jpg") + + +# --------------------------------------------------------------------------- +# _get_new_and_missing_images — set diff using the canonical key +# --------------------------------------------------------------------------- + + +def _embeddings_stub(tmp_path: Path) -> Embeddings: + """Construct an ``Embeddings`` pointed at a path inside ``tmp_path`` so + the encoder spec and clip root are valid but no model is ever loaded — + we only exercise the pure-Python diff method here.""" + return Embeddings(embeddings_path=tmp_path / "stub.npz") + + +class TestNewAndMissingDiff: + def test_exact_match_yields_empty_diff(self, tmp_path): + emb = _embeddings_stub(tmp_path) + live = [Path("/album/a.jpg"), Path("/album/b.jpg")] + existing = np.array(["/album/a.jpg", "/album/b.jpg"]) + + with patch.object(Embeddings, "get_image_files", return_value=live): + new, missing = emb._get_new_and_missing_images( + image_paths_or_dir=live, existing_filenames=existing + ) + + assert new == set() + assert missing == set() + + def test_added_file_appears_in_new(self, tmp_path): + emb = _embeddings_stub(tmp_path) + live = [Path("/album/a.jpg"), Path("/album/b.jpg")] + existing = np.array(["/album/a.jpg"]) + + with patch.object(Embeddings, "get_image_files", return_value=live): + new, missing = emb._get_new_and_missing_images( + image_paths_or_dir=live, existing_filenames=existing + ) + + assert new == {Path("/album/b.jpg")} + assert missing == set() + + def test_removed_file_appears_in_missing(self, tmp_path): + emb = _embeddings_stub(tmp_path) + live = [Path("/album/a.jpg")] + existing = np.array(["/album/a.jpg", "/album/b.jpg"]) + + with patch.object(Embeddings, "get_image_files", return_value=live): + new, missing = emb._get_new_and_missing_images( + image_paths_or_dir=live, existing_filenames=existing + ) + + assert new == set() + assert missing == {Path("/album/b.jpg")} + + def test_case_only_rename_is_not_double_encoded(self, tmp_path): + """Regression test for the case-insensitive-FS bug. + + The same file is recorded in the cache as ``Photo.JPG`` (upper) + and scanned from the filesystem as ``photo.jpg`` (lower). On a + case-insensitive FS these are the same file; on a case-sensitive + FS Python can't tell either way. The diff should treat them as + the same entry — neither new nor missing. + """ + emb = _embeddings_stub(tmp_path) + live = [Path("/album/photo.jpg")] + existing = np.array(["/album/Photo.JPG"]) + + with patch.object(Embeddings, "get_image_files", return_value=live): + new, missing = emb._get_new_and_missing_images( + image_paths_or_dir=live, existing_filenames=existing + ) + + assert new == set(), "expected no new file (case-folded match)" + assert missing == set(), "expected no missing file (case-folded match)" + + def test_case_only_rename_preserves_live_casing_for_new_files(self, tmp_path): + """When a *new* path differs only in case from a cached path, the + Path returned in ``missing`` is the cache-stored original-case + version (preserved for downstream filesystem-mask work in + ``_filter_missing_images``).""" + emb = _embeddings_stub(tmp_path) + # Live scan has lowercase only; cache has upper for one file plus a + # genuinely new mixed-case entry. + live = [Path("/album/photo.jpg"), Path("/album/NewShot.PNG")] + existing = np.array(["/album/Photo.JPG"]) + + with patch.object(Embeddings, "get_image_files", return_value=live): + new, missing = emb._get_new_and_missing_images( + image_paths_or_dir=live, existing_filenames=existing + ) + + # ``NewShot.PNG`` is genuinely new — and the original casing is + # preserved so PIL.open() and EXIF read use the actual file name. + assert new == {Path("/album/NewShot.PNG")} + # The case-only-rename file is neither new nor missing. + assert missing == set() diff --git a/tests/backend/test_imagetool.py b/tests/backend/test_imagetool.py new file mode 100644 index 00000000..8fec0c56 --- /dev/null +++ b/tests/backend/test_imagetool.py @@ -0,0 +1,175 @@ +"""Smoke tests for :mod:`photomap.backend.imagetool` CLI entry points. + +These ensure each script can dispatch through ``main()`` and that the +underlying ``Embeddings`` methods are called with kwarg names that +actually exist. Regression coverage for the long-standing +``query_image_path=`` typo in ``do_search`` that raised ``TypeError`` on +every first call. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import numpy as np +import pytest +from PIL import Image + +from photomap.backend import imagetool + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fake_npz(tmp_path: Path) -> Path: + """Minimal ``.npz`` whose existence is enough for ``Embeddings(...)`` to + construct successfully. The CLI never reads from it because we stub the + search method, but ``do_update_images`` checks ``os.path.exists`` first.""" + path = tmp_path / "fake.npz" + np.savez( + path, + embeddings=np.zeros((0, 16), dtype=np.float32), + filenames=np.array([], dtype=object), + modification_times=np.array([], dtype=np.float64), + metadata=np.array([], dtype=object), + model_id=np.array("fake:test-encoder"), + embedding_dim=np.array(16), + ) + return path + + +@pytest.fixture +def fake_query_image(tmp_path: Path) -> Path: + """Tiny PNG so ``Image.open(...)`` inside ``do_search`` works.""" + path = tmp_path / "query.png" + Image.new("RGB", (8, 8), color=(127, 64, 32)).save(path) + return path + + +# --------------------------------------------------------------------------- +# do_search — regression test for the query_image_path typo +# --------------------------------------------------------------------------- + + +class TestDoSearch: + def test_passes_pil_image_via_query_image_data_kwarg( + self, monkeypatch, fake_npz, fake_query_image, capsys + ): + """The old CLI passed ``query_image_path=Path(...)`` which raised + ``TypeError: got an unexpected keyword argument 'query_image_path'`` + before ever reaching the encoder. The fix opens the file as a PIL + Image and passes ``query_image_data=img``.""" + captured: dict[str, object] = {} + + def fake_search(self, *args, **kwargs): + captured["args"] = args + captured["kwargs"] = kwargs + return [], [] + + monkeypatch.setattr( + "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image", + fake_search, + ) + monkeypatch.setattr( + sys, "argv", ["search_images", str(fake_query_image), "--embeddings", str(fake_npz)] + ) + + # Must not raise — that was the original bug. + imagetool.do_search() + + # The new kwarg name landed; the broken one is gone. + assert "query_image_data" in captured["kwargs"] + assert "query_image_path" not in captured["kwargs"] + # And the value is an actual PIL Image, not a Path. + assert isinstance(captured["kwargs"]["query_image_data"], Image.Image) + # ``top_k`` is also forwarded. + assert captured["kwargs"].get("top_k") == 5 + + def test_top_k_is_forwarded( + self, monkeypatch, fake_npz, fake_query_image + ): + captured: dict[str, object] = {} + + def fake_search(self, *args, **kwargs): + captured["kwargs"] = kwargs + return [], [] + + monkeypatch.setattr( + "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image", + fake_search, + ) + monkeypatch.setattr( + sys, + "argv", + [ + "search_images", + str(fake_query_image), + "--embeddings", + str(fake_npz), + "--top_k", + "17", + ], + ) + + imagetool.do_search() + assert captured["kwargs"]["top_k"] == 17 + + +# --------------------------------------------------------------------------- +# do_text_search — parallel coverage; same shape, different query type +# --------------------------------------------------------------------------- + + +class TestDoTextSearch: + def test_passes_text_via_positive_query_kwarg( + self, monkeypatch, fake_npz + ): + captured: dict[str, object] = {} + + def fake_search(self, *args, **kwargs): + captured["kwargs"] = kwargs + return [], [] + + monkeypatch.setattr( + "photomap.backend.embeddings.Embeddings.search_images_by_text_and_image", + fake_search, + ) + monkeypatch.setattr( + sys, + "argv", + ["search_text", "a cat on a skateboard", "--embeddings", str(fake_npz)], + ) + + imagetool.do_text_search() + assert captured["kwargs"].get("positive_query") == "a cat on a skateboard" + assert captured["kwargs"].get("top_k") == 5 + + +# --------------------------------------------------------------------------- +# main() dispatch — sys.argv[0] basename → handler routing +# --------------------------------------------------------------------------- + + +class TestMainDispatch: + @pytest.mark.parametrize( + "argv0,handler_name", + [ + ("search_images", "do_search"), + ("search_text", "do_text_search"), + ("index_images", "do_index"), + ("update_images", "do_update_images"), + ("find_duplicate_images", "do_duplicate_search"), + ("rebuild_cluster_labels", "do_rebuild_cluster_labels"), + ], + ) + def test_main_routes_by_basename(self, monkeypatch, argv0, handler_name): + called: list[str] = [] + monkeypatch.setattr( + imagetool, handler_name, lambda: called.append(handler_name) + ) + monkeypatch.setattr(sys, "argv", [argv0]) + imagetool.main() + assert called == [handler_name]