lstein · lstein · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/photomap/backend/embeddings.py b/photomap/backend/embeddings.py
@@ -781,22 +781,55 @@ def _save_embeddings(self, index_result: IndexResult) -> None:
         # Clear cache after saving
         _open_npz_file.cache_clear()
 
+    @staticmethod
+    def _path_compare_key(p: Path) -> str:
+        """Canonical key for the new-vs-missing diff in
+        :meth:`_get_new_and_missing_images`.
+
+        Returns the casefolded posix-form of the path. Casefolding is what
+        lets the diff stay correct on **case-insensitive filesystems**
+        (Windows NTFS by default, macOS HFS+ and case-insensitive APFS):
+        when an image is stored in the cache as ``Photo.JPG`` and later
+        scanned from disk as ``photo.jpg``, plain Path equality treats
+        them as different entries — both as new (re-encoded) and as
+        missing (orphaned in the cache).
+
+        Trade-off on case-sensitive Linux: two files in the same
+        directory whose names differ only in case (``IMG.jpg`` and
+        ``img.jpg``) collapse to a single index entry. That collision is
+        rare and harmless — the same image will simply be indexed once
+        instead of twice — and is far less destructive than the silent
+        double-encoding the previous Path-equality diff produced on the
+        majority of user platforms.
+        """
+        return p.as_posix().casefold()
+
     def _get_new_and_missing_images(
         self,
         image_paths_or_dir: list[Path] | Path,
         existing_filenames: np.ndarray,
         progress_callback: Callable | None = None,
     ) -> tuple[set[Path], set[Path]]:
         """Determine which images are new and which are missing."""
-        image_path_set = set(
-            self.get_image_files(
-                image_paths_or_dir, progress_callback=progress_callback
-            )
+        live_paths = self.get_image_files(
+            image_paths_or_dir, progress_callback=progress_callback
         )
-        existing_filenames_set = set(Path(p) for p in existing_filenames)
-
-        new_image_paths = image_path_set - existing_filenames_set
-        missing_image_paths = existing_filenames_set - image_path_set
+        # Build map from casefolded posix key to the *original-case* Path.
+        # The casefolded keys drive the set diff; the original Paths flow
+        # back out so downstream filesystem operations see what the disk
+        # actually holds.
+        live_by_key: dict[str, Path] = {
+            self._path_compare_key(p): p for p in live_paths
+        }
+        existing_by_key: dict[str, Path] = {
+            self._path_compare_key(Path(p)): Path(p) for p in existing_filenames
+        }
+
+        new_keys = set(live_by_key) - set(existing_by_key)
+        missing_keys = set(existing_by_key) - set(live_by_key)
+
+        new_image_paths = {live_by_key[k] for k in new_keys}
+        missing_image_paths = {existing_by_key[k] for k in missing_keys}
 
         return new_image_paths, missing_image_paths
 
@@ -1342,6 +1375,25 @@ def search_images_by_text_and_image(
         if use_query_optimization is not None and hasattr(encoder, "use_ensembling"):
             encoder.use_ensembling = use_query_optimization
 
+        # Pre-declare every GPU tensor and downstream numpy buffer the finally
+        # block needs to release. The previous ``del locals()[name]`` loop was
+        # a no-op — locals() returns a dict copy in functions, so the actual
+        # local bindings (and the VRAM they pinned) lived on until the frame
+        # itself was destroyed. Binding here lets the finally use plain ``del``
+        # to drop the references *before* ``_cleanup_cuda_memory`` runs
+        # ``torch.cuda.empty_cache()``, so the allocator can actually reclaim
+        # the freed memory instead of leaving it pinned.
+        image_embedding = None
+        pos_emb = None
+        neg_emb = None
+        embeddings_tensor = None
+        norm_embeddings = None
+        cos_img = None
+        cos_pos = None
+        cos_neg = None
+        positive_score_sum = None
+        similarities = None
+
         try:
             self._check_cache_compatibility(data, encoder)
 
@@ -1356,9 +1408,6 @@ def search_images_by_text_and_image(
                 return [], []
 
             # Encode only the inputs that will actually contribute.
-            image_embedding = None
-            pos_emb = None
-            neg_emb = None
             if image_weight > 0.0:
                 pil_image = ImageOps.exif_transpose(query_image_data).convert("RGB")
                 image_embedding = torch.from_numpy(
@@ -1426,22 +1475,15 @@ def search_images_by_text_and_image(
 
             return result_indices, result_similarities
         finally:
-            # Drop any local tensors so VRAM doesn't accumulate across queries.
-            # The encoder itself is cached and intentionally NOT closed here.
-            for name in (
-                "image_embedding",
-                "pos_emb",
-                "neg_emb",
-                "embeddings_tensor",
-                "norm_embeddings",
-                "cos_img",
-                "cos_pos",
-                "cos_neg",
-                "positive_score_sum",
-                "similarities",
-            ):
-                if name in locals():
-                    del locals()[name]
+            # Drop any local tensors / arrays so VRAM doesn't accumulate
+            # across queries. All ten names are unconditionally bound above
+            # (initially to None), so plain ``del`` is safe — no NameError
+            # paths to guard against. The encoder itself is cached and
+            # intentionally NOT closed here.
+            del image_embedding, pos_emb, neg_emb
+            del embeddings_tensor, norm_embeddings
+            del cos_img, cos_pos, cos_neg
+            del positive_score_sum, similarities
             self._cleanup_cuda_memory(device)
 
     def find_duplicate_clusters(self, similarity_threshold=0.995):

diff --git a/photomap/backend/imagetool.py b/photomap/backend/imagetool.py
@@ -5,6 +5,8 @@
 import sys
 from pathlib import Path
 
+from PIL import Image
+
 from .embeddings import Embeddings
 
 
@@ -105,8 +107,15 @@ def do_search():
     args = parser.parse_args()
     embeddings = Embeddings(embeddings_path=args.embeddings)
 
-    results, scores = embeddings.search_images_by_text_and_image(query_image_path=args.search,
-                                                                 top_k=args.top_k)
+    # ``search_images_by_text_and_image`` takes a *PIL Image*, not a path —
+    # the prior ``query_image_path=`` kwarg name didn't exist on the
+    # function and raised TypeError on every invocation. Open + decode here
+    # so the CLI matches the function's actual signature.
+    with Image.open(args.search) as query_image:
+        results, scores = embeddings.search_images_by_text_and_image(
+            query_image_data=query_image,
+            top_k=args.top_k,
+        )
     print("Top similar images:")
     for filename, score in zip(results, scores, strict=False):
         print(f"{filename}: {score:.4f}")

diff --git a/tests/backend/test_embeddings_diff.py b/tests/backend/test_embeddings_diff.py
@@ -0,0 +1,144 @@
+"""Tests for the new-vs-missing path diff in
+:class:`photomap.backend.embeddings.Embeddings`.
+
+The diff used to use plain Path equality, which is always case-sensitive
+regardless of the underlying filesystem. On case-insensitive filesystems
+(Windows NTFS, macOS HFS+/APFS-CI) that meant the same image referenced
+with different case in its filename — e.g. ``Photo.JPG`` in the .npz
+cache and ``photo.jpg`` on disk — would be classified as both *new* and
+*missing*, silently double-encoding the image and orphaning the cache
+row. The fix funnels both sides through a casefolded posix-form key.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+
+from photomap.backend.embeddings import Embeddings
+
+# ---------------------------------------------------------------------------
+# _path_compare_key — direct unit test of the canonical-key derivation
+# ---------------------------------------------------------------------------
+
+
+class TestPathCompareKey:
+    def test_same_case_unchanged(self):
+        assert Embeddings._path_compare_key(Path("/a/b/c.jpg")) == "/a/b/c.jpg"
+
+    def test_uppercase_extension_casefolds(self):
+        assert Embeddings._path_compare_key(
+            Path("/a/b/Photo.JPG")
+        ) == "/a/b/photo.jpg"
+
+    def test_mixed_case_path_casefolds(self):
+        assert Embeddings._path_compare_key(
+            Path("/Users/Alice/Photos/IMG_001.PNG")
+        ) == "/users/alice/photos/img_001.png"
+
+    def test_windows_style_path_normalised_to_posix(self):
+        # ``PurePosixPath`` would round-trip ``\\`` unchanged. On Linux test
+        # runners ``Path`` is a PosixPath, so a literal backslash stays as
+        # a character in the basename — but the casefold still applies so
+        # the comparison still works the way we need it to.
+        assert Embeddings._path_compare_key(
+            Path("C:/Users/Alice/Photo.JPG")
+        ).endswith("photo.jpg")
+
+
+# ---------------------------------------------------------------------------
+# _get_new_and_missing_images — set diff using the canonical key
+# ---------------------------------------------------------------------------
+
+
+def _embeddings_stub(tmp_path: Path) -> Embeddings:
+    """Construct an ``Embeddings`` pointed at a path inside ``tmp_path`` so
+    the encoder spec and clip root are valid but no model is ever loaded —
+    we only exercise the pure-Python diff method here."""
+    return Embeddings(embeddings_path=tmp_path / "stub.npz")
+
+
+class TestNewAndMissingDiff:
+    def test_exact_match_yields_empty_diff(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
+        existing = np.array(["/album/a.jpg", "/album/b.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set()
+        assert missing == set()
+
+    def test_added_file_appears_in_new(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
+        existing = np.array(["/album/a.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == {Path("/album/b.jpg")}
+        assert missing == set()
+
+    def test_removed_file_appears_in_missing(self, tmp_path):
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/a.jpg")]
+        existing = np.array(["/album/a.jpg", "/album/b.jpg"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set()
+        assert missing == {Path("/album/b.jpg")}
+
+    def test_case_only_rename_is_not_double_encoded(self, tmp_path):
+        """Regression test for the case-insensitive-FS bug.
+
+        The same file is recorded in the cache as ``Photo.JPG`` (upper)
+        and scanned from the filesystem as ``photo.jpg`` (lower). On a
+        case-insensitive FS these are the same file; on a case-sensitive
+        FS Python can't tell either way. The diff should treat them as
+        the same entry — neither new nor missing.
+        """
+        emb = _embeddings_stub(tmp_path)
+        live = [Path("/album/photo.jpg")]
+        existing = np.array(["/album/Photo.JPG"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        assert new == set(), "expected no new file (case-folded match)"
+        assert missing == set(), "expected no missing file (case-folded match)"
+
+    def test_case_only_rename_preserves_live_casing_for_new_files(self, tmp_path):
+        """When a *new* path differs only in case from a cached path, the
+        Path returned in ``missing`` is the cache-stored original-case
+        version (preserved for downstream filesystem-mask work in
+        ``_filter_missing_images``)."""
+        emb = _embeddings_stub(tmp_path)
+        # Live scan has lowercase only; cache has upper for one file plus a
+        # genuinely new mixed-case entry.
+        live = [Path("/album/photo.jpg"), Path("/album/NewShot.PNG")]
+        existing = np.array(["/album/Photo.JPG"])
+
+        with patch.object(Embeddings, "get_image_files", return_value=live):
+            new, missing = emb._get_new_and_missing_images(
+                image_paths_or_dir=live, existing_filenames=existing
+            )
+
+        # ``NewShot.PNG`` is genuinely new — and the original casing is
+        # preserved so PIL.open() and EXIF read use the actual file name.
+        assert new == {Path("/album/NewShot.PNG")}
+        # The case-only-rename file is neither new nor missing.
+        assert missing == set()