Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 69 additions & 27 deletions photomap/backend/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,22 +781,55 @@ def _save_embeddings(self, index_result: IndexResult) -> None:
# Clear cache after saving
_open_npz_file.cache_clear()

@staticmethod
def _path_compare_key(p: Path) -> str:
"""Canonical key for the new-vs-missing diff in
:meth:`_get_new_and_missing_images`.

Returns the casefolded posix-form of the path. Casefolding is what
lets the diff stay correct on **case-insensitive filesystems**
(Windows NTFS by default, macOS HFS+ and case-insensitive APFS):
when an image is stored in the cache as ``Photo.JPG`` and later
scanned from disk as ``photo.jpg``, plain Path equality treats
them as different entries — both as new (re-encoded) and as
missing (orphaned in the cache).

Trade-off on case-sensitive Linux: two files in the same
directory whose names differ only in case (``IMG.jpg`` and
``img.jpg``) collapse to a single index entry. That collision is
rare and harmless — the same image will simply be indexed once
instead of twice — and is far less destructive than the silent
double-encoding the previous Path-equality diff produced on the
majority of user platforms.
"""
return p.as_posix().casefold()

def _get_new_and_missing_images(
self,
image_paths_or_dir: list[Path] | Path,
existing_filenames: np.ndarray,
progress_callback: Callable | None = None,
) -> tuple[set[Path], set[Path]]:
"""Determine which images are new and which are missing."""
image_path_set = set(
self.get_image_files(
image_paths_or_dir, progress_callback=progress_callback
)
live_paths = self.get_image_files(
image_paths_or_dir, progress_callback=progress_callback
)
existing_filenames_set = set(Path(p) for p in existing_filenames)

new_image_paths = image_path_set - existing_filenames_set
missing_image_paths = existing_filenames_set - image_path_set
# Build map from casefolded posix key to the *original-case* Path.
# The casefolded keys drive the set diff; the original Paths flow
# back out so downstream filesystem operations see what the disk
# actually holds.
live_by_key: dict[str, Path] = {
self._path_compare_key(p): p for p in live_paths
}
existing_by_key: dict[str, Path] = {
self._path_compare_key(Path(p)): Path(p) for p in existing_filenames
}

new_keys = set(live_by_key) - set(existing_by_key)
missing_keys = set(existing_by_key) - set(live_by_key)

new_image_paths = {live_by_key[k] for k in new_keys}
missing_image_paths = {existing_by_key[k] for k in missing_keys}

return new_image_paths, missing_image_paths

Expand Down Expand Up @@ -1342,6 +1375,25 @@ def search_images_by_text_and_image(
if use_query_optimization is not None and hasattr(encoder, "use_ensembling"):
encoder.use_ensembling = use_query_optimization

# Pre-declare every GPU tensor and downstream numpy buffer the finally
# block needs to release. The previous ``del locals()[name]`` loop was
# a no-op — locals() returns a dict copy in functions, so the actual
# local bindings (and the VRAM they pinned) lived on until the frame
# itself was destroyed. Binding here lets the finally use plain ``del``
# to drop the references *before* ``_cleanup_cuda_memory`` runs
# ``torch.cuda.empty_cache()``, so the allocator can actually reclaim
# the freed memory instead of leaving it pinned.
image_embedding = None
pos_emb = None
neg_emb = None
embeddings_tensor = None
norm_embeddings = None
cos_img = None
cos_pos = None
cos_neg = None
positive_score_sum = None
similarities = None

try:
self._check_cache_compatibility(data, encoder)

Expand All @@ -1356,9 +1408,6 @@ def search_images_by_text_and_image(
return [], []

# Encode only the inputs that will actually contribute.
image_embedding = None
pos_emb = None
neg_emb = None
if image_weight > 0.0:
pil_image = ImageOps.exif_transpose(query_image_data).convert("RGB")
image_embedding = torch.from_numpy(
Expand Down Expand Up @@ -1426,22 +1475,15 @@ def search_images_by_text_and_image(

return result_indices, result_similarities
finally:
# Drop any local tensors so VRAM doesn't accumulate across queries.
# The encoder itself is cached and intentionally NOT closed here.
for name in (
"image_embedding",
"pos_emb",
"neg_emb",
"embeddings_tensor",
"norm_embeddings",
"cos_img",
"cos_pos",
"cos_neg",
"positive_score_sum",
"similarities",
):
if name in locals():
del locals()[name]
# Drop any local tensors / arrays so VRAM doesn't accumulate
# across queries. All ten names are unconditionally bound above
# (initially to None), so plain ``del`` is safe — no NameError
# paths to guard against. The encoder itself is cached and
# intentionally NOT closed here.
del image_embedding, pos_emb, neg_emb
del embeddings_tensor, norm_embeddings
del cos_img, cos_pos, cos_neg
del positive_score_sum, similarities
self._cleanup_cuda_memory(device)

def find_duplicate_clusters(self, similarity_threshold=0.995):
Expand Down
13 changes: 11 additions & 2 deletions photomap/backend/imagetool.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import sys
from pathlib import Path

from PIL import Image

from .embeddings import Embeddings


Expand Down Expand Up @@ -105,8 +107,15 @@ def do_search():
args = parser.parse_args()
embeddings = Embeddings(embeddings_path=args.embeddings)

results, scores = embeddings.search_images_by_text_and_image(query_image_path=args.search,
top_k=args.top_k)
# ``search_images_by_text_and_image`` takes a *PIL Image*, not a path —
# the prior ``query_image_path=`` kwarg name didn't exist on the
# function and raised TypeError on every invocation. Open + decode here
# so the CLI matches the function's actual signature.
with Image.open(args.search) as query_image:
results, scores = embeddings.search_images_by_text_and_image(
query_image_data=query_image,
top_k=args.top_k,
)
print("Top similar images:")
for filename, score in zip(results, scores, strict=False):
print(f"{filename}: {score:.4f}")
Expand Down
144 changes: 144 additions & 0 deletions tests/backend/test_embeddings_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Tests for the new-vs-missing path diff in
:class:`photomap.backend.embeddings.Embeddings`.

The diff used to use plain Path equality, which is always case-sensitive
regardless of the underlying filesystem. On case-insensitive filesystems
(Windows NTFS, macOS HFS+/APFS-CI) that meant the same image referenced
with different case in its filename — e.g. ``Photo.JPG`` in the .npz
cache and ``photo.jpg`` on disk — would be classified as both *new* and
*missing*, silently double-encoding the image and orphaning the cache
row. The fix funnels both sides through a casefolded posix-form key.
"""

from __future__ import annotations

from pathlib import Path
from unittest.mock import patch

import numpy as np

from photomap.backend.embeddings import Embeddings

# ---------------------------------------------------------------------------
# _path_compare_key — direct unit test of the canonical-key derivation
# ---------------------------------------------------------------------------


class TestPathCompareKey:
def test_same_case_unchanged(self):
assert Embeddings._path_compare_key(Path("/a/b/c.jpg")) == "/a/b/c.jpg"

def test_uppercase_extension_casefolds(self):
assert Embeddings._path_compare_key(
Path("/a/b/Photo.JPG")
) == "/a/b/photo.jpg"

def test_mixed_case_path_casefolds(self):
assert Embeddings._path_compare_key(
Path("/Users/Alice/Photos/IMG_001.PNG")
) == "/users/alice/photos/img_001.png"

def test_windows_style_path_normalised_to_posix(self):
# ``PurePosixPath`` would round-trip ``\\`` unchanged. On Linux test
# runners ``Path`` is a PosixPath, so a literal backslash stays as
# a character in the basename — but the casefold still applies so
# the comparison still works the way we need it to.
assert Embeddings._path_compare_key(
Path("C:/Users/Alice/Photo.JPG")
).endswith("photo.jpg")


# ---------------------------------------------------------------------------
# _get_new_and_missing_images — set diff using the canonical key
# ---------------------------------------------------------------------------


def _embeddings_stub(tmp_path: Path) -> Embeddings:
"""Construct an ``Embeddings`` pointed at a path inside ``tmp_path`` so
the encoder spec and clip root are valid but no model is ever loaded —
we only exercise the pure-Python diff method here."""
return Embeddings(embeddings_path=tmp_path / "stub.npz")


class TestNewAndMissingDiff:
def test_exact_match_yields_empty_diff(self, tmp_path):
emb = _embeddings_stub(tmp_path)
live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
existing = np.array(["/album/a.jpg", "/album/b.jpg"])

with patch.object(Embeddings, "get_image_files", return_value=live):
new, missing = emb._get_new_and_missing_images(
image_paths_or_dir=live, existing_filenames=existing
)

assert new == set()
assert missing == set()

def test_added_file_appears_in_new(self, tmp_path):
emb = _embeddings_stub(tmp_path)
live = [Path("/album/a.jpg"), Path("/album/b.jpg")]
existing = np.array(["/album/a.jpg"])

with patch.object(Embeddings, "get_image_files", return_value=live):
new, missing = emb._get_new_and_missing_images(
image_paths_or_dir=live, existing_filenames=existing
)

assert new == {Path("/album/b.jpg")}
assert missing == set()

def test_removed_file_appears_in_missing(self, tmp_path):
emb = _embeddings_stub(tmp_path)
live = [Path("/album/a.jpg")]
existing = np.array(["/album/a.jpg", "/album/b.jpg"])

with patch.object(Embeddings, "get_image_files", return_value=live):
new, missing = emb._get_new_and_missing_images(
image_paths_or_dir=live, existing_filenames=existing
)

assert new == set()
assert missing == {Path("/album/b.jpg")}

def test_case_only_rename_is_not_double_encoded(self, tmp_path):
"""Regression test for the case-insensitive-FS bug.

The same file is recorded in the cache as ``Photo.JPG`` (upper)
and scanned from the filesystem as ``photo.jpg`` (lower). On a
case-insensitive FS these are the same file; on a case-sensitive
FS Python can't tell either way. The diff should treat them as
the same entry — neither new nor missing.
"""
emb = _embeddings_stub(tmp_path)
live = [Path("/album/photo.jpg")]
existing = np.array(["/album/Photo.JPG"])

with patch.object(Embeddings, "get_image_files", return_value=live):
new, missing = emb._get_new_and_missing_images(
image_paths_or_dir=live, existing_filenames=existing
)

assert new == set(), "expected no new file (case-folded match)"
assert missing == set(), "expected no missing file (case-folded match)"

def test_case_only_rename_preserves_live_casing_for_new_files(self, tmp_path):
"""When a *new* path differs only in case from a cached path, the
Path returned in ``missing`` is the cache-stored original-case
version (preserved for downstream filesystem-mask work in
``_filter_missing_images``)."""
emb = _embeddings_stub(tmp_path)
# Live scan has lowercase only; cache has upper for one file plus a
# genuinely new mixed-case entry.
live = [Path("/album/photo.jpg"), Path("/album/NewShot.PNG")]
existing = np.array(["/album/Photo.JPG"])

with patch.object(Embeddings, "get_image_files", return_value=live):
new, missing = emb._get_new_and_missing_images(
image_paths_or_dir=live, existing_filenames=existing
)

# ``NewShot.PNG`` is genuinely new — and the original casing is
# preserved so PIL.open() and EXIF read use the actual file name.
assert new == {Path("/album/NewShot.PNG")}
# The case-only-rename file is neither new nor missing.
assert missing == set()
Loading
Loading