Merge pull request #848 from lsst/tickets/DM-37499

DM-37499: Improve log message for datastore.mexists
lsst · Jun 6, 2023 · 9585fe8 · 9585fe8
2 parents 22011a4 + 65d95fd
commit 9585fe8
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 6 deletions.
diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py
@@ -1525,14 +1525,54 @@ def mexists(
         n_chunks = 0
         for chunk in chunk_iterable(refs, chunk_size=chunk_size):
             chunk_result = self._mexists(chunk, artifact_existence)
-            if log.isEnabledFor(VERBOSE):
-                n_results = len(chunk_result)
-                n_checked += n_results
+
+            # The log message level and content depend on how many
+            # datasets we are processing.
+            n_results = len(chunk_result)
+
+            # Use verbose logging to ensure that messages can be seen
+            # easily if many refs are being checked.
+            log_threshold = VERBOSE
+            n_checked += n_results
+
+            # This sum can take some time so only do it if we know the
+            # result is going to be used.
+            n_found = 0
+            if log.isEnabledFor(log_threshold):
                 # Can treat the booleans as 0, 1 integers and sum them.
                 n_found = sum(chunk_result.values())
                 n_found_total += n_found
-                log.verbose(
-                    "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)",
+
+            # We are deliberately not trying to count the number of refs
+            # provided in case it's in the millions. This means there is a
+            # situation where the number of refs exactly matches the chunk
+            # size and we will switch to the multi-chunk path even though
+            # we only have a single chunk.
+            if n_results < chunk_size and n_chunks == 0:
+                # Single chunk will be processed so we can provide more detail.
+                if n_results == 1:
+                    ref = list(chunk_result)[0]
+                    # Use debug logging to be consistent with `exists()`.
+                    log.debug(
+                        "Calling mexists() with single ref that does%s exist (%s).",
+                        "" if chunk_result[ref] else " not",
+                        ref,
+                    )
+                else:
+                    # Single chunk but multiple files. Summarize.
+                    log.log(
+                        log_threshold,
+                        "Number of datasets found in datastore: %d out of %d datasets checked.",
+                        n_found,
+                        n_checked,
+                    )
+
+            else:
+                # Use incremental verbose logging when we have multiple chunks.
+                log.log(
+                    log_threshold,
+                    "Number of datasets found in datastore for chunk %d: %d out of %d checked "
+                    "(running total from all chunks so far: %d found out of %d checked)",
                     n_chunks,
                     n_found,
                     n_results,

diff --git a/tests/test_datastore.py b/tests/test_datastore.py
@@ -276,8 +276,9 @@ def testBasicPutGet(self):
             self.assertTrue(datastore.knows(ref))
             multi = datastore.knows_these([ref])
             self.assertTrue(multi[ref])
-            multi = datastore.mexists([ref])
+            multi = datastore.mexists([ref, ref2])
             self.assertTrue(multi[ref])
+            self.assertFalse(multi[ref2])
 
             # Get
             metricsOut = datastore.get(ref, parameters=None)