Few fixes from code review

lsst · May 25, 2023 · d3031a7 · d3031a7
1 parent 408ec68
commit d3031a7
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 49 deletions.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -1050,7 +1050,7 @@ def _findDatasetRef(
         ------
         LookupError
             Raised if no matching dataset exists in the `Registry` (and
-            ``predict is False``).
+            ``predict`` is `False`).
         ValueError
             Raised if a resolved `DatasetRef` was passed as an input, but it
             differs from the one found in the registry.
@@ -1170,14 +1170,14 @@ def put(
         if isinstance(datasetRefOrType, DatasetRef):
             # This is a direct put of predefined DatasetRef.
             log.debug("Butler put direct: %s", datasetRefOrType)
-            # _importDatasets ignores existing dataset ref and just returns an
-            # original ref.
+            # If registry already has a dataset with the same dataset ID,
+            # dataset type and DataId, then _importDatasets will do nothing and
+            # just return an original ref. We have to raise in this case, there
+            # is a datastore check below for that.
             (imported_ref,) = self.registry._importDatasets(
                 [datasetRefOrType],
                 expand=True,
             )
-            if imported_ref.id != datasetRefOrType.id:
-                raise RuntimeError("This registry configuration does not support direct put of ref.")
             # Before trying to write to the datastore check that it does not
             # know this dataset. This is prone to races, of course.
             if self.datastore.knows(datasetRefOrType):
@@ -1276,7 +1276,15 @@ def getDirectDeferred(
         -------
         obj : `DeferredDatasetHandle`
             A handle which can be used to retrieve a dataset at a later time.
+
+        Raises
+        ------
+        LookupError
+            Raised if no matching dataset exists in the `Registry`.
         """
+        # Check thad dataset actuall exists.
+        if not self.datastore.exists(ref):
+            raise LookupError(f"Dataset reference {ref} does not exist.")
         return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
 
     def getDeferred(
@@ -1334,6 +1342,8 @@ def getDeferred(
         TypeError
             Raised if no collections were provided.
         """
+        if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType):
+            raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
         ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
         return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
 
@@ -1832,9 +1842,6 @@ def ingest(
             # execution butler.
             existingRefs: List[DatasetRef] = []
 
-            # Any newly-resolved refs.
-            resolvedRefs: list[DatasetRef] = []
-
             for ref in dataset.refs:
                 assert ref.run is not None  # For mypy
                 group_key = (ref.datasetType, ref.run)
@@ -1861,16 +1868,6 @@ def ingest(
 
                 # Store expanded form in the original FileDataset.
                 dataset.refs = existingRefs
-            elif resolvedRefs:
-                if len(dataset.refs) != len(resolvedRefs):
-                    raise ConflictingDefinitionError(
-                        f"For dataset {dataset.path} some DatasetRef were "
-                        "resolved and others were not. This is not supported."
-                    )
-                dataset.refs = resolvedRefs
-
-                # These datasets have to be registered.
-                self.registry._importDatasets(resolvedRefs)
             else:
                 groupedData[group_key].append(dataset)
 

diff --git a/python/lsst/daf/butler/core/datasets/ref.py b/python/lsst/daf/butler/core/datasets/ref.py
@@ -149,7 +149,6 @@ def makeDatasetId(
 class SerializedDatasetRef(BaseModel):
     """Simplified model of a `DatasetRef` suitable for serialization."""
 
-    # DO NOT change order in the Union, pydantic is sensitive to that!
     id: uuid.UUID
     datasetType: Optional[SerializedDatasetType] = None
     dataId: Optional[SerializedDataCoordinate] = None
@@ -187,10 +186,15 @@ def direct(
     ) -> SerializedDatasetRef:
         """Construct a `SerializedDatasetRef` directly without validators.
 
+        Notes
+        -----
         This differs from the pydantic "construct" method in that the arguments
         are explicitly what the model requires, and it will recurse through
         members, constructing them from their corresponding `direct` methods.
 
+        The ``id`` parameter is a string representation of dataset ID, it is
+        converted to UUID by this method.
+
         This method should only be called when the inputs are trusted.
         """
         node = SerializedDatasetRef.__new__(cls)
@@ -226,12 +230,12 @@ class DatasetRef:
         The `DatasetType` for this Dataset.
     dataId : `DataCoordinate`
         A mapping of dimensions that labels the Dataset within a Collection.
-    id : `DatasetId`, optional
-        The unique identifier assigned when the dataset is created. If ``id``
-        is not specified, a new unique ID will be created.
     run : `str`
         The name of the run this dataset was associated with when it was
         created.
+    id : `DatasetId`, optional
+        The unique identifier assigned when the dataset is created. If ``id``
+        is not specified, a new unique ID will be created.
     conform : `bool`, optional
         If `True` (default), call `DataCoordinate.standardize` to ensure that
         the data ID's dimensions are consistent with the dataset type's.
@@ -270,8 +274,8 @@ def __init__(
         self,
         datasetType: DatasetType,
         dataId: DataCoordinate,
-        *,
         run: str,
+        *,
         id: Optional[DatasetId] = None,
         conform: bool = True,
         id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
@@ -308,12 +312,12 @@ def __repr__(self) -> str:
         # DataCoordinate's __repr__ - while adhering to the guidelines for
         # __repr__ - is much harder to users to read, while its __str__ just
         # produces a dict that can also be passed to DatasetRef's constructor.
-        return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})"
+        return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})"
 
     def __str__(self) -> str:
         s = (
-            f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]"
-            f" (id={self.id})"
+            f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]"
+            f" (run={self.run} id={self.id})"
         )
         return s
 
@@ -349,30 +353,24 @@ def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
             The object converted to a dictionary.
         """
         if minimal:
-            # The only thing needed to uniquely define a DatasetRef
-            # is its id so that can be used directly if it is
-            # resolved and if it is not a component DatasetRef.
-            # Store is in a dict to allow us to easily add the planned
-            # origin information later without having to support
-            # an int and dict in simple form.
+            # The only thing needed to uniquely define a DatasetRef is its id
+            # so that can be used directly if it is not a component DatasetRef.
+            # Store is in a dict to allow us to easily add the planned origin
+            # information later without having to support an int and dict in
+            # simple form.
             simple: Dict[str, Any] = {"id": self.id}
             if self.isComponent():
                 # We can still be a little minimalist with a component
                 # but we will also need to record the datasetType component
                 simple["component"] = self.datasetType.component()
             return SerializedDatasetRef(**simple)
 
-        # Convert to a dict form
-        as_dict: Dict[str, Any] = {
-            "datasetType": self.datasetType.to_simple(minimal=minimal),
-            "dataId": self.dataId.to_simple(),
-        }
-
-        # Only include the id entry if it is defined
-        as_dict["run"] = self.run
-        as_dict["id"] = self.id
-
-        return SerializedDatasetRef(**as_dict)
+        return SerializedDatasetRef(
+            datasetType=self.datasetType.to_simple(minimal=minimal),
+            dataId=self.dataId.to_simple(),
+            run=self.run,
+            id=self.id,
+        )
 
     @classmethod
     def from_simple(
@@ -446,13 +444,13 @@ def from_simple(
         dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
 
         # Check that simple ref is resolved.
-        if simple.id is None or simple.run is None:
+        if simple.run is None:
             dstr = ""
             if simple.datasetType is None:
                 dstr = f" (datasetType={datasetType.name!r})"
             raise ValueError(
-                "Attempting to create an unresolved ref from simple form is deprecated. "
-                f"Encountered with {simple!r}{dstr}.",
+                "Run collection name is missing from serialized representation. "
+                f"Encountered with {simple!r}{dstr}."
             )
 
         return cls(datasetType, dataId, id=simple.id, run=simple.run)

diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -2070,7 +2070,7 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St
             if index < 2:
                 source_refs.append(ref)
             if ref not in deleted:
-                new_metric = butler.get(ref, collections=run)
+                new_metric = butler.get(ref)
                 self.assertEqual(new_metric, metric)
 
         # Create some bad dataset types to ensure we check for inconsistent
@@ -2144,8 +2144,8 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St
         # Now try to get the same refs from the new butler.
         for ref in source_refs:
             if ref not in deleted:
-                new_metric = self.target_butler.get(ref, collections=ref.run)
-                old_metric = self.source_butler.get(ref, collections=ref.run)
+                new_metric = self.target_butler.get(ref)
+                old_metric = self.source_butler.get(ref)
                 self.assertEqual(new_metric, old_metric)
 
         # Now prune run2 collection and create instead a CHAINED collection.