Merge branch 'tickets/DM-30266'

lsst · Dec 12, 2021 · ad64451 · ad64451
2 parents 1184064 + 08a9926
commit ad64451
Show file tree

Hide file tree

Showing 9 changed files with 597 additions and 20 deletions.
diff --git a/python/lsst/daf/butler/core/datasets/ref.py b/python/lsst/daf/butler/core/datasets/ref.py
@@ -89,6 +89,29 @@ def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any:  # noqa: N805
             raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
         return v
 
+    @classmethod
+    def direct(cls, *, id: Optional[Union[str, int]] = None, datasetType: Optional[Dict[str, Any]] = None,
+               dataId: Optional[Dict[str, Any]] = None, run: str = None, component: Optional[str] = None
+               ) -> SerializedDatasetRef:
+        """Construct a `SerializedDatasetRef` directly without validators.
+
+        This differs from the pydantic "construct" method in that the arguments
+        are explicitly what the model requires, and it will recurse through
+        members, constructing them from their corresponding `direct` methods.
+
+        This method should only be called when the inputs are trusted.
+        """
+        node = SerializedDatasetRef.__new__(cls)
+        setter = object.__setattr__
+        setter(node, 'id', uuid.UUID(id) if isinstance(id, str) else id)
+        setter(node, 'datasetType',
+               datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType))
+        setter(node, 'dataId', dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
+        setter(node, 'run', run)
+        setter(node, 'component', component)
+        setter(node, '__fields_set__', {'id', 'datasetType', 'dataId', 'run', 'component'})
+        return node
+
 
 DatasetId = Union[int, uuid.UUID]
 """A type-annotation alias for dataset ID which could be either integer or
@@ -249,7 +272,8 @@ def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
     @classmethod
     def from_simple(cls, simple: SerializedDatasetRef,
                     universe: Optional[DimensionUniverse] = None,
-                    registry: Optional[Registry] = None) -> DatasetRef:
+                    registry: Optional[Registry] = None,
+                    datasetType: Optional[DatasetType] = None) -> DatasetRef:
         """Construct a new object from simplified form.
 
         Generally this is data returned from the `to_simple` method.
@@ -265,6 +289,11 @@ def from_simple(cls, simple: SerializedDatasetRef,
             Registry to use to convert simple form of a DatasetRef to
             a full `DatasetRef`. Can be `None` if a full description of
             the type is provided along with a universe.
+        datasetType : DatasetType, optional
+            If datasetType is supplied, this will be used as the datasetType
+            object in the resulting DatasetRef instead of being read from
+            the `SerializedDatasetRef`. This is useful when many refs share
+            the same type as memory can be saved. Defaults to None.
 
         Returns
         -------
@@ -295,10 +324,13 @@ def from_simple(cls, simple: SerializedDatasetRef,
             # this is for mypy
             raise ValueError("Unable to determine a usable universe")
 
-        if simple.datasetType is None:
+        if simple.datasetType is None and datasetType is None:
             # mypy
             raise ValueError("The DatasetType must be specified to construct a DatasetRef")
-        datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
+        if datasetType is None:
+            if simple.datasetType is None:
+                raise ValueError("Cannot determine Dataset type of this serialized class")
+            datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
 
         if simple.dataId is None:
             # mypy

diff --git a/python/lsst/daf/butler/core/datasets/type.py b/python/lsst/daf/butler/core/datasets/type.py
@@ -69,6 +69,31 @@ class SerializedDatasetType(BaseModel):
     parentStorageClass: Optional[StrictStr] = None
     isCalibration: StrictBool = False
 
+    @classmethod
+    def direct(cls, *, name: str, storageClass: Optional[str] = None,
+               dimensions: Optional[Dict] = None,
+               parentStorageClass: Optional[str] = None, isCalibration: bool = False
+               ) -> SerializedDatasetType:
+        """Construct a `SerializedDatasetType` directly without validators.
+
+        This differs from PyDantics construct method in that the arguments are
+        explicitly what the model requires, and it will recurse through
+        members, constructing them from their corresponding `direct` methods.
+
+        This method should only be called when the inputs are trusted.
+        """
+        node = SerializedDatasetType.__new__(cls)
+        setter = object.__setattr__
+        setter(node, 'name', name)
+        setter(node, 'storageClass', storageClass)
+        setter(node, 'dimensions',
+               dimensions if dimensions is None else SerializedDimensionGraph.direct(**dimensions))
+        setter(node, 'parentStorageClass', parentStorageClass)
+        setter(node, 'isCalibration', isCalibration)
+        setter(node, '__fields_set__', {'name', 'storageClass', 'dimensions', 'parentStorageClass',
+                                        'isCalibration'})
+        return node
+
 
 class DatasetType:
     r"""A named category of Datasets.

diff --git a/python/lsst/daf/butler/core/ddl.py b/python/lsst/daf/butler/core/ddl.py
@@ -32,6 +32,8 @@
 """
 from __future__ import annotations
 
+from lsst import sphgeom
+
 __all__ = ("TableSpec", "FieldSpec", "ForeignKeySpec", "Base64Bytes", "Base64Region",
            "AstropyTimeNsecTai", "GUID")
 
@@ -127,6 +129,15 @@ def process_result_value(self, value: Optional[str], dialect: sqlalchemy.engine.
         # native `bytes`.
         return b64decode(value.encode("ascii")) if value is not None else None
 
+    @property
+    def python_type(self) -> Type[bytes]:
+        return bytes
+
+
+# create an alias, for use below to disambiguate between the built in
+# sqlachemy type
+LocalBase64Bytes = Base64Bytes
+
 
 class Base64Region(Base64Bytes):
     """A SQLAlchemy custom type for Python `sphgeom.Region`.
@@ -146,6 +157,10 @@ def process_result_value(self, value: Optional[str], dialect: sqlalchemy.engine.
             return None
         return Region.decode(super().process_result_value(value, dialect))
 
+    @property
+    def python_type(self) -> Type[sphgeom.Region]:
+        return sphgeom.Region
+
 
 class AstropyTimeNsecTai(sqlalchemy.TypeDecorator):
     """A SQLAlchemy custom type for Python `astropy.time.Time`.
@@ -370,7 +385,13 @@ def getPythonType(self) -> type:
         type : `type`
             Python type associated with this field's (SQL) `dtype`.
         """
-        return self.dtype().python_type
+        # to construct these objects, nbytes keyword is needed
+        if issubclass(self.dtype, LocalBase64Bytes):
+            # satisfy mypy for something that must be true
+            assert self.nbytes is not None
+            return self.dtype(nbytes=self.nbytes).python_type
+        else:
+            return self.dtype().python_type  # type: ignore
 
 
 @dataclass

diff --git a/python/lsst/daf/butler/core/dimensions/_coordinate.py b/python/lsst/daf/butler/core/dimensions/_coordinate.py
@@ -73,6 +73,25 @@ class SerializedDataCoordinate(BaseModel):
     dataId: Dict[str, DataIdValue]
     records: Optional[Dict[str, SerializedDimensionRecord]] = None
 
+    @classmethod
+    def direct(cls, *, dataId: Dict[str, DataIdValue], records: Dict[str, Dict]) -> SerializedDataCoordinate:
+        """Construct a `SerializedDataCoordinate` directly without validators.
+
+        This differs from the pydantic "construct" method in that the arguments
+        are explicitly what the model requires, and it will recurse through
+        members, constructing them from their corresponding `direct` methods.
+
+        This method should only be called when the inputs are trusted.
+        """
+        node = SerializedDataCoordinate.__new__(cls)
+        setter = object.__setattr__
+        setter(node, 'dataId', dataId)
+        setter(node, 'records',
+               records if records is None else
+               {k: SerializedDimensionRecord.direct(**v) for k, v in records.items()})
+        setter(node, '__fields_set__', {'dataId', 'records'})
+        return node
+
 
 def _intersectRegions(*args: Region) -> Optional[Region]:
     """Return the intersection of several regions.

diff --git a/python/lsst/daf/butler/core/dimensions/_graph.py b/python/lsst/daf/butler/core/dimensions/_graph.py
@@ -58,6 +58,21 @@ class SerializedDimensionGraph(BaseModel):
 
     names: List[str]
 
+    @classmethod
+    def direct(cls, *, names: List[str]) -> SerializedDimensionGraph:
+        """Construct a `SerializedDimensionGraph` directly without validators.
+
+        This differs from the pydantic "construct" method in that the arguments
+        are explicitly what the model requires, and it will recurse through
+        members, constructing them from their corresponding `direct` methods.
+
+        This method should only be called when the inputs are trusted.
+        """
+        node = SerializedDimensionGraph.__new__(cls)
+        object.__setattr__(node, 'names', names)
+        object.__setattr__(node, '__fields_set__', {'names'})
+        return node
+
 
 @immutable
 class DimensionGraph:

diff --git a/python/lsst/daf/butler/core/dimensions/_records.py b/python/lsst/daf/butler/core/dimensions/_records.py
@@ -159,6 +159,30 @@ class Config:
             }
         }
 
+    @classmethod
+    def direct(cls, *, definition: str, record: Dict[str, Union[None, StrictFloat, StrictStr, StrictBool,
+                                                                StrictInt, Tuple[int, int]]]
+               ) -> SerializedDimensionRecord:
+        """Construct a `SerializedDimensionRecord` directly without validators.
+
+        This differs from the pydantic "construct" method in that the arguments
+        are explicitly what the model requires, and it will recurse through
+        members, constructing them from their corresponding `direct` methods.
+
+        This method should only be called when the inputs are trusted.
+        """
+        node = cls.construct(definition=definition, record=record)
+        node = SerializedDimensionRecord.__new__(cls)
+        setter = object.__setattr__
+        setter(node, 'definition', definition)
+        # This method requires tuples as values of the mapping, but JSON
+        # readers will read things in as lists. Be kind and transparently
+        # transform to tuples
+        setter(node, 'record', {k: v if type(v) != list else tuple(v)  # type: ignore
+                                for k, v in record.items()})
+        setter(node, '__fields_set__', {'definition', 'record'})
+        return node
+
 
 @immutable
 class DimensionRecord:
@@ -298,7 +322,10 @@ def to_simple(self, minimal: bool = False) -> SerializedDimensionRecord:
                     # and also history. Here use a different approach.
                     # This code needs to be migrated to sphgeom
                     mapping[k] = v.encode().hex()
-
+                if isinstance(v, bytes):
+                    # We actually can't handle serializing out to bytes for
+                    # hash objects, encode it here to a hex string
+                    mapping[k] = v.hex()
         definition = self.definition.to_simple(minimal=minimal)
         return SerializedDimensionRecord(definition=definition, record=mapping)
 
@@ -353,6 +380,8 @@ def from_simple(cls, simple: SerializedDimensionRecord,
         if (reg := "region") in rec:
             encoded = bytes.fromhex(rec[reg])
             rec[reg] = lsst.sphgeom.Region.decode(encoded)
+        if (hsh := "hash") in rec:
+            rec[hsh] = bytes.fromhex(rec[hsh].decode())
 
         return _reconstructDimensionRecord(definition, rec)