Merge pull request #709 from lsst/tickets/DM-33492

Move dataset UUID generation code to a public class.
lsst · Aug 11, 2022 · 3aec807 · 3aec807
2 parents b0940af + d435dba
commit 3aec807
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 62 deletions.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -13,6 +13,13 @@
 from .core import *
 
 # Import the registry subpackage directly for other symbols.
-from .registry import CollectionSearch, CollectionType, DatasetIdGenEnum, Registry, RegistryConfig
+from .registry import (
+    CollectionSearch,
+    CollectionType,
+    DatasetIdFactory,
+    DatasetIdGenEnum,
+    Registry,
+    RegistryConfig,
+)
 from .transfers import YamlRepoExportBackend, YamlRepoImportBackend
 from .version import *
diff --git a/python/lsst/daf/butler/registries/remote.py b/python/lsst/daf/butler/registries/remote.py
@@ -64,7 +64,7 @@
     QueryDimensionRecordsModel,
 )
 from ..registry import CollectionSearch, CollectionType, Registry, RegistryConfig, RegistryDefaults
-from ..registry.interfaces import DatasetIdGenEnum
+from ..registry.interfaces import DatasetIdFactory, DatasetIdGenEnum
 from ..registry.summaries import CollectionSummary
 
 if TYPE_CHECKING:
@@ -120,6 +120,10 @@ def __init__(self, server_uri: ResourcePath, defaults: RegistryDefaults, writeab
         self._db = server_uri
         self._defaults = defaults
 
+        # In the future DatasetIdFactory may become configurable and this
+        # instance will need to be shared with datasets manager.
+        self.datasetIdFactory = DatasetIdFactory()
+
         # All PUT calls should be short-circuited if not writeable.
         self._writeable = writeable
 

diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py
@@ -74,7 +74,7 @@
     RegistryDefaults,
     queries,
 )
-from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
+from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
 from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
 from ..registry.queries import Query
 from ..registry.summaries import CollectionSummary
@@ -205,6 +205,9 @@ def __init__(self, database: Database, defaults: RegistryDefaults, managers: Reg
         # can only be done after most of the rest of Registry has already been
         # initialized, and must be done before the property getter is used.
         self.defaults = defaults
+        # In the future DatasetIdFactory may become configurable and this
+        # instance will need to be shared with datasets manager.
+        self.datasetIdFactory = DatasetIdFactory()
 
     def __str__(self) -> str:
         return str(self._db)

diff --git a/python/lsst/daf/butler/registry/__init__.py b/python/lsst/daf/butler/registry/__init__.py
@@ -26,7 +26,7 @@
 from ._defaults import *
 from ._exceptions import *
 from ._registry import *
-from .interfaces import DatasetIdGenEnum
+from .interfaces import DatasetIdFactory, DatasetIdGenEnum
 from .wildcards import CollectionSearch
 
 # Some modules intentionally not imported, either because they are purely

diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -65,7 +65,7 @@
 from ._collectionType import CollectionType
 from ._config import RegistryConfig
 from ._defaults import RegistryDefaults
-from .interfaces import DatasetIdGenEnum
+from .interfaces import DatasetIdFactory, DatasetIdGenEnum
 from .queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
 from .summaries import CollectionSummary
 from .wildcards import CollectionSearch
@@ -1640,3 +1640,6 @@ def queryDatasetAssociations(
     storageClasses: StorageClassFactory
     """All storage classes known to the registry (`StorageClassFactory`).
     """
+
+    datasetIdFactory: DatasetIdFactory
+    """Factory for dataset IDs."""
diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
@@ -3,7 +3,7 @@
 __all__ = ("ByDimensionsDatasetRecordStorage",)
 
 import uuid
-from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set
 
 import sqlalchemy
 from lsst.daf.butler import (
@@ -22,7 +22,7 @@
     ConflictingDefinitionError,
     UnsupportedIdGeneratorError,
 )
-from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
+from lsst.daf.butler.registry.interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage
 
 from ...summaries import GovernorDimensionRestriction
 from .tables import makeTagTableSpec
@@ -645,10 +645,9 @@ class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
     dataset IDs.
     """
 
-    NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
-    """Namespace UUID used for UUID5 generation. Do not change. This was
-    produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
-    """
+    idMaker = DatasetIdFactory()
+    """Factory for dataset IDs. In the future this factory may be shared with
+    other classes (e.g. Registry)."""
 
     def insert(
         self,
@@ -669,7 +668,7 @@ def insert(
             dataIdList.append(dataId)
             rows.append(
                 {
-                    "id": self._makeDatasetId(run, dataId, idMode),
+                    "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
                     "dataset_type_id": self._dataset_type_id,
                     self._runKeyColumn: run.key,
                 }
@@ -724,7 +723,9 @@ def import_(
             # this code supports mixed types or missing IDs.
             datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
             if datasetId is None:
-                datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
+                datasetId = self.idMaker.makeDatasetId(
+                    run.name, self.datasetType, dataset.dataId, idGenerationMode
+                )
             dataIds[datasetId] = dataset.dataId
             governorValues.update_extract(dataset.dataId)
 
@@ -895,50 +896,3 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
             raise ConflictingDefinitionError(
                 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
             )
-
-    def _makeDatasetId(
-        self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
-    ) -> uuid.UUID:
-        """Generate dataset ID for a dataset.
-
-        Parameters
-        ----------
-        run : `RunRecord`
-            The record object describing the RUN collection for the dataset.
-        dataId : `DataCoordinate`
-            Expanded data ID for the dataset.
-        idGenerationMode : `DatasetIdGenEnum`
-            ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
-            UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
-            deterministic UUID5-type ID based on a dataset type name and
-            ``dataId``.  `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
-            deterministic UUID5-type ID based on a dataset type name, run
-            collection name, and ``dataId``.
-
-        Returns
-        -------
-        datasetId : `uuid.UUID`
-            Dataset identifier.
-        """
-        if idGenerationMode is DatasetIdGenEnum.UNIQUE:
-            return uuid.uuid4()
-        else:
-            # WARNING: If you modify this code make sure that the order of
-            # items in the `items` list below never changes.
-            items: List[Tuple[str, str]] = []
-            if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
-                items = [
-                    ("dataset_type", self.datasetType.name),
-                ]
-            elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
-                items = [
-                    ("dataset_type", self.datasetType.name),
-                    ("run", run.name),
-                ]
-            else:
-                raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
-
-            for name, value in sorted(dataId.byName().items()):
-                items.append((name, str(value)))
-            data = ",".join(f"{key}={value}" for key, value in items)
-            return uuid.uuid5(self.NS_UUID, data)
diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py
@@ -21,11 +21,12 @@
 
 from __future__ import annotations
 
-__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
+__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
 
 import enum
+import uuid
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple
 
 import sqlalchemy.sql
 
@@ -60,6 +61,73 @@ class DatasetIdGenEnum(enum.Enum):
     """
 
 
+class DatasetIdFactory:
+    """Factory for dataset IDs (UUIDs).
+
+    For now the logic is hard-coded and is controlled by the user-provided
+    value of `DatasetIdGenEnum`. In the future we may implement a configurable
+    logic that can guess `DatasetIdGenEnum` value from other parameters.
+    """
+
+    NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
+    """Namespace UUID used for UUID5 generation. Do not change. This was
+    produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
+    """
+
+    def makeDatasetId(
+        self,
+        run: str,
+        datasetType: DatasetType,
+        dataId: DataCoordinate,
+        idGenerationMode: DatasetIdGenEnum,
+    ) -> uuid.UUID:
+        """Generate dataset ID for a dataset.
+
+        Parameters
+        ----------
+        run : `str`
+            Name of the RUN collection for the dataset.
+        datasetType : `DatasetType`
+            Dataset type.
+        dataId : `DataCoordinate`
+            Expanded data ID for the dataset.
+        idGenerationMode : `DatasetIdGenEnum`
+            ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
+            UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
+            deterministic UUID5-type ID based on a dataset type name and
+            ``dataId``.  `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
+            deterministic UUID5-type ID based on a dataset type name, run
+            collection name, and ``dataId``.
+
+        Returns
+        -------
+        datasetId : `uuid.UUID`
+            Dataset identifier.
+        """
+        if idGenerationMode is DatasetIdGenEnum.UNIQUE:
+            return uuid.uuid4()
+        else:
+            # WARNING: If you modify this code make sure that the order of
+            # items in the `items` list below never changes.
+            items: List[Tuple[str, str]] = []
+            if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
+                items = [
+                    ("dataset_type", datasetType.name),
+                ]
+            elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
+                items = [
+                    ("dataset_type", datasetType.name),
+                    ("run", run),
+                ]
+            else:
+                raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
+
+            for name, value in sorted(dataId.byName().items()):
+                items.append((name, str(value)))
+            data = ",".join(f"{key}={value}" for key, value in items)
+            return uuid.uuid5(self.NS_UUID, data)
+
+
 class DatasetRecordStorage(ABC):
     """An interface that manages the records associated with a particular
     `DatasetType`.

diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -2932,3 +2932,30 @@ def testSkyPixDatasetQueries(self):
             {data_id},
         )
         self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
+
+    def testDatasetIdFactory(self):
+        """Simple test for DatasetIdFactory, mostly to catch potential changes
+        in its API.
+        """
+        registry = self.makeRegistry()
+        factory = registry.datasetIdFactory
+        dataset_type = DatasetType(
+            "datasetType",
+            dimensions=["detector", "instrument"],
+            universe=registry.dimensions,
+            storageClass="int",
+        )
+        run = "run"
+        data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
+
+        datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
+        self.assertIsInstance(datasetId, uuid.UUID)
+        self.assertEquals(datasetId.version, 4)
+
+        datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
+        self.assertIsInstance(datasetId, uuid.UUID)
+        self.assertEquals(datasetId.version, 5)
+
+        datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
+        self.assertIsInstance(datasetId, uuid.UUID)
+        self.assertEquals(datasetId.version, 5)