Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move dataset UUID generation code to a public class. #709

Merged
merged 1 commit into from
Aug 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
from .core import *

# Import the registry subpackage directly for other symbols.
from .registry import CollectionSearch, CollectionType, DatasetIdGenEnum, Registry, RegistryConfig
from .registry import (
CollectionSearch,
CollectionType,
DatasetIdFactory,
DatasetIdGenEnum,
Registry,
RegistryConfig,
)
from .transfers import YamlRepoExportBackend, YamlRepoImportBackend
from .version import *
6 changes: 5 additions & 1 deletion python/lsst/daf/butler/registries/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
QueryDimensionRecordsModel,
)
from ..registry import CollectionSearch, CollectionType, Registry, RegistryConfig, RegistryDefaults
from ..registry.interfaces import DatasetIdGenEnum
from ..registry.interfaces import DatasetIdFactory, DatasetIdGenEnum
from ..registry.summaries import CollectionSummary

if TYPE_CHECKING:
Expand Down Expand Up @@ -120,6 +120,10 @@ def __init__(self, server_uri: ResourcePath, defaults: RegistryDefaults, writeab
self._db = server_uri
self._defaults = defaults

# In the future DatasetIdFactory may become configurable and this
# instance will need to be shared with datasets manager.
self.datasetIdFactory = DatasetIdFactory()

# All PUT calls should be short-circuited if not writeable.
self._writeable = writeable

Expand Down
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/registries/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
RegistryDefaults,
queries,
)
from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
from ..registry.queries import Query
from ..registry.summaries import CollectionSummary
Expand Down Expand Up @@ -205,6 +205,9 @@ def __init__(self, database: Database, defaults: RegistryDefaults, managers: Reg
# can only be done after most of the rest of Registry has already been
# initialized, and must be done before the property getter is used.
self.defaults = defaults
# In the future DatasetIdFactory may become configurable and this
# instance will need to be shared with datasets manager.
self.datasetIdFactory = DatasetIdFactory()

def __str__(self) -> str:
return str(self._db)
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ._defaults import *
from ._exceptions import *
from ._registry import *
from .interfaces import DatasetIdGenEnum
from .interfaces import DatasetIdFactory, DatasetIdGenEnum
from .wildcards import CollectionSearch

# Some modules intentionally not imported, either because they are purely
Expand Down
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
from ._collectionType import CollectionType
from ._config import RegistryConfig
from ._defaults import RegistryDefaults
from .interfaces import DatasetIdGenEnum
from .interfaces import DatasetIdFactory, DatasetIdGenEnum
from .queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
from .summaries import CollectionSummary
from .wildcards import CollectionSearch
Expand Down Expand Up @@ -1640,3 +1640,6 @@ def queryDatasetAssociations(
storageClasses: StorageClassFactory
"""All storage classes known to the registry (`StorageClassFactory`).
"""

datasetIdFactory: DatasetIdFactory
"""Factory for dataset IDs."""
64 changes: 9 additions & 55 deletions python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__all__ = ("ByDimensionsDatasetRecordStorage",)

import uuid
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set

import sqlalchemy
from lsst.daf.butler import (
Expand All @@ -22,7 +22,7 @@
ConflictingDefinitionError,
UnsupportedIdGeneratorError,
)
from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
from lsst.daf.butler.registry.interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage

from ...summaries import GovernorDimensionRestriction
from .tables import makeTagTableSpec
Expand Down Expand Up @@ -645,10 +645,9 @@ class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
dataset IDs.
"""

NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
"""Namespace UUID used for UUID5 generation. Do not change. This was
produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
"""
idMaker = DatasetIdFactory()
"""Factory for dataset IDs. In the future this factory may be shared with
other classes (e.g. Registry)."""

def insert(
self,
Expand All @@ -669,7 +668,7 @@ def insert(
dataIdList.append(dataId)
rows.append(
{
"id": self._makeDatasetId(run, dataId, idMode),
"id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
"dataset_type_id": self._dataset_type_id,
self._runKeyColumn: run.key,
}
Expand Down Expand Up @@ -724,7 +723,9 @@ def import_(
# this code supports mixed types or missing IDs.
datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
if datasetId is None:
datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
datasetId = self.idMaker.makeDatasetId(
run.name, self.datasetType, dataset.dataId, idGenerationMode
)
dataIds[datasetId] = dataset.dataId
governorValues.update_extract(dataset.dataId)

Expand Down Expand Up @@ -895,50 +896,3 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
raise ConflictingDefinitionError(
f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
)

def _makeDatasetId(
self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
) -> uuid.UUID:
"""Generate dataset ID for a dataset.

Parameters
----------
run : `RunRecord`
The record object describing the RUN collection for the dataset.
dataId : `DataCoordinate`
Expanded data ID for the dataset.
idGenerationMode : `DatasetIdGenEnum`
ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
deterministic UUID5-type ID based on a dataset type name and
``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
deterministic UUID5-type ID based on a dataset type name, run
collection name, and ``dataId``.

Returns
-------
datasetId : `uuid.UUID`
Dataset identifier.
"""
if idGenerationMode is DatasetIdGenEnum.UNIQUE:
return uuid.uuid4()
else:
# WARNING: If you modify this code make sure that the order of
# items in the `items` list below never changes.
items: List[Tuple[str, str]] = []
if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
items = [
("dataset_type", self.datasetType.name),
]
elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
items = [
("dataset_type", self.datasetType.name),
("run", run.name),
]
else:
raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")

for name, value in sorted(dataId.byName().items()):
items.append((name, str(value)))
data = ",".join(f"{key}={value}" for key, value in items)
return uuid.uuid5(self.NS_UUID, data)
72 changes: 70 additions & 2 deletions python/lsst/daf/butler/registry/interfaces/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@

from __future__ import annotations

__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")

import enum
import uuid
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple
from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple

import sqlalchemy.sql

Expand Down Expand Up @@ -60,6 +61,73 @@ class DatasetIdGenEnum(enum.Enum):
"""


class DatasetIdFactory:
"""Factory for dataset IDs (UUIDs).

For now the logic is hard-coded and is controlled by the user-provided
value of `DatasetIdGenEnum`. In the future we may implement a configurable
logic that can guess `DatasetIdGenEnum` value from other parameters.
"""

NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
"""Namespace UUID used for UUID5 generation. Do not change. This was
produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
"""

def makeDatasetId(
self,
run: str,
datasetType: DatasetType,
dataId: DataCoordinate,
idGenerationMode: DatasetIdGenEnum,
) -> uuid.UUID:
"""Generate dataset ID for a dataset.

Parameters
----------
run : `str`
Name of the RUN collection for the dataset.
datasetType : `DatasetType`
Dataset type.
dataId : `DataCoordinate`
Expanded data ID for the dataset.
idGenerationMode : `DatasetIdGenEnum`
ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
deterministic UUID5-type ID based on a dataset type name and
``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
deterministic UUID5-type ID based on a dataset type name, run
collection name, and ``dataId``.

Returns
-------
datasetId : `uuid.UUID`
Dataset identifier.
"""
if idGenerationMode is DatasetIdGenEnum.UNIQUE:
return uuid.uuid4()
else:
# WARNING: If you modify this code make sure that the order of
# items in the `items` list below never changes.
items: List[Tuple[str, str]] = []
if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
items = [
("dataset_type", datasetType.name),
]
elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
items = [
("dataset_type", datasetType.name),
("run", run),
]
else:
raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")

for name, value in sorted(dataId.byName().items()):
items.append((name, str(value)))
data = ",".join(f"{key}={value}" for key, value in items)
return uuid.uuid5(self.NS_UUID, data)


class DatasetRecordStorage(ABC):
"""An interface that manages the records associated with a particular
`DatasetType`.
Expand Down
27 changes: 27 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2932,3 +2932,30 @@ def testSkyPixDatasetQueries(self):
{data_id},
)
self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})

def testDatasetIdFactory(self):
"""Simple test for DatasetIdFactory, mostly to catch potential changes
in its API.
"""
registry = self.makeRegistry()
factory = registry.datasetIdFactory
dataset_type = DatasetType(
"datasetType",
dimensions=["detector", "instrument"],
universe=registry.dimensions,
storageClass="int",
)
run = "run"
data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 4)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 5)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 5)