Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-24288: Do disassembly inside datastore #269

Merged
merged 8 commits into from
Apr 30, 2020
1 change: 1 addition & 0 deletions config/datastores/posixDatastore.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ datastore:
# Gen2 has.
default: "{run:/}/{datasetType}.{component:?}/{tract:?}/{patch:?}/{label:?}/{abstract_filter:?}/{subfilter:?}/{physical_filter:?}/{visit:?}/{datasetType}_{component:?}_{tract:?}_{patch:?}_{label:?}_{abstract_filter:?}_{physical_filter:?}_{calibration_label:?}_{visit:?}_{exposure:?}_{detector:?}_{instrument:?}_{skymap:?}_{skypix:?}_{run}"
formatters: !include formatters.yaml
composites: !include ../composites.yaml
1 change: 1 addition & 0 deletions config/datastores/s3Datastore.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ datastore:
# Gen2 has.
default: "{collection:/}/{datasetType}.{component:?}/{tract:?}/{patch:?}/{label:?}/{abstract_filter:?}/{subfilter:?}/{physical_filter:?}/{visit:?}/{datasetType}_{component:?}_{tract:?}_{patch:?}_{label:?}_{abstract_filter:?}_{physical_filter:?}_{calibration_label:?}_{visit:?}_{exposure:?}_{detector:?}_{instrument:?}_{skymap:?}_{skypix:?}_{run}"
formatters: !include formatters.yaml
composites: !include ../composites.yaml
19 changes: 16 additions & 3 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,13 +644,17 @@ def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
f"{collectionType.name}.")

isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
# Disable all disassembly at the registry level for now
isVirtualComposite = False

# Add Registry Dataset entry. If not a virtual composite, add
# and attach components at the same time.
dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
producer=producer, recursive=not isVirtualComposite)
producer=producer,
# Never write components into
# registry
recursive=False)

# Check to see if this datasetType requires disassembly
if isVirtualComposite:
Expand Down Expand Up @@ -694,7 +698,9 @@ def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = N
# if the ref exists in the store we return it directly
if self.datastore.exists(ref):
return self.datastore.get(ref, parameters=parameters)
elif ref.isComposite():
elif ref.isComposite() and ref.components:
# The presence of components indicates that this dataset
# was disassembled at the registry level.
# Check that we haven't got any unknown parameters
ref.datasetType.storageClass.validateParameters(parameters)
# Reconstruct the composite
Expand Down Expand Up @@ -1047,6 +1053,13 @@ def pruneDatasets(self, refs: Iterable[DatasetRef], *,
if collectionType is not CollectionType.TAGGED:
raise TypeError(f"Cannot disassociate from collection '{tag}' "
f"of non-TAGGED type {collectionType.name}.")
# Pruning a component of a DatasetRef makes no sense since registry
# doesn't always know about components and datastore might not store
# components in a separate file
for ref in refs:
if ref.datasetType.component():
raise ValueError(f"Can not prune a component of a dataset (ref={ref})")

if recursive:
refs = list(DatasetRef.flatten(refs))
# We don't need an unreliable Datastore transaction for this, because
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/core/composites.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class CompositesMap:
"""

def __init__(self, config, *, universe):
if not isinstance(config, type(self)):
if not isinstance(config, CompositesConfig):
config = CompositesConfig(config)
assert isinstance(config, CompositesConfig)
self.config = config
Expand Down
10 changes: 8 additions & 2 deletions python/lsst/daf/butler/core/datasets/ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class DatasetRef:
not be created in new code, but are still supported for backwards
compatibility. New code should only pass `False` if it can guarantee
that the dimensions are already consistent.
hasParentId : `bool`, optional
If `True` this `DatasetRef` is a component that has the ``id``
of the composite parent. This is set if the registry does not
know about individual components but does know about the composite.

Raises
------
Expand All @@ -80,16 +84,18 @@ class DatasetRef:
``id`` is provided but ``run`` is not.
"""

__slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components")
__slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId")

def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
id: Optional[int] = None,
run: Optional[str] = None, hash: Optional[bytes] = None,
components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef:
components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True,
hasParentId: bool = False) -> DatasetRef:
self = super().__new__(cls)
assert isinstance(datasetType, DatasetType)
self.id = id
self.datasetType = datasetType
self.hasParentId = hasParentId
if conform:
self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
else:
Expand Down
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/core/datasets/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ def __repr__(self):
return "DatasetType({}, {}, {})".format(self.name, self.dimensions, self._storageClassName)

def __eq__(self, other):
if not isinstance(other, type(self)):
return False
if self._name != other._name:
return False
if self._dimensions != other._dimensions:
Expand Down
7 changes: 6 additions & 1 deletion python/lsst/daf/butler/core/storedFileInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class StoredDatastoreItemInfo:
class StoredFileInfo(StoredDatastoreItemInfo):
"""Datastore-private metadata associated with a file stored in a Datastore.
"""
__slots__ = {"formatter", "path", "storageClass", "checksum", "file_size"}
__slots__ = {"formatter", "path", "storageClass", "component",
"checksum", "file_size"}

formatter: str
"""Fully-qualified name of Formatter."""
Expand All @@ -53,6 +54,10 @@ class StoredFileInfo(StoredDatastoreItemInfo):
storageClass: StorageClass
"""StorageClass associated with Dataset."""

component: Optional[str]
"""Component associated with this file. Can be None if the file does
not refer to a component of a composite."""

checksum: Optional[str]
"""Checksum of the serialized dataset."""

Expand Down