Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-26683: Make exporting dimension data friendlier #375

Merged
merged 10 commits into from
Sep 17, 2020
23 changes: 10 additions & 13 deletions python/lsst/daf/butler/transfers/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@
Dict,
Iterable,
List,
Mapping,
MutableMapping,
Optional,
Set,
Tuple,
)
from collections import defaultdict
Expand All @@ -57,7 +54,7 @@ class RepoExportContext:

with butler.export(filename="export.yaml") as export:
export.saveDataIds(...)
export.saveDatasts(...)
export.saveDatasets(...)

Parameters
----------
Expand All @@ -80,7 +77,9 @@ def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExport
self._backend = backend
self._directory = directory
self._transfer = transfer
self._dataset_ids: Set[int] = set()
self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
self._dataset_ids = set()
self._datasets: Dict[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)

def saveDataIds(self, dataIds: Iterable[DataCoordinate], *,
elements: Optional[Iterable[DimensionElement]] = None) -> None:
Expand All @@ -99,13 +98,10 @@ def saveDataIds(self, dataIds: Iterable[DataCoordinate], *,
if element.hasTable() and element.viewOf is None)
else:
elements = frozenset(elements)
records: MutableMapping[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
for dataId in dataIds:
for record in dataId.records.values():
if record is not None and record.definition in elements:
records[record.definition].setdefault(record.dataId, record)
for element in self._registry.dimensions.sorted(records.keys()):
self._backend.saveDimensionData(element, *records[element].values())
self._records[record.definition].setdefault(record.dataId, record)

def saveDatasets(self, refs: Iterable[DatasetRef], *,
elements: Optional[Iterable[DimensionElement]] = None,
Expand Down Expand Up @@ -139,7 +135,6 @@ def saveDatasets(self, refs: Iterable[DatasetRef], *,
future (once `Registry` provides a way to look up that information).
"""
dataIds = set()
datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list)
for ref in refs:
# The query interfaces that are often used to generate the refs
# passed here often don't remove duplicates, so do that here for
Expand All @@ -155,14 +150,16 @@ def saveDatasets(self, refs: Iterable[DatasetRef], *,
exports = [rewrite(export) for export in exports]
self._dataset_ids.add(ref.getCheckedId())
assert ref.run is not None
datasets[ref.datasetType, ref.run].extend(exports)
self._datasets[ref.datasetType, ref.run].extend(exports)
self.saveDataIds(dataIds, elements=elements)
for (datasetType, run), records in datasets.items():
self._backend.saveDatasets(datasetType, run, *records)

def _finish(self) -> None:
"""Delegate to the backend to finish the export process.

For use by `Butler.export` only.
"""
for element in self._registry.dimensions.sorted(self._records.keys()):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this sorting help us out at all with DM-26324 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a start, but we'd also need to sort the iterable provided on the line below, and that's harder, because we don't have a way to sort data IDs or dimension records (something I realized after my Jira post about trying it out).

self._backend.saveDimensionData(element, *self._records[element].values())
for (datasetType, run), records in self._datasets.items():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this benefit from a sort?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add one. As in the previous case, we'll need to add sorting in the line below as well to make the order fully deterministic, but we might as well start.

self._backend.saveDatasets(datasetType, run, *records)
self._backend.finish()
3 changes: 3 additions & 0 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,9 @@ def runImportExportTest(self, storageClass):
exportFile = os.path.join(exportDir, "exports.yaml")
with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
export.saveDatasets(datasets)
# Save one of the data IDs again; this should be harmless
# because of internal deduplication.
export.saveDataIds([datasets[0].dataId])
self.assertTrue(os.path.exists(exportFile))
with tempfile.TemporaryDirectory() as importDir:
# We always want this to be a local posix butler
Expand Down