-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DM-26683: Make exporting dimension data friendlier #375
Changes from 1 commit
0c3ddb0
5578084
d062e3e
2031534
0c3008f
b3ce605
35d518c
7baced3
b10c432
7eef36c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,10 +28,7 @@ | |
Dict, | ||
Iterable, | ||
List, | ||
Mapping, | ||
MutableMapping, | ||
Optional, | ||
Set, | ||
Tuple, | ||
) | ||
from collections import defaultdict | ||
|
@@ -57,7 +54,7 @@ class RepoExportContext: | |
|
||
with butler.export(filename="export.yaml") as export: | ||
export.saveDataIds(...) | ||
export.saveDatasts(...) | ||
export.saveDatasets(...) | ||
|
||
Parameters | ||
---------- | ||
|
@@ -80,7 +77,9 @@ def __init__(self, registry: Registry, datastore: Datastore, backend: RepoExport | |
self._backend = backend | ||
self._directory = directory | ||
self._transfer = transfer | ||
self._dataset_ids: Set[int] = set() | ||
self._records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) | ||
self._dataset_ids = set() | ||
self._datasets: Dict[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) | ||
|
||
def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, | ||
elements: Optional[Iterable[DimensionElement]] = None) -> None: | ||
|
@@ -99,13 +98,10 @@ def saveDataIds(self, dataIds: Iterable[DataCoordinate], *, | |
if element.hasTable() and element.viewOf is None) | ||
else: | ||
elements = frozenset(elements) | ||
records: MutableMapping[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) | ||
for dataId in dataIds: | ||
for record in dataId.records.values(): | ||
if record is not None and record.definition in elements: | ||
records[record.definition].setdefault(record.dataId, record) | ||
for element in self._registry.dimensions.sorted(records.keys()): | ||
self._backend.saveDimensionData(element, *records[element].values()) | ||
self._records[record.definition].setdefault(record.dataId, record) | ||
|
||
def saveDatasets(self, refs: Iterable[DatasetRef], *, | ||
elements: Optional[Iterable[DimensionElement]] = None, | ||
|
@@ -139,7 +135,6 @@ def saveDatasets(self, refs: Iterable[DatasetRef], *, | |
future (once `Registry` provides a way to look up that information). | ||
""" | ||
dataIds = set() | ||
datasets: Mapping[Tuple[DatasetType, str], List[FileDataset]] = defaultdict(list) | ||
for ref in refs: | ||
# The query interfaces that are often used to generate the refs | ||
# passed here often don't remove duplicates, so do that here for | ||
|
@@ -155,14 +150,16 @@ def saveDatasets(self, refs: Iterable[DatasetRef], *, | |
exports = [rewrite(export) for export in exports] | ||
self._dataset_ids.add(ref.getCheckedId()) | ||
assert ref.run is not None | ||
datasets[ref.datasetType, ref.run].extend(exports) | ||
self._datasets[ref.datasetType, ref.run].extend(exports) | ||
self.saveDataIds(dataIds, elements=elements) | ||
for (datasetType, run), records in datasets.items(): | ||
self._backend.saveDatasets(datasetType, run, *records) | ||
|
||
def _finish(self) -> None: | ||
"""Delegate to the backend to finish the export process. | ||
|
||
For use by `Butler.export` only. | ||
""" | ||
for element in self._registry.dimensions.sorted(self._records.keys()): | ||
self._backend.saveDimensionData(element, *self._records[element].values()) | ||
for (datasetType, run), records in self._datasets.items(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this benefit from a sort? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add one. As in the previous case, we'll need to add sorting in the line below as well to make the order fully deterministic, but we might as well start. |
||
self._backend.saveDatasets(datasetType, run, *records) | ||
self._backend.finish() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this sorting help us out at all with DM-26324 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a start, but we'd also need to sort the iterable provided on the line below, and that's harder, because we don't have a way to sort data IDs or dimension records (something I realized after my Jira post about trying it out).