Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-23129: Update gen3 ingest to allow for multiple datasets per file #198

Merged
merged 6 commits into from
Jan 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 4 additions & 5 deletions python/lsst/obs/base/cameraMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,8 @@ def __init__(self, policy, repositoryDir,
posixIfNoSql=False) # NB never use posix for calibs
else:
raise RuntimeError(
"'needCalibRegistry' is true in Policy, but was unable to locate a repo at " +
"calibRoot ivar:%s or policy['calibRoot']:%s" %
(calibRoot, policy.get('calibRoot', None)))
"'needCalibRegistry' is true in Policy, but was unable to locate a repo at "
f"calibRoot ivar:{calibRoot} or policy['calibRoot']:{policy.get('calibRoot', None)}")
else:
self.calibRegistry = None

Expand Down Expand Up @@ -1030,8 +1029,8 @@ def _setFilter(self, mapping, item, dataId):
Dataset identifier.
"""

if not (isinstance(item, afwImage.ExposureU) or isinstance(item, afwImage.ExposureI) or
isinstance(item, afwImage.ExposureF) or isinstance(item, afwImage.ExposureD)):
if not (isinstance(item, afwImage.ExposureU) or isinstance(item, afwImage.ExposureI)
or isinstance(item, afwImage.ExposureF) or isinstance(item, afwImage.ExposureD)):
return

if item.getFilter().getId() != afwImage.Filter.UNKNOWN:
Expand Down
6 changes: 3 additions & 3 deletions python/lsst/obs/base/gen2to3/convertRepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,9 @@ def isDatasetTypeIncluded(self, datasetTypeName: str):
"""
return (
any(fnmatch.fnmatchcase(datasetTypeName, pattern)
for pattern in self.config.datasetIncludePatterns) and
not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
for pattern in self.config.datasetIgnorePatterns)
for pattern in self.config.datasetIncludePatterns)
and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
for pattern in self.config.datasetIgnorePatterns)
)

def useSkyMap(self, skyMap: BaseSkyMap) -> str:
Expand Down
4 changes: 2 additions & 2 deletions python/lsst/obs/base/gen2to3/repoConverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ class implementation at some point in their own logic.
skip = False
message = None
storageClass = None
if (not self.task.isDatasetTypeIncluded(datasetTypeName) or
self.isDatasetTypeSpecial(datasetTypeName)):
if (not self.task.isDatasetTypeIncluded(datasetTypeName)
or self.isDatasetTypeSpecial(datasetTypeName)):
# User indicated not to include this data, but we still want
# to recognize files of that type to avoid warning about them.
skip = True
Expand Down
6 changes: 3 additions & 3 deletions python/lsst/obs/base/gen2to3/rootRepoConverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def __init__(self, **kwds):
def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
# Docstring inherited from RepoConverter.
return (
super().isDatasetTypeSpecial(datasetTypeName) or
datasetTypeName in ("raw", "ref_cat", "ref_cat_config") or
super().isDatasetTypeSpecial(datasetTypeName)
or datasetTypeName in ("raw", "ref_cat", "ref_cat_config")
# in Gen2, some of these are in the root repo, not a calib repo
datasetTypeName in CURATED_CALIBRATION_DATASET_TYPES
or datasetTypeName in CURATED_CALIBRATION_DATASET_TYPES
)

def getSpecialDirectories(self) -> List[str]:
Expand Down
136 changes: 102 additions & 34 deletions python/lsst/obs/base/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@


@dataclass
class RawFileData:
"""Structure that holds information about a single raw file, used during
ingest.
class RawFileDatasetInfo:
"""Structure that hold information about a single dataset within a
raw file.
"""

dataId: DataCoordinate
Expand All @@ -72,6 +72,18 @@ class RawFileData:
(`lsst.sphgeom.ConvexPolygon`).
"""


@dataclass
class RawFileData:
"""Structure that holds information about a single raw file, used during
ingest.
"""

datasets: List[RawFileDatasetInfo]
"""The information describing each dataset within this raw file.
(`list` of `RawFileDatasetInfo`)
"""

filename: str
"""Name of the file this information was extracted from (`str`).

Expand Down Expand Up @@ -220,10 +232,42 @@ def extractMetadata(self, filename: str) -> RawFileData:
as well as the original filename. All fields will be populated,
but the `RawFileData.dataId` attribute will be a minimal
(unexpanded) `DataCoordinate` instance.

Notes
-----
Assumes that there is a single dataset associated with the given
file. Instruments using a single file to store multiple datasets
must implement their own version of this method.
"""
phdu = readMetadata(filename, 0)
header = merge_headers([phdu, readMetadata(filename)], mode="overwrite")
fix_header(header)
datasets = [self._calculate_dataset_info(header, filename)]

# The data model currently assumes that whilst multiple datasets
# can be associated with a single file, they must all share the
# same formatter.
FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId)
timj marked this conversation as resolved.
Show resolved Hide resolved

return RawFileData(datasets=datasets, filename=filename,
FormatterClass=FormatterClass)

def _calculate_dataset_info(self, header, filename):
"""Calculate a RawFileDatasetInfo from the supplied information.

Parameters
----------
header : `Mapping`
Header from the dataset.
filename : `str`
Filename to use for error messages.

Returns
-------
dataset : `RawFileDatasetInfo`
The region, dataId, and observation information associated with
this dataset.
"""
obsInfo = ObservationInfo(header)
dataId = DataCoordinate.standardize(instrument=obsInfo.instrument,
exposure=obsInfo.exposure_id,
Expand All @@ -232,7 +276,29 @@ def extractMetadata(self, filename: str) -> RawFileData:
if obsInfo.instrument != self.instrument.getName():
raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, "
f"got {obsInfo.instrument}) for file {filename}.")

FormatterClass = self.instrument.getRawFormatter(dataId)
region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass)
return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)

def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass):
"""Calculate the sky region covered by the supplied observation
information.

Parameters
----------
obsInfo : `~astro_metadata_translator.ObservationInfo`
Summary information of this dataset.
header : `Mapping`
Header from the dataset.
FormatterClass: `type` as subclass of `FitsRawFormatterBase`
Formatter class that should be used to compute the spatial region.

Returns
-------
region : `lsst.sphgeom.ConvexPolygon`
Region of sky covered by this observation.
"""
if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None:
formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo)
visitInfo = formatter.makeVisitInfo()
Expand All @@ -246,8 +312,7 @@ def extractMetadata(self, filename: str) -> RawFileData:
region = ConvexPolygon(sphCorners)
else:
region = None
return RawFileData(obsInfo=obsInfo, region=region, filename=filename,
FormatterClass=FormatterClass, dataId=dataId)
return region

def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]:
"""Group an iterable of `RawFileData` by exposure.
Expand All @@ -269,7 +334,8 @@ def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]
exposureDimensions = self.universe["exposure"].graph
byExposure = defaultdict(list)
for f in files:
byExposure[f.dataId.subset(exposureDimensions)].append(f)
# Assume that the first dataset is representative for the file
byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f)

return [RawExposureData(dataId=dataId, files=exposureFiles)
for dataId, exposureFiles in byExposure.items()]
Expand All @@ -291,40 +357,41 @@ def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData:
`RawExposureData.records` populated.
"""
firstFile = exposure.files[0]
firstDataset = firstFile.datasets[0]
VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass
exposure.records = {
"exposure": [makeExposureRecordFromObsInfo(firstFile.obsInfo, self.universe)],
"exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)],
}
if firstFile.obsInfo.visit_id is not None:
if firstDataset.obsInfo.visit_id is not None:
exposure.records["visit_detector_region"] = []
visitVertices = []
for file in exposure.files:
if file.obsInfo.visit_id != firstFile.obsInfo.visit_id:
raise ValueError(f"Inconsistent visit/exposure relationship for "
f"exposure {firstFile.obsInfo.exposure_id} between "
f"{file.filename} and {firstFile.filename}: "
f"{file.obsInfo.visit_id} != {firstFile.obsInfo.visit_id}.")
if file.region is None:
self.log.warn("No region found for visit=%s, detector=%s.", file.obsInfo.visit_id,
file.obsInfo.detector_num)
continue
visitVertices.extend(file.region.getVertices())
exposure.records["visit_detector_region"].append(
VisitDetectorRegionRecordClass.fromDict({
"instrument": file.obsInfo.instrument,
"visit": file.obsInfo.visit_id,
"detector": file.obsInfo.detector_num,
"region": file.region,
})
)
for dataset in file.datasets:
if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id:
raise ValueError(f"Inconsistent visit/exposure relationship for "
f"exposure {firstDataset.obsInfo.exposure_id} between "
f"{file.filename} and {firstFile.filename}: "
f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.")
if dataset.region is None:
self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id,
dataset.obsInfo.detector_num)
continue
visitVertices.extend(dataset.region.getVertices())
exposure.records["visit_detector_region"].append(
VisitDetectorRegionRecordClass.fromDict({
"instrument": dataset.obsInfo.instrument,
"visit": dataset.obsInfo.visit_id,
"detector": dataset.obsInfo.detector_num,
"region": dataset.region,
})
)
if visitVertices:
visitRegion = ConvexPolygon(visitVertices)
else:
self.log.warn("No region found for visit=%s.", file.obsInfo.visit_id,
file.obsInfo.detector_num)
self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id)
visitRegion = None
exposure.records["visit"] = [
makeVisitRecordFromObsInfo(firstFile.obsInfo, self.universe, region=visitRegion)
makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion)
]
return exposure

Expand Down Expand Up @@ -367,10 +434,11 @@ def expandDataIds(self, data: RawExposureData) -> RawExposureData:
# one.
vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None)
for file, vdrRecord in zip(data.files, vdrRecords):
file.dataId = self.butler.registry.expandDataId(
file.dataId,
records=dict(data.dataId.records, visit_detector_region=vdrRecord)
)
for dataset in file.datasets:
dataset.dataId = self.butler.registry.expandDataId(
dataset.dataId,
records=dict(data.dataId.records, visit_detector_region=vdrRecord)
)
return data

def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]:
Expand Down Expand Up @@ -484,7 +552,7 @@ def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[But
if butler is None:
butler = self.butler
datasets = [FileDataset(path=os.path.abspath(file.filename),
refs=DatasetRef(self.datasetType, file.dataId),
refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets],
formatter=file.FormatterClass)
for file in exposure.files]
butler.ingest(*datasets, transfer=self.config.transfer)
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/obs/base/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def template(self):
if self._template: # template must not be an empty string or None
return self._template
else:
raise RuntimeError("Template is not defined for the {} dataset type, ".format(self.datasetType) +
raise RuntimeError(f"Template is not defined for the {self.datasetType} dataset type, "
"it must be set before it can be used.")

def keys(self):
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[flake8]
max-line-length = 110
ignore = E133, E226, E228, N802, N803, N806, N812, N815, N816, W504
ignore = E133, E226, E228, N802, N803, N806, N812, N815, N816, W503
exclude = __init__.py, tests/camera/camera.py

[tool:pytest]
addopts = --flake8
flake8-ignore = E133 E226 E228 N802 N803 N806 N812 N815 N816 W504
flake8-ignore = E133 E226 E228 N802 N803 N806 N812 N815 N816 W503
8 changes: 4 additions & 4 deletions tests/test_cameraMapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def testGetRaises(self):
# (I hope) error message.
self.assertEqual(
str(contextManager.exception),
'Template is not defined for the raw dataset type, ' +
'Template is not defined for the raw dataset type, '
'it must be set before it can be used.')
with self.assertRaises(RuntimeError) as contextManager:
butler.queryMetadata('raw', 'unused', {})
Expand All @@ -506,7 +506,7 @@ def testQueryMetadataRaises(self):
# (I hope) error message.
self.assertEqual(
str(contextManager.exception),
'Template is not defined for the raw dataset type, ' +
'Template is not defined for the raw dataset type, '
'it must be set before it can be used.')

def testFilenameRaises(self):
Expand All @@ -519,7 +519,7 @@ def testFilenameRaises(self):
# (I hope) error message.
self.assertEqual(
str(contextManager.exception),
'Template is not defined for the raw dataset type, ' +
'Template is not defined for the raw dataset type, '
'it must be set before it can be used.')

def testWcsRaises(self):
Expand All @@ -532,7 +532,7 @@ def testWcsRaises(self):
# (I hope) error message.
self.assertEqual(
str(contextManager.exception),
'Template is not defined for the raw dataset type, ' +
'Template is not defined for the raw dataset type, '
'it must be set before it can be used.')

def testConflictRaises(self):
Expand Down