Skip to content

Commit

Permalink
Add subpackage for converting Gen2->Gen3.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Apr 26, 2018
1 parent 9438069 commit 72722ff
Show file tree
Hide file tree
Showing 8 changed files with 1,371 additions and 0 deletions.
51 changes: 51 additions & 0 deletions config/gen2convert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
mappers:
HscMapper:
# Gen3 Camera name associated with this Gen2 Mapper
camera: HSC
# VisitInfo objects are used to populate Gen3 Visit and Exposure DataUnit
# entries; options here say how to get those from a Gen2 repo.
VisitInfo:
# The Gen2 DatasetType to read when trying to create a VisitInfo.
# (we actually add a "_md" suffix, because we just read the metadata).
DatasetType: raw
# Gen2 Data ID keys for the above DatasetType that are used
# to construct Gen3 Exposure/Visit identifiers.
# At present, only one key can be provided and this is used directly
# as the Gen3 value.
uniqueKeys:
- visit
collections:
substitutions:
# Regular expression patterns and replacement strings (passed directly
# to Python's re.sub) applied in order to all Gen2 absolute repository
# paths in order to construct names for the Collections they go into.
# This can be used to merge Gen2 repositories into a single Collection
# by making them reduce to the same name.
-
pattern: "^(.+)/rerun/private/"
repl: "u/"
-
pattern: "^(.+)/rerun/"
repl: "shared/"
overrides:
# A dictionary of DatasetType -> Collection mappings that force all
# Datasets with that DatasetType into a particular Collection.
# Collection names can be str.format patterns that utilize any of
# the Gen3 DataUnits or Gen2 DataIds associated with the Dataset.
raw: raw/{camera}
ref_cat: ref/{name}
ref_cat_config: ref/{name}
runs:
# Names of Collections (after processing via the above section)
# that should be assigned to a particular Run.
# Note that the *first* Collection a Dataset is added to determines
# its Run; this will be the one corresponding to the Gen2 repository
# that originally contained the file, unless that has been overridden.
raw/HSC: 1
ref/ps1_pv3_3pi_20170110: 2

skymaps: {} # dictionary mapping repository roots to Gen3 SkyMap names

storageClasses:
# dictionary mapping Gen2 Mapping.persistable to Gen3 StorageClass name
{}
24 changes: 24 additions & 0 deletions python/lsst/daf/butler/gen2convert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from .walker import ConversionWalker
from .writer import ConversionWriter
from .translators import KeyHandler, Translator
159 changes: 159 additions & 0 deletions python/lsst/daf/butler/gen2convert/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import re
from collections import OrderedDict

from .structures import Gen2Dataset, Gen2DatasetType

__all__ = ("Extractor",)


TEMPLATE_RE = re.compile(r'\%\((?P<name>\w+)\).*?(?P<type>[idrs])')


class FilePathParser:
"""A callable object that extracts Gen2Dataset instances from filenames
corresponding to a particular Gen2 DatasetType.
External code should use the `fromMapping` method to construct instances.
Parameters
----------
datasetType : `Gen2DatasetType`
Information about the DatasetType this parser processes.
regex : regular expression object
Regular expression pattern with named groups for all data ID keys.
"""

@classmethod
def fromMapping(cls, mapping):
"""Construct a FilePathParser instance from a Gen2
`lsst.obs.base.Mapping` instance.
"""
try:
template = mapping.template
except RuntimeError:
return None
datasetType = Gen2DatasetType(name=mapping.datasetType,
keys={},
persistable=mapping.persistable,
python=mapping.python)
iterator = TEMPLATE_RE.finditer(template)
try:
m1 = iterator.__next__()
except StopIteration:
regex = re.compile(re.escape(template))
return cls(datasetType=datasetType, regex=regex)
terms = [template[:m1.start()]]
allKeys = mapping.keys()
try:
while True:
name = m1.group("name")
if name == "patch":
pattern = r"\d+,\d+"
elif m1.group("type") in "id": # integers
pattern = r"0*\d+"
else:
pattern = ".+"
# only use named groups for the first occurence of a key
if name not in datasetType.keys:
terms.append(r"(?P<%s>%s)" % (name, pattern))
datasetType.keys[name] = allKeys[name]
else:
terms.append(r"(%s)" % pattern)
m2 = iterator.__next__()
terms.append(re.escape(template[m1.end():m2.start()]))
m1 = m2
except StopIteration:
pass
terms.append(re.escape(template[m1.end():]))
return cls(datasetType=datasetType, regex=re.compile("".join(terms)))

def __init__(self, datasetType, regex):
self.datasetType = datasetType
self.regex = regex

def __call__(self, filePath, root):
"""Extract a Gen2Dataset instance from the given path.
Parameters
----------
filePath : `str`
Path and filename relative to `root`.
root : `str`
Absolute path to the root of the Gen2 data repository containing
this file.
"""
m = self.regex.fullmatch(filePath)
if m is None:
return None
dataId = {k: v(m.group(k)) for k, v in self.datasetType.keys.items()}
return Gen2Dataset(datasetType=self.datasetType, dataId=dataId,
filePath=filePath, root=root)


class Extractor:
"""An object that parses Gen2 paths into Gen2Dataset instance for a
particular Gen2 data repository.
Parameters
----------
repo : `Gen2Repo`
Structure describing the repository this Extractor will process.
"""

def __init__(self, repo):
self.repo = repo
self.parsers = OrderedDict()
for mapping in self.repo.mapper.mappings.values():
parser = FilePathParser.fromMapping(mapping)
if parser is not None:
self.parsers[parser.datasetType.name] = parser

def __call__(self, filePath):
"""Parse a file path and return a Gen2Dataset that represents it.
Parameters
----------
filePath : `str`
A path relative to the root of the data repository.
Returns
-------
dataset : `Gen2Dataset` or None
A Gen2Dataset instance, or None if the file path is not recognized
by this mapper.
"""
for parser in self.parsers.values():
dataset = parser(filePath, root=self.repo.root)
if dataset is not None:
break
else:
return None
# Move the parser we just used to the front of the OrderedDict so we
# always try them in MRU order.
self.parsers.move_to_end(dataset.datasetType.name, last=False)
return dataset

def getDatasetTypes(self):
"""Return a dict mapping DatasetType name to Gen2DatasetType instance."""
return {parser.datasetType.name: parser.datasetType for parser in self.parsers.values()}

0 comments on commit 72722ff

Please sign in to comment.