Add subpackage for converting Gen2->Gen3.

lsst · Apr 26, 2018 · 72722ff · 72722ff
1 parent 9438069
commit 72722ff
Show file tree

Hide file tree

Showing 8 changed files with 1,371 additions and 0 deletions.
diff --git a/config/gen2convert.yaml b/config/gen2convert.yaml
@@ -0,0 +1,51 @@
+mappers:
+  HscMapper:
+    # Gen3 Camera name associated with this Gen2 Mapper
+    camera: HSC
+    # VisitInfo objects are used to populate Gen3 Visit and Exposure DataUnit
+    # entries; options here say how to get those from a Gen2 repo.
+    VisitInfo:
+      # The Gen2 DatasetType to read when trying to create a VisitInfo.
+      # (we actually add a "_md" suffix, because we just read the metadata).
+      DatasetType: raw
+      # Gen2 Data ID keys for the above DatasetType that are used
+      # to construct Gen3 Exposure/Visit identifiers.
+      # At present, only one key can be provided and this is used directly
+      # as the Gen3 value.
+      uniqueKeys:
+        - visit
+collections:
+  substitutions:
+    # Regular expression patterns and replacement strings (passed directly
+    # to Python's re.sub) applied in order to all Gen2 absolute repository
+    # paths in order to construct names for the Collections they go into.
+    # This can be used to merge Gen2 repositories into a single Collection
+    # by making them reduce to the same name.
+    -
+      pattern: "^(.+)/rerun/private/"
+      repl: "u/"
+    -
+      pattern: "^(.+)/rerun/"
+      repl: "shared/"
+  overrides:
+    # A dictionary of DatasetType -> Collection mappings that force all
+    # Datasets with that DatasetType into a particular Collection.
+    # Collection names can be str.format patterns that utilize any of
+    # the Gen3 DataUnits or Gen2 DataIds associated with the Dataset.
+    raw: raw/{camera}
+    ref_cat: ref/{name}
+    ref_cat_config: ref/{name}
+runs:
+  # Names of Collections (after processing via the above section)
+  # that should be assigned to a particular Run.
+  # Note that the *first* Collection a Dataset is added to determines
+  # its Run; this will be the one corresponding to the Gen2 repository
+  # that originally contained the file, unless that has been overridden.
+  raw/HSC: 1
+  ref/ps1_pv3_3pi_20170110: 2
+
+skymaps: {}  # dictionary mapping repository roots to Gen3 SkyMap names
+
+storageClasses:
+  # dictionary mapping Gen2 Mapping.persistable to Gen3 StorageClass name
+  {}
diff --git a/python/lsst/daf/butler/gen2convert/__init__.py b/python/lsst/daf/butler/gen2convert/__init__.py
@@ -0,0 +1,24 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .walker import ConversionWalker
+from .writer import ConversionWriter
+from .translators import KeyHandler, Translator
diff --git a/python/lsst/daf/butler/gen2convert/extractor.py b/python/lsst/daf/butler/gen2convert/extractor.py
@@ -0,0 +1,159 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from collections import OrderedDict
+
+from .structures import Gen2Dataset, Gen2DatasetType
+
+__all__ = ("Extractor",)
+
+
+TEMPLATE_RE = re.compile(r'\%\((?P<name>\w+)\).*?(?P<type>[idrs])')
+
+
+class FilePathParser:
+    """A callable object that extracts Gen2Dataset instances from filenames
+    corresponding to a particular Gen2 DatasetType.
+
+    External code should use the `fromMapping` method to construct instances.
+
+    Parameters
+    ----------
+    datasetType : `Gen2DatasetType`
+        Information about the DatasetType this parser processes.
+    regex : regular expression object
+        Regular expression pattern with named groups for all data ID keys.
+    """
+
+    @classmethod
+    def fromMapping(cls, mapping):
+        """Construct a FilePathParser instance from a Gen2
+        `lsst.obs.base.Mapping` instance.
+        """
+        try:
+            template = mapping.template
+        except RuntimeError:
+            return None
+        datasetType = Gen2DatasetType(name=mapping.datasetType,
+                                      keys={},
+                                      persistable=mapping.persistable,
+                                      python=mapping.python)
+        iterator = TEMPLATE_RE.finditer(template)
+        try:
+            m1 = iterator.__next__()
+        except StopIteration:
+            regex = re.compile(re.escape(template))
+            return cls(datasetType=datasetType, regex=regex)
+        terms = [template[:m1.start()]]
+        allKeys = mapping.keys()
+        try:
+            while True:
+                name = m1.group("name")
+                if name == "patch":
+                    pattern = r"\d+,\d+"
+                elif m1.group("type") in "id":  # integers
+                    pattern = r"0*\d+"
+                else:
+                    pattern = ".+"
+                # only use named groups for the first occurence of a key
+                if name not in datasetType.keys:
+                    terms.append(r"(?P<%s>%s)" % (name, pattern))
+                    datasetType.keys[name] = allKeys[name]
+                else:
+                    terms.append(r"(%s)" % pattern)
+                m2 = iterator.__next__()
+                terms.append(re.escape(template[m1.end():m2.start()]))
+                m1 = m2
+        except StopIteration:
+            pass
+        terms.append(re.escape(template[m1.end():]))
+        return cls(datasetType=datasetType, regex=re.compile("".join(terms)))
+
+    def __init__(self, datasetType, regex):
+        self.datasetType = datasetType
+        self.regex = regex
+
+    def __call__(self, filePath, root):
+        """Extract a Gen2Dataset instance from the given path.
+
+        Parameters
+        ----------
+        filePath : `str`
+            Path and filename relative to `root`.
+        root : `str`
+            Absolute path to the root of the Gen2 data repository containing
+            this file.
+        """
+        m = self.regex.fullmatch(filePath)
+        if m is None:
+            return None
+        dataId = {k: v(m.group(k)) for k, v in self.datasetType.keys.items()}
+        return Gen2Dataset(datasetType=self.datasetType, dataId=dataId,
+                           filePath=filePath, root=root)
+
+
+class Extractor:
+    """An object that parses Gen2 paths into Gen2Dataset instance for a
+    particular Gen2 data repository.
+
+    Parameters
+    ----------
+    repo : `Gen2Repo`
+        Structure describing the repository this Extractor will process.
+    """
+
+    def __init__(self, repo):
+        self.repo = repo
+        self.parsers = OrderedDict()
+        for mapping in self.repo.mapper.mappings.values():
+            parser = FilePathParser.fromMapping(mapping)
+            if parser is not None:
+                self.parsers[parser.datasetType.name] = parser
+
+    def __call__(self, filePath):
+        """Parse a file path and return a Gen2Dataset that represents it.
+
+        Parameters
+        ----------
+        filePath : `str`
+            A path relative to the root of the data repository.
+
+        Returns
+        -------
+        dataset : `Gen2Dataset` or None
+            A Gen2Dataset instance, or None if the file path is not recognized
+            by this mapper.
+        """
+        for parser in self.parsers.values():
+            dataset = parser(filePath, root=self.repo.root)
+            if dataset is not None:
+                break
+        else:
+            return None
+        # Move the parser we just used to the front of the OrderedDict so we
+        # always try them in MRU order.
+        self.parsers.move_to_end(dataset.datasetType.name, last=False)
+        return dataset
+
+    def getDatasetTypes(self):
+        """Return a dict mapping DatasetType name to Gen2DatasetType instance."""
+        return {parser.datasetType.name: parser.datasetType for parser in self.parsers.values()}