Reorganization to split file reading from indexing

lsst · Mar 31, 2016 · bbd76ce · bbd76ce
1 parent ee000d0
commit bbd76ce
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 96 deletions.
diff --git a/bin.src/ingestReferenceCatalog.py b/bin.src/ingestReferenceCatalog.py
@@ -1,3 +1,3 @@
 #!/usr/bin/env python
-from lsst.pipe.tasks.htmIndexEngine import HtmIndexEngine
-HtmIndexEngine.parseAndRun()
+from lsst.pipe.tasks.indexReferenceTask import IngestIndexedReferenceTask
+IngestIndexedReferenceTask.parseAndRun()
diff --git a/python/lsst/pipe/tasks/htmIndexer.py b/python/lsst/pipe/tasks/htmIndexer.py
@@ -0,0 +1,58 @@
+#
+# LSST Data Management System
+# Copyright 2008-2016 AURA/LSST.
+#
+# This product includes software developed by the
+# LSST Project (http://www.lsst.org/).
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the LSST License Statement and
+# the GNU General Public License along with this program.  If not,
+# see <https://www.lsstcorp.org/LegalNotices/>.
+#
+import esutil
+
+
+class HtmIndexer(object):
+    def __init__(self, depth=8):
+        """!Construct the indexer object
+
+        @param[in] depth  depth of the hierarchy to construct
+        """
+        self.htm = esutil.htm.HTM(depth)
+        # HACK need to call intersect first otherwise it segfaults
+        self.htm.intersect(1., 2., 0.00001)
+
+    def get_pixel_ids(self, ctrCoord, radius):
+        """!Get all shards that touch a circular aperture
+
+        @param[in] ctrCoord  afwCoord.Coord object of the center of the aperture
+        @param[in] radius  afwGeom.Angle object of the aperture radius
+        @param[out] A pipeBase.Struct with the list of shards, shards, and a boolean arry, boundary_mask,
+                    indicating whether the shard touches the boundary (True) or is fully contained (False).
+        """
+        pixel_id_list = self.htm.intersect(ctrCoord.getRa().asDegrees(), ctrCoord.getDec().asDegrees(),
+                                           radius.asDegrees(), inclusive=True)
+        covered_pixel_id_list = self.htm.intersect(ctrCoord.getRa().asDegrees(),
+                                                   ctrCoord.getDec().asDegrees(),
+                                                   radius.asDegrees(), inclusive=False)
+        is_on_boundary = (pixel_id not in covered_pixel_id_list for pixel_id in pixel_id_list)
+        return pixel_id_list, is_on_boundary
+
+    def index_points(self, ra_list, dec_list):
+        """!Generate trixel ids for each row in an input file
+
+        @param[in] ra_list  List of RA coordinate in degrees
+        @param[in] dec_list  List of Dec coordinate in degrees
+        @param[out] A list of pixel ids
+        """
+        return self.htm.lookup_id(ra_list, dec_list)
diff --git a/python/lsst/pipe/tasks/htmIndexEngine.py → python/lsst/pipe/tasks/indexReferenceTask.py b/python/lsst/pipe/tasks/htmIndexEngine.py → python/lsst/pipe/tasks/indexReferenceTask.py
@@ -25,17 +25,84 @@
 import lsst.afw.coord as afwCoord
 import lsst.afw.geom as afwGeom
 from lsst.afw.image import fluxFromABMag, fluxErrFromABMagErr
+from .htmIndexer import HtmIndexer as Indexer
 
-import esutil
 import numpy
 
 
+class TextReaderConfig(pexConfig.Config):
+    header_lines = pexConfig.Field(
+        dtype=int,
+        default=0,
+        doc='Number of lines to skip when reading the text reference file.'
+    )
+    colnames = pexConfig.ListField(
+        dtype=str,
+        default=[],
+        doc="""An ordered list of column names to use in ingesting the catalog.  With an empty
+list, column names will be discovered from the first line after the skipped header lines."""
+    )
+    delimiter = pexConfig.Field(
+        dtype=str,
+        default=',',
+        doc='Delimiter to use when reading text reference files.  Comma is default.'
+    )
+
+
+class TextReaderTask(pipeBase.Task):
+    _DefaultName = 'TextReaderTask'
+    ConfigClass = TextReaderConfig
+
+    def readFile(self, filename):
+            names = True
+            if self.config.colnames:
+                names = self.config.colnames
+            arr = numpy.genfromtxt(filename, dtype=None, skip_header=self.config.header_lines,
+                                   delimiter=self.config.delimiter,
+                                   names=names)
+            # Just in case someone has only one line in the file.
+            return numpy.atleast_1d(arr)
+
+
+class IngestReferenceRunner(pipeBase.TaskRunner):
+    """!Task runner for the reference catalog ingester
+
+    Data IDs are ignored so the runner should just run the task on the parsed command.
+    """
+    def run(self, parsedCmd):
+        """!Run the task.
+        Several arguments need to be collected to send on to the task methods.
+
+        @param[in] parsedCmd  Parsed command including command line arguments.
+        @param[out] Struct containing the result of the indexing.
+        """
+        files = parsedCmd.files
+        butler = parsedCmd.butler
+        task = self.TaskClass(config=self.config, log=self.log, butler=butler)
+        task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
+
+        result = task.create_indexed_catalog(files)
+        if self.doReturnResults:
+            return pipeBase.Struct(
+                result = result,
+            )
+
+
 class IngestIndexedReferenceConfig(pexConfig.Config):
     ref_dataset_name = pexConfig.Field(
         dtype=str,
         default='cal_ref_cat',
         doc='String to pass to the butler to retrieve persisted files.',
     )
+    level = pexConfig.Field(
+        dtype=int,
+        default=8,
+        doc='Default HTM level.  Level 8 gives ~0.08 sq deg per trixel.',
+    )
+    file_reader = pexConfig.ConfigurableField(
+        target=TextReaderTask,
+        doc='Task to use to read the files.  Default is to expect text files.'
+    )
     ra_name = pexConfig.Field(
         dtype=str,
         doc="Name of RA column",
@@ -80,27 +147,6 @@ class IngestIndexedReferenceConfig(pexConfig.Config):
         default=[],
         doc='Extra columns to add to the reference catalog.'
     )
-    header_lines = pexConfig.Field(
-        dtype=int,
-        default=0,
-        doc='Number of lines to skip when reading the text reference file.'
-    )
-    colnames = pexConfig.ListField(
-        dtype=str,
-        default=[],
-        doc="""An ordered list of column names to use in ingesting the catalog.  With an empty
-list, column names will be discovered from the first line after the skipped header lines."""
-    )
-    delimiter = pexConfig.Field(
-        dtype=str,
-        default=',',
-        doc='Delimiter to use when reading text reference files.  Comma is default.'
-    )
-    level = pexConfig.Field(
-        dtype=int,
-        default=8,
-        doc='Default HTM level.  Level 8 gives ~0.08 sq deg per trixel.',
-    )
 
     def validate(self):
         pexConfig.Config.validate(self)
@@ -111,28 +157,6 @@ def validate(self):
             raise ValueError("If magnitude errors are provided, all magnitudes must have an error column")
 
 
-class IngestReferenceRunner(pipeBase.TaskRunner):
-    """!Task runner for the reference catalog ingester
-    """
-    def run(self, parsedCmd):
-        """!Run the task.
-        Several arguments need to be collected to send on to the task methods.
-
-        @param[in] parsedCmd  Parsed command including command line arguments.
-        @param[out] Struct containing the result of the indexing.
-        """
-        files = parsedCmd.files
-        butler = parsedCmd.butler
-        task = self.TaskClass(config=self.config, log=self.log, butler=butler)
-        task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup)
-
-        result = task.create_indexed_catalog(files)
-        if self.doReturnResults:
-            return pipeBase.Struct(
-                result = result,
-            )
-
-
 class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
     """!Class for both producing indexed reference catalogs and for loading them.
 
@@ -141,6 +165,7 @@ class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
     shards.  In this case each shard contains the entries from the catalog in a single
     HTM trixel
     """
+    canMultiprocess = False
     ConfigClass = IngestIndexedReferenceConfig
     RunnerClass = IngestReferenceRunner
     _DefaultName = 'IngestIndexedReferenceTask'
@@ -150,8 +175,10 @@ class IngestIndexedReferenceTask(pipeBase.CmdLineTask):
     @classmethod
     def _makeArgumentParser(cls):
         """Create an argument parser
+
+        This overrides the original because we need the file arguments
         """
-        parser = pipeBase.ArgumentParser(name=cls._DefaultName)
+        parser = pipeBase.InputOnlyArgumentParser(name=cls._DefaultName)
         parser.add_argument("files", nargs="+", help="Names of files to index")
         return parser
 
@@ -162,7 +189,8 @@ def __init__(self, *args, **kwargs):
         """
         self.butler = kwargs.pop('butler')
         pipeBase.Task.__init__(self, *args, **kwargs)
-        self.indexer = HtmIndexer(self.config.level)
+        self.indexer = Indexer(self.config.level)
+        self.makeSubtask('file_reader')
 
     def create_indexed_catalog(self, files):
         """!Index a set of files comprising a reference catalog.  Outputs are persisted in the
@@ -173,14 +201,7 @@ def create_indexed_catalog(self, files):
         rec_num = 0
         first = True
         for filename in files:
-            names = True
-            if self.config.colnames:
-                names = self.config.colnames
-            arr = numpy.genfromtxt(filename, dtype=None, skip_header=self.config.header_lines,
-                                   delimiter=self.config.delimiter,
-                                   names=names)
-            # Just in case someone has only one line in the file.
-            arr = numpy.atleast1d(arr)
+            arr = self.file_reader.readFile(filename)
             index_list = self.indexer.index_points(arr[self.config.ra_name], arr[self.config.dec_name])
             if first:
                 schema, key_map = self.make_schema(arr.dtype)
@@ -325,39 +346,3 @@ def add_field(name):
         for col in self.config.extra_col_names:
             key_map[col] = add_field(col)
         return schema, key_map
-
-
-class HtmIndexer(object):
-    def __init__(self, depth=8):
-        """!Construct the indexer object
-
-        @param[in] depth  depth of the hierarchy to construct
-        """
-        self.htm = esutil.htm.HTM(depth)
-        # HACK need to call intersect first otherwise it segfaults
-        self.htm.intersect(1., 2., 0.00001)
-
-    def get_pixel_ids(self, ctrCoord, radius):
-        """!Get all shards that touch a circular aperture
-
-        @param[in] ctrCoord  afwCoord.Coord object of the center of the aperture
-        @param[in] radius  afwGeom.Angle object of the aperture radius
-        @param[out] A pipeBase.Struct with the list of shards, shards, and a boolean arry, boundary_mask,
-                    indicating whether the shard touches the boundary (True) or is fully contained (False).
-        """
-        pixel_id_list = self.htm.intersect(ctrCoord.getRa().asDegrees(), ctrCoord.getDec().asDegrees(),
-                                           radius.asDegrees(), inclusive=True)
-        covered_pixel_id_list = self.htm.intersect(ctrCoord.getRa().asDegrees(),
-                                                   ctrCoord.getDec().asDegrees(),
-                                                   radius.asDegrees(), inclusive=False)
-        is_on_boundary = (pixel_id not in covered_pixel_id_list for pixel_id in pixel_id_list)
-        return pixel_id_list, is_on_boundary
-
-    def index_points(self, ra_list, dec_list):
-        """!Generate trixel ids for each row in an input file
-
-        @param[in] ra_list  List of RA coordinate in degrees
-        @param[in] dec_list  List of Dec coordinate in degrees
-        @param[out] A list of pixel ids
-        """
-        return self.htm.lookup_id(ra_list, dec_list)
diff --git a/tests/testHtmIndex.py b/tests/testHtmIndex.py
@@ -34,7 +34,7 @@
 import lsst.afw.geom as afwGeom
 import lsst.afw.coord as afwCoord
 import lsst.daf.persistence as dafPersist
-from lsst.pipe.tasks.htmIndexEngine import IngestIndexedReferenceTask
+from lsst.pipe.tasks.indexReferenceTask import IngestIndexedReferenceTask
 from lsst.pipe.tasks.loadIndexedReferenceObjects import LoadIndexedReferenceObjectsTask
 
 obs_test_dir = lsst.utils.getPackageDir('obs_test')
@@ -179,10 +179,10 @@ def testIngest(self):
         default_config.is_variable_name = 'is_var'
         default_config.id_name = 'id'
         default_config.extra_col_names = ['val1', 'val2', 'val3']
-        default_config.header_lines = 1
-        default_config.colnames = ['id', 'ra', 'dec', 'a', 'a_err', 'b', 'b_err', 'is_phot', 'is_res',
+        default_config.file_reader.header_lines = 1
+        default_config.file_reader.colnames = ['id', 'ra', 'dec', 'a', 'a_err', 'b', 'b_err', 'is_phot', 'is_res',
                                    'is_var', 'val1', 'val2', 'val3']
-        default_config.delimiter = '|'
+        default_config.file_reader.delimiter = '|'
         # this also tests changing the delimiter
         IngestIndexedReferenceTask.parseAndRun(
             args=[input_dir, "--output", self.out_path+"/output_override",