Merge pull request #24 from lsst/tickets/DM-22798

DM-22798: Update fgcmcal cookbook on proper usage of checkAllCcds config.
lsst · Jan 14, 2020 · 0f0e2d6 · 0f0e2d6
2 parents 261d5c6 + 48e97ad
commit 0f0e2d6
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 59 deletions.
diff --git a/cookbook/README.md b/cookbook/README.md
@@ -106,20 +106,25 @@ In order to make the fit cycles tractable without redoing processor-intensive
 steps, all data are collated and star observations are matched and indexed
 before the fit runs are started.  Depending on how many visits are being
 processed, and where the data are located, this step can take quite a while.
-(In the future, with a database backend for the butler, I believe this step
+(In the future, with a database backend for the Gen3 butler, I believe this step
 will be much faster).
 
-The FGCM code can run on a list of visits specified by the `--id` parameter on
-the command line, or it can search the input repository for all visits with
-`src` catalogs generated from `calexps`.  Be aware that FGCM uses **entire
-visits** even if only one CCD is specified.  Also note that input processing
-will be much faster if you specify a single CCD with the visits if `--id` is
-used.  E.g., `--id visit=13376^13450 ccd=13`.  For HSC, using a reference CCD
-to scan for available `src` catalogs speeds things up by x100, which is
-necessary.  You can also specify by field or other valid dataId. A [sample
-config](fgcmBuildStarsHsc.py) for HSC is available.
-
-If `doReferenceCalibration = True` in the configuration (the new default), then
+The FGCM code runs on calexp source catalogs from visits constrained by the
+`--id` parameter on the command line.  Best results are obtained when FGCM is
+run with **full visits**.  Due to limitations in the Gen2 Butler (the only
+Butler currently supported by `fgcmcal`), optimal performance is obtained by
+specifying a single "reference" ccd on the command line (e.g. `ccd=13`) and
+setting the config variable `checkAllCcds = True` (which is the default).  The
+alternative is to specify all the desired CCDs and set `checkAllCcds = False`,
+e.g., "ccd=0..8^10..103".  However, this is slower than the first option, and
+the improvement in speed in the first option is greater the more visits are
+specified.  If instead you want to process all the visits in a rerun selected
+by filter, field, or some other dataid field, then by using a reference ccd and
+setting `checkAllCcds = True` you can speed things up by a factor of
+approximately 100 relative to the alternative (naming CCDs specifically).  For
+config settings, please see the [sample config](fgcmBuildStarsHsc.py).
+
+If `doReferenceCalibration = True` in the configuration (the default), then
 stars from a reference catalog (e.g. PS1) will be loaded and matched to the
 internal stars.  The signal-to-noise cut specified here should be the minimum
 one expects to use in the actual fit, and the fit may also be performed without
@@ -132,9 +137,9 @@ on `lsst-dev01`:
 
 ```bash
 fgcmBuildStars.py /datasets/hsc/repo --rerun \
-private/${USER}/${COOKBOOKRERUN}/lut:private/${USER}/${COOKBOOKRERUN}/wide \
---configfile $FGCMCAL_DIR/cookbook/fgcmBuildStarsHsc.py --id field=SSP_WIDE ccd=13 \
-filter=HSC-G^HSC-R^HSC-I^HSC-Z^HSC-Y
+private/${USER}/${COOKBOOKRERUN}/lut:private/${USER}/${COOKBOOKRERUN}/wide+deep \
+--configfile $FGCMCAL_DIR/cookbook/fgcmBuildStarsHsc.py \
+--id ccd=13 filter=HSC-G^HSC-R^HSC-I^HSC-Z^HSC-Y
 ```
 
 ## Running a Fit Cycle
@@ -176,7 +181,7 @@ run on `lsst-dev01`.
 
 ```bash
 fgcmFitCycle.py /datasets/hsc/repo --rerun \
-private/${USER}/${COOKBOOKRERUN}/wide:private/${USER}/${COOKBOOKRERUN}/fit1 \
+private/${USER}/${COOKBOOKRERUN}/wide+deep:private/${USER}/${COOKBOOKRERUN}/fit1 \
 --configfile $FGCMCAL_DIR/cookbook/fgcmFitCycleHscCookbook_cycle00_config.py \
 |& tee fgcmFitCycleHscCookbook_cycle00.log
 ```

diff --git a/cookbook/fgcmBuildStarsHsc.py b/cookbook/fgcmBuildStarsHsc.py
@@ -6,6 +6,8 @@
 
 from lsst.utils import getPackageDir
 
+# Check repo for all CCDs for each visit
+config.checkAllCcds = True
 # Minimum number of observations per band for a star to be considered for calibration
 config.minPerBand = 2
 # Match radius to associate stars from src catalogs (arcseconds)

diff --git a/python/lsst/fgcmcal/fgcmBuildStars.py b/python/lsst/fgcmcal/fgcmBuildStars.py
@@ -118,11 +118,12 @@ class FgcmBuildStarsConfig(pexConfig.Config):
         default=13,
     )
     checkAllCcds = pexConfig.Field(
-        doc=("Check repo for existence of all possible ccds, in case when we have "
-             "incomplete visits and the referenceCCD will be missing.  Primarily "
-             "used in testing."),
+        doc=("Check repo for all CCDs for each visit specified.  To be used when the "
+             "full set of ids (visit/ccd) are not specified on the command line.  For "
+             "Gen2, specifying one ccd and setting checkAllCcds=True is significantly "
+             "faster than the alternatives."),
         dtype=bool,
-        default=False,
+        default=True,
     )
     visitDataRefName = pexConfig.Field(
         doc="dataRef name for the 'visit' field",
@@ -313,7 +314,7 @@ def _makeArgumentParser(cls):
         """Create an argument parser"""
 
         parser = pipeBase.ArgumentParser(name=cls._DefaultName)
-        parser.add_id_argument("--id", "calexp", help="Data ID, e.g. --id visit=6789 (optional)")
+        parser.add_id_argument("--id", "calexp", help="Data ID, e.g. --id visit=6789")
 
         return parser
 
@@ -506,41 +507,16 @@ def findAndGroupDataRefs(self, butler, dataRefs):
         # will be unnecessary with Gen3 Butler.  This should be part of
         # DM-13730.
 
-        # Note that self.config.checkAllCcds means that we will check the repo
-        # for existence of all possible ccds for each (incomplete) visit, in cases
-        # when the referenceCCD will be missing (primarily during testing).
-
-        # scanAllCcds is a local flag that is set when the dataRefs do not include
-        # all relevant ccds.
-
-        scanAllCcds = False
-        if len(dataRefs) == 0:
-            if not self.config.checkAllCcds:
-                # These dataRefs include all possible visits, for each
-                # referenceCCD.  This is much faster than scanning through
-                # all CCDs when none of them have been processed in a given repo.
-                dataRefs = butler.subset('src',
-                                         dataId={self.config.ccdDataRefName:
-                                                 self.config.referenceCCD})
-                scanAllCcds = True
-            else:
-                # These dataRefs include all possible ccds
-                # Note that in the Gen2 era (at least) a valid dataRef in a subset
-                # only says that a given raw is available, and the src may not
-                # be accessible or processed in a specific repo.
-                dataRefs = butler.subset('src')
-        elif self.config.checkAllCcds:
-            scanAllCcds = True
-
         groupedDataRefs = {}
         for dataRef in dataRefs:
             visit = dataRef.dataId[self.config.visitDataRefName]
             # If we don't have the dataset, just continue
             if not dataRef.datasetExists(datasetType='src'):
                 continue
-            # If we need to scan all ccds, do it here
-            if scanAllCcds:
+            # If we need to check all ccds, do it here
+            if self.config.checkAllCcds:
                 dataId = dataRef.dataId.copy()
+                # For each ccd we must check that a valid source catalog exists.
                 for ccdId in ccdIds:
                     dataId[self.config.ccdDataRefName] = ccdId
                     if butler.datasetExists('src', dataId=dataId):
@@ -552,6 +528,8 @@ def findAndGroupDataRefs(self, butler, dataRefs):
                         else:
                             groupedDataRefs[visit] = [goodDataRef]
             else:
+                # We have already confirmed that the dataset exists, so no need
+                # to check here.
                 if visit in groupedDataRefs:
                     if (dataRef.dataId[self.config.ccdDataRefName] not in
                        [d.dataId[self.config.ccdDataRefName] for d in groupedDataRefs[visit]]):
@@ -567,8 +545,8 @@ def ccdSorter(dataRef):
             else:
                 return ccdId
 
-        # If we did not scan all ccds, put them in ccd order
-        if not scanAllCcds:
+        # If we did not check all ccds, put them in ccd order
+        if not self.config.checkAllCcds:
             for visit in groupedDataRefs:
                 groupedDataRefs[visit] = sorted(groupedDataRefs[visit], key=ccdSorter)
 

diff --git a/tests/fgcmcalTestBase.py b/tests/fgcmcalTestBase.py
@@ -144,14 +144,14 @@ def _testFgcmMakeLut(self, nBand, i0Std, i0Recon, i10Std, i10Recon):
 
         self.assertFloatsAlmostEqual(i10Recon, i1 / i0, msg='i10Recon', rtol=1e-5)
 
-    def _testFgcmBuildStars(self, nVisit, nStar, nObs):
+    def _testFgcmBuildStars(self, visits, nStar, nObs):
         """
         Test running of FgcmBuildStarsTask
 
         Parameters
         ----------
-        nVisit: `int`
-           Number of visits expected
+        visits: `list`
+           List of visits to calibrate
         nStar: `int`
            Number of stars expected
         nObs: `int`
@@ -163,6 +163,7 @@ def _testFgcmBuildStars(self, nVisit, nStar, nObs):
         """
 
         args = [self.inputDir, '--output', self.testDir,
+                '--id', 'visit='+'^'.join([str(visit) for visit in visits]),
                 '--doraise']
         args.extend(self.otherArgs)
 
@@ -172,7 +173,7 @@ def _testFgcmBuildStars(self, nVisit, nStar, nObs):
         butler = dafPersist.butler.Butler(self.testDir)
 
         visitCat = butler.get('fgcmVisitCatalog')
-        self.assertEqual(nVisit, len(visitCat))
+        self.assertEqual(len(visits), len(visitCat))
 
         starIds = butler.get('fgcmStarIds')
         self.assertEqual(nStar, len(starIds))

diff --git a/tests/test_fgcmcal_hsc.py b/tests/test_fgcmcal_hsc.py
@@ -88,11 +88,13 @@ def test_fgcmcalTasks(self):
         self.fillDefaultBuildStarsConfig(self.config, visitDataRefName, ccdDataRefName)
         self.otherArgs = []
 
-        nVisit = 11
+        visits = [903334, 903336, 903338, 903342, 903344, 903346,
+                  903986, 903988, 903990, 904010, 904014]
+
         nStar = 472
         nObs = 5431
 
-        self._testFgcmBuildStars(nVisit, nStar, nObs)
+        self._testFgcmBuildStars(visits, nStar, nObs)
 
         # Perform the fit cycle
         self.config = fgcmcal.FgcmFitCycleConfig()
@@ -170,7 +172,6 @@ def test_fgcmcalTract(self):
 
         self.config = fgcmcal.FgcmCalibrateTractConfig()
         self.fillDefaultBuildStarsConfig(self.config.fgcmBuildStars, visitDataRefName, ccdDataRefName)
-        self.config.fgcmBuildStars.checkAllCcds = False
         self.fillDefaultFitCycleConfig(self.config.fgcmFitCycle)
         self.config.maxFitCycles = 3
 
@@ -233,7 +234,7 @@ def fillDefaultBuildStarsConfig(self, config, visitDataRefName, ccdDataRefName):
         config.filterMap = {'r': 'r', 'i': 'i'}
         config.requiredBands = ['r', 'i']
         config.primaryBands = ['i']
-        config.checkAllCcds = True
+        config.checkAllCcds = False
         config.coarseNside = 64
         config.visitDataRefName = visitDataRefName
         config.ccdDataRefName = ccdDataRefName