Improve documentation. Rename functions to clarify intent.

Return pipeBase.Struct() for previous base tuples from calcPA1 and calcPA2. While technically a functional change, this is effectively a change to improve the documentation and clarity of what's being passed around. Document new files in the manifest in README.md
lsst-dm · Feb 4, 2016 · d7cc23f · d7cc23f
1 parent 80282be
commit d7cc23f
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 96 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 A set of utilities to run the processCcd task on some 
 CFHT data and DECam data
-and validate the astrometry of the results.
+and validate the astrometric and photometric repeatability of the results.
 
 Pre-requisites: install and declare the following
 1. `pipe_tasks` from the LSST DM stack (note that pipe_tasks is included with lsst_apps, which is the usual thing to install)
@@ -12,7 +12,7 @@ Pre-requisites: install and declare the following
 The `obs_decam`, `obs_cfht`, `validation_data_cfht`, `validation_data_decam`, `validate_drp` products are also buildable by the standard LSST DM stack tools: `lsstsw` or `eups distrib`.  But they (intentionally) aren't in the dependency tree of `lsst_apps`.  If you have a stack already installed with `lsst_apps`, you can install these in the same manner.  E.g.,
 
 ```
-eups distrib obs_decam obs_cfht validation_data_decam validation_data_cfht validate_drp
+eups distrib install obs_decam obs_cfht validation_data_decam validation_data_cfht validate_drp
 ```
 
 XOR
@@ -100,8 +100,8 @@ Once these basic steps are completed, then you can run any of the following:
 
 Note that the example validation test selects several of the CCDs and will fail if you just pass it a repository with 1 visit or just 1 CCD.
 
-Files :
--------
+Files of Interest:
+------------------
 * `examples/runCfhtTest.sh`  : CFHT Run initialization, ingest, measurement, and astrometry validation.
 * `examples/runDecamTest.sh` : DECam Run initialization, ingest, measurement, and astrometry validation.
 * `examples/validateCfht.py`    : CFHT run some analysis on the output data produced by processCcd.py
@@ -110,4 +110,11 @@ Files :
 * `examples/runDecam.list`   : DECam list of vistits / ccd to be processed by processCcd
 * `config/newAstrometryConfig.py`  : configuration for running processCcd with the new AstrometryTask
 * `config/anetAstrometryConfig.py` : configuration for running processCcd ANetAstrometryTask
+* `bin.src/validateCfht.py` : Wrapper script to analyze CFHT data in validation_data_cfht
+* `bin.src/validateDecam.py` : Wrapper script to analyze DECam data in validation_data_decam
+* `bin.src/validateDecamCosmos.py` : Wrapper script to analyze larger set of DECam data taken in the COSMOS field.
+* `bin.src/validateCfht.py` : Wrapper script to analyze CFHT data in validation_data_cfht
+* `python/lsst/validate/drp/calcSrd.py` : calculate metrics defined by the LSST SRC.
+* `python/lsst/validate/drp/plotAstrometryPhotometry.py` : plotting routines
+* `python/lsst/validate/drp/checkAstrometryPhotometry.py` : coordination and calculation routines.
 * `README.md` : THIS FILE.  Guide and examples.
diff --git a/python/lsst/validate/drp/calcSrd.py b/python/lsst/validate/drp/calcSrd.py
@@ -22,46 +22,107 @@
 
 from __future__ import print_function, division
 
+import math
+
 import numpy as np
 import scipy.stats
 
-# To validate these width estimates, we can plot a single random realization (re-evaluate the cell or call the function again to get a new one).
-def calcPA1(gv, magKey):
-    diffs = gv.aggregate(getRandomDiff, field=magKey)
-    means = gv.aggregate(np.mean, field=magKey)
+import lsst.pipe.base as pipeBase
+
+def calcPA1(groupView, magKey):
+    """Calculate the photometric repeatability of measurements across a set of observations.
+
+    @param[in] groupView -- A lsst.afw.table.GroupView of matched observations.
+    @param[in] magKey -- The lookup key of the field storing the magnitude of interest.
+       E.g., `magKey = allMatches.schema.find("base_PsfFlux_mag").key`
+       where `allMatches` is a the result of lsst.afw.table.MultiMatch.finish()
+
+    @param[out] -- pipeBase.Struct containing
+       the RMS, inter-quartile range, differences between pairs of observations, mean mag of each object.
+
+    The LSST Science Requirements Document (LPM-17), commonly referred to as the SRD
+    characterizes the photometric repeatability by putting a requirement
+    on the median RMS of measurements of non-variable bright stars.
+    This quantity is PA1, with a design, minimum, and stretch goal of (5, 8, 3) millimag
+      following LPM-17 as of 2011-07-06, available at http://ls.st/LPM-17.
+
+    This present routine calculates this quantity in two different ways: RMS, interquartile
+    and also returns additional quantities of interest: 
+      the pair differences of observations of stars, the mean magnitude of each star
+    """
+
+    diffs = groupView.aggregate(getRandomDiffRmsInMas, field=magKey)
+    means = groupView.aggregate(np.mean, field=magKey)
     rmsPA1, iqrPA1 = computeWidths(diffs)
-    return rmsPA1, iqrPA1, diffs, means
+    return pipeBase.Struct(rms = rmsPA1, iqr = iqrPA1, diffs = diffs, means = means)
+
 
+def calcPA2(groupView, magKey):
+    """Calculate the fraction of outliers from the photometric repeatability of measurements.
 
-def calcPA2(gv, magKey):
-    """Calculate PA2 values for min, design, stretch values of PF1.
+    @param[in] groupView -- A lsst.afw.table.GroupView of matched observations.
+    @param[in] magKey -- The lookup key of the field storing the magnitude of interest.
+       E.g., `magKey = allMatches.schema.find("base_PsfFlux_mag").key`
+       where `allMatches` is a the result of lsst.afw.table.MultiMatch.finish()
 
-    The SRD requirements puts a limit on the fraction of outliers plot of PA1.
-    This is PA2.  Below, we compute the values of PA2 for the minimum, design, and stretch specification values of PF1 in the SRD.
+    @param[out] -- pipeBase.Struct containing PA2, the millimags of varaition at the 
+       `design`, `minimum`, and `stretch` fraction of outliers
+       The specified fractions are also avialable as `PF1`.
+
+    The LSST Science Requirements Document (LPM-17) is commonly referred to as the SRD.
+    The SRD puts a limit that no more than PF1 % of difference will very by more than PA2 millimag.
+    The design, minimum, and stretch goals are PF1 = (10, 20, 5) % at PA2 = (15, 15, 10) millimag
+      following LPM-17 as of 2011-07-06, available at http://ls.st/LPM-17.
     """
 
-    diffs = gv.aggregate(getRandomDiff, field=magKey)
-    minPA2, designPA2, stretchPA2 = np.percentile(np.abs(diffs), [80, 90, 95])
-    return minPA2, designPA2, stretchPA2
+    diffs = groupView.aggregate(getRandomDiffRmsInMas, field=magKey)
+    PF1 = {'minimum' : 20, 'design' : 10, 'stretch' : 5}
+    PF1_percentiles = 100 - np.asarray([PF1['minimum'], PF1['design'], PF1['stretch']])
+    minPA2, designPA2, stretchPA2 = np.percentile(np.abs(diffs), PF1_percentiles)
+    return pipeBase.Struct(design = designPA2, minimum = minPA2, stretch = stretchPA2, PF1 = PF1)
+
+def getRandomDiffRmsInMas(array):
+    """Get the RMS difference between two randomly selected elements of an array of magnitudes.
+
+    The LSST SRD recommends computing repeatability from a histogram of magnitude differences 
+       for the same star measured on two visits (using a median over the diffs to reject outliers). 
+    Because we have N>=2 measurements for each star, we select a random pair of visits for each star.
+    We divide each difference by sqrt(2) to obtain RMS about the (unknown) mean magnitude, 
+       instead of obtaining just the RMS difference.
+    """
+    # For scalars, math.sqrt is several times faster than numpy.sqrt.
+    return (1000/math.sqrt(2)) * getRandomDiff(array)
+
 
-# The SRD recommends computing repeatability from a histogram of magnitude differences for the same star measured on two visits (using a median over the diffs to reject outliers). Since our dataset includes N>2 measurements for each star, we select a random pair of visits for each star.
-# We also divide each difference by sqrt(2), because we want the RMS about the (unknown) mean magnitude, not the RMS difference, and convert from mags to mmag.
-# Note that this randomization still works for cases where we have only 2 obervations of the star.
 def getRandomDiff(array):
-    # not the most efficient way to extract a pair, but it's the easiest to write
+    """Get the difference between two randomly selected elements of an array.
+
+    Notes: 
+      * This is not the most efficient way to extract a pair, but it's the easiest to write.
+      * Shuffling works correctly for low N (even N=2), where a naive random generation of entries 
+          would result in duplicates.  
+      * In principle it might be more efficient to shuffle the indices, then extract the difference.
+        But this probably only would make a difference for arrays whose elements were
+          substantially larger than just scalars and had the subtraction operation defined.
+    """
     copy = array.copy()
     np.random.shuffle(copy)
-    return 1000*(copy[0] - copy[1])/(2**0.5)
+    return copy[0] - copy[1]
 
-def computeWidths(diffs):
+
+def computeWidths(array):
     """Compute the RMS and the scaled inter-quartile range of an array.
   
-    We estimate the width of the histogram in two ways: using a simple RMS, and using the interquartile range (scaled by the IQR/RMS ratio for a Gaussian). We do this for 50 different random realizations of the measurement pairs, to provide some estimate of the uncertainty on our RMS estimates due to the random shuffling (we could probably turn this into a more formal estimate somehow, but I'm not going to bother with that at the moment).
-    While the SRD specifies that we should just compute the RMS directly, we haven't limited our sample to nonvariable stars as carefully as the SRD specifies, so using a more robust estimator like IQR will allow us to reject some outliers. It is also less sensitive some realistic sources of scatter the metric should include, however, such as bad zero points.
+    We estimate the width of the histogram in two ways: using a simple RMS, 
+       and using the interquartile range (scaled by the IQR/RMS ratio for a Gaussian). 
+    While the SRD specifies that we should just compute the RMS directly, 
+       we haven't limited our sample to nonvariable stars as carefully as the SRD specifies, 
+       so using a more robust estimator like IQR allows us to reject some outliers. 
+       It is also less sensitive some realistic sources of scatter the metric should include, 
+         such as bad zero points.
     """
-
-    rmsSigma = np.mean(diffs**2)**0.5
-    iqrSigma = np.subtract.reduce(np.percentile(diffs, [75, 25])) / (scipy.stats.norm.ppf(0.75)*2)
+    rmsSigma = math.sqrt(np.mean(array**2))
+    iqrSigma = np.subtract.reduce(np.percentile(array, [75, 25])) / (scipy.stats.norm.ppf(0.75)*2)
     return rmsSigma, iqrSigma
 
 
diff --git a/python/lsst/validate/drp/checkAstrometryPhotometry.py b/python/lsst/validate/drp/checkAstrometryPhotometry.py
@@ -72,12 +72,6 @@ def averageRaDec(cat):
     dec = np.mean(cat.get('coord_dec'))
     return ra, dec
 
-# Some thoughts from Paul Price on how to do the coordinate differences correctly:
-#    mean = sum([afwGeom.Extent3D(coord.toVector()) 
-#                for coord in coordList, afwGeom.Point3D(0, 0, 0)])
-#    mean /= len(coordList)
-#    mean = afwCoord.IcrsCoord(mean)
-
 def magNormDiff(cat):
     """Calculate the normalized mag/mag_err difference from the mean for a 
     set of observations of an objection.
@@ -94,29 +88,6 @@ def magNormDiff(cat):
     normDiff = (mag - mag_avg) / magerr
 
 
-
-def positionDiff(cat):
-    """Calculate the diff RA, Dec from the mean for a set of observations an object for each observation.
-
-    @param[in]  cat -- Collection with a .get method 
-         for 'coord_ra', 'coord_dec' that returns radians.
-
-    @param[out]  pos_median -- median diff of positions in milliarcsec.  Float.
-
-    This is WRONG!
-    Doesn't do wrap-around
-    """
-    ra_avg, dec_avg = averageRaDec(cat)
-    ra, dec = cat.get('coord_ra'), cat.get('coord_dec')
-    # Approximating that the cos(dec) term is roughly the same
-    #   for all observations of this object.
-    ra_diff  = (ra - ra_avg) * np.cos(dec)**2
-    dec_diff = dec - dec_avg
-    pos_diff = np.sqrt(ra_diff**2 + dec_diff**2)  # radians
-    pos_diff = [afwGeom.radToMas(p) for p in pos_diff]  # milliarcsec
-
-    return pos_diff
-
 def positionRms(cat):
     """Calculate the RMS for RA, Dec for a set of observations an object.
 
@@ -162,13 +133,11 @@ def loadAndMatchData(repo, visitDataIds,
 
     dataRefs = [dafPersist.ButlerDataRef(butler, vId) for vId in visitDataIds]
 
-    # retrieve the schema of the source catalog and extend it in order to add a field to record the ccd number
     ccdKeyName = getCcdKeyName(visitDataIds[0])
 
     schema = butler.get(dataset + "_schema", immediate=True).schema
     mapper = SchemaMapper(schema)
     mapper.addMinimalSchema(schema)
-#    mapper.addOutputField(Field[int](ccdKeyName, "CCD number"))
     mapper.addOutputField(Field[float]('base_PsfFlux_mag', "PSF magnitude"))
     mapper.addOutputField(Field[float]('base_PsfFlux_magerr', "PSF magnitude uncertainty"))
     newSchema = mapper.getOutputSchema()
@@ -179,17 +148,6 @@ def loadAndMatchData(repo, visitDataIds,
                         radius=matchRadius,
                         RecordClass=SimpleRecord)
 
-    if False:
-        # Create visit catalogs by merging those from constituent CCDs. We also convert from BaseCatalog to SimpleCatalog, but in the future we'll probably want to have the source transformation tasks generate SimpleCatalogs (or SourceCatalogs) directly.
-        byVisit = {}
-        for dataRef in dataRefs:
-            catalog = byVisit.setdefault(dataRef.dataId["visit"], SimpleRecord.Catalog(schema))
-            catalog.extend(dataRef.get(dataset, immediate=True), deep=True)
-
-        # Loop over visits, adding them to the match.
-        for visit, catalog in byVisit.iteritems():
-            mmatch.add(catalog, dict(visit=visit))
-
     # create the new extented source catalog
     srcVis = SourceCatalog(newSchema)
 
@@ -234,7 +192,6 @@ def analyzeData(allMatches, good_mag_limit=19.5):
 
     psfMagKey = allMatches.schema.find("base_PsfFlux_mag").key
     psfMagErrKey = allMatches.schema.find("base_PsfFlux_magerr").key
-#    apMagKey = allMatches.schema.find("base_CircularApertureFlux_12_0_mag").key
     extendedKey = allMatches.schema.find("base_ClassificationExtendedness_value").key
 
     def goodFilter(cat):
@@ -269,17 +226,15 @@ def safeFilter(cat):
     #   by going with the default `field=None`.
     dist = goodMatches.aggregate(positionRms)
 
-    rmsPA1 = []
-    iqrPA1 = []
-    for i in range(50):
-#        diffs = safeMatches.aggregate(getRandomDiff, field=apMagKey)
-        diffs = safeMatches.aggregate(getRandomDiff, field=psfMagKey)
-        rmsSigma, iqrSigma = computeWidths(diffs)
-        rmsPA1.append(rmsSigma)
-        iqrPA1.append(iqrSigma)
-
-    rmsPA1 = np.array(rmsPA1)
-    iqrPA1 = np.array(iqrPA1)
+    # We calculate differences for 50 different random realizations of the measurement pairs, 
+    # to provide some estimate of the uncertainty on our RMS estimates due to the random shuffling 
+    # This estimate could be stated and calculated from a more formally derived motivation
+    #   but in practice 50 should be sufficient.
+    numRandomShuffles = 50
+    pa1_samples = [calcPA1(safeMatches, psfMagKey) for n in range(numRandomShuffles)]
+    rmsPA1 = np.array([pa1.rms for pa1 in pa1_samples])
+    iqrPA1 = np.array([pa1.iqr for pa1 in pa1_samples])
+
     print("PA1(RMS) = %4.2f+-%4.2f mmag" % (rmsPA1.mean(), rmsPA1.std()))
     print("PA1(IQR) = %4.2f+-%4.2f mmag" % (iqrPA1.mean(), iqrPA1.std()))
 
@@ -367,10 +322,13 @@ def checkPhotometry(mag, mmagrms, dist, match,
 
 
 def printPA2(gv, magKey):
-    minPA2, designPA2, stretchPA2 = calcPA2(gv, magKey)
-    print("minimum: PF1=20%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 15 mmag)" % minPA2)
-    print("design:  PF1=10%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 15 mmag)" % designPA2)
-    print("stretch: PF1= 5%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 10 mmag)" % stretchPA2)
+    pa2 = calcPA2(gv, magKey)
+    print("minimum: PF1=%2d%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 15 mmag)" % 
+          (pa2.PF1['minimum'], pa2.minimum))
+    print("design:  PF1=%2d%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 15 mmag)" % 
+          (pa2.PF1['design'], pa2.design))
+    print("stretch: PF1=%2d%% of diffs more than PA2 = %4.2f mmag (target is PA2 < 10 mmag)" % 
+          (pa2.PF1['stretch'], pa2.stretch))
 
 def repoNameToPrefix(repo):
     """Generate a base prefix for plots based on the repo name.

diff --git a/python/lsst/validate/drp/plotAstrometryPhotometry.py b/python/lsst/validate/drp/plotAstrometryPhotometry.py
@@ -213,31 +213,31 @@ def plotPhotometry(mag, mmagerr, mmagrms, dist, match, good_mag_limit=19.5,
 
 
 def plotPA1(gv, magKey, plotbase=""):
-    rmsPA1, iqrPA1, diffs, means = calcPA1(gv, magKey)
+    pa1 = calcPA1(gv, magKey)
 
     diff_range = (-100, +100)
 
     fig = plt.figure(figsize=(18,12))
     ax1 = fig.add_subplot(1,2,1)
-    ax1.scatter(means, diffs, s=10, color=color['bright'], linewidth=0)
-    ax1.axhline(+rmsPA1, color=color['rms'], linewidth=3)
-    ax1.axhline(-rmsPA1, color=color['rms'], linewidth=3)
-    ax1.axhline(+iqrPA1, color=color['iqr'], linewidth=3)
-    ax1.axhline(-iqrPA1, color=color['iqr'], linewidth=3)
+    ax1.scatter(pa1.means, pa1.diffs, s=10, color=color['bright'], linewidth=0)
+    ax1.axhline(+pa1.rms, color=color['rms'], linewidth=3)
+    ax1.axhline(-pa1.rms, color=color['rms'], linewidth=3)
+    ax1.axhline(+pa1.iqr, color=color['iqr'], linewidth=3)
+    ax1.axhline(-pa1.iqr, color=color['iqr'], linewidth=3)
 
     ax2 = fig.add_subplot(1,2,2, sharey=ax1)
-    ax2.hist(diffs, bins=25, range=diff_range,
+    ax2.hist(pa1.diffs, bins=25, range=diff_range,
              orientation='horizontal', histtype='stepfilled',
              normed=True, color=color['bright'])
     ax2.set_xlabel("relative # / bin")
 
     yv = np.linspace(diff_range[0], diff_range[1], 100)
-    ax2.plot(scipy.stats.norm.pdf(yv, scale=rmsPA1), yv, 
+    ax2.plot(scipy.stats.norm.pdf(yv, scale=pa1.rms), yv, 
              marker='', linestyle='-', linewidth=3, color=color['rms'],
-             label="PA1(RMS) = %4.2f mmag" % rmsPA1)
-    ax2.plot(scipy.stats.norm.pdf(yv, scale=iqrPA1), yv, 
+             label="PA1(RMS) = %4.2f mmag" % pa1.rms)
+    ax2.plot(scipy.stats.norm.pdf(yv, scale=pa1.iqr), yv, 
              marker='', linestyle='-', linewidth=3, color=color['iqr'],
-             label="PA1(IQR) = %4.2f mmag" % iqrPA1)
+             label="PA1(IQR) = %4.2f mmag" % pa1.iqr)
     ax2.set_ylim(*diff_range)
     ax2.legend()
 #    ax1.set_ylabel(u"12-pixel aperture magnitude diff (mmag)")