Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-39167: Improve outlier rejection in PTC fitting and add additional tests. #193

Merged
merged 11 commits into from
May 18, 2023
Merged
46 changes: 36 additions & 10 deletions python/lsst/cp/pipe/ptc/cpExtractPtcTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import numpy as np
from lmfit.models import GaussianModel
import scipy.stats
import warnings

import lsst.afw.math as afwMath
import lsst.pex.config as pexConfig
Expand Down Expand Up @@ -450,6 +451,23 @@ def run(self, inputExp, inputDims, taskMetadata):
im1Area, im2Area, imStatsCtrl, mu1, mu2 = self.getImageAreasMasksStats(exp1, exp2,
region=region)

# We demand that both mu1 and mu2 be finite and greater than 0.
if not np.isfinite(mu1) or not np.isfinite(mu2) \
or ((np.nan_to_num(mu1) + np.nan_to_num(mu2)/2.) <= 0.0):
self.log.warning(
"Illegal mean value(s) detected for amp %s on exposure pair %d/%d",
ampName,
expId1,
expId2,
)
partialPtcDataset.setAmpValuesPartialDataset(
ampName,
inputExpIdPair=(expId1, expId2),
rawExpTime=expTime,
expIdMask=False,
)
continue

# `measureMeanVarCov` is the function that measures
# the variance and covariances from a region of
# the difference image of two flats at the same
Expand Down Expand Up @@ -1000,7 +1018,19 @@ def computeGaussianHistogramParameters(self, im1Area, im2Area, imStatsCtrl, mu1,
errVals = np.sqrt(yVals)
errVals[(errVals == 0.0)] = 1.0
pars = model.guess(yVals, x=xVals)
out = model.fit(yVals, pars, x=xVals, weights=1./errVals, calc_covar=True, method="least_squares")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# The least-squares fitter sometimes spouts (spurious) warnings
# when the model is very bad. Swallow these warnings now and
# let the KS test check the model below.
out = model.fit(
yVals,
pars,
x=xVals,
weights=1./errVals,
calc_covar=True,
method="least_squares",
)

# Calculate chi2.
chiArr = out.residual
Expand All @@ -1009,16 +1039,12 @@ def computeGaussianHistogramParameters(self, im1Area, im2Area, imStatsCtrl, mu1,
sigmaFit = out.params["sigma"].value

# Calculate KS test p-value for the fit.
# Seed this with the mean value, so that the same data will get the
# same result.
randomSeed = int((mu1 + mu2)/2.)
gSample = scipy.stats.norm.rvs(
size=numOk,
scale=sigmaFit,
loc=out.params["center"].value,
random_state=randomSeed,
ksResult = scipy.stats.ks_1samp(
diffArr,
scipy.stats.norm.cdf,
(out.params["center"].value, sigmaFit),
)
ksResult = scipy.stats.ks_2samp(diffArr, gSample)

kspValue = ksResult.pvalue
if kspValue < 1e-15:
kspValue = 0.0
Expand Down
161 changes: 90 additions & 71 deletions python/lsst/cp/pipe/ptc/cpSolvePtcTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,45 +122,49 @@ class PhotonTransferCurveSolveConfig(pipeBase.PipelineTaskConfig,
doc="Degree of polynomial to fit the PTC, when 'ptcFitType'=POLYNOMIAL.",
default=3,
)
doLegacyTurnoffAndOutlierSelection = pexConfig.Field(
doLegacyTurnoffSelection = pexConfig.Field(
dtype=bool,
doc="Use 'legacy' computation for PTC turnoff and outlier selection. If set "
doc="Use 'legacy' computation for PTC turnoff selection. If set "
"to False, then the KS test p-value selection will be used instead.",
default=False,
)
sigmaCutPtcOutliers = pexConfig.Field(
dtype=float,
doc="Sigma cut for outlier rejection in PTC. Only used if "
"doLegacyTurnoffAndOutlierSelection is True.",
doc="Sigma cut for outlier rejection in PTC.",
default=5.0,
)
maxIterationsPtcOutliers = pexConfig.RangeField(
dtype=int,
doc="Maximum number of iterations for outlier rejection in PTC. Only used if "
"doLegacyTurnoffAndOutlierSelection is True.",
doc="Maximum number of iterations for outlier rejection in PTC.",
default=2,
min=0
)
maxSignalInitialPtcOutlierFit = pexConfig.Field(
dtype=float,
doc="Maximum signal considered for intial outlier fit. This should be below "
"the PTC turnoff to ensure accurate outlier rejection.",
default=30_000.,
)
minVarPivotSearch = pexConfig.Field(
dtype=float,
doc="The code looks for a pivot signal point after which the variance starts decreasing at high-flux"
" to exclude then from the PTC model fit. However, sometimes at low fluxes, the variance"
" decreases slightly. Set this variable for the variance value, in ADU^2, after which the pivot "
" should be sought. Only used if doLegacyTurnoffAndOutlierSelection is True.",
" should be sought. Only used if doLegacyTurnoffSelection is True.",
default=10000,
)
consecutivePointsVarDecreases = pexConfig.RangeField(
dtype=int,
doc="Required number of consecutive points/fluxes in the PTC where the variance "
"decreases in order to find a first estimate of the PTC turn-off. "
"Only used if doLegacyTurnoffAndOutlierSelection is True.",
"Only used if doLegacyTurnoffSelection is True.",
default=2,
min=2
)
ksTestMinPvalue = pexConfig.Field(
dtype=float,
doc="Minimum value of the Gaussian histogram KS test p-value to be used in PTC fit. "
"Only used if doLegacyTurnoffAndOutlierSelection is False.",
"Only used if doLegacyTurnoffSelection is False.",
default=0.01,
)
doFitBootstrap = pexConfig.Field(
Expand Down Expand Up @@ -328,7 +332,7 @@ def run(self, inputCovariances, camera=None, detId=0):
expIdMask = False

kspValue = partialPtcDataset.kspValues[ampName][0]
if not self.config.doLegacyTurnoffAndOutlierSelection and \
if not self.config.doLegacyTurnoffSelection and \
kspValue < self.config.ksTestMinPvalue:
expIdMask = False

Expand Down Expand Up @@ -357,6 +361,9 @@ def run(self, inputCovariances, camera=None, detId=0):
datasetPtc.rawExpTimes[ampName] = datasetPtc.rawExpTimes[ampName][index]
datasetPtc.rawMeans[ampName] = datasetPtc.rawMeans[ampName][index]
datasetPtc.rawVars[ampName] = datasetPtc.rawVars[ampName][index]
datasetPtc.histVars[ampName] = datasetPtc.histVars[ampName][index]
datasetPtc.histChi2Dofs[ampName] = datasetPtc.histChi2Dofs[ampName][index]
datasetPtc.kspValues[ampName] = datasetPtc.kspValues[ampName][index]
datasetPtc.expIdMask[ampName] = datasetPtc.expIdMask[ampName][index]
datasetPtc.covariances[ampName] = datasetPtc.covariances[ampName][index]
datasetPtc.covariancesSqrtWeights[ampName] = datasetPtc.covariancesSqrtWeights[ampName][index]
Expand Down Expand Up @@ -948,12 +955,11 @@ def errFunc(p, x, y):
maxIterationsPtcOutliers = self.config.maxIterationsPtcOutliers

for i, ampName in enumerate(dataset.ampNames):
timeVecOriginal = dataset.rawExpTimes[ampName]
meanVecOriginal = dataset.rawMeans[ampName]
varVecOriginal = dataset.rawVars[ampName]
meanVecOriginal = dataset.rawMeans[ampName].copy()
varVecOriginal = dataset.rawVars[ampName].copy()
varVecOriginal = self._makeZeroSafe(varVecOriginal)

if self.config.doLegacyTurnoffAndOutlierSelection:
if self.config.doLegacyTurnoffSelection:
# Discard points when the variance starts to decrease after two
# consecutive signal levels
goodPoints = self._getInitialGoodPoints(meanVecOriginal, varVecOriginal,
Expand All @@ -974,11 +980,6 @@ def errFunc(p, x, y):
self.fillBadAmp(dataset, ptcFitType, ampName)
continue

# Save the point where the variance starts decreasing as the
# PTC turnoff point
ptcTurnoff = meanVecOriginal[goodPoints][-1]
dataset.ptcTurnoff[ampName] = ptcTurnoff

mask = goodPoints

if ptcFitType == 'EXPAPPROXIMATION':
Expand All @@ -996,73 +997,91 @@ def errFunc(p, x, y):
parsIniPtc = self._initialParsForPolynomial(self.config.polynomialFitDegree + 1)
bounds = self._boundsForPolynomial(parsIniPtc)

if self.config.doLegacyTurnoffAndOutlierSelection:
# Before bootstrap fit, do an iterative fit to get rid of
# outliers. This further process of outlier rejection be
# skipped if self.config.maxIterationsPtcOutliers = 0.
# We already did some initial outlier rejection above in
# self._getInitialGoodPoints.
count = 1
newMask = np.ones_like(meanVecOriginal, dtype=bool)
pars = parsIniPtc
while count <= maxIterationsPtcOutliers:
# Note that application of the mask actually shrinks the
# array to size rather than setting elements to zero (as we
# want) so always update mask itself and re-apply to the
# original data.
meanTempVec = meanVecOriginal[mask]
varTempVec = varVecOriginal[mask]
res = least_squares(errFunc, parsIniPtc, bounds=bounds, args=(meanTempVec, varTempVec))
# We perform an initial (unweighted) fit of variance vs signal
# (after initial KS test or post-drop selection) to look for
# outliers, particularly at the high-flux end. The initial fit
# is performed only for points that are guaranteed to be below
# the PTC turnoff and then extrapolated to ensure that high
# flux points that have abnormal variance values can be properly
# rejected in this phase without biasing the initial fit.
# This algorithm was initially developed by Seth Digel for
# the EO Testing pipeline.

if maxIterationsPtcOutliers == 0:
# We are not doing any outlier rejection here, but we do want
# an initial fit.
res = least_squares(
errFunc,
parsIniPtc,
bounds=bounds,
args=(meanVecOriginal[mask], varVecOriginal[mask]),
)
pars = res.x
newMask = mask.copy()
else:
newMask = (mask & (meanVecOriginal <= self.config.maxSignalInitialPtcOutlierFit))

count = 0
lastMask = mask.copy()
while count < maxIterationsPtcOutliers:
res = least_squares(
errFunc,
parsIniPtc,
bounds=bounds,
args=(meanVecOriginal[newMask], varVecOriginal[newMask]),
)
pars = res.x

# change this to the original from the temp because
# the masks are ANDed meaning once a point is masked
# it's always masked, and the masks must always be the
# same length for broadcasting
sigResids = (varVecOriginal - ptcFunc(pars, meanVecOriginal))/np.sqrt(varVecOriginal)
newMask = np.array([True if np.abs(r) < sigmaCutPtcOutliers else False
for r in sigResids])
mask = mask & newMask
if not (mask.any() and newMask.any()):
msg = (f"SERIOUS: All points in either mask: {mask} or newMask: {newMask} are bad. "
# The new mask includes points where the residuals are
# finite, are less than the cut, and include the original
# mask of known points that should not be used.
newMask = (
np.isfinite(sigResids)
& (np.abs(np.nan_to_num(sigResids)) < sigmaCutPtcOutliers)
& mask
)
if np.count_nonzero(newMask) == 0:
msg = (f"SERIOUS: All points after outlier rejection are bad. "
f"Setting {ampName} to BAD.")
self.log.warning(msg)
# Fill entries with NaNs
self.fillBadAmp(dataset, ptcFitType, ampName)
break
nDroppedTotal = Counter(mask)[False]
self.log.debug("Iteration %d: discarded %d points in total for %s",
count, nDroppedTotal, ampName)

erykoff marked this conversation as resolved.
Show resolved Hide resolved
self.log.debug(
"Iteration %d: Removed %d points in total for %s.",
count,
np.count_nonzero(mask) - np.count_nonzero(newMask),
ampName,
)

# If the mask hasn't changed then break out.
if np.all(newMask == lastMask):
self.log.debug("Convergence at iteration %d; breaking loop for %s.", count, ampName)
break

lastMask = newMask.copy()

count += 1
# objects should never shrink
assert (len(mask) == len(timeVecOriginal) == len(meanVecOriginal) == len(varVecOriginal))
else:
pars = parsIniPtc
meanTempVec = meanVecOriginal[mask]
varTempVec = varVecOriginal[mask]
res = least_squares(
errFunc,
parsIniPtc,
bounds=bounds,
args=(meanVecOriginal[mask], varVecOriginal[mask]),
)
pars = res.x
newMask = mask

if not (mask.any() and newMask.any()):
# Set the mask to the new mask
mask = newMask.copy()

if not mask.any():
# We hae already filled the bad amp above, so continue.
continue
dataset.expIdMask[ampName] = np.array(dataset.expIdMask[ampName])
# store the final mask
if len(dataset.expIdMask[ampName]):
dataset.expIdMask[ampName] &= mask # bitwise_and if there is already a mask
else:
dataset.expIdMask[ampName] = mask
# In case there was a previous mask stored
mask = dataset.expIdMask[ampName]

dataset.expIdMask[ampName] = mask

parsIniPtc = pars
meanVecFinal = meanVecOriginal[mask]
varVecFinal = varVecOriginal[mask]

# Save the maximum point after outlier detection as the
# PTC turnoff point.
dataset.ptcTurnoff[ampName] = meanVecFinal[-1]

if Counter(mask)[False] > 0:
self.log.info("Number of points discarded in PTC of amplifier %s:"
" %d out of %d", ampName, Counter(mask)[False], len(meanVecOriginal))
Expand Down