Merge pull request #29 from jjbuchanan/tickets/DM-42458

DM-42458: Modify variance plane when injecting sources and optionally vary injected light
lsst · Mar 5, 2024 · f4d6588 · f4d6588
2 parents b5039a9 + 4d9478c
commit f4d6588
Show file tree

Hide file tree

Showing 4 changed files with 374 additions and 4 deletions.
diff --git a/python/lsst/source/injection/inject_base.py b/python/lsst/source/injection/inject_base.py
@@ -117,6 +117,16 @@ class BaseInjectConfig(PipelineTaskConfig, pipelineConnections=BaseInjectConnect
         doc="String to prefix to the entries in the *col_stamp* column, for example, a directory path.",
         default="",
     )
+    add_noise = Field[bool](
+        doc="Whether to randomly vary the injected flux in each pixel by an amount consistent with "
+        "the injected variance.",
+        default=True,
+    )
+    noise_seed = Field[int](
+        doc="Initial seed for random noise generation. This value increments by 1 for each injected "
+        "object, so each object has an independent noise realization.",
+        default=0,
+    )
 
     # Custom column names.
     col_ra = Field[str](
@@ -163,7 +173,7 @@ class BaseInjectTask(PipelineTask):
     _DefaultName = "baseInjectTask"
     ConfigClass = BaseInjectConfig
 
-    def run(self, injection_catalogs, input_exposure, psf, photo_calib, wcs):
+    def run(self, injection_catalogs, input_exposure, psf, photo_calib, wcs, variance_scale=0.0):
         """Inject sources into an image.
 
         Parameters
@@ -179,6 +189,9 @@ def run(self, injection_catalogs, input_exposure, psf, photo_calib, wcs):
             Photometric calibration used to calibrate injected sources.
         wcs : `lsst.afw.geom.SkyWcs`
             WCS used to calibrate injected sources.
+        variance_scale : `float`
+            Scale by which to multiply injected image flux to determine the
+            amount of variance to add.
 
         Returns
         -------
@@ -267,6 +280,9 @@ def run(self, injection_catalogs, input_exposure, psf, photo_calib, wcs):
                 mask_plane_name=self.config.mask_plane_name,
                 calib_flux_radius=self.config.calib_flux_radius,
                 draw_size_max=10000,  # TODO: replace draw_size logic with GS logic.
+                variance_scale=self.config.variance_scale,
+                add_noise=self.config.add_noise,
+                noise_seed=self.config.noise_seed,
                 logger=self.log,
             )
             # Add inject_galsim_objects_into_exposure outputs into output cat.

diff --git a/python/lsst/source/injection/inject_coadd.py b/python/lsst/source/injection/inject_coadd.py
@@ -23,7 +23,12 @@
 
 __all__ = ["CoaddInjectConnections", "CoaddInjectConfig", "CoaddInjectTask"]
 
+import numpy as np
+from lsst.pex.config import Field
 from lsst.pipe.base.connectionTypes import Input, Output
+from sklearn.cluster import KMeans
+from sklearn.linear_model import LinearRegression, RANSACRegressor
+from sklearn.metrics import mean_squared_error
 
 from .inject_base import BaseInjectConfig, BaseInjectConnections, BaseInjectTask
 
@@ -63,7 +68,58 @@ class CoaddInjectConfig(  # type: ignore [call-arg]
 ):
     """Coadd-level configuration for source injection tasks."""
 
-    pass
+    n_fits_1 = Field[int](
+        doc="Perform this many RANSAC fits in the first round, to get a sample "
+        "of different slopes based on the different random samples of points used "
+        "in the fit.",
+        default=20,
+    )
+    n_fits_2 = Field[int](
+        doc="Perform this many RANSAC fits in the second round, to get a sample "
+        "of different slopes based on the different random samples of points used "
+        "in the fit.",
+        default=20,
+    )
+    threshold_scale_1 = Field[float](
+        doc="An outlier in the first RANSAC fit is farther from the fit line, "
+        "in terms of squared error, than this multiple of the initial linear MSE.",
+        default=0.1,
+    )
+    threshold_scale_2 = Field[float](
+        doc="An outlier in the second RANSAC fit is farther from the fit line, "
+        "in terms of squared error, than this multiple of the initial linear MSE.",
+        default=0.1,
+    )
+    max_trials_1 = Field[int](
+        doc="Maximum number of trials the first RANSAC fit is allowed to run.",
+        default=1000,
+    )
+    max_trials_2 = Field[int](
+        doc="Maximum number of trials the second RANSAC fit is allowed to run.",
+        default=1000,
+    )
+    variance_fit_seed_1 = Field[int](doc="Seed for first RANSAC fit of flux vs. variance.", default=0)
+    variance_fit_seed_2 = Field[int](doc="Seed for second RANSAC fit of flux vs. variance.", default=0)
+    n_clusters_1 = Field[int](
+        doc="K-means cluster the first set of RANSAC fits using this many clusters, "
+        "in order to find the most stable slope (biggest cluster).",
+        default=4,
+    )
+    n_clusters_2 = Field[int](
+        doc="K-means cluster the second set of RANSAC fits using this many clusters, "
+        "in order to find the most stable slope (biggest cluster).",
+        default=3,
+    )
+    kmeans_seed_1 = Field[int](doc="Seed for first round of k-means clustering.", default=0)
+    kmeans_seed_2 = Field[int](doc="Seed for second round of k-means clustering.", default=0)
+    kmeans_n_init_1 = Field[int](
+        doc="Number of times the first k-means clustering is run with different initial centroids.",
+        default=10,
+    )
+    kmeans_n_init_2 = Field[int](
+        doc="Number of times the second k-means clustering is run with different initial centroids.",
+        default=10,
+    )
 
 
 class CoaddInjectTask(BaseInjectTask):
@@ -72,6 +128,13 @@ class CoaddInjectTask(BaseInjectTask):
     _DefaultName = "coaddInjectTask"
     ConfigClass = CoaddInjectConfig
 
+    def run(self, injection_catalogs, input_exposure, psf, photo_calib, wcs):
+        self.log.info("Fitting flux vs. variance in each pixel.")
+        variance_scale = self.get_variance_scale(input_exposure)
+        self.log.info("Variance scale factor: %.6f", variance_scale)
+
+        return super().run(injection_catalogs, input_exposure, psf, photo_calib, wcs, variance_scale)
+
     def runQuantum(self, butler_quantum_context, input_refs, output_refs):
         inputs = butler_quantum_context.get(input_refs)
 
@@ -82,3 +145,175 @@ def runQuantum(self, butler_quantum_context, input_refs, output_refs):
         input_keys = ["injection_catalogs", "input_exposure", "sky_map", "psf", "photo_calib", "wcs"]
         outputs = self.run(**{key: value for (key, value) in inputs.items() if key in input_keys})
         butler_quantum_context.put(outputs, output_refs)
+
+    """
+    Establish the variance scale by a linear fit of variance vs. flux.
+    In practice we see that most pixels in a coadd obey a consistent, simple
+    linear relationship between variance and flux, but a small sample do not.
+
+    To identify and hence ignore these odd pixels, we perform two rounds of
+    RANSAC fits. RANSAC iteratively finds a least-squares straight line fit on
+    random subsamples of points, while identifying outliers. The inlier pixels
+    from a first RANSAC fit tend to be regions of empty space and the extreme
+    outer edges of galaxies, while the outliers tend to be the inner regions of
+    galaxies.
+
+    We can pick up some more pixels in the galaxies by running a second RANSCAC
+    fit on just the outliers of the first fit. In the second round, the inliers
+    tend to be the bulk of the galaxy, while the outliers are the innermost
+    cores of galaxies. In practice, the inliers from this second round tend to
+    have a qualitatively similar variance-vs-flux relationship to the outliers
+    from the first fit and do not strongly alter the final variance scale we
+    get; while the outliers from the second round are truly wild and should
+    clearly be omitted from a simple linear fit.
+
+    In both rounds, random variation of the points sampled by the RANSAC fit
+    causes the fitted slope and intercept to vary, and sometimes the fit can
+    settle on a pathological sample of points as its inliers. We try to avoid
+    such pathologies by running each fit multiple times with different seeds,
+    and using K-Means clustering on the resulting slopes to identify the most
+    stable value.
+    """
+
+    def get_variance_scale(self, exposure):
+        flux = exposure.image.array.ravel()
+        var = exposure.variance.array.ravel()
+
+        # Ignore pixels with nan or infinite values
+        good_pixels = np.isfinite(flux) & np.isfinite(var)
+        flux = flux[good_pixels]
+        var = var[good_pixels]
+
+        # Simple linear regression to establish MSE.
+        linear = LinearRegression()
+        linear.fit(flux.reshape(-1, 1), var)
+        linear_mse = mean_squared_error(var, linear.predict(flux.reshape(-1, 1)))
+
+        # First RANSAC fit
+        fit_results = []
+        for seed in range(
+            self.config.variance_fit_seed_1, self.config.variance_fit_seed_1 + self.config.n_fits_1
+        ):
+            ransac = RANSACRegressor(
+                loss="squared_error",
+                residual_threshold=self.config.threshold_scale_1 * linear_mse,
+                max_trials=self.config.max_trials_1,
+                random_state=seed,
+            )
+            ransac.fit(flux.reshape(-1, 1), var)
+            # Remember fit results
+            slope = ransac.estimator_.coef_[0]
+            fit_results.append((slope, seed))
+
+        # K-means cluster the first round of fits,
+        # to find the most stable results.
+        kmeans = KMeans(
+            n_clusters=self.config.n_clusters_1,
+            random_state=self.config.kmeans_seed_1,
+            n_init=self.config.kmeans_n_init_1,
+        )
+        kmeans.fit(np.log(np.array([f[0] for f in fit_results if f[0] > 0])).reshape(-1, 1))
+        label_counts = [np.sum(kmeans.labels_ == idx) for idx in range(self.config.n_clusters_1)]
+
+        # Recall one of the fits, chosen arbitrarily from those which are both
+        # stable, according to the first k-means fit, and positive-slope.
+        stable_fit_seeds = np.array([f[1] for f in fit_results if f[0] > 0])[
+            kmeans.labels_ == np.argmax(label_counts)
+        ]
+        if len(stable_fit_seeds == 0):
+            # No positive-slope fit found.
+            # Allow the fitted slope to be negative but throw a warning.
+            self.log.warning(
+                "No positive-slope result in the first round of "
+                "RANSAC fits. Proceeding with a negative-slope fit."
+            )
+            stable_fit_seeds = np.array([f[1] for f in fit_results])[
+                kmeans.labels_ == np.argmax(label_counts)
+            ]
+        seed = stable_fit_seeds[0]
+        ransac = RANSACRegressor(
+            loss="squared_error",
+            residual_threshold=self.config.threshold_scale_1 * linear_mse,
+            max_trials=self.config.max_trials_1,
+            random_state=seed,
+        )
+        ransac.fit(flux.reshape(-1, 1), var)
+
+        # Label the pixels with a "good" variance vs. flux relationship
+        # (the inliers), together with the ones that are further from a simple
+        # straight line (the outliers).
+        inlier_mask_1 = ransac.inlier_mask_
+        outlier_mask_1 = ~inlier_mask_1
+
+        # Second RANSAC fit,
+        # on just the outliers of the 1st fit.
+        fit_results = []
+        for seed in range(
+            self.config.variance_fit_seed_2, self.config.variance_fit_seed_2 + self.config.n_fits_2
+        ):
+            ransac = RANSACRegressor(
+                loss="squared_error",
+                residual_threshold=self.config.threshold_scale_2 * linear_mse,
+                max_trials=self.config.max_trials_2,
+                random_state=seed,
+            )
+            ransac.fit(flux[outlier_mask_1].reshape(-1, 1), var[outlier_mask_1])
+            # Remember fit results
+            slope = ransac.estimator_.coef_[0]
+            fit_results.append((slope, seed))
+
+        # K-Means cluster the second round of fits,
+        # to find the most stable result
+        kmeans = KMeans(
+            n_clusters=self.config.n_clusters_2,
+            random_state=self.config.kmeans_seed_2,
+            n_init=self.config.kmeans_n_init_2,
+        )
+        kmeans.fit(np.log(np.array([f[0] for f in fit_results if f[0] > 0])).reshape(-1, 1))
+        label_counts = [np.sum(kmeans.labels_ == idx) for idx in range(self.config.n_clusters_2)]
+
+        # Recall one of the stable fits
+        stable_fit_seeds = np.array([f[1] for f in fit_results if f[0] > 0])[
+            kmeans.labels_ == np.argmax(label_counts)
+        ]
+        if len(stable_fit_seeds == 0):
+            # No positive-slope fit found.
+            # Allow the fitted slope to be negative but throw a warning.
+            self.log.warning(
+                "No positive-slope result in the second round of "
+                "RANSAC fits. Proceeding with a negative-slope fit."
+            )
+            stable_fit_seeds = np.array([f[1] for f in fit_results])[
+                kmeans.labels_ == np.argmax(label_counts)
+            ]
+        seed = stable_fit_seeds[0]
+        ransac = RANSACRegressor(
+            loss="squared_error",
+            residual_threshold=self.config.threshold_scale_2 * linear_mse,
+            max_trials=self.config.max_trials_2,
+            random_state=seed,
+        )
+        ransac.fit(flux[outlier_mask_1].reshape(-1, 1), var[outlier_mask_1])
+
+        # Pixels with a "good" variance vs. flux relationship:
+        # Union of the inliers from the first fit
+        # together with the inliers from the second fit.
+        flux_good = np.concatenate(
+            (flux[inlier_mask_1], flux[outlier_mask_1][ransac.inlier_mask_]),
+            axis=None,
+        )
+        var_good = np.concatenate(
+            (var[inlier_mask_1], var[outlier_mask_1][ransac.inlier_mask_]),
+            axis=None,
+        )
+
+        # Fit all the good pixels with a simple least squares regression.
+        linear = LinearRegression()
+        linear.fit(flux_good.reshape(-1, 1), var_good)
+        variance_scale = float(linear.coef_[0])
+
+        if variance_scale < 0:
+            self.log.warning("Slope of final variance vs. flux fit is negative.")
+
+        # Return the slope of the final fit.
+        return variance_scale