In [None]:
# %pip install --quiet git+https://github.com/astronomy-commons/hats.git@main

In [None]:
# %pip install --quiet git+https://github.com/astronomy-commons/lsdb.git@main

In [None]:
import lsdb

lsdb.__version__

In [None]:
import astropy.units as u
import lsdb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import dask

from dask.distributed import Client
from nested_pandas.utils import count_nested
from dask.distributed import print as dask_print
from lsdb.core.search.pixel_search import PixelSearch
from lsdb.core.search.order_search import OrderSearch
from io import StringIO
from nested_pandas import NestedDtype
from pathlib import Path
from hats.pixel_math import HealpixPixel

This is pretty brute-force. I gave up on thinking about how to make the reuslts per-band co-exist in the same table, and I'm just running the stages 6 times, once for each band.

I might need to do it differently for diaObject, but we'll see what happens tomorrow!!

In [None]:
def stable_decomposition(a):
    n = len(a)
    D = np.diag(a)
    one_vec = np.ones((n, 1)) / np.sqrt(n)

    inv_a_sum = np.sum(1/a)
    u = np.ones((n, 1)) / np.sqrt(inv_a_sum)

    # Explicit eigenvalue along ones-vector:
    lambda_min = float((one_vec.T @ (D - u @ u.T) @ one_vec).item())
    assert lambda_min > 0

    # Construct orthonormal basis explicitly:
    Q, _ = np.linalg.qr(np.eye(n) - one_vec @ one_vec.T)
    Q = Q[:, :n-1]  # explicitly enforce correct dimensions (n x n-1)

    # Project onto orthogonal subspace:
    M_orth = Q.T @ (D - u @ u.T) @ Q
    eigvals_orth, eigvecs_orth = np.linalg.eigh(M_orth)
    assert np.all(eigvals_orth > 0)

    # Combine eigenvectors/eigenvalues explicitly:
    eigvals_full = np.concatenate(([lambda_min], eigvals_orth))
    eigvecs_full = np.hstack((one_vec, Q @ eigvecs_orth))

    # Cholesky-like decomposition
    B = eigvecs_full @ np.diag(np.sqrt(eigvals_full))

    return B


def whiten_data(x, sigma):
    mu = np.average(x, weights=1 / sigma**2)

    decomposed = stable_decomposition(sigma**2)
    transform = np.linalg.inv(decomposed)

    residual = x - mu
    z = transform @ residual

    return z

In [None]:
def filter_by_nested(nested_df, pixel, source_column, band):
    dask_print("starting on", pixel)
    empty_result = {
        "max_variability": np.float64,
        "band": "str",
        "wmean_flux": np.float64,
        "rel_dev": pd.Series([], dtype=np.float64),
        "whitened_data": pd.Series([], dtype=np.float64),
    }
    nested_df = nested_df.assign(
        **{
            source_column: nested_df[source_column].astype(
                NestedDtype.from_pandas_arrow_dtype(nested_df.dtypes[source_column])
            )
        }
    )
    ## Consider only lightcurves in the target band
    nested_df = nested_df.query(
        f"{source_column}.band == '{band}' and not {source_column}.psfFlux_flag"
    ).dropna(subset=source_column)
    if len(nested_df) == 0:
        dask_print("found nothing in band")
        return empty_result

    ## Needs to have at least 10 observations in that band
    counts = count_nested(nested_df, source_column)
    nested_df = nested_df.loc[counts[f"n_{source_column}"] > 10]
    if len(nested_df) == 0:
        dask_print("all lightcurves too short")
        return empty_result

    def rate_variability(flux_column, flux_err_columns):
        wmean_flux = np.average(flux_column, weights=1 / flux_err_columns**2)
        rel_dev = (flux_column - wmean_flux) / flux_err_columns

        return {
            "max_variability": np.abs(rel_dev).max(),
            "band": band,
            "wmean_flux": wmean_flux,
            "rel_dev": rel_dev,
            "whitened_data": whiten_data(flux_column, flux_err_columns)
        }

    ## Drop rows with obvious variability, just return stats per-object.
    z_scores = nested_df.reduce(
        rate_variability, f"{source_column}.psfFlux", f"{source_column}.psfFluxErr"
    )
    non_vars = z_scores.query("max_variability < 10")
    if len(non_vars) == 0:
        dask_print("all lightcurves too variable")
        return empty_result

    dask_print("worked on", pixel, "found", len(non_vars))
    return non_vars

In [None]:
drp_release = "w_2025_11"
hats_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/hats")
hats_path = hats_dir / drp_release
output_path = Path(
    "/sdf/data/rubin/shared/lsdb_commissioning/science_projects/06_uncertainty/object_whiten"
)
target_source_column = "objectForcedSource"

In [None]:
for target_band in ["g", "r", "i", "z"]:
    # Create a local cluster and connect to it
    with Client(n_workers=1, threads_per_worker=1, memory_limit="10GB") as client:
        dask.config.set({"dataframe.convert-string": False})
        
        object_catalog = lsdb.read_hats(
            hats_path / "object_lc", columns=["objectId", target_source_column]
        )
        obj_cat = object_catalog.map_partitions(
            filter_by_nested,
            source_column=target_source_column,
            band=target_band,
            include_pixel=True,
        )
        obj_cat.to_hats(output_path / f"{target_band}_band")
        print("=======FINISHED with band", target_band, "========")