This notebook grabs ComCam data, and cross-matches with Euclid and the redshift catalog.

Creator: John Franklin Crenshaw  
Last run: June 18, 2025

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from astropy.table import Table
import astropy.units as u
from astropy.coordinates import SkyCoord

from lsst.daf.butler import Butler
from dustmaps.sfd import SFDQuery

from pathlib import Path

import tables_io

In [None]:
# Setup paths
euclid_dir = Path("/sdf/data/rubin/shared/pz/users/crenshaw/euclid")
catalog_dir = Path("/sdf/data/rubin/shared/pz/users/crenshaw/comcam/data/catalogs")

#### First we will load and process the ComCam data:

In [None]:
# A few utility functions

def flux_to_mag(flux: np.ndarray) -> np.ndarray:
    """Convert the array of fluxes to AB magnitudes.

    Parameters
    ----------
    flux : np.ndarray
        Fluxes in nJy.

    Returns
    -------
    np.ndarray
        AB magnitudes
    """
    with np.errstate(divide="ignore"):
        mag = -2.5 * np.log10(np.clip(flux, 0, None)) + 31.4

    return mag


def fluxErr_to_magErr(flux: np.ndarray, fluxErr: np.ndarray) -> np.ndarray:
    """Convert the array of flux errors to magnitude errors.

    Parameters
    ----------
    flux : np.ndarray
        Fluxes in nJy.
    fluxErr : np.ndarray
        Flux errors in nJy.

    Returns
    -------
    np.ndarray
        Magnitude errors
    """
    with np.errstate(divide="ignore"):
        magErr = 2.5 / np.log(10) * fluxErr / np.clip(flux, 0, None)

    return magErr

def calc_ext_class(df: pd.DataFrame) -> np.ndarray:
    """Calculate EXT_CLASS defined by Alex DW.

    refExtendedness provides a very pure sample of galaxies, at the expense
    of losing a bunch of faint galaxies. Alex DW suggested a different
    (unoptimized) cut that returns a more complete but less pure galaxy sample.
    More info:
    https://rubin-obs.slack.com/archives/C07Q45NF8TZ/p1742103513020149

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe of ComCam data

    Returns
    -------
    np.ndarray
        EXT_CLASS values. Note a reasonable cut is EXT_CLASS > 0.1.
    """
    # Lists to hold quantities
    cmodel_mag = []
    cmodel_flux = []
    cmodel_fluxerr = []
    psf_flux = []
    psf_fluxerr = []

    for row in df:
        # Get the reference band, which is used to determine extendedness
        ref = row["refBand"]

        # Save cModel mag in reference band
        cmodel_mag.append(flux_to_mag(row[f"{ref}_cModelFlux"]))

        # Save cModel fluxes
        cmodel_flux.append(row[f"{ref}_cModelFlux"])
        cmodel_fluxerr.append(row[f"{ref}_cModelFluxErr"])

        # And PSF fluxes
        psf_flux.append(row[f"{ref}_psfFlux"])
        psf_fluxerr.append(row[f"{ref}_psfFluxErr"])

    # Convert our lists to arrays
    cmodel_mag = np.array(cmodel_mag)
    cmodel_flux = np.array(cmodel_flux)
    cmodel_fluxerr = np.array(cmodel_fluxerr)
    psf_flux = np.array(psf_flux)
    psf_fluxerr = np.array(psf_fluxerr)

    # Extendedness is determined by the ratio of PSF flux to cModel flux
    # (point sources should be consistent with PSF)
    ref_flux_ratio = psf_flux / cmodel_flux

    # Calculate error on the flux ratio
    # uses standard error propagation, assuming two flux types are uncorrelated
    # bad assumption, but this matches what Alex DW did
    with np.errstate(over="ignore"):
        err_ratio = np.sqrt((psf_fluxerr / psf_flux)**2 + (cmodel_fluxerr / cmodel_flux)**2)
        err_ratio = np.clip(err_ratio, 0, 1)
    ref_flux_ratio_err = ref_flux_ratio * err_ratio

    # Calculate quantity we will cut on
    ext_class = (1 - ref_flux_ratio) + 5/2 * ref_flux_ratio_err

    return ext_class

In [None]:
# Load and process ComCam data

# Butler with latest DP1 run
butler = Butler("/repo/dp1", collections="LSSTComCam/runs/DRP/DP1/v29_0_0/DM-50260")


# Create list of columns we want, including ALL flux types
bands = list("ugrizy")
col_names = ["objectId", "coord_ra", "coord_dec", "refExtendedness", "refBand"]
flux_types = ["cModel", "sersic", "gaap1p0", "gaap3p0", "psf", "kron"]
for flux_type in flux_types:
    for band in bands:
        col_names += [f"{band}_{flux_type}Flux", f"{band}_{flux_type}FluxErr"]


# Loop over ECDFS tracts and load ComCam data
df_list = []
ecdfs_tracts = [5063, 4849, 4848]
for tract in ecdfs_tracts:
    # Get data from butler
    objects = butler.get(
        "object",
        dataId={"band": "i", "tract": tract, "skymap": "lsst_cells_v1"},
        parameters = {"columns": col_names},
    )

    # Cut on i-band SNR > 5
    mask = objects["i_cModelFlux"] / objects["i_cModelFluxErr"] > 5
    objects = objects[mask]

    # Convert fluxes to AB magnitudes
    for flux_type in flux_types:
        for band in bands:
            flux = objects[f"{band}_{flux_type}Flux"]
            fluxErr = objects[f"{band}_{flux_type}FluxErr"]
            objects[f"{band}_{flux_type}Mag"] =  flux_to_mag(flux)
            objects[f"{band}_{flux_type}MagErr"] =  fluxErr_to_magErr(flux, fluxErr)

    # Save and cut on ext_class
    # this is a star/galaxy separation cut that favors galaxy completeness
    # rather than purity on the faint end
    objects["ext_class"] = calc_ext_class(objects)
    objects = objects[objects["ext_class"] >= 0.1]

    # Save SNR in two different mag types
    objects["snr_i_cModel"] = objects["i_cModelFlux"] / objects["i_cModelFluxErr"]
    objects["snr_i_gaap1p0"] = objects["i_gaap1p0Flux"] / objects["i_gaap1p0FluxErr"]

    df_list.append(objects.to_pandas())


# Combine catalogs
comcam = pd.concat(df_list)

# De-redden magnitudes
band_a_ebv = dict(
    u=4.81,
    g=3.64,
    r=2.70,
    i=2.06,
    z=1.58,
    y=1.31,
)

comcam_coord = SkyCoord(ra=comcam.coord_ra, dec=comcam.coord_dec, unit="deg")

sfd = SFDQuery()
ebv = sfd(comcam_coord)
comcam["ebv"] = ebv

for flux_type in flux_types:
    for band in bands:
        comcam[f"{band}_{flux_type}Mag"] -= comcam["ebv"] * band_a_ebv[band]


# Drop the flux columns
comcam = comcam.loc[:, ~comcam.columns.str.contains("Flux")]

#### Now we will load and cross-match with our Euclid catalog:

In [None]:
# Load the Euclid data
euclid0 = Table.read(euclid_dir / "euclid_ecdfs.fits").to_pandas()

# Cut on SNR > 5 in VIS
mask = euclid0["flux_vis_psf"] / euclid0["fluxerr_vis_psf"] >= 5
mask &= euclid0["flag_vis"] == 0
euclid0 = euclid0[mask].reset_index()

# Package the data we want to keep
euclid = pd.DataFrame(
    np.transpose([euclid0["right_ascension"], euclid0["declination"]]),
    columns=["coord_ra", "coord_dec"],
)

# First save VIS mags
flux = 1e3 * euclid0["flux_vis_psf"] # nJy
fluxErr = 1e3 * euclid0["fluxerr_vis_psf"] # nJy
euclid["euclid_vis_psfMag"] = flux_to_mag(flux)
euclid["euclid_vis_psfMagErr"] = fluxErr_to_magErr(flux, fluxErr)

# Now YJH
for band in "yjh":
    flux = 1e3 * euclid0[f"flux_{band}_unif"] # nJy
    fluxErr = 1e3 * euclid0[f"fluxerr_{band}_unif"] # nJy
    euclid[f"euclid_{band}_unifMag"] = flux_to_mag(flux)
    euclid[f"euclid_{band}_unifMagErr"] = fluxErr_to_magErr(flux, fluxErr)

    mask = euclid0[f"flag_{band}"] == 1
    euclid.loc[mask, f"euclid_{band}_unifMag"] = np.nan
    euclid.loc[mask, f"euclid_{band}_unifMagErr"] = np.nan

In [None]:
# Cross-match ComCam and Euclid
comcam_coord = SkyCoord(
    ra=comcam.coord_ra.values * u.degree,
    dec=comcam.coord_dec.values * u.degree,
)

euclid_coord = SkyCoord(
    ra=euclid.coord_ra.values * u.degree,
    dec=euclid.coord_dec.values * u.degree,
)

# Get nearest matches on the sky
idx, d2d, _ = comcam_coord.match_to_catalog_sky(euclid_coord)
d2d = d2d.to(u.arcsec).value

MAX_SEP = 0.8 # arcsecs

plt.hist(d2d, bins=np.geomspace(1e-3, 20))
plt.xscale("log")
plt.title("ComCom-Euclid cross-match")
plt.xlabel("Distance to nearest match (arcsec)")
plt.axvline(MAX_SEP, c="r", ls="--")

# Replace large separations with idx=-1
idx[d2d > MAX_SEP] = -1

# Replace duplicates with -1
elements, counts = np.unique(idx, return_counts=True)
idx[np.isin(idx, elements[counts > 1])] = -1

# Match Euclid into ComCam
euclid_match = euclid.reset_index(drop=True)
euclid_match.loc[-1] = np.full(len(euclid.columns), np.nan)
euclid_match = euclid_match.loc[idx]
euclid_match = euclid_match.reset_index(drop=True)
euclid_match = euclid_match.drop(["coord_ra", "coord_dec"], axis=1)

comcam_euclid = pd.DataFrame(
    np.hstack((comcam.values, euclid_match.values)),
    columns=list(comcam.columns) + list(euclid_match.columns),
)

#### Finally, load and match with redshift catalog

In [None]:
# Cross-match with the spectroscopic catalog
z_cat = pd.read_parquet(catalog_dir / "comcam_ecdfs_redshift_catalog_20250618.parquet")
z_cat = z_cat.query("redshift < 7")

spec_coord = SkyCoord(
    ra=z_cat.RA.values * u.degree,
    dec=z_cat.DEC.values * u.degree,
)

# Get nearest matches on the sky
idx, d2d, _ = comcam_coord.match_to_catalog_sky(spec_coord)
d2d = d2d.to(u.arcsec).value

MAX_SEP = 1 # arcsecs

plt.hist(d2d, bins=np.geomspace(1e-2, 200))
plt.xscale("log")
plt.title("ComCom-Spec-z cross-match")
plt.xlabel("Distance to nearest match (arcsec)")
plt.axvline(MAX_SEP, c="r", ls="--")

# Replace large separations with idx=-1
idx[d2d > MAX_SEP] = -1

# Replace duplicates with -1
elements, counts = np.unique(idx, return_counts=True)
idx[np.isin(idx, elements[counts > 1])] = -1

# Match the spec-z catalog into ComCam
z_match = z_cat.reset_index(drop=True)
z_match.loc[-1] = [np.nan, np.nan, np.nan, np.nan, "", ""]
z_match = z_match.loc[idx].reset_index(drop=True)
z_match = z_match.drop(["RA", "DEC"], axis=1)

combined = pd.concat([comcam_euclid, z_match], axis=1)

print(f"{int(np.isfinite(combined["redshift"]).sum())} galaxies have spec-z's")

In [None]:
combined.to_parquet(catalog_dir / "comcam_ecdfs_crossmatched_catalog_20250618.parquet")

#### Save train/test sets with stringent cuts

In [None]:
# Stringent cuts
gold = combined.query("(type == 's') & (confidence >= 0.95) & (snr_i_gaap1p0 >= 10)")
gold = gold.dropna(subset=[col for col in gold.columns if "gaap1p0" in col])

# Train/test split
train = gold.sample(frac=0.80, random_state=42)
test = gold.drop(train.index)

# Save as hdf5
tables_io.write(train, catalog_dir / "dp1_ecdfs_matched_specgold_train.hdf5")
tables_io.write(test, catalog_dir / "dp1_ecdfs_matched_specgold_test.hdf5");

#### Some plots

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(14, 2.5), dpi=200)

with_z = combined[np.isfinite(combined["redshift"])]

# Names and colors of surveys
names = with_z.source.unique()
names.sort()
colors = list(sns.color_palette('tab20', n_colors=len(with_z.source.unique())))
colors = colors[::-1]

for name, color in zip(names, colors):
    # Plot footprint of each survey
    subset = with_z.query(f"source == '{name}'")
    ax1.scatter(subset.coord_ra, subset.coord_dec, color=color, s=0.01)
    ax1.set(xlabel="RA (deg)", ylabel="Dec (deg)", aspect="equal")
    ax1.invert_xaxis()

with_z.pivot(columns="source").redshift.plot.hist(stacked=True, bins=64, range=(0, 8), ax=ax2, color=colors)
ax2.set(
    xlabel="Redshift",
    xlim=(0, 7),
)
ax2.legend(fontsize=5, ncols=2, loc="upper right", frameon=False)

with_z.pivot(columns="source").redshift.plot.hist(stacked=True, bins=64, range=(0, 8), ax=ax3, color=colors, legend=False)
ax3.set(
    xlabel="Redshift",
    yscale="log",
    xlim=(0, 7),
    ylabel=None,
)

with_z.pivot(columns="type").redshift.plot.hist(stacked=False, bins=64, range=(0, 8), ax=ax4, histtype="step")
ax4.set(
    xlabel="Redshift",
    yscale="log",
    xlim=(0, 7),
    ylabel=None,
)
ax4.legend(fontsize=8, ncols=3, loc="upper right", handlelength=1, frameon=False, borderpad=0, columnspacing=1)

fig.subplots_adjust(wspace=0.25)

In [None]:
# 10 sigma depths in cModelMag

fig, axes = plt.subplots(2, 3, figsize=(10, 5), dpi=150)

for ax, band in zip(axes.flatten(), "ugrizy"):
    flux = 10 ** (combined[f"{band}_cModelMag"] - 31.4) / -2.5
    err = np.log(10) / 2.5 * combined[f"{band}_cModelMagErr"] * flux
    snr = flux / err
    depth = combined[f"{band}_cModelMag"] + 2.5 * np.log10(snr.values.astype(float) / 10)

    ax.hist(depth, bins="auto")
    ax.axvline(np.nanmedian(depth), ls="--", c="r")
    ax.set(
        xlabel=f"${band}$ band 10$\sigma$ depth",
        title=f"median $\sim$ {np.nanmedian(depth):.2f}",
        xlim=(19, 27),
    )

fig.subplots_adjust(hspace=0.6, wspace=0.5)

In [None]:
# 10 sigma depths in gaap1p0

fig, axes = plt.subplots(2, 3, figsize=(10, 5), dpi=150)

for ax, band in zip(axes.flatten(), "ugrizy"):
    flux = 10 ** (combined[f"{band}_gaap1p0Mag"] - 31.4) / -2.5
    err = np.log(10) / 2.5 * combined[f"{band}_gaap1p0MagErr"] * flux
    snr = flux / err
    depth = combined[f"{band}_gaap1p0Mag"] + 2.5 * np.log10(snr.values.astype(float) / 10)

    ax.hist(depth, bins="auto")
    ax.axvline(np.nanmedian(depth), ls="--", c="r")
    ax.set(
        xlabel=f"${band}$ band 10$\sigma$ depth",
        title=f"median $\sim$ {np.nanmedian(depth):.2f}",
        xlim=(19, 27),
    )

fig.subplots_adjust(hspace=0.6, wspace=0.5)