# Catalog Verification - Basic Statistics

Perform some basic verification on the datasets.

- confirm the number of nulls (NaNs) in the dataset is within expectations
- for fields with predictable limits, confirm min/max values in dataset

In [1]:
import os
import hats
import numpy as np
import pandas as pd

from hats.io.validation import is_valid_catalog
from pathlib import Path

In [2]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
hats_dir = base_output_dir / "hats" / DRP_VERSION

DRP_VERSION: w_2025_09


## Convenience methods

We define a few convenience methods to load catalog metadata, check for known value types and their ranges, and output the number of nulls found in each column.

This ensures we're performing the same kinds of checks against each table type.

In [None]:
def min_max_floats_in_range(stats, columns, exp_min_value, exp_max_value, value_type):
    """Convenience method to check all columns (rows) of a value type, using the `min_value` and `max_value`
    on the stats dataframe."""
    if columns:
        anything_bad = False
        for col in columns:
            if (
                float(stats["min_value"][col]) < exp_min_value
                or float(stats["max_value"][col]) > exp_max_value
            ):
                print(
                    f"**** {col} has values outside acceptable range for {value_type} (min:{stats['min_value'][col]}, max:{stats['max_value'][col]})"
                )
                anything_bad = True
        if not anything_bad:
            print(f"  All {value_type} columns within acceptable ranges ({columns})")


def verify_catalog(
    catalog_name,
    ra_cols=None,
    dec_cols=None,
    flux_cols=None,
    flux_err_cols=None,
    mjd_cols=None,
):
    print(catalog_name)
    cat = hats.read_hats(hats_dir / catalog_name)
    print("  is valid catalog", is_valid_catalog(hats_dir / catalog_name, strict=True, verbose=False))
    print("  num partitions:", len(cat.get_healpix_pixels()))
    print("  num rows:", cat.catalog_info.total_rows)
    stats = cat.aggregate_column_statistics()
    print("  num columns:", len(stats))
    ## Remove columns with "Mag" as these are created for HATS
    stats = stats.iloc[~stats.index.str.contains("Mag")]
    # print(stats)

    min_max_floats_in_range(stats, ra_cols, 0.0, 360.0, "RIGHT ASCENSION")
    min_max_floats_in_range(stats, dec_cols, -90.0, 90.0, "DECLINATION")
    min_max_floats_in_range(stats, flux_cols, -100_000_000.0, 100_000_000.0, "FLUX")
    min_max_floats_in_range(stats, flux_err_cols, 0.0, 100_000_000.0, "FLUX ERROR")
    min_max_floats_in_range(stats, mjd_cols, 60600.0, 60700.0, "MJD")

    np_null_count = stats["null_count"].to_numpy(copy=False, dtype=np.int64)
    with_null_index = np.where(np_null_count > 0)
    with_nulls = stats.iloc[with_null_index]

    if len(with_nulls):
        print(f"  columns with nulls: {len(with_nulls)}")
        with_nulls = with_nulls[["null_count"]]
        with_nulls["percent"] = [
            null_count / cat.catalog_info.total_rows * 100
            for null_count in with_nulls["null_count"].to_numpy(
                copy=False, dtype=np.int64
            )
        ]
        with_nulls = with_nulls.sort_values(by="percent", ascending=False)
        print(with_nulls)
    else:
        print("  columns with nulls: 0")

In [4]:
verify_catalog("diaObject", ra_cols=["ra"], dec_cols=["dec"], mjd_cols=["radecMjdTai"])

diaObject
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/diaObject ... 
Found 4 partitions.
Approximate coverage is 33.33 % of the sky.
  is valid catalog True
  num partitions: 4
  num rows: 2276883
  num columns: 6
  All RIGHT ASCENSION columns within acceptable ranges (['ra'])
  All DECLINATION columns within acceptable ranges (['dec'])
  All MJD columns within acceptable ranges (['radecMjdTai'])
  columns with nulls: 0


In [5]:
verify_catalog(
    "diaSource",
    ra_cols=["ra", "coord_ra", "trailRa"],
    dec_cols=["dec", "coord_dec", "trailDec"],
    flux_cols=[
        "apFlux",
        "psfFlux",
        "trailFlux",
        "dipoleMeanFlux",
        "dipoleFluxDiff",
        "scienceFlux",
        "ixxPSF",
        "iyyPSF",
        "ixyPSF",
    ],
    flux_err_cols=[
        "apFluxErr",
        "psfFluxErr",
        "dipoleMeanFluxErr",
        "dipoleFluxDiffErr",
        "scienceFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

diaSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/diaSource ... 
Found 6 partitions.
Approximate coverage is 31.25 % of the sky.
  is valid catalog True
  num partitions: 6
  num rows: 4159577
  num columns: 92
  All RIGHT ASCENSION columns within acceptable ranges (['ra', 'coord_ra', 'trailRa'])
  All DECLINATION columns within acceptable ranges (['dec', 'coord_dec', 'trailDec'])
  All FLUX columns within acceptable ranges (['apFlux', 'psfFlux', 'trailFlux', 'dipoleMeanFlux', 'dipoleFluxDiff', 'scienceFlux', 'ixxPSF', 'iyyPSF', 'ixyPSF'])
  All FLUX ERROR columns within acceptable ranges (['apFluxErr', 'psfFluxErr', 'dipoleMeanFluxErr', 'dipoleFluxDiffErr', 'scienceFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 31
                  null_count    percent
column_names                           
dipoleChi2           3082123  74.097030
dipoleAngle          3082123  74.097030
dipoleLength         

In [6]:
verify_catalog(
    "diaForcedSource",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux", "localBackground_instFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr", "localBackground_instFluxErr"],
    mjd_cols=["midpointMjdTai"],
)

diaForcedSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/diaForcedSource ... 
Found 245 partitions.
Approximate coverage is 17.54 % of the sky.
  is valid catalog True
  num partitions: 245
  num rows: 467053711
  num columns: 40
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
  All FLUX columns within acceptable ranges (['psfFlux', 'psfDiffFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['psfFluxErr', 'psfDiffFluxErr', 'localBackground_instFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 6
                            null_count   percent
column_names                                    
localBackground_instFluxErr    6101071  1.306289
localBackground_instFlux       6100739  1.306218
psfDiffFluxErr                  364594  0.078063
psfDiffFlux                     364594  

In [7]:
verify_catalog(
    "object",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=[
        "u_psfFlux",
        "u_kronFlux",
        "g_psfFlux",
        "g_kronFlux",
        "r_psfFlux",
        "r_kronFlux",
        "i_psfFlux",
        "i_kronFlux",
        "z_psfFlux",
        "z_kronFlux",
        "y_psfFlux",
        "y_kronFlux",
    ],
    flux_err_cols=[
        "u_psfFluxErr",
        "u_kronFluxErr",
        "g_psfFluxErr",
        "g_kronFluxErr",
        "r_psfFluxErr",
        "r_kronFluxErr",
        "i_psfFluxErr",
        "i_kronFluxErr",
        "z_psfFluxErr",
        "z_kronFluxErr",
        "y_psfFluxErr",
        "y_kronFluxErr",
    ],
)

object
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/object ... 
Found 80 partitions.
Approximate coverage is 9.74 % of the sky.
  is valid catalog True
  num partitions: 80
  num rows: 5366707
  num columns: 74
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
**** r_kronFlux has values outside acceptable range for FLUX (min:-1871823.198986211, max:254366473.42652225)
**** u_kronFluxErr has values outside acceptable range for FLUX ERROR (min:10.335923840071157, max:inf)
**** g_kronFluxErr has values outside acceptable range for FLUX ERROR (min:12.028468044159634, max:inf)
**** r_kronFluxErr has values outside acceptable range for FLUX ERROR (min:10.141821398611642, max:inf)
**** i_kronFluxErr has values outside acceptable range for FLUX ERROR (min:25.241087956430487, max:inf)
**** z_kronFluxErr has values outside acceptable range for FLUX ERROR (min:53.313838526

In [8]:
# pd.set_option('display.max_rows', None)

verify_catalog(
    "source",
    ra_cols=["coord_ra", "ra"],
    dec_cols=["coord_dec", "dec"],
    flux_cols=[
        "calibFlux",
        "ap03Flux",
        "ap06Flux",
        "ap09Flux",
        "ap12Flux",
        "ap17Flux",
        "ap25Flux",
        "ap35Flux",
        "ap50Flux",
        "ap70Flux",
        "psfFlux",
        "gaussianFlux",
        "apFlux_12_0_instFlux",
        "apFlux_17_0_instFlux",
        "apFlux_35_0_instFlux",
        "apFlux_50_0_instFlux",
        "normCompTophatFlux_instFlux",
        "localBackground_instFlux",
    ],
    flux_err_cols=[
        "calibFluxErr",
        "ap03FluxErr",
        "ap06FluxErr",
        "ap09FluxErr",
        "ap12FluxErr",
        "ap17FluxErr",
        "ap25FluxErr",
        "ap35FluxErr",
        "ap50FluxErr",
        "ap70FluxErr",
        "psfFluxErr",
        "gaussianFluxErr",
        "apFlux_12_0_instFluxErr",
        "apFlux_17_0_instFluxErr",
        "apFlux_35_0_instFluxErr",
        "apFlux_50_0_instFluxErr",
        "normCompTophatFlux_instFluxErr",
        "localBackground_instFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

source
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/source ... 
Found 117 partitions.
Approximate coverage is 9.19 % of the sky.
  is valid catalog True
  num partitions: 117
  num rows: 44360726
  num columns: 169
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra', 'ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec', 'dec'])
  All FLUX columns within acceptable ranges (['calibFlux', 'ap03Flux', 'ap06Flux', 'ap09Flux', 'ap12Flux', 'ap17Flux', 'ap25Flux', 'ap35Flux', 'ap50Flux', 'ap70Flux', 'psfFlux', 'gaussianFlux', 'apFlux_12_0_instFlux', 'apFlux_17_0_instFlux', 'apFlux_35_0_instFlux', 'apFlux_50_0_instFlux', 'normCompTophatFlux_instFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['calibFluxErr', 'ap03FluxErr', 'ap06FluxErr', 'ap09FluxErr', 'ap12FluxErr', 'ap17FluxErr', 'ap25FluxErr', 'ap35FluxErr', 'ap50FluxErr', 'ap70FluxErr', 'psfFluxErr', 'gaussianFluxErr', 'apFlux_12

In [9]:
verify_catalog(
    "forcedSource",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux", "localBackground_instFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr", "localBackground_instFluxErr"],
    mjd_cols=["midpointMjdTai"],
)

forcedSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_09/forcedSource ... 
Found 201 partitions.
Approximate coverage is 11.22 % of the sky.
  is valid catalog True
  num partitions: 201
  num rows: 576158429
  num columns: 43
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
  All FLUX columns within acceptable ranges (['psfFlux', 'psfDiffFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['psfFluxErr', 'psfDiffFluxErr', 'localBackground_instFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 6
                            null_count   percent
column_names                                    
localBackground_instFluxErr    5303257  0.920451
localBackground_instFlux       5303077  0.920420
psfDiffFluxErr                  500156  0.086809
psfDiffFlux                     500156  0.0868