# Catalog Verification - Basic Statistics

Perform some basic verification on the datasets.

- confirm the number of nulls (NaNs) in the dataset is within expectations
- for fields with predictable limits, confirm min/max values in dataset

In [12]:
import os
import hats
import numpy as np
import pandas as pd

from hats.io.validation import is_valid_catalog
from pathlib import Path

In [14]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION

DRP_VERSION: w_2025_07


## Convenience methods

We define a few convenience methods to load catalog metadata, check for known value types and their ranges, and output the number of nulls found in each column.

This ensures we're performing the same kinds of checks against each table type.

In [19]:
def min_max_floats_in_range(stats, columns, exp_min_value, exp_max_value, value_type):
    """Convenience method to check all columns (rows) of a value type, using the `min_value` and `max_value`
    on the stats dataframe."""
    if columns:
        anything_bad = False
        for col in columns:
            if (
                float(stats["min_value"][col]) < exp_min_value
                or float(stats["max_value"][col]) > exp_max_value
            ):
                print(
                    f"**** {col} has values outside acceptable range for {value_type} (min:{stats['min_value'][col]}, max:{stats['max_value'][col]})"
                )
                anything_bad = True
        if not anything_bad:
            print(f"  All {value_type} columns within acceptable ranges ({columns})")


def verify_catalog(
    catalog_name,
    ra_cols=None,
    dec_cols=None,
    flux_cols=None,
    flux_err_cols=None,
    mjd_cols=None,
):
    print(catalog_name)
    cat = hats.read_hats(hats_dir / catalog_name)
    print("  is valid catalog", is_valid_catalog(hats_dir / catalog_name, strict=True))
    print("  num partitions:", len(cat.get_healpix_pixels()))
    print("  num rows:", cat.catalog_info.total_rows)
    stats = cat.aggregate_column_statistics()
    print("  num columns:", len(stats))
    ## Remove columns with "Mag" as these are created for HATS
    stats = stats.iloc[~stats.index.str.contains("Mag")]
    # print(stats)

    min_max_floats_in_range(stats, ra_cols, 0.0, 360.0, "RIGHT ASCENSION")
    min_max_floats_in_range(stats, dec_cols, -90.0, 90.0, "DECLINATION")
    min_max_floats_in_range(stats, flux_cols, -100_000_000.0, 100_000_000.0, "FLUX")
    min_max_floats_in_range(stats, flux_err_cols, 0.0, 100_000_000.0, "FLUX ERROR")
    min_max_floats_in_range(stats, mjd_cols, 60600.0, 60700.0, "MJD")

    np_null_count = stats["null_count"].to_numpy(copy=False, dtype=np.int64)
    with_null_index = np.where(np_null_count > 0)
    with_nulls = stats.iloc[with_null_index]

    if len(with_nulls):
        print(f"  columns with nulls: {len(with_nulls)}")
        with_nulls = with_nulls[["null_count"]]
        with_nulls["percent"] = [
            null_count / cat.catalog_info.total_rows * 100
            for null_count in with_nulls["null_count"].to_numpy(
                copy=False, dtype=np.int64
            )
        ]
        with_nulls = with_nulls.sort_values(by="percent", ascending=False)
        print(with_nulls)
    else:
        print("  columns with nulls: 0")

In [20]:
verify_catalog("diaObject", ra_cols=["ra"], dec_cols=["dec"], mjd_cols=["radecMjdTai"])

diaObject
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/diaObject ... 
Found 4 partitions.
Approximate coverage is 33.33 % of the sky.
  is valid catalog True
  num partitions: 4
  num rows: 1914697
  md 1914697
  num columns: 6
  All RIGHT ASCENSION columns within acceptable ranges (['ra'])
  All DECLINATION columns within acceptable ranges (['dec'])
  All MJD columns within acceptable ranges (['radecMjdTai'])
  columns with nulls: 0


In [21]:
verify_catalog(
    "diaSource",
    ra_cols=["ra", "coord_ra", "trailRa"],
    dec_cols=["dec", "coord_dec", "trailDec"],
    flux_cols=[
        "apFlux",
        "psfFlux",
        "trailFlux",
        "dipoleMeanFlux",
        "dipoleFluxDiff",
        "scienceFlux",
        "ixxPSF",
        "iyyPSF",
        "ixyPSF",
    ],
    flux_err_cols=[
        "apFluxErr",
        "psfFluxErr",
        "dipoleMeanFluxErr",
        "dipoleFluxDiffErr",
        "scienceFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

diaSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/diaSource ... 
Found 4 partitions.
Approximate coverage is 33.33 % of the sky.
  is valid catalog True
  num partitions: 4
  num rows: 3593618
  md 3593618
  num columns: 90
  All RIGHT ASCENSION columns within acceptable ranges (['ra', 'coord_ra', 'trailRa'])
  All DECLINATION columns within acceptable ranges (['dec', 'coord_dec', 'trailDec'])
  All FLUX columns within acceptable ranges (['apFlux', 'psfFlux', 'trailFlux', 'dipoleMeanFlux', 'dipoleFluxDiff', 'scienceFlux', 'ixxPSF', 'iyyPSF', 'ixyPSF'])
  All FLUX ERROR columns within acceptable ranges (['apFluxErr', 'psfFluxErr', 'dipoleMeanFluxErr', 'dipoleFluxDiffErr', 'scienceFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 31
                  null_count    percent
column_names                           
dipoleMeanFlux       2616251  72.802702
dipoleMeanFluxErr    2616251  72.802702
dipoleCh

In [22]:
verify_catalog(
    "diaForcedSource",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux", "localBackground_instFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr", "localBackground_instFluxErr"],
    mjd_cols=["midpointMjdTai"],
)

diaForcedSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/diaForcedSource ... 
Found 225 partitions.
Approximate coverage is 19.60 % of the sky.
  is valid catalog True
  num partitions: 225
  num rows: 426871848
  md 426871848
  num columns: 40
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
  All FLUX columns within acceptable ranges (['psfFlux', 'psfDiffFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['psfFluxErr', 'psfDiffFluxErr', 'localBackground_instFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 6
                            null_count   percent
column_names                                    
localBackground_instFluxErr    5200680  1.218323
localBackground_instFlux       5200490  1.218279
psfDiffFlux                     441129  0.103340
psfDiffFluxErr           

In [23]:
verify_catalog(
    "object",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=[
        "u_psfFlux",
        "u_kronFlux",
        "g_psfFlux",
        "g_kronFlux",
        "r_psfFlux",
        "r_kronFlux",
        "i_psfFlux",
        "i_kronFlux",
        "z_psfFlux",
        "z_kronFlux",
        "y_psfFlux",
        "y_kronFlux",
    ],
    flux_err_cols=[
        "u_psfFluxErr",
        "u_kronFluxErr",
        "g_psfFluxErr",
        "g_kronFluxErr",
        "r_psfFluxErr",
        "r_kronFluxErr",
        "i_psfFluxErr",
        "i_kronFluxErr",
        "z_psfFluxErr",
        "z_kronFluxErr",
        "y_psfFluxErr",
        "y_kronFluxErr",
    ],
)

object
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/object ... 
Found 54 partitions.
Approximate coverage is 11.94 % of the sky.
  is valid catalog True
  num partitions: 54
  num rows: 5439433
  md 5439433
  num columns: 74
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
**** y_kronFlux has values outside acceptable range for FLUX (min:-1944109.982704437, max:125572292.77500014)
**** u_kronFluxErr has values outside acceptable range for FLUX ERROR (min:53.52109122681252, max:inf)
**** g_kronFluxErr has values outside acceptable range for FLUX ERROR (min:12.33349069970275, max:inf)
**** r_kronFluxErr has values outside acceptable range for FLUX ERROR (min:16.246293495025153, max:inf)
**** i_kronFluxErr has values outside acceptable range for FLUX ERROR (min:26.825956194337333, max:inf)
**** z_kronFluxErr has values outside acceptable range for FLUX ERROR (min:

In [24]:
# pd.set_option('display.max_rows', None)

verify_catalog(
    "source",
    ra_cols=["coord_ra", "ra"],
    dec_cols=["coord_dec", "dec"],
    flux_cols=[
        "calibFlux",
        "ap03Flux",
        "ap06Flux",
        "ap09Flux",
        "ap12Flux",
        "ap17Flux",
        "ap25Flux",
        "ap35Flux",
        "ap50Flux",
        "ap70Flux",
        "psfFlux",
        "gaussianFlux",
        "apFlux_12_0_instFlux",
        "apFlux_17_0_instFlux",
        "apFlux_35_0_instFlux",
        "apFlux_50_0_instFlux",
        "normCompTophatFlux_instFlux",
        "localBackground_instFlux",
    ],
    flux_err_cols=[
        "calibFluxErr",
        "ap03FluxErr",
        "ap06FluxErr",
        "ap09FluxErr",
        "ap12FluxErr",
        "ap17FluxErr",
        "ap25FluxErr",
        "ap35FluxErr",
        "ap50FluxErr",
        "ap70FluxErr",
        "psfFluxErr",
        "gaussianFluxErr",
        "apFlux_12_0_instFluxErr",
        "apFlux_17_0_instFluxErr",
        "apFlux_35_0_instFluxErr",
        "apFlux_50_0_instFluxErr",
        "normCompTophatFlux_instFluxErr",
        "localBackground_instFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

source
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/source ... 
Found 117 partitions.
Approximate coverage is 9.19 % of the sky.
  is valid catalog True
  num partitions: 117
  num rows: 43826352
  md 43826352
  num columns: 169
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra', 'ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec', 'dec'])
  All FLUX columns within acceptable ranges (['calibFlux', 'ap03Flux', 'ap06Flux', 'ap09Flux', 'ap12Flux', 'ap17Flux', 'ap25Flux', 'ap35Flux', 'ap50Flux', 'ap70Flux', 'psfFlux', 'gaussianFlux', 'apFlux_12_0_instFlux', 'apFlux_17_0_instFlux', 'apFlux_35_0_instFlux', 'apFlux_50_0_instFlux', 'normCompTophatFlux_instFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['calibFluxErr', 'ap03FluxErr', 'ap06FluxErr', 'ap09FluxErr', 'ap12FluxErr', 'ap17FluxErr', 'ap25FluxErr', 'ap35FluxErr', 'ap50FluxErr', 'ap70FluxErr', 'psfFluxErr', 'gaussianFluxEr

In [25]:
verify_catalog(
    "forcedSource",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux", "localBackground_instFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr", "localBackground_instFluxErr"],
    mjd_cols=["midpointMjdTai"],
)

forcedSource
Validating catalog at path /sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_07/forcedSource ... 
Found 196 partitions.
Approximate coverage is 11.22 % of the sky.
  is valid catalog True
  num partitions: 196
  num rows: 579119668
  md 579119668
  num columns: 43
  All RIGHT ASCENSION columns within acceptable ranges (['coord_ra'])
  All DECLINATION columns within acceptable ranges (['coord_dec'])
  All FLUX columns within acceptable ranges (['psfFlux', 'psfDiffFlux', 'localBackground_instFlux'])
  All FLUX ERROR columns within acceptable ranges (['psfFluxErr', 'psfDiffFluxErr', 'localBackground_instFluxErr'])
  All MJD columns within acceptable ranges (['midpointMjdTai'])
  columns with nulls: 6
                            null_count   percent
column_names                                    
localBackground_instFluxErr    5544788  0.957451
localBackground_instFlux       5544586  0.957416
psfDiffFlux                     920847  0.159008
psfDiffFluxErr                 