# Catalog Verification - Basic Statistics

Perform some basic verification on the datasets.

- confirm the number of nulls (NaNs) in the dataset is within expectations
- for fields with predictable limits, confirm min/max values in dataset

In [None]:
import os
import hats
import numpy as np
import pandas as pd

from hats.io.validation import is_valid_catalog
from pathlib import Path

In [None]:
VERSION = os.environ["VERSION"]
OUTPUT_DIR = Path(os.environ["OUTPUT_DIR"])

print(f"VERSION: {VERSION}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

hats_dir = OUTPUT_DIR / "hats" / VERSION

## Convenience methods

We define a few convenience methods to load catalog metadata, check for known value types and their ranges, and output the number of nulls found in each column.

This ensures we're performing the same kinds of checks against each table type.

In [None]:
def min_max_floats_in_range(stats, columns, exp_min_value, exp_max_value, value_type):
    """Convenience method to check all columns (rows) of a value type, using the `min_value` and `max_value`
    on the stats dataframe."""
    if columns:
        anything_bad = False
        for col in columns:
            if (
                float(stats["min_value"][col]) < exp_min_value
                or float(stats["max_value"][col]) > exp_max_value
            ):
                print(
                    f"**** {col} has values outside acceptable range for {value_type} (min:{stats['min_value'][col]}, max:{stats['max_value'][col]})"
                )
                anything_bad = True
        if not anything_bad:
            print(f"  All {value_type} columns within acceptable ranges ({columns})")


def verify_catalog(
    catalog_name,
    ra_cols=None,
    dec_cols=None,
    flux_cols=None,
    flux_err_cols=None,
    mjd_cols=None,
):
    print(catalog_name)
    cat = hats.read_hats(hats_dir / catalog_name)
    print(
        "  is valid catalog",
        is_valid_catalog(hats_dir / catalog_name, strict=True, verbose=False),
    )
    print("  num partitions:", len(cat.get_healpix_pixels()))
    print("  num rows:", cat.catalog_info.total_rows)
    stats = cat.aggregate_column_statistics()
    print("  num columns:", len(stats))
    ## Remove columns with "Mag" as these are created for HATS
    stats = stats.iloc[~stats.index.str.contains("Mag")]

    min_max_floats_in_range(stats, ra_cols, 0.0, 360.0, "RIGHT ASCENSION")
    min_max_floats_in_range(stats, dec_cols, -90.0, 90.0, "DECLINATION")
    min_max_floats_in_range(stats, flux_cols, -100_000_000.0, 100_000_000.0, "FLUX")
    min_max_floats_in_range(stats, flux_err_cols, 0.0, 100_000_000.0, "FLUX ERROR")
    min_max_floats_in_range(stats, mjd_cols, 60600.0, 60700.0, "MJD")

    np_null_count = stats["null_count"].to_numpy(copy=False, dtype=np.int64)
    with_null_index = np.where(np_null_count > 0)
    with_nulls = stats.iloc[with_null_index]

    if len(with_nulls):
        print(f"  columns with nulls: {len(with_nulls)}")
        with_nulls = with_nulls[["null_count"]]
        with_nulls["percent"] = [
            null_count / cat.catalog_info.total_rows * 100
            for null_count in with_nulls["null_count"].to_numpy(
                copy=False, dtype=np.int64
            )
        ]
        with_nulls = with_nulls.sort_values(by="percent", ascending=False)
        print(with_nulls)
    else:
        print("  columns with nulls: 0")

In [None]:
verify_catalog("dia_object", ra_cols=["ra"], dec_cols=["dec"], mjd_cols=["radecMjdTai"])

In [None]:
verify_catalog(
    "dia_source",
    ra_cols=["ra", "coord_ra", "trailRa"],
    dec_cols=["dec", "coord_dec", "trailDec"],
    flux_cols=[
        "apFlux",
        "psfFlux",
        "trailFlux",
        "dipoleMeanFlux",
        "dipoleFluxDiff",
        "scienceFlux",
        "ixxPSF",
        "iyyPSF",
        "ixyPSF",
    ],
    flux_err_cols=[
        "apFluxErr",
        "psfFluxErr",
        "dipoleMeanFluxErr",
        "dipoleFluxDiffErr",
        "scienceFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

In [None]:
verify_catalog(
    "dia_object_forced_source",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr"],
    mjd_cols=["midpointMjdTai"],
)

In [None]:
verify_catalog(
    "object",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=[
        "u_psfFlux",
        "u_kronFlux",
        "g_psfFlux",
        "g_kronFlux",
        "r_psfFlux",
        "r_kronFlux",
        "i_psfFlux",
        "i_kronFlux",
        "z_psfFlux",
        "z_kronFlux",
        "y_psfFlux",
        "y_kronFlux",
    ],
    flux_err_cols=[
        "u_psfFluxErr",
        "u_kronFluxErr",
        "g_psfFluxErr",
        "g_kronFluxErr",
        "r_psfFluxErr",
        "r_kronFluxErr",
        "i_psfFluxErr",
        "i_kronFluxErr",
        "z_psfFluxErr",
        "z_kronFluxErr",
        "y_psfFluxErr",
        "y_kronFluxErr",
    ],
)

In [None]:
# pd.set_option('display.max_rows', None)

verify_catalog(
    "source2",
    ra_cols=["coord_ra", "ra"],
    dec_cols=["coord_dec", "dec"],
    flux_cols=[
        "calibFlux",
        "ap03Flux",
        "ap06Flux",
        "ap09Flux",
        "ap12Flux",
        "ap17Flux",
        "ap25Flux",
        "ap35Flux",
        "ap50Flux",
        "ap70Flux",
        "psfFlux",
        "gaussianFlux",
        "apFlux_12_0_instFlux",
        "apFlux_17_0_instFlux",
        "apFlux_35_0_instFlux",
        "apFlux_50_0_instFlux",
        "normCompTophatFlux_instFlux",
        "localBackground_instFlux",
    ],
    flux_err_cols=[
        "calibFluxErr",
        "ap03FluxErr",
        "ap06FluxErr",
        "ap09FluxErr",
        "ap12FluxErr",
        "ap17FluxErr",
        "ap25FluxErr",
        "ap35FluxErr",
        "ap50FluxErr",
        "ap70FluxErr",
        "psfFluxErr",
        "gaussianFluxErr",
        "apFlux_12_0_instFluxErr",
        "apFlux_17_0_instFluxErr",
        "apFlux_35_0_instFluxErr",
        "apFlux_50_0_instFluxErr",
        "normCompTophatFlux_instFluxErr",
        "localBackground_instFluxErr",
    ],
    mjd_cols=["midpointMjdTai"],
)

In [None]:
verify_catalog(
    "object_forced_source",
    ra_cols=["coord_ra"],
    dec_cols=["coord_dec"],
    flux_cols=["psfFlux", "psfDiffFlux"],
    flux_err_cols=["psfFluxErr", "psfDiffFluxErr"],
    mjd_cols=["midpointMjdTai"],
)