# Catalog Verification - Basic Statistics

Perform some basic verification on the datasets.

- confirm the number of nulls (NaNs) in the dataset is within expectations
- for fields with predictable limits, confirm min/max values in dataset

In [None]:
import os
import hats
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from pathlib import Path

In [None]:
%env DRP_VERSION=w_2025_07

In [None]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION

In [None]:
fields_dir = raw_dir / "field_sizes"
fields_dir.mkdir(parents=True, exist_ok=True)

In [None]:
fresh_catalogs = [
    "diaObject",
    "diaSource",
    "diaForcedSource",
    "object",
    "source",
    "forcedSource",
]

for catalog_name in fresh_catalogs[4:5]:
    cat = hats.read_hats(hats_dir / catalog_name)
    print(catalog_name)
    print("  num partitions:", len(cat.get_healpix_pixels()))
    print("  num rows:", cat.catalog_info.total_rows)
    stats = cat.aggregate_column_statistics()
    print("  num columns:", len(stats))
    ## Remove columns with "Mag" as these are created for HATS
    stats = stats.iloc[~stats.index.str.contains("Mag")]
    np_null_count = stats["null_count"].to_numpy(copy=False, dtype=np.int64)
    with_null_index = np.where(np_null_count >0)
    with_nulls = stats.iloc[with_null_index]
    
    if len(with_nulls):
        print(f"  columns with nulls: {len(with_nulls)}")
        with_nulls = with_nulls[["null_count"]]
        with_nulls["percent"] = [null_count/cat.catalog_info.total_rows*100 for null_count in with_nulls["null_count"].to_numpy(copy=False, dtype=np.int64)]
        with_nulls = with_nulls.sort_values(by='percent', ascending=False)
        print(with_nulls)
    else:
        print("  columns with nulls: 0")