# Catalog inspection

I want to look at the "fresh" datasets (that have come directly from butler), and will also look at the post-processed DASH datasets (once I've generated them).

In [52]:
import os
import hats
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from pathlib import Path

In [None]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION

In [54]:
fields_dir = raw_dir / "field_sizes"
fields_dir.mkdir(parents=True, exist_ok=True)

In [55]:
def collect_statistics(catalog_name):
    parquet_file = pq.ParquetFile(hats_dir / catalog_name / "dataset" / "_metadata")

    num_cols = parquet_file.metadata.num_columns
    num_row_groups = parquet_file.metadata.num_row_groups
    sizes = np.zeros(num_cols)

    for rg in range(num_row_groups):
        for col in range(num_cols):
            sizes[col] += parquet_file.metadata.row_group(rg).column(col).total_compressed_size

    ## This is just an attempt at pretty formatting
    percents = [f"{s/sizes.sum()*100:.1f}" for s in sizes]

    statistics = {"name": parquet_file.schema.names, "size": sizes.astype(int), "percent": percents}
    outfile = raw_dir / "field_sizes" / f"{catalog_name}.csv"
    pd.DataFrame(statistics).sort_values("size", ascending=False).to_csv(outfile, index=False)

In [None]:
fresh_catalogs = [
    "diaObject",
    "diaSource",
    "diaForcedSource",
    "object",
    "source",
    "forcedSource",
]

for catalog_name in fresh_catalogs:
    cat = hats.read_hats(hats_dir / catalog_name)
    print(catalog_name)
    print("num partitions", len(cat.get_healpix_pixels()))
    print("num rows", cat.catalog_info.total_rows)
    collect_statistics(catalog_name)