# Catalog inspection

I want to look at the "fresh" datasets (that have come directly from butler), and will also look at the post-processed DASH datasets (once I've generated them).

In [1]:
import hats
from pathlib import Path

In [7]:
base_output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_04")
hats_dir = base_output_dir / "hats"
raw_dir = base_output_dir / "raw"
fields_dir = raw_dir / "field_sizes"
fields_dir.mkdir(parents=True, exist_ok=True)

In [None]:
fresh_catalogs = ["diaForcedSource", "diaObject", "diaSource", "forcedSource", "object", "source"]

for catalog in fresh_catalogs:
    cat = hats.read_hats(hats_dir / catalog)
    print(catalog)
    print("num partitions", len(cat.get_healpix_pixels()))
    print("num rows", cat.catalog_info.total_rows)

In [8]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile(hats_dir / "source" / "dataset" / "_metadata")

import numpy as np
import pandas as pd

num_cols = parquet_file.metadata.num_columns
num_row_groups = parquet_file.metadata.num_row_groups
sizes = np.zeros(num_cols)

for rg in range(num_row_groups):
    for col in range(num_cols):
        sizes[col] += parquet_file.metadata.row_group(rg).column(col).total_compressed_size

## This is just an attempt at pretty formatting
percents = [f"{s/sizes.sum()*100:.1f}" for s in sizes]
pd.DataFrame({"name": parquet_file.schema.names, "size": sizes.astype(int), "percent": percents}).sort_values(
    "size", ascending=False
).to_csv("/sdf/data/rubin/shared/lsdb_commissioning/hats/w_2025_04/raw/field_sizes/source.csv", index=False)