# Catalog inspection

I want to look at the "fresh" datasets (that have come directly from butler), and will also look at the post-processed DASH datasets (once I've generated them).

In [1]:
from pathlib import Path

# HATS/LSDB
import lsdb
import hats

from tqdm import tqdm
import pandas as pd

In [2]:
base_output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/dm_48556")
hats_dir = base_output_dir /  "hats"

In [15]:
fresh_catalogs = ["diaForcedSource", "diaObject", "diaSource", "forcedSource", "object", "source"]

for catalog in fresh_catalogs:
    cat = hats.read_hats(hats_dir / catalog)
    print(catalog)
    print("num partitions", len(cat.get_healpix_pixels()))
    print("num rows", cat.catalog_info.total_rows)

diaForcedSource
num partitions 510
num rows 1065086776
diaObject
num partitions 6
num rows 4064285
diaSource
num partitions 12
num rows 6922037
forcedSource
num partitions 207
num rows 592040780
object
num partitions 54
num rows 5448838
source
num partitions 148
num rows 54067391


In [22]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile(hats_dir / "source" / "dataset" / "_metadata")

import numpy as np
import pandas as pd

num_cols = parquet_file.metadata.num_columns
num_row_groups = parquet_file.metadata.num_row_groups
sizes = np.zeros(num_cols)

for rg in range(num_row_groups):
    for col in range(num_cols):
        sizes[col] += parquet_file.metadata.row_group(rg).column(col).total_compressed_size

## This is just an attempt at pretty formatting
percents = [f"{s/sizes.sum()*100:.1f}" for s in sizes]
pd.DataFrame({"name": parquet_file.schema.names, "size": sizes.astype(int), "percent": percents}).sort_values(
    "size", ascending=False
).to_csv("/sdf/data/rubin/shared/lsdb_commissioning/dm_48556/raw/field_sizes/source.csv", index=False)