# Catalog inspection - By Field

Perform more detailed verification on the datasets, using LSDB to inspect leaf parquet files, using spatial fields.

In [None]:
!pip install lsdb --quiet

In [None]:
%env DRP_VERSION=w_2025_07

In [None]:
import os
import hats
import lsdb
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm
import itertools

from pathlib import Path

In [None]:
DRP_VERSION = os.environ["DRP_VERSION"]
print(f"DRP_VERSION: {DRP_VERSION}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION

In [None]:
fields_dir = raw_dir / "field_sizes"
fields_dir.mkdir(parents=True, exist_ok=True)

In [None]:
catalogs = [
    "diaObject",
    "diaSource",
    "diaForcedSource",
    "object",
    "source",
    "forcedSource",
]   

# Define the six fields with their coordinates
fields = {
    "ECDFS": (53.13, -28.10),  # Extended Chandra Deep Field South
    "EDFS": (59.10, -48.73),  # Euclid Deep Field South
    "Rubin_SV_38_7": (37.86, 6.98),  # Low Ecliptic Latitude Field
    "Rubin_SV_95_-25": (95.00, -25.00),  # Low Galactic Latitude Field
    "47_Tuc": (6.02, -72.08),  # 47 Tuc Globular Cluster
    "Fornax_dSph": (40.00, -34.45)  # Fornax Dwarf Spheroidal Galaxy
}

# Define the radius for selecting sources
selection_radius_arcsec = 2.0 * 3600  # 2-degree radius

# Define bands
bands = ["u", "g", "r", "i", "z", "y"]

In [None]:
cat = lsdb.read_hats(hats_dir / "source")
cat

In [None]:
## What are the columns of interest for the results? Everything numeric!
print("starting column count", len(cat._ddf.meta.columns))
column_names = list(cat._ddf.meta.select_dtypes(include=np.number))
# Exclude HATS-added columns
column_names = [c for c in column_names if c not in ["_healpix_29", "Norder", "Dir", "Npix"]]
column_names = [c for c in column_names if not c.endswith("Id")]
column_names = [c for c in column_names if "Mag" not in c]
print("effective column count", len(column_names))

In [None]:
# Function to compute statistics
def get_stats(df, stat_columns, out_columns):
    stats={col: 0 for col in out_columns} 
    for band in bands:
        mask = df["band"] == band  # Filter by band
        if mask.sum() > 0:  # Ensure there are sources in this band
            for col in stat_columns:
                stats[f"mean_{col}_{band}"] = np.nanmean(df.loc[mask,col])
            stats[f"len_{band}"] = len(df.loc[mask, "x"]) - np.count_nonzero(np.isnan(df.loc[mask, "x"]))
    return pd.DataFrame([stats])  # Convert to DataFrame

# Dictionary to store results
all_results = {}

meta={**{f"mean_{column}_{band}": "f8" for (column, band) in itertools.product(column_names, bands)},
     **{f"len_{band}": "i8" for band in bands}}

In [None]:

# Loop through each field and perform cone search + computation
for field_name, (ra, dec) in tqdm(fields.items()):
    # Perform cone search for the given field
    field_cat = cat.cone_search(ra=ra, dec=dec, radius_arcsec=selection_radius_arcsec)
    
    # Compute statistics
    result = field_cat.map_partitions(
        get_stats,
        meta=meta,
        stat_columns=column_names,
        out_columns=meta.keys(),
    ).compute()

    # Compute weighted sum for each band separately
    weighted_means = {}
    for (column, band) in itertools.product(column_names, bands):
        if np.nansum(result[f"len_{band}"]):
            mean_col_name = f"mean_{column}_{band}"
            weighted_means[mean_col_name] = np.nansum(result[mean_col_name] * result[f"len_{band}"]) / np.nansum(result[f"len_{band}"])
    
    # Store the weighted means for this field
    all_results[field_name] = weighted_means

# Convert to DataFrame for better visualization
weighted_means_df = pd.DataFrame.from_dict(all_results, orient="index")


In [None]:
pd.set_option('display.max_rows', None)
weighted_means_df.T