## Import PPDB base catalog

This notebook imports the base PPDB catalog (2025/26 fall and winter data).

In [1]:
import lsdb
import numpy as np

Set up the Dask Client and the input/output dirs:

In [2]:
import tempfile
from dask.distributed import Client
from pathlib import Path

# Path to the PPDB data 2025
PPDB_DIR = Path("/sdf/scratch/rubin/ppdb/data/ppdb_lsstcam")

# Temporary directory
tmp_dir = tempfile.TemporaryDirectory()
print(f"Intermediate directory: {tmp_dir.name}")

# Final target directory
output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/ppdb")

# Dask distributed client
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir.name, memory_limit="8GB")

Intermediate directory: /lscratch/stavar/tmp/tmpvpnt0q5h


### Get input files for each dataset type

In [3]:
def get_paths(dataset_type):
    """Return the parquet files for a given dataset type."""
    dataset_name = "".join(word.capitalize() for word in dataset_type.split("_"))
    files = sorted(PPDB_DIR.rglob(f"{dataset_name}.parquet"))
    get_date = lambda p: "/".join(p.relative_to(PPDB_DIR).parts[:3])
    print(f"Found {len(files)} files for {dataset_type} ({get_date(files[0])} - {get_date(files[-1])})")
    return files

object_files = get_paths("dia_object")
source_files = get_paths("dia_source")
forced_source_files = get_paths("dia_forced_source")

Found 249 files for dia_object (2025/09/06 - 2026/01/20)
Found 249 files for dia_source (2025/09/06 - 2026/01/20)
Found 27 files for dia_forced_source (2026/01/08 - 2026/01/16)


### Import base catalogs

In [4]:
import pyarrow as pa
import pyarrow.parquet as pq
from hats_import import pipeline_with_client
from hats_import.catalog.arguments import ImportArguments
from hats_import.catalog.file_readers import ParquetPyarrowReader
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments
from lsst.resources import ResourcePath


def import_dataset(dataset_type, input_file_list, catalog_type):
    """Import `dataset_type` with files of up to 1GiB"""
    schema_filepath = _download_schema(dataset_type, input_file_list[0])
    args = ImportArguments(
        output_path=output_dir,
        output_artifact_name=dataset_type,
        input_file_list=input_file_list,
        file_reader=ParquetPyarrowReader(),
        ra_column="ra",
        dec_column="dec",
        catalog_type=catalog_type,
        byte_pixel_threshold=1<<30, # 1 GiB
        use_schema_file=schema_filepath,
        simple_progress_bar=True,
        resume=False,
    )
    pipeline_with_client(args, client)

def _download_schema(dataset_type, single_parquet_path):
    """Downloads the schema for `dataset_type`"""
    with ResourcePath(single_parquet_path).open("rb") as file:
        schema = pq.read_schema(file).remove_metadata()
    schema_table = pa.table({field.name: pa.array([], type=field.type) for field in schema})
    schema_filepath = f"{tmp_dir.name}/{dataset_type}_schema.parquet"
    pq.write_table(schema_table, schema_filepath)
    return schema_filepath

In [5]:
import_dataset("dia_object", object_files, catalog_type="object")
import_dataset("dia_source", source_files, catalog_type="source")
import_dataset("dia_forced_source", forced_source_files, catalog_type="source")

Catalog: Planning  : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 246.00it/s]
Catalog: Mapping   : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 249/249 [00:17<00:00, 14.17it/s]
Catalog: Binning   : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:52<00:00, 26.19s/it]
Catalog: Splitting : 100%|█████████████████████████████████████████████████████

### Post-processing

About 8% of objects have duplicates (same `diaObjectId`):

In [6]:
dia_object = lsdb.open_catalog(output_dir / "dia_object")
_, counts = np.unique(dia_object["diaObjectId"], return_counts=True)
n_dup_ids = np.sum(counts > 1)
n_dup_ids/len(dia_object)*100

np.float64(7.705989774842453)

We will keep the oned of latest `validityStartMjdTai`, and add mag/magerr columns:

In [7]:
from postprocess import postprocess_catalog

flux_col_prefixes = [f"{band}_scienceFluxMean" for band in list("ugrizy")]
postprocess_catalog(client, output_dir, "dia_object", flux_col_prefixes, "validityStartMjdTai")
postprocess_catalog(client, output_dir, "dia_source", ["scienceFlux"])
postprocess_catalog(client, output_dir, "dia_forced_source", ["scienceFlux"])

  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
dia_object: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.79it/s]
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
  return getattr(ufunc, method)(*new_inputs, **kwargs)
dia_source: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

### Nest sources in objects

In [8]:
def load_sources_with_margin(dataset_type, margin_arcsec=5):
    """Create source margins for nesting"""
    input_catalog_path = output_dir / dataset_type
    margin_name = f"{dataset_type}_{margin_arcsec}arcs"
    args = MarginCacheArguments(
        input_catalog_path=input_catalog_path,
        output_path=tmp_dir.name,
        margin_threshold=margin_arcsec,
        output_artifact_name=margin_name,
        progress_bar=False,
        resume=False,
    )
    pipeline_with_client(args, client)
    margin_path = f"{tmp_dir.name}/{margin_name}"
    return lsdb.open_catalog(input_catalog_path, margin_cache=margin_path)

In [9]:
dia_object = lsdb.open_catalog(output_dir / "dia_object")
dia_source = load_sources_with_margin("dia_source")
dia_forced_source = load_sources_with_margin("dia_forced_source")

There are ~6% of sources with no `diaObjectId`:

In [10]:
n_sources_no_objid = np.sum(dia_source["diaObjectId"].isna().compute())
n_sources_no_objid/len(dia_source)*100

np.float64(5.6193954448733034)

We'll need to filter them out otherwise we cannot nest:

In [11]:
valid_dia_source = dia_source[~dia_source["diaObjectId"].isna()]

That does not seem to be an issue for `diaForcedSource`:

In [12]:
np.sum(dia_forced_source["diaObjectId"].isna().compute())

np.int64(0)

Nest sources and forced sources and write to disk:

In [None]:
from nest import nest_sources

dia_object_lc = nest_sources(dia_object, valid_dia_source, dia_forced_source)
dia_collection_dir = output_dir / "dia_object_collection"
dia_object_lc.to_hats(dia_collection_dir, catalog_name="dia_object_lc")



### Finish collection

Generate margin and index catalogs.

In [14]:
from hats_import.collection.arguments import CollectionArguments

args = (
    CollectionArguments(
        output_artifact_name="dia_object_collection",
        new_catalog_name="dia_object_lc",
        output_path=output_dir,
        simple_progress_bar=True,
    )
    .catalog(
        catalog_path=output_dir / "dia_object_collection" / "dia_object_lc",
    )
    .add_margin(margin_threshold=5.0, is_default=True)
    .add_index(indexing_column="diaObjectId")
)
pipeline_with_client(args, client)

Margin: Planning  : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.61it/s]
Margin: Mapping   : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00,  7.04it/s]
Margin: Binning   : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 85.98it/s]
Margin: Reducing  : 100%|██████████████████████████████████████████████████████

Let's store which files we ingested for later:

In [11]:
input_paths_dir = (dia_collection_dir / "input_paths")
input_paths_dir.mkdir(exist_ok=True)

def save_paths(dataset_type, filepaths):
    with (input_paths_dir / f"{dataset_type}.txt").open("a") as f:
        f.writelines(str(p) + "\n" for p in filepaths)

save_paths("dia_object", object_files)
save_paths("dia_source", source_files)
save_paths("dia_forced_source", forced_source_files)

### Some validation

In [15]:
df = lsdb.open_catalog(output_dir / "dia_object_collection").compute()

#### Checking objects

We have the same set of objects in the collection as in the original data:

In [17]:
import pandas as pd
input_objs = pd.read_parquet(object_files, dtype_backend="pyarrow")
input_obj_ids = np.unique(input_objs["diaObjectId"])
assert set(df["diaObjectId"]) == set(input_obj_ids)
assert len(input_obj_ids) == len(df)

#### Checking sources

All objects have sources:

In [18]:
len(df[~df["diaSource"].isna()])/len(df)*100

100.0

There are as many sources in the collection as in the base catalog (minus those with no diaObjectId):

In [19]:
input_sources = pd.read_parquet(source_files, dtype_backend="pyarrow")
expected_sources = input_sources[~input_sources["diaObjectId"].isna()]
assert len(expected_sources) == len(df["diaSource"].explode())

Though there are ~7% of objects for which "nDiaSource" doesn't match the number of "diaSource" we got:

In [20]:
from nested_pandas.utils import count_nested
count_df = count_nested(df, "diaSource", join=True)
unmatched = count_df[count_df["nDiaSources"] != count_df["n_diaSource"]]
len(unmatched)/len(count_df)*100

7.255606249555245

#### Checking forced sources

Only ~5% of objects have forced sources:

In [21]:
len(df[~df["diaForcedSource"].isna()])/len(df)*100

5.169310311439514

This seems to make sense according to the input data:

In [22]:
forced_sources = pd.read_parquet(forced_source_files, dtype_backend="pyarrow")
expected_f_sources = forced_sources[~forced_sources["diaObjectId"].isna()]
assert len(expected_f_sources) == len(df["diaForcedSource"].explode())