## Generate collection

Create DIA Object collection with margin and index on `diaObjectId`.

In [1]:
import lsdb
import os

from dask.distributed import Client
from datetime import datetime
from hats_import import pipeline_with_client
from hats_import import ImportArguments
from hats_import.collection.arguments import CollectionArguments
from pathlib import Path

In [2]:
# Path to the RAW parquet data
PPDB_DIR = Path("/sdf/scratch/rubin/ppdb/data/lsstcam")

# Paths to the target OUTPUT directories
HATS_DIR = Path(os.environ["OUTPUT_DIR"])
TMP_DIR = HATS_DIR / "tmp"

In [3]:
# Final collection directory.
COLLECTION_DIR = HATS_DIR / "dia_object_collection"

# The collection will contain the most recent
# version of data stored with the latest date.
DATE_STR = datetime.now().strftime("%Y-%m-%d")

In [None]:
# Initialize Dask Client
client = Client(n_workers=16, threads_per_worker=1, local_directory=TMP_DIR)

### Helper methods

In [5]:
def reimport(input_path):
    """Reimport the catalog to optimize partitioning, generate alternative
    sky maps and set default columns. The resulting catalog is saved to the
    working TEMP_DIR and it is named after today's date."""
    default_columns = ",".join(
        """\
        dec
        decErr
        diaObjectId
        ra
        raErr
        u_psfFluxMean
        g_psfFluxMean
        r_psfFluxMean
        i_psfFluxMean
        z_psfFluxMean
        y_psfFluxMean
        u_psfFluxMeanErr
        g_psfFluxMeanErr
        r_psfFluxMeanErr
        i_psfFluxMeanErr
        z_psfFluxMeanErr
        y_psfFluxMeanErr
        nDiaSources
        validityStart
        diaSource.apFlux
        diaSource.apFluxErr
        diaSource.band
        diaSource.dec
        diaSource.decErr
        diaSource.detector
        diaSource.diaSourceId
        diaSource.isDipole
        diaSource.ixx
        diaSource.ixxPSF
        diaSource.iyy
        diaSource.iyyPSF
        diaSource.ixy
        diaSource.ixyPSF
        diaSource.midpointMjdTai
        diaSource.psfFlux
        diaSource.psfFluxErr
        diaSource.scienceFlux
        diaSource.scienceFluxErr
        diaSource.ra
        diaSource.raErr
        diaSource.visit
        diaSource.x
        diaSource.xErr
        diaSource.y
        diaSource.yErr
        diaSource.scienceMag
        diaSource.scienceMagErr
        diaForcedSource
        """.splitlines()
    )
    args = ImportArguments.reimport_from_hats(
        input_path,
        output_dir=TMP_DIR,
        output_artifact_name=DATE_STR,
        highest_healpix_order=11,
        pixel_threshold=15_000,
        skymap_alt_orders=[2, 4, 6],
        row_group_kwargs={"subtile_order_delta": 1},
        addl_hats_properties={"hats_cols_default": default_columns},
        simple_progress_bar=True,
        resume=False,
    )
    pipeline_with_client(args, client)


def finalize_collection():
    """Generate new collection in the TEMP directory and move it to the COLLECTION_DIR."""
    new_collection_dir = TMP_DIR / "dia_object_collection"
    # Create final collection folder.
    !mkdir $new_collection_dir
    # Move main catalog to final folder.
    !mv $TMP_DIR/$DATE_STR $new_collection_dir
    # Generate the collection.
    generate_collection(catalog_path=new_collection_dir / DATE_STR, output_path=TMP_DIR)
    # Move all contents of new colection to expected target output path.
    !mv $new_collection_dir/* $COLLECTION_DIR
    print(f"Collection updated at {COLLECTION_DIR}")


def generate_collection(catalog_path, output_path):
    """Generate the collection for a given main catalog"""
    args = (
        CollectionArguments(
            output_artifact_name="dia_object_collection",
            output_path=output_path,
            simple_progress_bar=True,
        )
        .catalog(catalog_path=catalog_path, file_reader="parquet")
        .add_margin(margin_threshold=5.0, is_default=True)
        .add_index(indexing_column="diaObjectId")
    )
    pipeline_with_client(args, client)

### Main entrypoint

In [None]:
if not COLLECTION_DIR.exists():
    # Create a collection from scratch (first import)
    os.makedirs(COLLECTION_DIR)
    # Reimport catalog (to ensure partitioning is optimized and skymaps are up-to-date).
    reimport(input_path=TMP_DIR / "new_dia_object_lc")
    # Generate new collection and move it to COLLECTION_DIR.
    finalize_collection()
else:
    # Concatenate existing collections ignoring duplicate object entries
    # Load existing collection.
    dia_object_collection = lsdb.open_catalog(COLLECTION_DIR, columns="all")
    # Load new DIA object data.
    new_dia_object = lsdb.open_catalog(TMP_DIR / "new_dia_object_lc")
    # Concatenate both catalogs.
    concated = dia_object_collection.concat(new_dia_object)
    # Save the concatenated catalog to disk.
    concat_catalog_path = TMP_DIR / "dia_object_concat"
    #concated.to_hats(concat_catalog_path, as_collection=False)
    # Reimport catalog (to ensure partitioning is optimized and skymaps are up-to-date).
    reimport(input_path=concat_catalog_path)
    # Generate new collection and move it to COLLECTION_DIR.
    finalize_collection()

Planning  : 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.01s/it]
Mapping   : 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:10<00:00,  3.35it/s]
Binning   :   0%|                                                                                                                                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]
Reducing  : 100%|█████████████████████████████

Collection updated at /sdf/home/s/stavar/linccf/ppdb/dia_object_collection


Let's check that the updated collection has been updated:

In [9]:
!ls $COLLECTION_DIR

2025-09-23		2025-09-24_5arcs	2025-09-25_diaObjectId
2025-09-23_5arcs	2025-09-24_diaObjectId	collection.properties
2025-09-23_diaObjectId	2025-09-25
2025-09-24		2025-09-25_5arcs


In [10]:
!cat $COLLECTION_DIR/collection.properties

#HATS Collection
obs_collection=dia_object_collection
hats_primary_table_url=2025-09-25
all_margins=2025-09-25_5arcs
default_margin=2025-09-25_5arcs
all_indexes=diaObjectId 2025-09-25_diaObjectId
hats_builder=hats-import v0.6.4, hats v0.6.5.dev4+ga69bbd9de
hats_creation_date=2025-09-25T18:25UTC
hats_estsize=1096180
hats_release_date=2024-09-18
hats_version=v0.1


### Seeking feedback

#### 1. How to handle incremental data for existing objects?

- Do we augment the catalog by creating new rows for the object, or do we merge them into one?

#### 2. Merging vs not to merge

- "Alerts" for each given object might have different coordinates (ra/dec).

    - If we do not merge information for the object, it might live in more than 1 partition.

    - If the goal is to update as little pixel files as possible, and just "append", we might want not to merge.

- If columns have information that collide, should we drop the existing ones and keep the latest?

- If they do not collide, e.g. if an object alert has information for a filter for which we had no information before, do we just keep the new values for that filter?

#### 3. How should we handle the nested light curves?

- Pro for merging: we could update the lightcurves.

- Sources should contain entirely new observations, so we would "append" them to their respective objects.

- But aren't forced source somewhat "iterative"? Do we just keep the latest forced source, or do we need their full history?

#### 4. Serving incremented catalogs

The incremented catalogs have new parquet files and metadata. Since we're serving the catalogs live, overwriting the contents of a catalog is not stable.

- Should we store the incremented version alongside the previous one, and replace after each cycle? Similar to the implementation of the cron jobs for TNS/VSX?

- Should we explore Apache Iceberg, as a solution that supports transactions and data versioning? It does not currently support Dask.

In [11]:
client.close()

In [12]:
%rm -rf $TMP_DIR