## Daily increments

- Keep the existing partitioning schema.

- Add a new parquet file to each pixel.

  - Account for the object rows of latest validity start only.
  
  - Find out where each new object lies.

  - Generate the margin caches for sources and forced sources.
  
  - Nest the sources and forced sources in their respective objects.

  - Copy the new daily parquet files into their respective pixel directories.

- Delete the outdated _metadata file.

- Update hats.properties, partition_info.csv and skymaps.

In [1]:
import hats
import lsdb

Specify the paths to the existing and new data:

In [None]:
import tempfile
from pathlib import Path

# path to the new PPDB data
PPDB_DIR = Path("/sdf/scratch/rubin/ppdb/data/lsstcam")

# path to the pre-existing catalog
dia_object_lc_path = Path("dia_object_lc")

# temporary directory
tmp_dir = tempfile.TemporaryDirectory()
print(f"Intermediate directory: {tmp_dir.name}")

Initialize a Dask Client for parallelization:

In [None]:
from dask.distributed import Client

client = Client(
    n_workers=16, threads_per_worker=1, local_directory=tmp_dir.name, memory_limit="8GB"
)

Load existing nested catalog with `Npix` directory partitions:

In [None]:
# The catalog has leaf pixel directories
dia_object_lc = lsdb.open_catalog(dia_object_lc_path)
existing_pixels = dia_object_lc.get_healpix_pixels()
existing_pixels

In [None]:
# Get highest pixel order for the existing catalog
mapping_order = dia_object_lc.hc_structure.catalog_info.skymap_order
print(f"mapping_order = {mapping_order}")

Get the new increments for objects, sources and forced sources:

In [None]:
def get_paths(dataset_type, start=10, end=20):
    """Return the latest parquet file for a given dataset type."""
    dataset_name = "".join(word.capitalize() for word in dataset_type.split("_"))
    files = sorted(PPDB_DIR.rglob(f"{dataset_name}.parquet"))
    return files[start:end]


new_object_files = get_paths("dia_object")
new_source_files = get_paths("dia_source")
new_forced_source_files = get_paths("dia_forced_source")

When importing new data, we want to keep the existing partitioning structure intact.

We'll add new pixels if the alerts include data that lies outside the current pixel coverage.

In [None]:
from hats_import import pipeline_with_client
from hats_import.catalog.arguments import ImportArguments


def import_dataset(dataset_type, input_file_list, catalog_type):
    args = ImportArguments(
        output_path=tmp_dir.name,
        output_artifact_name=dataset_type,
        input_file_list=input_file_list,
        file_reader="parquet",
        ra_column="ra",
        dec_column="dec",
        catalog_type=catalog_type,
        pixel_threshold=5_000_000,
        highest_healpix_order=mapping_order,
        simple_progress_bar=True,
        existing_pixels=[(p.order, p.pixel) for p in existing_pixels],
        resume=False,
    )
    pipeline_with_client(args, client)

In [None]:
import_dataset("dia_object", new_object_files, catalog_type="object")

In [None]:
import_dataset("dia_source", new_source_files, catalog_type="source")

In [None]:
import_dataset("dia_forced_source", new_forced_source_files, catalog_type="source")

In [None]:
"""Check that the new pixels are a subset of the existing pixels"""

for cat_name in ["dia_object", "dia_source", "dia_forced_source"]:
    cat = hats.read_hats(f"{tmp_dir.name}/{cat_name}")
    new_pixels = set(cat.get_healpix_pixels())
    assert set(existing_pixels).issubset(new_pixels)

### Post-processing

Same post-processing steps that were applied to the existing data.

In [None]:
from postprocess import postprocess_catalog

flux_col_prefixes = [f"{band}_scienceFluxMean" for band in list("ugrizy")]
postprocess_catalog("dia_object", flux_col_prefixes, tmp_dir.name, client)
postprocess_catalog("dia_source", ["scienceFlux"], tmp_dir.name, client)
postprocess_catalog("dia_forced_source", ["scienceFlux"], tmp_dir.name, client)

### Create nested increment

Nest sources in objects and sort them by MJD.

In [None]:
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments


def load_sources_with_margin(dataset_type, margin_arcsec=5):
    """Create source margins for nesting"""
    input_catalog_path = f"{tmp_dir.name}/{dataset_type}"
    margin_name = f"{dataset_type}_{margin_arcsec}arcs"

    args = MarginCacheArguments(
        input_catalog_path=input_catalog_path,
        output_path=tmp_dir.name,
        margin_threshold=margin_arcsec,
        output_artifact_name=margin_name,
        progress_bar=False,
        resume=False,
    )
    pipeline_with_client(args, client)
    margin_path = f"{tmp_dir.name}/{margin_name}"
    return lsdb.read_hats(input_catalog_path, margin_cache=margin_path)

In [None]:
# Load object catalog
dia_object = lsdb.read_hats(f"{tmp_dir.name}/dia_object")

# Load the source catalogs with margins
dia_source = load_sources_with_margin("dia_source")
dia_forced_source = load_sources_with_margin("dia_forced_source")

In [None]:
from nest import nest_sources

new_dia_object_lc = nest_sources(dia_object, dia_source, dia_forced_source)

### Update existing catalog

Write the partitions with the new data to the existing catalog and update the relevant metadata:

In [None]:
from increment import write_partitions, update_skymaps, update_metadata

new_pixels, new_counts, new_histograms = write_partitions(new_dia_object_lc, dia_object_lc_path, mapping_order)
update_skymaps(dia_object_lc, new_histograms, mapping_order)
update_metadata(dia_object_lc, new_pixels, new_counts)

Close the Dask Client:

In [None]:
client.close()