## Weekly aggregation

- Aggregate the daily parquets of each pixel into the past.parquet file.

    - Merge duplicate object rows.

    - Keep the latest values for each conflicting object columns.

    - Merge the light curve nested columns.

- Repartition each pixel according to a pre-defined threshold argument.

- Regenerate _metadata file.

- Regenerate collection with margin cache and index catalog (from scratch).

In [None]:
import lsdb
import numpy as np
import tempfile

from dask.distributed import Client
from pathlib import Path

hats_dir = Path("ppdb")

# Initializing the Dask Client to parallelize operations
tmp_dir = tempfile.TemporaryDirectory()
print(f"Intermediate directory: {tmp_dir.name}")
client = Client(
    n_workers=16, threads_per_worker=1, local_directory=tmp_dir.name, memory_limit="8GB"
)

- Objects near pixel borders might have moved to one of their neighboring partitions. 
- To merge object alerts accurately we need to generate margins for the daily catalog.

In [None]:
from hats_import import pipeline_with_client
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments

args = MarginCacheArguments(
    input_catalog_path=hats_dir / "dia_object_lc",
    output_path=hats_dir,
    margin_threshold=10,  # 10 arcsec
    output_artifact_name="dia_object_lc_10arcs",
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

In [None]:
dia_object_lc = lsdb.open_catalog(
    hats_dir / "dia_object_lc", margin_cache=hats_dir / "dia_object_lc_10arcs"
)
dia_object_lc

We should now merge each partition with their margins on "diaObjectId":

In [None]:
import hats as hc
import pandas as pd

from lsdb import Catalog
from lsdb.dask.merge_catalog_functions import (
    align_and_apply,
    align_catalogs,
    construct_catalog_args,
    get_healpix_pixels_from_alignment,
    filter_by_spatial_index_to_pixel,
)


def merge_object_data(dia_object_lc):
    alignment = align_catalogs(dia_object_lc, dia_object_lc)
    _, pixels = get_healpix_pixels_from_alignment(alignment)
    joined_partitions = align_and_apply(
        [(dia_object_lc, pixels), (dia_object_lc.margin, pixels)],
        perform_join_on,
    )
    ddf, ddf_map, alignment = construct_catalog_args(
        joined_partitions,
        dia_object_lc._ddf._meta,
        alignment,
    )
    hc_catalog = hc.catalog.Catalog(
        dia_object_lc.hc_structure.catalog_info,
        alignment.pixel_tree,
        schema=dia_object_lc.original_schema,  # the schema is the same
        moc=alignment.moc,
    )
    return Catalog(ddf, ddf_map, hc_catalog)


def perform_join_on(df, margin, df_pixel, *args):
    original_cols = list(df.columns)

    # 1. Join df with margin
    final_df = pd.concat([df, margin])

    # 2. Order each object by validityStart
    final_df = final_df.sort_values(["diaObjectId", "validityStart"], ascending=[True,False])

    # 3. Get the sources for all the objects
    final_df["diaSource.diaObjectId"] = final_df["diaObjectId"]
    final_df["diaForcedSource.diaObjectId"] = final_df["diaObjectId"]
    sources = final_df["diaSource"].explode().sort_values(["midpointMjdTai"])
    fsources = final_df["diaForcedSource"].explode().sort_values(["midpointMjdTai"])

    # 4. Grab the latest row per object
    _, latest_indices = np.unique(final_df["diaObjectId"], return_index=True)
    final_df = final_df.iloc[latest_indices]

    # 5. Drop the sources and join them again
    final_df = final_df.drop(columns=["diaSource", "diaForcedSource"])
    final_df = final_df.join_nested(sources, "diaSource", on="diaObjectId")
    final_df = final_df.join_nested(fsources, "diaForcedSource", on="diaObjectId")

    # 6. Filter out points outside of the pixel (that are therefore in margin)
    final_df = filter_by_spatial_index_to_pixel(final_df, df_pixel.order, df_pixel.pixel)

    # 7. Make sure columns keep the same order
    return final_df[original_cols]

In [None]:
merged_lc = merge_object_data(dia_object_lc)
merged_lc

In [None]:
# Quick sanity check on the expected IDs:
actual_ids = np.unique(merged_lc["diaObjectId"].compute())
expected_ids = np.unique(dia_object_lc["diaObjectId"].compute())
np.testing.assert_array_equal(actual_ids, expected_ids)

Then we need to write this catalog to disk:

In [None]:
merged_lc.write_catalog(f"{tmp_dir.name}/dia_object_lc", as_collection=False, overwrite=True)

And use `hats-import` to reimport with different threshold, etc.:

In [None]:
import hats_import.catalog.run_import as catalog_runner
from hats_import import ImportArguments

catalog_args = ImportArguments.reimport_from_hats(
    path=f"{tmp_dir.name}/dia_object_lc",
    output_dir=hats_dir/"dia_object_collection",
    output_artifact_name="dia_object_lc",
    pixel_threshold=1_000_000, # use byte_threshold when it's available?
    skymap_alt_orders=[2,4,6],
    npix_suffix="/", # make sure it's stored in Npix dirs
    addl_hats_properties={"hats_cols_default": dia_object_lc.hc_structure.catalog_info.default_columns},
)
catalog_runner.run(catalog_args, client)

In [None]:
client.close()