## Weekly aggregation

- Deduplicate object rows (keeping the latest diaObject-level data for each object).

- Repartition each pixel according to a pre-defined threshold argument.

- Regenerate collection with margin cache and index catalog (from scratch).

In [None]:
import lsdb
import numpy as np
import tempfile
from dask.distributed import Client
from pathlib import Path

hats_dir = Path(".")
tmp_dir = tempfile.TemporaryDirectory()
print(f"Intermediate directory: {tmp_dir.name}")

Initialize a Dask Client for parallelization:

In [None]:
client = Client(
    n_workers=16, threads_per_worker=1, local_directory=tmp_dir.name, memory_limit="8GB"
)

- Objects near pixel borders might have moved to one of their neighboring partitions. 
- To merge object alerts accurately we need to generate margins for the daily catalog.

In [None]:
from hats_import import pipeline_with_client
from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments

args = MarginCacheArguments(
    input_catalog_path=hats_dir / "dia_object_lc",
    output_path=hats_dir,
    margin_threshold=10,
    output_artifact_name="dia_object_lc_10arcs",
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

In [None]:
dia_object_lc = lsdb.open_catalog(
    hats_dir / "dia_object_lc", margin_cache=hats_dir / "dia_object_lc_10arcs"
)
dia_object_lc

We should now merge each partition with their margins on "diaObjectId":

In [None]:
from aggregate import aggregate_object_data
agg_lc = aggregate_object_data(dia_object_lc)
agg_lc

In [None]:
# Quick sanity check on the expected IDs
actual_ids = np.unique(agg_lc["diaObjectId"].compute())
expected_ids = np.unique(dia_object_lc["diaObjectId"].compute())
np.testing.assert_array_equal(actual_ids, expected_ids)

Then we need to write this catalog to disk:

In [None]:
agg_lc.hc_structure.catalog_info.npix_suffix=".parquet"
agg_lc.write_catalog(f"{tmp_dir.name}/dia_object_lc", as_collection=False, overwrite=True)

And use `hats-import` to reimport with different threshold, etc.:

In [None]:
import hats_import.collection.run_import as collection_runner
from hats_import import ImportArguments, CollectionArguments

catalog_args = ImportArguments.reimport_from_hats(
    path=f"{tmp_dir.name}/dia_object_lc",
    output_dir=hats_dir/"dia_object_collection",
    output_artifact_name="dia_object_lc",
    pixel_threshold=100_000, # Value set for demo purposes
    skymap_alt_orders=[2,4,6],
    npix_suffix="/",
    addl_hats_properties={"hats_cols_default": dia_object_lc.hc_structure.catalog_info.default_columns},
)

collection_args = (
    CollectionArguments(
        output_path=hats_dir,
        output_artifact_name="dia_object_collection",
    )
    .catalog(**catalog_args.__dict__)
    .add_margin(margin_threshold=10)
    .add_index(indexing_column="diaObjectId")
)
collection_runner.run(collection_args, client)

Close the Dask Client:

In [None]:
client.close()