# Nesting

Create catalogs for `diaObject` and `object` with nested sources and forced sources.

In [None]:
#%pip install lsdb --upgrade
#%pip install nested-dask --upgrade
#%pip install nested-pandas --upgrade

In [1]:
import os
import lsdb
import tempfile

from pathlib import Path
from dask.distributed import Client
from nested_pandas import NestedDtype

In [2]:
base_output_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/dm_48556_new")
raw_dir = base_output_dir / "raw"
hats_dir = base_output_dir / "hats"

In [3]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

In [4]:
def sort_nested_sources(df, source_cols):
    mjd_col = "midpointMjdTai"
    for source_col in source_cols:
        flat_sources = df[source_col].nest.to_flat()
        df = df.drop(columns=[source_col])
        df = df.add_nested(flat_sources.sort_values([flat_sources.index.name, mjd_col]), source_col)
    return df

### DiaObject with nested sources

In [5]:
diaObject_cat = lsdb.read_hats(os.path.join(hats_dir, "diaObject"))
diaSource_cat = lsdb.read_hats(os.path.join(hats_dir, "diaSource"))
diaForcedSource_cat = lsdb.read_hats(os.path.join(hats_dir, "diaForcedSource"))

In [None]:
diaObject_cat_nested = diaObject_cat.join_nested(
    diaSource_cat, left_on="diaObjectId", right_on="diaObjectId", nested_column_name="diaSource").join_nested(
    diaForcedSource_cat, left_on="diaObjectId", right_on="diaObjectId", nested_column_name="diaForcedSource")
diaObject_cat_nested

Also, for each object, sort sources by timestamp:

In [7]:
diaObject_cat_nested = diaObject_cat_nested.map_partitions(lambda x: sort_nested_sources(x, source_cols=["diaSource", "diaForcedSource"]))

Save resulting catalog to disk:

In [None]:
diaObject_cat_nested.to_hats(hats_dir / "diaObject_lc")

Reading with LSDB currently requires a bit of manipulation:

In [None]:
diaObject_lc = lsdb.read_hats(
    hats_dir / "diaObject_lc"   
).map_partitions(
    lambda df: df.assign(
        **{ 
            lc_column: df[lc_column].astype(NestedDtype.from_pandas_arrow_dtype(df.dtypes[lc_column])) 
            for lc_column in ["diaSource", "diaForcedSource"]
        }
    )
)
diaObject_lc

### Object with nested sources

In [25]:
object_cat = lsdb.read_hats(os.path.join(hats_dir, "object"))
forcedSource_cat = lsdb.read_hats(os.path.join(hats_dir, "forcedSource"))

In [None]:
object_cat_nested = object_cat.join_nested(forcedSource_cat, left_on="objectId", right_on="objectId", nested_column_name="forcedSource")
object_cat_nested

Also, for each object, sort sources by timestamp:

In [27]:
object_cat_nested = object_cat_nested.map_partitions(lambda x: sort_nested_sources(x, source_cols=["forcedSource"]))

Save resulting catalog to disk:

In [28]:
object_cat_nested.to_hats(os.path.join(hats_dir, "object_lc"))

In [None]:
object_lc = lsdb.read_hats(
    hats_dir / "object_lc"
).map_partitions(
    lambda df: df.assign(
        **{ "forcedSource": df["forcedSource"].astype(NestedDtype.from_pandas_arrow_dtype(df.dtypes["forcedSource"])) }
    )
)
object_lc

In [None]:
client.close()
tmp_path.cleanup()