# Simulated HATS DP1

This notebook creates a HATS catalog of simulated DP1 data.

### Data requirements

- Shuffle all the points around by _healpix_29 and assign new radecs based on that;
- All other fields can be re-generated within partition-level min/max uniformly;
- Boolean can be true/false randomly;
- Light curve can have the same length, but should also be twiddled with;
- For band, sample from "ugrizy" chars;
- The object and source IDs, dimensions, and visit are not changed.

In [None]:
import cdshealpix
import lsdb
import numpy as np
import pandas as pd
import hats_import.pipeline as runner
import tempfile

from dask.distributed import Client
from hats_import.collection.arguments import CollectionArguments
from hats.pixel_math.spatial_index import spatial_index_to_healpix
from nested_pandas import NestedDtype
from pandas.api.types import (
    is_integer_dtype,
    is_float_dtype,
    is_bool_dtype,
    is_string_dtype,
)
from pathlib import Path

In [None]:
def cast_nested(df, columns):
    """Helper function to cast nested columns to the correct type."""
    return df.assign(
        **{
            col: df[col].astype(NestedDtype.from_pandas_arrow_dtype(df.dtypes[col]))
            for col in columns
        },
    )

In [None]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

In [None]:
hats_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/mock_dp1")

### Mock data generation

In [None]:
rng = np.random.default_rng()

default_fixed_columns = [
    "diaObjectId",
    "nDiaSources",
    "diaSourceId",
    "forcedSourceOnDiaObjectId",
    "objectId",
    "forcedSourceId",
    "visit",
    "tract",
    "patch",
    "detector",
]
ra_columns = ["ra", "coord_ra"]
dec_columns = ["dec", "coord_dec"]

In [None]:
def mock_partition(df, *, nested_columns=[]):
    """Populates a partition with mock data.

    - Integers, floats and timestamps are re-generated within min/max uniformly;
    - Boolean columns are assigned True/False values randomly;
    - String columns are sampled from the available choices, also randomly;
    """
    if not len(df):
        return df
    df = mock_positions(df, update_index=True)
    fixed_columns = ra_columns + dec_columns + nested_columns + default_fixed_columns
    df = mock_base_columns(df, fixed_columns)
    return mock_nested_columns(df, nested_columns, fixed_columns)


def mock_base_columns(df, fixed_columns=[]):
    """Generates randomly sampled data for a partition's base fields."""
    n_samples = len(df)
    columns_to_update = [col for col in df.columns if col not in fixed_columns]
    for column in columns_to_update:
        df[column] = _sample_column(df, column, n_samples)
    return df


def mock_nested_columns(df, nested_cols, fixed_columns):
    """Generates randomly sampled data for a partition's nested fields."""
    for source_col in nested_cols:
        flat_sources = df[source_col].nest.to_flat()
        mock_sources = mock_positions(flat_sources, update_index=False)
        mock_sources = mock_base_columns(mock_sources, fixed_columns)
        # Sort re-generated sources by mjd and repack them
        mock_sources = mock_sources.sort_values("midpointMjdTai")
        df = df.drop(columns=[source_col])
        df = df.add_nested(mock_sources, source_col)
    return df


def mock_positions(df, update_index=True):
    """Updates _healpix_29 and positional ra/dec coordinates"""
    _healpix_29 = df.index.to_numpy()

    # Randomly sample _healpix_29 values for the current partition
    min_index = _healpix_29.min()
    max_index = _healpix_29.max()
    rand_index = rng.integers(min_index, max_index, size=len(df), dtype=np.int64)
    ipix = spatial_index_to_healpix(rand_index)
    ra, dec = cdshealpix.healpix_to_lonlat(ipix, depth=29)

    # When mocking positions for the nested sources the _healpix_29 values
    # should be kept. They will be the object's _healpix_29.
    if update_index:
        df.reset_index(drop=True, inplace=True)
        df.index = pd.Index(rand_index, name="_healpix_29")

    for ra_col in ["ra", "coord_ra"]:
        if ra_col in df.columns:
            coords_dtype = df[ra_col].dtype
            df[ra_col] = pd.Series(ra.deg, index=df.index, dtype=coords_dtype)
    for dec_col in ["dec", "coord_dec"]:
        if dec_col in df.columns:
            coords_dtype = df[dec_col].dtype
            df[dec_col] = pd.Series(dec.deg, index=df.index, dtype=coords_dtype)
    return df


def _sample_column(df, column, n_samples):
    """Samples values for a column of specified type"""
    column_data = df[column].replace([np.inf, -np.inf], np.nan)

    _min = column_data.min()
    _max = column_data.max()

    if pd.isna(_min) or pd.isna(_max):
        series = [None] * n_samples
    elif is_integer_dtype(column_data):
        series = rng.integers(_min, _max + 1, size=n_samples)
    elif is_float_dtype(column_data):
        series = rng.uniform(_min, _max, size=n_samples)
    elif is_bool_dtype(column_data):
        series = rng.integers(0, 2, size=n_samples).astype(bool)
    elif is_string_dtype(column_data):
        possible_strings = (
            list("ugrizy") if column == "band" else column_data.dropna().unique()
        )
        series = rng.choice(possible_strings, size=n_samples)
    elif column_data.dtype == "timestamp[ns][pyarrow]":
        series = rng.integers(_min.value, _max.value + 1, size=n_samples)
    else:
        series = [None] * n_samples

    return pd.Series(series, index=column_data.index, dtype=column_data.dtype)

### dia_object_lc

In [None]:
dia_object_lc = lsdb.read_hats(
    "/sdf/data/rubin/shared/lsdb_commissioning/hats/v29_0_0_rc5/dia_object_lc"
)
dia_object_lc = dia_object_lc.map_partitions(
    cast_nested, columns=["diaSource", "diaObjectForcedSource"]
)
dia_object_lc

In [None]:
mock_dia_object_lc = dia_object_lc.map_partitions(
    mock_partition, nested_columns=["diaSource", "diaObjectForcedSource"]
)
mock_dia_object_lc

In [None]:
mock_dia_object_lc.to_hats(hats_dir / "dia_object_lc", catalog_name="dia_object_lc")

### object_lc

In [None]:
object_lc = lsdb.read_hats(
    "/sdf/data/rubin/shared/lsdb_commissioning/hats/v29_0_0_rc5/object_lc"
)
object_lc = object_lc.map_partitions(cast_nested, columns=["objectForcedSource"])
object_lc

In [None]:
mock_object_lc = object_lc.map_partitions(
    mock_partition, nested_columns=["objectForcedSource"]
)
mock_object_lc

In [None]:
mock_object_lc.to_hats(hats_dir / "object_lc", catalog_name="object_lc")

### Create collections

In [None]:
args = (
    CollectionArguments(
        output_artifact_name="dia_object_collection",
        new_catalog_name="dia_object_lc",
        output_path=hats_dir,
        simple_progress_bar=True,
    )
    .catalog(
        catalog_path=hats_dir / "dia_object_collection" / "dia_object_lc",
    )
    .add_margin(margin_threshold=5.0, is_default=True)
    .add_index(indexing_column="diaObjectId")
)
runner.pipeline_with_client(args, client)

In [None]:
args = (
    CollectionArguments(
        output_artifact_name="object_collection",
        new_catalog_name="object_lc",
        output_path=hats_dir,
        simple_progress_bar=True,
    )
    .catalog(
        catalog_path=hats_dir / "object_collection" / "object_lc",
    )
    .add_margin(margin_threshold=5.0, is_default=True)
    .add_index(indexing_column="objectId")
)
runner.pipeline_with_client(args, client)

In [59]:
client.close()
tmp_path.cleanup()