# Import to HATS

Use hats-import to ingest the parquet URLs and create each HATS catalog.

In [1]:
import os
import tempfile
import hats_import.pipeline as runner

from pathlib import Path
from dask.distributed import Client
from hats_import.catalog.arguments import ImportArguments
from dimension_reader import DimensionParquetReader

In [2]:
DRP_VERSION = os.environ["DRP_VERSION"]
COLLECTION_TAG = os.environ["COLLECTION_TAG"]
print(f"DRP_VERSION: {DRP_VERSION}")
print(f"COLLECTION_TAG: {COLLECTION_TAG}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
collections = f"LSSTComCam/runs/DRP/DP1/{DRP_VERSION}/{COLLECTION_TAG}"

In [3]:
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION
hats_dir.mkdir(parents=True, exist_ok=True)

In [4]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

### Helper methods

In [5]:
def get_paths(dataset_type):
    index_dir = raw_dir / "index" / dataset_type
    return list(index_dir.glob("*.csv"))

#### dia_object

In [6]:
dia_obj_default_columns = ["diaObjectId", "ra", "dec", "nDiaSources", "radecMjdTai"]


args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_object",
    input_file_list=get_paths("dia_object"),
    file_reader=DimensionParquetReader(column_names=dia_obj_default_columns),
    ra_column="ra",
    dec_column="dec",
    catalog_type="object",
    highest_healpix_order=0,
    pixel_threshold=5_000_000,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

#### dia_source

In [7]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_source",
    input_file_list=get_paths("dia_source"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    highest_healpix_order=0,
    pixel_threshold=4_000_000,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

#### dia_object_forced_source

In [8]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_object_forced_source",
    input_file_list=get_paths("dia_object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    pixel_threshold=25_000_000,
    highest_healpix_order=9,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

#### object

In [9]:
cols_per_band = []
for band in list("ugrizy"):
    for flux_type in ["psf", "kron"]:
        prefix = f"{band}_{flux_type}"
        cols_per_band.extend([f"{prefix}Flux", f"{prefix}FluxErr"])
    cols_per_band.append(f"{band}_kronRad")

obj_default_columns = [
    "objectId",
    "refFwhm",
    "shape_flag",
    "sky_object",
    "parentObjectId",
    "x",
    "y",
    "xErr",
    "yErr",
    "shape_yy",
    "shape_xx",
    "shape_xy",
    "coord_ra",
    "coord_dec",
    "coord_raErr",
    "coord_decErr",
    "tract",
    "patch",
    "detect_isIsolated",
] + cols_per_band

In [10]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="object",
    input_file_list=get_paths("object"),
    file_reader=DimensionParquetReader(
        column_names=obj_default_columns, chunksize=250_000
    ),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="object",
    highest_healpix_order=7,
    pixel_threshold=300_000,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

#### source

In [11]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="source",
    input_file_list=get_paths("source"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    highest_healpix_order=9,
    pixel_threshold=1_000_000,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

#### object_forced_source

In [12]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="object_forced_source",
    input_file_list=get_paths("object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    highest_healpix_order=9,
    pixel_threshold=25_000_000,
    simple_progress_bar=True,
    resume=False,
)
runner.pipeline_with_client(args, client)

In [13]:
client.close()
tmp_path.cleanup()