# Import to HATS

Use hats-import to ingest the parquet URLs and create each HATS catalog.

In [None]:
import os
import tempfile

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from dask.distributed import Client
from dimension_reader import DimensionParquetReader
from hats_import import pipeline_with_client
from hats_import.catalog.arguments import ImportArguments
from pathlib import Path

In [None]:
INSTRUMENT = os.environ["INSTRUMENT"]
RUN = os.environ["RUN"]
VERSION = os.environ["VERSION"]
COLLECTION = os.environ["COLLECTION"]
OUTPUT_DIR = Path(os.environ["OUTPUT_DIR"])

print(f"INSTRUMENT: {INSTRUMENT}")
print(f"RUN: {RUN}")
print(f"VERSION: {VERSION}")
print(f"COLLECTION: {COLLECTION}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

collections = f"{INSTRUMENT}/runs/DRP/{RUN}/{VERSION}/{COLLECTION}"

In [None]:
raw_dir = OUTPUT_DIR / "raw" / VERSION
hats_dir = OUTPUT_DIR / "hats" / VERSION
hats_dir.mkdir(parents=True, exist_ok=True)

In [None]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir, memory_limit="8GB")

### Helper methods

In [None]:
from lsst.resources import ResourcePath

def get_paths(dataset_type):
    index_dir = raw_dir / "index" / dataset_type
    return list(index_dir.glob("*.csv"))


def download_dataset_schema(
    dataset_type, columns_to_select=None, dimension_columns=None
):
    with open(raw_dir / "paths" / f"{dataset_type}.txt", "r") as file:
        single_parquet_path = file.readline().strip()
    with ResourcePath(single_parquet_path).open("rb") as file:
        schema = pq.read_schema(file).remove_metadata()
    schema_table = pa.table(
        {field.name: pa.array([], type=field.type) for field in schema}
    )
    schema_table = _select_desired_columns(schema_table, columns_to_select)
    schema_table = _add_dimensions_to_schema(schema_table, dimension_columns)
    pq.write_table(schema_table, raw_dir / f"{dataset_type}_schema.parquet")


def _select_desired_columns(schema_table, columns_to_select=None):
    # Select subset of columns keeping the order from the original schema.
    if columns_to_select is not None:
        ordered_columns = [
            col for col in schema_table.column_names if col in columns_to_select
        ]
        schema_table = schema_table.select(ordered_columns)
    return schema_table


def _add_dimensions_to_schema(schema_table, dimension_columns=None):
    # Add dimension columns to the schema (e.g. tract and/or patch).
    if dimension_columns is not None:
        for dimension_column in dimension_columns:
            if dimension_column not in schema_table.column_names:
                schema_table = schema_table.append_column(
                    dimension_column, pa.array([], type=pa.int64())
                )
    return schema_table

#### dia_object

We realized that the dia object table columns in the input files might come in different orders. To make sure we get a consistent arrow schema, we can grab the schema for a single parquet file and use it throughout the import pipeline.

In [None]:
dia_obj_files = get_paths("dia_object")
dia_obj_default_columns = ["diaObjectId", "ra", "dec", "nDiaSources"]
dia_obj_dimension_columns = set(pd.read_csv(dia_obj_files[0]).columns) - set(["path"])

# To import all columns set dia_obj_default_columns to None.
download_dataset_schema(
    "dia_object", dia_obj_default_columns, dia_obj_dimension_columns
)

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_object",
    input_file_list=dia_obj_files,
    # To import all columns remove `column_names` argument.
    file_reader=DimensionParquetReader(column_names=dia_obj_default_columns),
    ra_column="ra",
    dec_column="dec",
    catalog_type="object",
    pixel_threshold=5_000_000,
    simple_progress_bar=True,
    resume=False,
    # Use the final schema previously constructed.
    use_schema_file=raw_dir / "dia_object_schema.parquet",
)
pipeline_with_client(args, client)

#### dia_source

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_source",
    input_file_list=get_paths("dia_source"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    pixel_threshold=4_000_000,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

#### dia_object_forced_source

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_object_forced_source",
    input_file_list=get_paths("dia_object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    pixel_threshold=25_000_000,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

#### object

In [None]:
cols_per_band = []
for band in list("ugrizy"):
    for flux_type in ["psf", "kron"]:
        prefix = f"{band}_{flux_type}"
        cols_per_band.extend([f"{prefix}Flux", f"{prefix}FluxErr"])
    cols_per_band.append(f"{band}_kronRad")

obj_default_columns = [
    "objectId",
    "refBand",
    "shape_flag",
    "sky_object",
    "parentObjectId",
    "x",
    "y",
    "xErr",
    "yErr",
    "shape_yy",
    "shape_xx",
    "shape_xy",
    "coord_ra",
    "coord_dec",
    "coord_raErr",
    "coord_decErr",
    "tract",
    "patch",
    "detect_isIsolated",
] + cols_per_band

Similarly to what we did previously, obtain the schema for the object dataset:

In [None]:
obj_files = get_paths("object")
obj_dimension_columns = set(pd.read_csv(obj_files[0]).columns) - set(["path"])

# To import all columns set obj_default_columns to None.
download_dataset_schema("object", obj_default_columns, obj_dimension_columns)

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="object",
    input_file_list=obj_files,
    # To import all columns remove `column_names` argument.
    file_reader=DimensionParquetReader(
        column_names=obj_default_columns, chunksize=250_000
    ),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="object",
    pixel_threshold=300_000,
    simple_progress_bar=True,
    resume=False,
    # Use the final schema previously constructed.
    use_schema_file=raw_dir / "object_schema.parquet",
)
pipeline_with_client(args, client)

#### source

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="source2",
    input_file_list=get_paths("source2"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    pixel_threshold=1_000_000,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

#### object_forced_source

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="object_forced_source",
    input_file_list=get_paths("object_forced_source"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    pixel_threshold=25_000_000,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

In [None]:
client.close()
tmp_path.cleanup()