# Import to HATS

Use hats-import to ingest the parquet URLs and create each HATS catalog.

In [1]:
import os
import tempfile
import hats_import.pipeline as runner

from pathlib import Path
from dask.distributed import Client
from hats_import.catalog.arguments import ImportArguments
from dimension_reader import DimensionParquetReader

In [2]:
DRP_VERSION = os.environ["DRP_VERSION"]
COLLECTION_TAG = os.environ["COLLECTION_TAG"]
print(f"DRP_VERSION: {DRP_VERSION}")
print(f"COLLECTION_TAG: {COLLECTION_TAG}")
base_output_dir = Path(f"/sdf/data/rubin/shared/lsdb_commissioning")
collections = f"LSSTComCam/runs/DRP/DP1/{DRP_VERSION}/{COLLECTION_TAG}"

DRP_VERSION: w_2025_08
COLLECTION_TAG: DM-49029


In [3]:
raw_dir = base_output_dir / "raw" / DRP_VERSION
hats_dir = base_output_dir / "hats" / DRP_VERSION
hats_dir.mkdir(parents=True, exist_ok=True)

In [4]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

### Helper methods

In [5]:
def get_paths(dataset_type):
    index_dir = raw_dir / "index" / dataset_type
    return list(index_dir.glob("*.csv"))

#### DiaObject

In [6]:
diaObj_default_columns = ["diaObjectId", "ra", "dec", "nDiaSources", "radecMjdTai"]

args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="diaObject",
    input_file_list=get_paths("diaObjectTable_tract"),
    file_reader=DimensionParquetReader(column_names=diaObj_default_columns),
    ra_column="ra",
    dec_column="dec",
    catalog_type="object",
    resume=False,
    pixel_threshold=2_000_000,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/25 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/25 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/4 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

#### DiaSource

In [7]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="diaSource",
    input_file_list=get_paths("diaSourceTable_tract"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    resume=False,
    pixel_threshold=2_000_000,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/25 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/25 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/6 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

#### DiaForcedSource

In [8]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="diaForcedSource",
    input_file_list=get_paths("forcedSourceOnDiaObjectTable"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    pixel_threshold=5_000_000,
    highest_healpix_order=12,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/25 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/25 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/251 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

#### Object

In [9]:
cols_per_band = []
for band in list("ugrizy"):
    for flux_type in ["psf", "kron"]:
        prefix = f"{band}_{flux_type}"
        cols_per_band.extend([f"{prefix}Flux", f"{prefix}FluxErr"])
    cols_per_band.append(f"{band}_kronRad")

obj_default_columns = [
    "objectId",
    "refFwhm",
    "shape_flag",
    "sky_object",
    "parentObjectId",
    "detect_isPrimary",
    "x",
    "y",
    "xErr",
    "yErr",
    "shape_yy",
    "shape_xx",
    "shape_xy",
    "coord_ra",
    "coord_dec",
    "coord_raErr",
    "coord_decErr",
    "tract",
    "patch",
    "detect_isIsolated",
] + cols_per_band

obj_default_columns

['objectId',
 'refFwhm',
 'shape_flag',
 'sky_object',
 'parentObjectId',
 'detect_isPrimary',
 'x',
 'y',
 'xErr',
 'yErr',
 'shape_yy',
 'shape_xx',
 'shape_xy',
 'coord_ra',
 'coord_dec',
 'coord_raErr',
 'coord_decErr',
 'tract',
 'patch',
 'detect_isIsolated',
 'u_psfFlux',
 'u_psfFluxErr',
 'u_kronFlux',
 'u_kronFluxErr',
 'u_kronRad',
 'g_psfFlux',
 'g_psfFluxErr',
 'g_kronFlux',
 'g_kronFluxErr',
 'g_kronRad',
 'r_psfFlux',
 'r_psfFluxErr',
 'r_kronFlux',
 'r_kronFluxErr',
 'r_kronRad',
 'i_psfFlux',
 'i_psfFluxErr',
 'i_kronFlux',
 'i_kronFluxErr',
 'i_kronRad',
 'z_psfFlux',
 'z_psfFluxErr',
 'z_kronFlux',
 'z_kronFluxErr',
 'z_kronRad',
 'y_psfFlux',
 'y_psfFluxErr',
 'y_kronFlux',
 'y_kronFluxErr',
 'y_kronRad']

In [10]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="object",
    input_file_list=get_paths("objectTable"),
    file_reader=DimensionParquetReader(column_names=obj_default_columns, chunksize=250_000),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="object",
    resume=False,
    pixel_threshold=200_000,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/29 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/29 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/89 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

#### Source

This is one that's going to get much worse very quickly. The `sourceTable` dimension is on the visit. So each file is very small, and there are LOTS of them. 

```
Planning  : 100% 4/4 [00:00<00:00, 123.68it/s]
Mapping   : 100% 16471/16471 [04:25<00:00,  1.77it/s]
Binning   : 100% 2/2 [00:38<00:00, 17.09s/it]
Splitting : 100% 16471/16471 [28:41<00:00,  1.64s/it]
Reducing  : 100% 148/148 [04:30<00:00,  2.21s/it]
Finishing : 100% 5/5 [00:24<00:00,  8.99s/it]
```

Solutions:

- Use the `IndexedParquetReader`. We can aggregate each index file by something like tract/patch of the visit, to reduce intermediate file usage.
- Escalate to DM. This is going to be ROUGH for everyone if there is no aggregation.

In [11]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="source",
    input_file_list=get_paths("sourceTable"),
    file_reader=DimensionParquetReader(),
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    resume=False,
    pixel_threshold=1_000_000,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/75 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/75 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/117 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

#### ForcedSource

In [6]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="forcedSource",
    input_file_list=get_paths("forcedSourceTable"),
    file_reader=DimensionParquetReader(),
    ra_column="coord_ra",
    dec_column="coord_dec",
    catalog_type="source",
    resume=False,
    pixel_threshold=8_000_000,
)
runner.pipeline_with_client(args, client)

Planning  :   0%|          | 0/4 [00:00<?, ?it/s]

Mapping   :   0%|          | 0/29 [00:00<?, ?it/s]

Binning   :   0%|          | 0/2 [00:00<?, ?it/s]

Splitting :   0%|          | 0/29 [00:00<?, ?it/s]

Reducing  :   0%|          | 0/197 [00:00<?, ?it/s]

Finishing :   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
client.close()
tmp_path.cleanup()