## HATS catalogs in Butler

In [None]:
import hats
import numpy as np
import lsst.daf.butler as dafButler
import lsdb
import pandas as pd
from hats.io.paths import pixel_catalog_file

In [None]:
repo = "test_butler"
dataset_type = "dia_object"
collection_name = "u/stavar/imports"

### Create butler

In [None]:
# butler create REPO
!butler create {repo}

### Register HEALPix dataset type

In [None]:
# butler register-dataset-type REPO DATASET_TYPE STORAGE_CLASS DIMENSIONS
!butler register-dataset-type {repo} {dataset_type} ArrowTable healpix17

### Upload files to Butler

In [None]:
catalog = hats.read_hats(dataset_type)
pixels = catalog.get_healpix_pixels()
pixel_paths = [pixel_catalog_file(dataset_type, pix) for pix in pixels]
pixel_paths

The Butler ParquetFormatter only accepts parquet files with the ".parq" extension:

In [None]:
for i, path in enumerate(pixel_paths):
    new_path = path.with_suffix(".parq")
    path.rename(new_path)
    print(f"Renamed: {path} → {new_path}")
    pixel_paths[i] = new_path

The Butler accepts HEALPix dimensions of max level 17:

In [None]:
def get_healpix17(order, pixel):
    order = np.int64(order)
    pixel = np.int64(pixel)
    return pixel * (4 ** (17 - order))

orders = [pix.order for pix in pixels]
pixs = [pix.pixel for pix in pixels]
all_healpix17 = [p * (4 ** (17 - o)) for o, p in zip(orders, pixs)]
all_healpix17

In [None]:
# Create dictionary mapping healpix17 to each pixel file
if len(all_healpix17) != len(set(all_healpix17)):
    raise ValueError("Cannot proceed - level 17 is not enough")
healpix17_path_dict = dict(zip(pixel_paths, all_healpix17))
healpix17_path_dict

### Generate table with files to ingest

In [None]:
ingest_df = pd.DataFrame(list(healpix17_path_dict.items()), columns=["file URI", "healpix17"])
ingest_df.to_csv(f"ingest_files.csv", index=False)
ingest_df

### Upload files

In [None]:
# butler ingest-files REPO DATASET_TYPE RUN FILE_TABLE
!butler --log-level VERBOSE ingest-files {repo} {dataset_type} {collection_name} ingest_files.csv

### Update local directory

Let's delete the real files from our local storage and instead have symlinks to the ones on the Butler:

In [None]:
my_butler = dafButler.Butler(repo, collections=collection_name)
refs = my_butler.query_datasets(dataset_type)
refs

We also need to rename all local paths to have the ".parquet" extension:

In [None]:
for file, healpix17 in healpix17_path_dict.items():
    butler_filepath = my_butler.getURI(dataset_type, dataId={"healpix17": healpix17}).path
    file.unlink()
    file.symlink_to(butler_filepath)
    print(f"Created symlink for: {butler_filepath}")
    updated_extension_path = file.with_suffix(".parquet")
    file.rename(updated_extension_path)
    print(f"Updated extension: {updated_extension_path}")
    

### Read with LSDB

In [None]:
dia_object = lsdb.read_hats("dia_object")
dia_object

In [None]:
dia_object.head()