## Import PPDB base catalogs

Hatsify DIA object, source and forced source data from PPDB.

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import tempfile

from dask.distributed import Client
from hats_import import pipeline_with_client
from hats_import.collection.arguments import ImportArguments
from pathlib import Path

In [None]:
ppdb_dir = Path("/sdf/scratch/rubin/ppdb/data/lsstcam")
hats_dir = Path("/sdf/data/rubin/shared/lsdb_commissioning/hats/PPDB_v1")

In [None]:
tmp_path = tempfile.TemporaryDirectory()
tmp_dir = Path(tmp_path.name)
client = Client(n_workers=16, threads_per_worker=1, local_directory=tmp_dir)

In [None]:
def get_paths(dataset_type):
    """Return all parquet files for a given dataset type."""
    dataset_name = ''.join(word.capitalize() for word in dataset_type.split('_'))
    files = sorted(ppdb_dir.rglob(f"{dataset_name}.parquet"))
    print(f"Found {len(files)} {dataset_type} parquet files")
    return files

def download_schema(dataset_type, parquet_filepath):
    """Obtain the final schema for a dataset type."""
    with open(parquet_filepath, "rb") as file:
        schema = pq.read_schema(file).remove_metadata()
    schema_table = pa.table(
        {field.name: pa.array([], type=field.type) for field in schema}
    )
    schema_filepath = tmp_dir / f"{dataset_type}_schema.parquet"
    pq.write_table(schema_table, schema_filepath)
    print(f"Wrote {dataset_type} schema to {schema_filepath}")
    return schema_filepath

def get_paths_and_schema(dataset_type):
    paths = get_paths(dataset_type)
    # Use the first parquet schema as the final schema
    schema_path = download_schema(dataset_type, paths[0])
    return paths, schema_path

### dia_object

In [None]:
paths, schema_path = get_paths_and_schema("dia_object")

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_object",
    input_file_list=paths,
    file_reader="parquet",
    ra_column="ra",
    dec_column="dec",
    catalog_type="object",
    pixel_threshold=5_000_000,
    use_schema_file=schema_path,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

### dia_source

In [None]:
paths, schema_path = get_paths_and_schema("dia_source")

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_source",
    input_file_list=paths,
    file_reader="parquet",
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    pixel_threshold=4_000_000,
    use_schema_file=schema_path,
    simple_progress_bar=True,
    resume=False,
)
pipeline_with_client(args, client)

### dia_forced_source

In [None]:
paths, schema_path = get_paths_and_schema("dia_forced_source")

In [None]:
args = ImportArguments(
    output_path=hats_dir,
    output_artifact_name="dia_forced_source",
    input_file_list=paths,
    file_reader="parquet",
    ra_column="ra",
    dec_column="dec",
    catalog_type="source",
    pixel_threshold=25_000_000,
    use_schema_file=schema_path,
    simple_progress_bar=True,
    resume=False,    
)
pipeline_with_client(args, client)

In [None]:
client.close()