# Generate data

Script to generate test data for lsdb_macauff.

In [None]:
import numpy as np
import pandas as pd
from macauff.utils import generate_random_catalogs

import hipscat_import.pipeline as runner
from hipscat_import.catalog.arguments import ImportArguments
import tempfile
from dask.distributed import Client

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

## Generic "catalog a", "catalog b"

We create two catalogs, "catalog a" and "catalog b", with some known counterpart pairings. This is useful to confirm that the crossmatch routine returns the same known counterparts.

This is largely copy-pasted from macauff/src/macauff/utils.py

Modifications are made to::

- construct a single dataframe using the randomly generated data
- construct string identifiers for each object in each catalog
- write out to one CSV per catalog, and one for counterpart IDs.

In [None]:
num_a_source, num_b_source, num_common = 50, 100, 40
extent = [0, 0.25, 50, 50.3]
num_filters_a, num_filters_b = 3, 2
a_uncert, b_uncert = 0.1, 0.3
(
    a_astro,
    b_astro,
    a_photo,
    b_photo,
    amagref,
    bmagref,
    a_pair_indices,
    b_pair_indices,
) = generate_random_catalogs(
    num_a_source,
    num_b_source,
    num_common,
    extent,
    num_filters_a,
    num_filters_b,
    a_uncert,
    b_uncert,
    seed=5732,
)

## Make a pretty CSV of catalog A
cat_a_ids = [f"cat_a_3{index :03d}" for index in np.arange(num_a_source)]

cat_a_data = {
    "survey_id": cat_a_ids,
    "ra": a_astro[:, 0],
    "dec": a_astro[:, 1],
    "astro_unc": a_astro[:, 2],
}
for index in range(num_filters_a):
    cat_a_data[f"filter_{index}"] = a_photo[:, index]
cat_a_data["magref"] = amagref
catalog_a_frame = pd.DataFrame(cat_a_data)

catalog_a_frame.to_csv("catalog_a.csv", index=False)

## Make a pretty CSV of catalog B
cat_b_ids = [f"cat_b_8{index :03d}" for index in np.arange(num_b_source)]

cat_b_data = {
    "survey_id": cat_b_ids,
    "ra": b_astro[:, 0],
    "dec": b_astro[:, 1],
    "astro_unc": b_astro[:, 2],
}
for index in range(num_filters_b):
    cat_b_data[f"filter_{index}"] = b_photo[:, index]
cat_b_data["magref"] = bmagref
catalog_b_frame = pd.DataFrame(cat_b_data)

catalog_b_frame.to_csv("catalog_b.csv", index=False)

## Make a pretty CSV of counterparts
counters = {
    "cat_a": [f"cat_a_3{index :03d}" for index in a_pair_indices],
    "cat_b": [f"cat_b_8{index :03d}" for index in b_pair_indices],
}
pd.DataFrame(counters).to_csv("counters.csv", index=False)

## Import pipeline catalogs

Using the two above generic catalogs, we want to import the known matches as an association table.

The below steps will convert the catalog CSV files into hipscat catalogs.

In [None]:
catalog_a_args = ImportArguments(
    input_file_list=["catalog_a.csv"],
    output_path="import_pipeline",
    sort_columns="survey_id",
    constant_healpix_order=4,
    file_reader="csv",
    output_artifact_name="catalog_a",
    tmp_dir=tmp_dir,
)
catalog_b_args = ImportArguments(
    input_file_list=["catalog_b.csv"],
    output_path="import_pipeline",
    sort_columns="survey_id",
    constant_healpix_order=4,
    file_reader="csv",
    output_artifact_name="catalog_b",
    tmp_dir=tmp_dir,
)

with Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir) as client:
    runner.pipeline_with_client(catalog_a_args, client)
    runner.pipeline_with_client(catalog_b_args, client)

In [None]:
tmp_path.cleanup()