### Minimal HEALPix-aware Butler ingestion example

This standalone example registers a dataset type bound to the correct HEALPix dimension (based on NSIDE) and ingests per-pixel CSV catalogs using that dimension in the dataId. Adjust the paths and names as needed.


In [8]:
import os
import time
from pathlib import Path
import numpy as np
from astropy.table import Table

# Fresh repo and data locations
REPO_PATH = f"data/monster_guide_repo_{int(time.time())}"
COLLECTION = f"monster_guide_{int(time.time())}"
DATASET_NAME = "monster_guide_catalog"
CATALOG_PATH = "data/Monster_guide"
VIGNETTING_FILE = "data/vignetting_vs_angle.npz"
NSIDE = 32

# Create directories
Path("data").mkdir(exist_ok=True)
Path(CATALOG_PATH).mkdir(parents=True, exist_ok=True)

# Generate a simple vignetting file
theta = np.linspace(0.0, 2.0, 51)
# Near-flat vignetting for simplicity
vignetting = 1.0 - 0.0005 * theta**2
np.savez(VIGNETTING_FILE, theta=theta, vignetting=vignetting)

# Generate a few minimal HEALPix catalog CSVs with required columns
# Choose some arbitrary pixel IDs
pixel_ids = [10467, 10466, 10586]
num_stars = 25
for pid in pixel_ids:
    # Random coordinates around some RA/Dec
    ra = np.deg2rad(120.0 + np.random.uniform(-2.0, 2.0, size=num_stars))
    dec = np.deg2rad(-45.0 + np.random.uniform(-2.0, 2.0, size=num_stars))
    guide_flag = np.random.randint(0, 128, size=num_stars)
    mags = {
        "mag_u": 15.0 + np.random.uniform(0, 2.0, size=num_stars),
        "mag_g": 14.0 + np.random.uniform(0, 2.0, size=num_stars),
        "mag_r": 13.5 + np.random.uniform(0, 2.0, size=num_stars),
        "mag_i": 13.8 + np.random.uniform(0, 2.0, size=num_stars),
        "mag_z": 14.2 + np.random.uniform(0, 2.0, size=num_stars),
        "mag_y": 14.6 + np.random.uniform(0, 2.0, size=num_stars),
    }
    gaia_G = 14.0 + np.random.uniform(0, 2.0, size=num_stars)
    healpix_id = np.full(num_stars, pid)

    tbl = Table(
        {
            "coord_ra": ra,
            "coord_dec": dec,
            "gaia_G": gaia_G,
            **mags,
            "guide_flag": guide_flag,
            "healpix_id": healpix_id,
        }
    )
    tbl.write(Path(CATALOG_PATH) / f"{pid}.csv", overwrite=True)

print(f"✅ Generated fresh data in: {CATALOG_PATH}")
print(f"✅ Created vignetting file: {VIGNETTING_FILE}")
print(f"🆕 Fresh repo will be created at: {REPO_PATH}")


✅ Generated fresh data in: data/Monster_guide
✅ Created vignetting file: data/vignetting_vs_angle.npz
🆕 Fresh repo will be created at: data/monster_guide_repo_1755196108


In [9]:
from lsst.daf.butler import Butler, DatasetType, CollectionType

# Create fresh repo and connect writeable
Butler.makeRepo(REPO_PATH)
butler = Butler(REPO_PATH, writeable=True)

# Create a RUN collection explicitly (script parity)
try:
    butler.registry.registerCollection(COLLECTION, CollectionType.RUN)
except Exception:
    pass

# Register dataset type bound to healpix dimension using DimensionGroup (script parity)
level = int(np.log2(NSIDE))
hp_dim = f"healpix{level}"
hp_group = butler.registry.dimensions.conform([hp_dim])

existing = {dt.name for dt in butler.registry.queryDatasetTypes()}
print(f"DATASET_NAME {DATASET_NAME} and existing {existing}")
if DATASET_NAME not in existing:
    dt = DatasetType(DATASET_NAME, dimensions=hp_group, storageClass='ArrowAstropy')
    butler.registry.registerDatasetType(dt)
else:
    dt = butler.registry.getDatasetType(DATASET_NAME)

# Idempotent ingest into the fresh repo
for csv_path in Path(CATALOG_PATH).glob('*.csv'):
    try:
        pixel_id = int(csv_path.stem)
    except ValueError:
        continue
    dataId = {hp_dim: pixel_id}
    ref = butler.registry.findDataset(DATASET_NAME, dataId=dataId, collections=[COLLECTION])
    if ref is None:
        print(f"ref not found for dataset {DATASET_NAME} and collections = {[COLLECTION]}")
        tbl = Table.read(csv_path)
        ref = butler.put(tbl, dt, dataId=dataId, run=COLLECTION)
    else:
        print(f"✅ ref found {ref} for dataset {DATASET_NAME} and collections = {[COLLECTION]}")
        butler.registry.associate(COLLECTION, [ref])

print(f"✅ Ingested {DATASET_NAME} into fresh repo: {REPO_PATH}")
print(f"   Collection: {COLLECTION}")
print(f"   HEALPix dimension: {hp_dim}")


DATASET_NAME monster_guide_catalog and existing set()
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_1755196108']
ref not found for dataset monster_guide_catalog and collections = ['monster_guide_175519610

In [None]:
# Verification-only cell for the fresh repo; no ingestion here
from lsst.daf.butler import Butler

butler = Butler(REPO_PATH)
print(f"📦 Repo: {REPO_PATH}")
print(f"📚 Collection (run): {COLLECTION}")

# List dataset types
existing = {dt.name for dt in butler.registry.queryDatasetTypes()}
print(f"DATASET_NAME is {DATASET_NAME} AND existing is {existing}")

# Derive nside from dataset type's dimensions (not from dataId)
dt = butler.registry.getDatasetType(DATASET_NAME)

# Prefer iterating the required set directly (healpix{level} is required)
dim_names = dt.dimensions.required
hp_dims = [name for name in dim_names if str(name).startswith("healpix")]

if hp_dims:
    hp_name = str(next(iter(hp_dims)))
    level = int(hp_name.replace("healpix", ""))
    derived_nside = 2 ** level
    print(f"Derived HEALPix: dimension={hp_name} → level={level} → nside={derived_nside}")
else:
    print("Could not locate HEALPix dimension on dataset type.")


# List datasets for this type in the run
refs = list(butler.registry.queryDatasets(DATASET_NAME, collections=[COLLECTION]))
print(f"Found {len(refs)} datasets for '{DATASET_NAME}' in collection '{COLLECTION}'")
if refs:
    sample = refs[:5]
    print("Sample data IDs:")
    for r in sample:
        print(f"  dataId={r.dataId.required}")
        print(f"  dataId={{" + ", ".join(f"{k}={v}" for k, v in r.dataId.required.items()) + "}}")
        #print(f"  dataId={dict(r.dataId)}")



📦 Repo: data/monster_guide_repo_1755196108
📚 Collection (run): monster_guide_1755196108
DATASET_NAME is monster_guide_catalog AND existing is {'monster_guide_catalog'}
Derived HEALPix: dimension=healpix5 → level=5 → nside=32
Derived HEALPix: dimension=healpix5 → level=5 → nside=32
Found 14 datasets for 'monster_guide_catalog' in collection 'monster_guide_1755196108'
Sample data IDs:
  dataId={healpix5: 10467}
  dataId={healpix5=10467}}
  dataId={healpix5: 10466}
  dataId={healpix5=10466}}
  dataId={healpix5: 10700}
  dataId={healpix5=10700}}
  dataId={healpix5: 10701}
  dataId={healpix5=10701}}
  dataId={healpix5: 10585}
  dataId={healpix5=10585}}


# HEALPix-aware Butler ingestion: Dimensions and Data IDs

This note summarizes how to register and ingest HEALPix-partitioned datasets with Butler, and clarifies how Dimensions and Data IDs work.

## Dimensions (what they are and are not)
- Dimensions are not arbitrary metadata. They are defined by the repository’s DimensionUniverse (schema) and must be one of the known names.
- For HEALPix, the dimension name encodes the level: level = log2(nside). Examples:
  - nside = 32 → level 5 → dimension name: `healpix5`
  - nside = 64 → level 6 → dimension name: `healpix6`
- You cannot add free-form keys (e.g., `nside`) to a DatasetType’s dimensions. If you need to carry `nside`, put it in the dataset payload (metadata/columns) or encode it in the dataset type or collection name.

## Registering DatasetTypes (HEALPix and dimensionless)
Prefer building a DimensionGroup from canonical names, then use it when creating the DatasetType.

```python
# Given a writeable Butler
from lsst.daf.butler import DatasetType, CollectionType

# Create a RUN collection explicitly
butler.registry.registerCollection(COLLECTION, CollectionType.RUN)

# Build the HEALPix dimension group from names (recommended)
level = int(np.log2(NSIDE))
hp_dim = f"healpix{level}"
hp_group = butler.registry.dimensions.conform([hp_dim])

# Register the catalog dataset type (HEALPix-bound)
dt = DatasetType(
    "monster_guide_catalog",
    dimensions=hp_group,
    storageClass="ArrowAstropy",  # or another appropriate storage class
)
butler.registry.registerDatasetType(dt)

# Register a dimensionless dataset type (e.g., vignetting)
vt = DatasetType(
    "vignetting_correction",
    dimensions=butler.registry.dimensions.empty,
    storageClass="StructuredDataDict",
)
butler.registry.registerDatasetType(vt)
```

Notes:
- Passing a set of names (e.g., `{hp_dim}`) is also supported in some versions, but then you must also pass `universe=butler.registry.dimensions`. Using `dimensions.conform([...])` produces a DimensionGroup and avoids that requirement.

## Data IDs (how to address a dataset)
- Butler requires a mapping from required dimension name to value. For HEALPix:
  - `dataId = {hp_dim: pixel_id}` where `hp_dim == 'healpix5'` (for nside=32)
- Alternative calling style (kwargs):
  - `butler.put(obj, dt, run=COLLECTION, **{hp_dim: pixel_id})`
  - If you know `hp_dim` concretely: `butler.put(obj, dt, run=COLLECTION, healpix5=pixel_id)`
- Dimensionless datasets use an empty mapping: `dataId = {}`

## Idempotent ingestion pattern
Avoid uniqueness conflicts by checking whether a dataset already exists for the (datasetType, dataId) in your run collection:

```python
from astropy.table import Table

for csv_path in Path(CATALOG_PATH).glob("*.csv"):
    pixel_id = int(csv_path.stem)  # filename like 10467.csv
    dataId = {hp_dim: pixel_id}

    # Check in target collection (run)
    ref = butler.registry.findDataset("monster_guide_catalog", dataId=dataId, collections=[COLLECTION])
    if ref is None:
        table = Table.read(csv_path)
        butler.put(table, dt, dataId=dataId, run=COLLECTION)
    else:
        # ensure association to the run (no-op if already associated)
        butler.registry.associate(COLLECTION, [ref])
```

## Common pitfalls and fixes
- “Butler is read-only.” → open with `Butler(REPO_PATH, writeable=True)` and create a run (collection) for writes.
- “NoDefaultCollectionError.” → pass the collection explicitly: `collections=[COLLECTION]` to queries like `findDataset`.
- “UNIQUE constraint failed … dataset_tags …” → you are inserting a duplicate (same datasetType and dataId) into the same run. Use the idempotent pattern above, or start with a fresh repo/run.
- “If dimensions is not a DimensionGroup, a universe must be provided.” → either pass `universe=butler.registry.dimensions` or build a DimensionGroup with `dimensions.conform([...])`.

## Where this is used here
- Example notebook: `guider_roi_extra_other_branch/healpix_ingest_example.ipynb` shows data generation, repo creation, HEALPix-aware registration, idempotent ingest, and verification.
- Ingestion script: `guider_roi_extra_other_branch/ingest_guider_data.py` uses the same DimensionGroup approach and also ingests a dimensionless vignetting dataset.

## Quick reference
- Dimension name for HEALPix: `healpix{level}` with `level = int(log2(nside))`
- DatasetType for HEALPix: `DatasetType(name, dimensions=butler.registry.dimensions.conform([healpix{level}]), storageClass=...)`
- DatasetType for dimensionless data: `DatasetType(name, dimensions=butler.registry.dimensions.empty, storageClass=...)`
- Data ID for HEALPix: `{healpix{level}: pixel_id}` (mapping), or kwargs equivalent
- Run/collection for writes: `registerCollection(COLLECTION, CollectionType.RUN)` and pass `run=COLLECTION` when calling `put`

