Skip to content

Commit

Permalink
Merge pull request #120 from lsst-dm/tickets/DM-41241
Browse files Browse the repository at this point in the history
DM-41241: Create a new dev butler central repo in s3://rubin-pp-dev-users and move there
  • Loading branch information
hsinfang committed Feb 20, 2024
2 parents 410b611 + 106e9b2 commit 25b7b13
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 37 deletions.
12 changes: 6 additions & 6 deletions bin.src/make_latiss_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,10 @@ def _export_for_copy(butler, target_butler):
)
contents.saveDatasets(records)

logging.debug("Selecting refcats datasets")
refcats = {"atlas_refcat2_20220201", "gaia_dr3_20230707"}
logging.debug(f"Selecting refcats datasets {refcats}")
records = _filter_datasets(
butler, target_butler, datasetType=..., collections="refcats*"
butler, target_butler, datasetType=refcats, collections="refcats*"
)
contents.saveDatasets(records)

Expand All @@ -126,13 +127,12 @@ def _export_for_copy(butler, target_butler):
for collection in butler.registry.queryCollections(
expression="LATISS/calib",
flattenChains=True,
includeChains=True,
) + [
"LATISS/calib",
"LATISS/calib/DM-36719",
"LATISS/calib/DM-38946",
"LATISS/calib/DM-39505",
"LATISS/templates",
"LATISS/calib/unbounded",
]:
logging.debug(f"Selecting collection {collection}")
try:
target_butler.registry.queryCollections(collection)
except daf_butler.registry.MissingCollectionError:
Expand Down
11 changes: 7 additions & 4 deletions bin.src/make_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ def _make_parser():
help="The config file to use for the new repository. Defaults to etc/db_butler.yaml.")
parser.add_argument("--export-file", default="export.yaml",
help="The export file containing the repository contents. Defaults to ./export.yaml.")
parser.add_argument("--instrument",
help="The short name of the instrument (HSC, LATISS, etc).")
parser.add_argument("--hsc-rc2", action="store_true", help="Extra fix up for HSC-RC2 dataset.")
return parser


def _add_chains(butler):
def _add_chains(butler, instrument_name):
"""Create collections to serve as a uniform interface.
Parameters
Expand All @@ -69,6 +71,8 @@ def _add_chains(butler):
- standard skymap collection
- templates/*
- refcats/*
instrument_name : `str`
The short name of the instrument.
"""
butler.registry.registerCollection("templates", type=CollectionType.CHAINED)
butler.registry.setCollectionChain(
Expand All @@ -82,8 +86,7 @@ def _add_chains(butler):
list(butler.registry.queryCollections("refcats/*", collectionTypes=CollectionType.RUN))
)

instrument = Instrument.fromName(list(butler.registry.queryDataIds("instrument"))[0]["instrument"],
butler.registry)
instrument = Instrument.fromName(instrument_name, butler.registry)
defaults = instrument.makeUmbrellaCollectionName()
butler.registry.registerCollection(defaults, type=CollectionType.CHAINED)
calib_collection = instrument.makeCalibrationCollectionName()
Expand Down Expand Up @@ -146,7 +149,7 @@ def main():
butler = Butler(config, writeable=True)
with time_this(msg="Import", level=logging.INFO):
butler.import_(directory=args.src_repo, filename=args.export_file, transfer="auto")
_add_chains(butler)
_add_chains(butler, args.instrument)
if args.hsc_rc2:
_hsc_rc2(butler)

Expand Down
79 changes: 56 additions & 23 deletions doc/playbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Table of Contents
* `Buckets`_
* `Central Repo`_
* `Development Service`_
* `tester`_
* `Testers`_
* `Databases`_


Expand Down Expand Up @@ -112,56 +112,80 @@ Buckets

`This document <https://confluence.lsstcorp.org/display/LSSTOps/USDF+S3+Bucket+Organization>`_ describes the overall organization of S3 buckets and access at USDF.

The bucket ``rubin:rubin-pp`` holds incoming raw images.
For development purposes, Prompt Processing has its own buckets, including ``rubin-pp-dev``, ``rubin-pp-dev-users``, ``rubin:rubin-pp``, and ``rubin:rubin-pp-users``.

The bucket ``rubin:rubin-pp-users`` holds:
Current Buckets
---------------

Currently the buckets ``rubin-pp-dev`` and ``rubin-pp-dev-users`` are used with the testers (see `Testers`_).
They are owned by the Ceph user ``prompt-processing-dev``.

The bucket ``rubin-pp-dev`` holds incoming raw images.

The bucket ``rubin-pp-dev-users`` holds:

* ``rubin-pp-dev-users/central_repo/`` contains the central repository described in `DMTN-219`_.
This repository currently contains HSC and LATISS data, uploaded with ``make_hsc_rc2_export.py``, ``make_latiss_export.py``, and ``make_template_export.py``.

* ``rubin-pp-dev-users/unobserved/`` contains raw files that the upload scripts can draw from to create incoming raws.

``rubin-pp-dev`` has notifications configured for new file arrival; these publish to the Kafka topic ``prompt-processing-dev``.
The notifications can be viewed at `Kafdrop <https://k8s.slac.stanford.edu/usdf-prompt-processing-dev/kafdrop>`_.

* ``rubin:rubin-pp-users/central_repo/`` contains the central repository described in `DMTN-219`_.
This repository currently contains a copy of HSC RC2 data, uploaded with ``make_hsc_rc2_export.py`` and ``make_template_export``.
Legacy Buckets
--------------

* ``rubin:rubin-pp-users/unobserved/`` contains raw files that the upload script(s) can draw from to create incoming raws.
The buckets ``rubin:rubin-pp`` and ``rubin:rubin-pp-users`` are also for Prompt Processing development and previously used by the testers.
``rubin:rubin-pp-users`` contains an older version of the development central repository.
``rubin:rubin-pp`` has notifications configured to publish to the Kafka topic ``rubin-prompt-processing``.

``rubin:rubin-pp`` has had notifications configured for it; these publish to a Kafka topic.
These buckets are owned by the Ceph user ``rubin-prompt-processing``.
We are in the process of deprecating the ``rubin-prompt-processing`` user as it has more restrictive permissions than ``prompt-processing-dev``.

Bucket Access and Credentials
-----------------------------

The default Rubin users' setup on ``rubin-devl`` includes an AWS credential file at the environment variable ``AWS_SHARED_CREDENTIALS_FILE`` and a default profile without read permission to the prompt processing buckets.
A separate credential for prompt processing developers is at `vault <https://vault.slac.stanford.edu/ui/vault/secrets/secret/show/rubin/usdf-prompt-processing-dev/s3-buckets>`_ and can be set up as another credential profile for Butler or command line tools such as AWS Command Line Interface and MinIO Client.
A separate credential for prompt processing developers as the Ceph user ``prompt-processing-dev`` (version 6 or newer) or ``rubin-prompt-processing`` (version 5 or older) is at `Vault <https://vault.slac.stanford.edu/ui/vault/secrets/secret/show/rubin/usdf-prompt-processing-dev/s3-buckets>`_.
The credential can be set up as another credential profile for Butler or command line tools such as AWS Command Line Interface and MinIO Client.
One way to set up this profile is with the AWS CLI:

.. code-block:: sh
singularity exec /sdf/sw/s3/aws-cli_latest.sif aws configure --profile rubin-prompt-processing
singularity exec /sdf/sw/s3/aws-cli_latest.sif aws configure --profile prompt-processing-dev
and follow the prompts.
To use the new credentials with the Butler, set the environment variable ``AWS_PROFILE=rubin-prompt-processing``.
To use the new credentials with the Butler, set the environment variable ``AWS_PROFILE=prompt-processing-dev``.

The AWS CLI can be used to inspect non-tenenat buckets:

.. code-block:: sh
alias s3="singularity exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu s3"
s3 --profile rubin-prompt-processing [ls|cp|rm] s3://rubin-summit/<path>
s3 --profile prompt-processing-dev [ls|cp|rm] s3://rubin-summit/<path>
.. note::

You must pass the ``--endpoint-url`` argument even if you have ``S3_ENDPOINT_URL`` defined.

Some of the prompt processing buckets are Ceph tenant buckets and require a tenant prefix, which violates the bucket name standard and is not supported by AWS CLI.
Those buckets starting with ``rubin:`` are Ceph tenant buckets with the tenant prefix.
The bucket name with the tenant prefix violates the standard and is not supported by AWS CLI.
The MinIO Client ``mc`` tool may be used.
One version can be accessed at ``/sdf/group/rubin/sw/bin/mc`` at USDF.
To inspect buckets with the MinIO Client ``mc`` tool, first set up an alias (e.g. ``usdf-pp``) and then can use commands:
To inspect buckets with the MinIO Client ``mc`` tool, first set up an alias (e.g. ``prompt-processing-dev``) and then can use commands:

.. code-block:: sh
mc alias set usdf-pp https://s3dfrgw.slac.stanford.edu ACCESS_KEY SECRET_KEY
mc ls usdf-pp/rubin:rubin-pp
mc alias set prompt-processing-dev https://s3dfrgw.slac.stanford.edu ACCESS_KEY SECRET_KEY
mc ls prompt-processing-dev/rubin:rubin-pp
For Butler not to complain about the bucket names, set the environment variable ``LSST_DISABLE_BUCKET_VALIDATION=1``.

Central Repo
============

The central repo for development use is located at ``s3://rubin:rubin-pp-users/central_repo/``.
The central repo for development use is located at ``s3://rubin-pp-dev-users/central_repo/``.
You need developer credentials to access it, as described under `Buckets`_.

Migrating the Repo
Expand Down Expand Up @@ -190,6 +214,16 @@ In our case, we want to migrate to the versions that ``/repo/embargo`` is using,
However, when using ``butler migrate`` to update ``dimensions-config``, you should delete all existing pods to ensure that their replacements have the correct version.
This can be done using ``kubectl delete pod`` or from Argo CD (see `Development Service`_).

Adding New Dataset Types
------------------------

When pipelines change, sometimes it is necessary to register the new dataset types in the central repo so to avoid ``MissingDatasetTypeError`` at prompt service export time.
One raw was ingested, visit-defined, and kept in the development central repo, so a ``pipetask`` like the following can be run:

.. code-block:: sh
make_apdb.py -c db_url="sqlite:///apdb.db"
pipetask run -b s3://rubin-pp-dev-users/central_repo -i LATISS/raw/all,LATISS/defaults,LATISS/templates -o u/username/collection -d "detector=0 and instrument='LATISS' and exposure=2023082900500 and visit_system=0" -p $PROMPT_PROCESSING_DIR/pipelines/LATISS/ApPipe.yaml -c diaPipe:apdb.db_url=sqlite:///apdb.db --register-dataset-types
Development Service
===================
Expand Down Expand Up @@ -294,13 +328,13 @@ There should be only one local repo per ``MiddlewareInterface`` object, and at t
If in doubt, check the logs first.


tester
======
Testers
=======

``python/tester/upload.py`` and ``python/tester/upload_hsc_rc2.py`` are scripts that simulate the CCS image writer.
It can be run from ``rubin-devl``, but requires the user to install the ``confluent_kafka`` package in their environment.

You must have a profile set up for the ``rubin:rubin-pp`` bucket (see `Buckets`_, above).
You must have a profile set up for the ``rubin-pp-dev`` bucket (see `Buckets`_, above).

Install the Prompt Processing code, and set it up before use:

Expand All @@ -310,9 +344,7 @@ Install the Prompt Processing code, and set it up before use:
setup -r prompt_processing
The tester scripts send ``next_visit`` events for each detector via Kafka on the ``next-visit-topic`` topic.
They then upload a batch of files representing the snaps of the visit to the ``rubin:rubin-pp`` S3 bucket, simulating incoming raw images.

Eventually a set of parallel processes running on multiple nodes will be needed to upload the images sufficiently rapidly.
They then upload a batch of files representing the snaps of the visit to the ``rubin-pp-dev`` S3 bucket, simulating incoming raw images.

``python/tester/upload.py``: Command line arguments are the instrument name (currently HSC or LATISS) and the number of groups of images to send.

Expand All @@ -323,7 +355,7 @@ Sample command line:
python upload.py HSC 3
python upload.py LATISS 3
This script draws images stored in the ``rubin:rubin-pp-users`` bucket.
This script draws images stored in the ``rubin-pp-dev-users`` bucket.

* For HSC, 4 groups, in total 10 raw files, are curated.
They are the COSMOS data as curated in `ap_verify_ci_cosmos_pdr2 <Rhttps://github.com/lsst/ap_verify_ci_cosmos_pdr2>`_.
Expand All @@ -343,6 +375,7 @@ Sample command line:
This scripts draws images from the curated ``HSC/RC2/defaults`` collection at USDF's ``/repo/main`` butler repository.
The source collection includes 432 visits, each with 103 detector images.
The visits are randomly selected and uploaded as one new group for each visit.
Images can be uploaded in parallel processes.


.. note::
Expand Down
125 changes: 124 additions & 1 deletion etc/db_butler.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,128 @@
# Seed config for generating Prompt Processing central repo.
# This is the test repo to be used with upload.py, not the repo for processing AuxTel data.
datastore:
cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
records:
table: file_datastore_records
root: <butlerRoot>
registry:
db: postgresql://pp@usdf-prompt-processing-dev.slac.stanford.edu/ppcentralbutler
namespace: pp_central_repo
namespace: pp_dev_central_repo
managers:
attributes: lsst.daf.butler.registry.attributes.DefaultButlerAttributeManager
collections: lsst.daf.butler.registry.collections.synthIntKey.SynthIntKeyCollectionManager
datasets: lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID
datastores: lsst.daf.butler.registry.bridge.monolithic.MonolithicDatastoreRegistryBridgeManager
dimensions: lsst.daf.butler.registry.dimensions.static.StaticDimensionRecordStorageManager
opaque: lsst.daf.butler.registry.opaque.ByNameOpaqueTableStorageManager
obscore:
cls: lsst.daf.butler.registry.obscore._manager.ObsCoreLiveTableManager
config:
# Configuration copied from /repo/embargo
namespace: embargo
version: 1
facility_name: Rubin-LSST
# We don't explicitly specify instrument_name - it's auto-generated from the corresponding Butler dimension
obs_collection: LATISS_LIVE # may evolve as experience with this builds up
collection_type: RUN # means we are using all RUN-type collections that match line below
collections: ["LATISS/raw/all", "LATISS/runs/quickLook/202.*", "LATISS/runs/AUXTEL_DRP_IMAGING.*", "LATISS/prompt/.*"]
use_butler_uri: false # do not use URI from Butler, use datalink_url_fmt defined below
dataset_types:
raw:
dataproduct_type: image
dataproduct_subtype: lsst.raw
calib_level: 1
obs_id_fmt: "{records[exposure].obs_id}-{records[detector].full_name}"
o_ucd: phot.count
access_format: "application/x-votable+xml;content=datalink"
# access_url format is still under discussion/review:
datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
calexp:
dataproduct_type: image
dataproduct_subtype: lsst.calexp
calib_level: 2
obs_id_fmt: "{records[visit].name}-{records[detector].full_name}"
o_ucd: phot.count
access_format: "application/x-votable+xml;content=datalink"
# access_url format is still under discussion/review:
datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
quickLookExp:
dataproduct_type: image
dataproduct_subtype: lsst.quickLookExp
calib_level: 2
obs_id_fmt: "{records[exposure].obs_id}-{records[detector].full_name}"
o_ucd: phot.count
access_format: "application/x-votable+xml;content=datalink"
# access_url format is still under discussion/review:
datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
extra_columns:
lsst_visit:
template: "{visit}"
type: "int"
lsst_exposure:
template: "{exposure}"
type: "int"
lsst_detector:
template: "{detector}"
type: "int"
lsst_tract:
template: "{tract}"
type: "int"
lsst_patch:
template: "{patch}"
type: "int"
lsst_band:
template: "{band}"
type: "string"
length: 32
lsst_filter:
template: "{physical_filter}"
type: "string"
length: 32
lsst_dataset_type:
template: "{dataset_type}"
type: "string"
length: 64
lsst_run:
template: "{run}"
type: "string"
length: 255
indices:
# Indices for obscore table, spatial columns are indexed automatically.
# We likely will need to extend this list to support most popular queries,
# would be good to have a list of possible queries generated by TAP.
instrument_name_idx: instrument_name
lsst_visit_idx: lsst_visit
lsst_exposure_idx: lsst_exposure
dataproduct_idx: [dataproduct_type, dataproduct_subtype]
spectral_ranges:
# This list includes every band defined now in registry, actual values for some
# of them are probably very approximate. Keys in this section could be a band
# name or a physical filter name
"u": [330.0e-9, 400.0e-9]
"u~nd": [330.0e-9, 400.0e-9]
"g": [402.0e-9, 552.0e-9]
"g~nd": [402.0e-9, 552.0e-9]
"r": [552.0e-9, 691.0e-9]
"r~nd": [552.0e-9, 691.0e-9]
"i": [691.0e-9, 818.0e-9]
"i~nd": [691.0e-9, 818.0e-9]
"z": [818.0e-9, 922.0e-9]
"z~nd": [818.0e-9, 922.0e-9]
"y": [922.0e-9, 1060.0e-9]
"y~nd": [922.0e-9, 1060.0e-9]
"white": [null, null]
"unknown": [null, null]
"diffuser": [null, null]
"notch": [null, null]
"grid": [null, null]
"grid~nd": [null, null]
"spot": [null, null]
"spot~nd": [null, null]
spatial_plugins:
pgsphere:
# adds pgsphere columns and indices
cls: lsst.daf.butler.registry.obscore.pgsphere.PgSphereObsCorePlugin
config:
region_column: pgs_region # name of a column for a region/polygon
position_column: pgs_center # name of a column for position/center
4 changes: 2 additions & 2 deletions python/tester/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,10 @@ def main():
kafka_url = "https://usdf-rsp-dev.slac.stanford.edu/sasquatch-rest-proxy/topics/test.next-visit"
endpoint_url = "https://s3dfrgw.slac.stanford.edu"
s3 = boto3.resource("s3", endpoint_url=endpoint_url)
dest_bucket = s3.Bucket("rubin:rubin-pp")
dest_bucket = s3.Bucket("rubin-pp-dev")
dest_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)

src_bucket = s3.Bucket("rubin:rubin-pp-users")
src_bucket = s3.Bucket("rubin-pp-dev-users")
src_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)

last_group = get_last_group(dest_bucket, instrument, date)
Expand Down
2 changes: 1 addition & 1 deletion python/tester/upload_hsc_rc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _set_s3_bucket():
global dest_bucket
endpoint_url = "https://s3dfrgw.slac.stanford.edu"
s3 = boto3.resource("s3", endpoint_url=endpoint_url)
dest_bucket = s3.Bucket("rubin:rubin-pp")
dest_bucket = s3.Bucket("rubin-pp-dev")
dest_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)


Expand Down

0 comments on commit 25b7b13

Please sign in to comment.