Merge pull request #120 from lsst-dm/tickets/DM-41241

DM-41241: Create a new dev butler central repo in s3://rubin-pp-dev-users and move there
lsst-dm · Feb 20, 2024 · 25b7b13 · 25b7b13
2 parents 410b611 + 106e9b2
commit 25b7b13
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 37 deletions.
diff --git a/bin.src/make_latiss_export.py b/bin.src/make_latiss_export.py
@@ -97,9 +97,10 @@ def _export_for_copy(butler, target_butler):
         )
         contents.saveDatasets(records)
 
-        logging.debug("Selecting refcats datasets")
+        refcats = {"atlas_refcat2_20220201", "gaia_dr3_20230707"}
+        logging.debug(f"Selecting refcats datasets {refcats}")
         records = _filter_datasets(
-            butler, target_butler, datasetType=..., collections="refcats*"
+            butler, target_butler, datasetType=refcats, collections="refcats*"
         )
         contents.saveDatasets(records)
 
@@ -126,13 +127,12 @@ def _export_for_copy(butler, target_butler):
         for collection in butler.registry.queryCollections(
             expression="LATISS/calib",
             flattenChains=True,
+            includeChains=True,
         ) + [
-            "LATISS/calib",
-            "LATISS/calib/DM-36719",
-            "LATISS/calib/DM-38946",
-            "LATISS/calib/DM-39505",
             "LATISS/templates",
+            "LATISS/calib/unbounded",
         ]:
+            logging.debug(f"Selecting collection {collection}")
             try:
                 target_butler.registry.queryCollections(collection)
             except daf_butler.registry.MissingCollectionError:

diff --git a/bin.src/make_remote_butler.py b/bin.src/make_remote_butler.py
@@ -53,11 +53,13 @@ def _make_parser():
                         help="The config file to use for the new repository. Defaults to etc/db_butler.yaml.")
     parser.add_argument("--export-file", default="export.yaml",
                         help="The export file containing the repository contents. Defaults to ./export.yaml.")
+    parser.add_argument("--instrument",
+                        help="The short name of the instrument (HSC, LATISS, etc).")
     parser.add_argument("--hsc-rc2", action="store_true", help="Extra fix up for HSC-RC2 dataset.")
     return parser
 
 
-def _add_chains(butler):
+def _add_chains(butler, instrument_name):
     """Create collections to serve as a uniform interface.
 
     Parameters
@@ -69,6 +71,8 @@ def _add_chains(butler):
         - standard skymap collection
         - templates/*
         - refcats/*
+    instrument_name : `str`
+        The short name of the instrument.
     """
     butler.registry.registerCollection("templates", type=CollectionType.CHAINED)
     butler.registry.setCollectionChain(
@@ -82,8 +86,7 @@ def _add_chains(butler):
         list(butler.registry.queryCollections("refcats/*", collectionTypes=CollectionType.RUN))
     )
 
-    instrument = Instrument.fromName(list(butler.registry.queryDataIds("instrument"))[0]["instrument"],
-                                     butler.registry)
+    instrument = Instrument.fromName(instrument_name, butler.registry)
     defaults = instrument.makeUmbrellaCollectionName()
     butler.registry.registerCollection(defaults, type=CollectionType.CHAINED)
     calib_collection = instrument.makeCalibrationCollectionName()
@@ -146,7 +149,7 @@ def main():
         butler = Butler(config, writeable=True)
     with time_this(msg="Import", level=logging.INFO):
         butler.import_(directory=args.src_repo, filename=args.export_file, transfer="auto")
-    _add_chains(butler)
+    _add_chains(butler, args.instrument)
     if args.hsc_rc2:
         _hsc_rc2(butler)
 

diff --git a/doc/playbook.rst b/doc/playbook.rst
@@ -12,7 +12,7 @@ Table of Contents
 * `Buckets`_
 * `Central Repo`_
 * `Development Service`_
-* `tester`_
+* `Testers`_
 * `Databases`_
 
 
@@ -112,56 +112,80 @@ Buckets
 
 `This document <https://confluence.lsstcorp.org/display/LSSTOps/USDF+S3+Bucket+Organization>`_ describes the overall organization of S3 buckets and access at USDF.
 
-The bucket ``rubin:rubin-pp`` holds incoming raw images.
+For development purposes, Prompt Processing has its own buckets, including ``rubin-pp-dev``, ``rubin-pp-dev-users``, ``rubin:rubin-pp``, and ``rubin:rubin-pp-users``.
 
-The bucket ``rubin:rubin-pp-users`` holds:
+Current Buckets
+---------------
+
+Currently the buckets ``rubin-pp-dev`` and ``rubin-pp-dev-users`` are used with the testers (see `Testers`_).
+They are owned by the Ceph user ``prompt-processing-dev``.
+
+The bucket ``rubin-pp-dev`` holds incoming raw images.
+
+The bucket ``rubin-pp-dev-users`` holds:
+
+* ``rubin-pp-dev-users/central_repo/`` contains the central repository described in `DMTN-219`_.
+  This repository currently contains HSC and LATISS data, uploaded with ``make_hsc_rc2_export.py``, ``make_latiss_export.py``, and ``make_template_export.py``.
+
+* ``rubin-pp-dev-users/unobserved/`` contains raw files that the upload scripts can draw from to create incoming raws.
+
+``rubin-pp-dev`` has notifications configured for new file arrival; these publish to the Kafka topic ``prompt-processing-dev``.
+The notifications can be viewed at `Kafdrop <https://k8s.slac.stanford.edu/usdf-prompt-processing-dev/kafdrop>`_.
 
-* ``rubin:rubin-pp-users/central_repo/`` contains the central repository described in `DMTN-219`_.
-  This repository currently contains a copy of HSC RC2 data, uploaded with ``make_hsc_rc2_export.py`` and ``make_template_export``.
+Legacy Buckets
+--------------
 
-* ``rubin:rubin-pp-users/unobserved/`` contains raw files that the upload script(s) can draw from to create incoming raws.
+The buckets ``rubin:rubin-pp`` and ``rubin:rubin-pp-users`` are also for Prompt Processing development and previously used by the testers.
+``rubin:rubin-pp-users`` contains an older version of the development central repository.
+``rubin:rubin-pp`` has notifications configured to publish to the Kafka topic ``rubin-prompt-processing``.
 
-``rubin:rubin-pp`` has had notifications configured for it; these publish to a Kafka topic.
+These buckets are owned by the Ceph user ``rubin-prompt-processing``.
+We are in the process of deprecating the ``rubin-prompt-processing`` user as it has more restrictive permissions than ``prompt-processing-dev``.
+
+Bucket Access and Credentials
+-----------------------------
 
 The default Rubin users' setup on ``rubin-devl`` includes an AWS credential file at the environment variable ``AWS_SHARED_CREDENTIALS_FILE`` and a default profile without read permission to the prompt processing buckets.
-A separate credential for prompt processing developers is at  `vault <https://vault.slac.stanford.edu/ui/vault/secrets/secret/show/rubin/usdf-prompt-processing-dev/s3-buckets>`_ and can be set up as another credential profile for Butler or command line tools such as AWS Command Line Interface and MinIO Client.
+A separate credential for prompt processing developers as the Ceph user ``prompt-processing-dev`` (version 6 or newer) or ``rubin-prompt-processing`` (version 5 or older) is at  `Vault <https://vault.slac.stanford.edu/ui/vault/secrets/secret/show/rubin/usdf-prompt-processing-dev/s3-buckets>`_.
+The credential can be set up as another credential profile for Butler or command line tools such as AWS Command Line Interface and MinIO Client.
 One way to set up this profile is with the AWS CLI:
 
 .. code-block:: sh
 
-   singularity exec /sdf/sw/s3/aws-cli_latest.sif aws configure --profile rubin-prompt-processing
+   singularity exec /sdf/sw/s3/aws-cli_latest.sif aws configure --profile prompt-processing-dev
 
 and follow the prompts.
-To use the new credentials with the Butler, set the environment variable ``AWS_PROFILE=rubin-prompt-processing``.
+To use the new credentials with the Butler, set the environment variable ``AWS_PROFILE=prompt-processing-dev``.
 
 The AWS CLI can be used to inspect non-tenenat buckets:
 
 .. code-block:: sh
 
    alias s3="singularity exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu s3"
-   s3 --profile rubin-prompt-processing [ls|cp|rm] s3://rubin-summit/<path>
+   s3 --profile prompt-processing-dev [ls|cp|rm] s3://rubin-summit/<path>
 
 .. note::
 
    You must pass the ``--endpoint-url`` argument even if you have ``S3_ENDPOINT_URL`` defined.
 
-Some of the prompt processing buckets are Ceph tenant buckets and require a tenant prefix, which violates the bucket name standard and is not supported by AWS CLI.
+Those buckets starting with ``rubin:`` are Ceph tenant buckets with the tenant prefix.
+The bucket name with the tenant prefix violates the standard and is not supported by AWS CLI.
 The MinIO Client ``mc`` tool may be used.
 One version can be accessed at ``/sdf/group/rubin/sw/bin/mc`` at USDF.
-To inspect buckets with the MinIO Client ``mc`` tool, first set up an alias (e.g. ``usdf-pp``) and then can use commands:
+To inspect buckets with the MinIO Client ``mc`` tool, first set up an alias (e.g. ``prompt-processing-dev``) and then can use commands:
 
 .. code-block:: sh
 
-    mc alias set usdf-pp https://s3dfrgw.slac.stanford.edu ACCESS_KEY SECRET_KEY
-    mc ls usdf-pp/rubin:rubin-pp
+    mc alias set prompt-processing-dev https://s3dfrgw.slac.stanford.edu ACCESS_KEY SECRET_KEY
+    mc ls prompt-processing-dev/rubin:rubin-pp
 
 
 For Butler not to complain about the bucket names, set the environment variable ``LSST_DISABLE_BUCKET_VALIDATION=1``.
 
 Central Repo
 ============
 
-The central repo for development use is located at ``s3://rubin:rubin-pp-users/central_repo/``.
+The central repo for development use is located at ``s3://rubin-pp-dev-users/central_repo/``.
 You need developer credentials to access it, as described under `Buckets`_.
 
 Migrating the Repo
@@ -190,6 +214,16 @@ In our case, we want to migrate to the versions that ``/repo/embargo`` is using,
    However, when using ``butler migrate`` to update ``dimensions-config``, you should delete all existing pods to ensure that their replacements have the correct version.
    This can be done using ``kubectl delete pod`` or from Argo CD (see `Development Service`_).
 
+Adding New Dataset Types
+------------------------
+
+When pipelines change, sometimes it is necessary to register the new dataset types in the central repo so to avoid ``MissingDatasetTypeError`` at prompt service export time.
+One raw was ingested, visit-defined, and kept in the development central repo, so a ``pipetask`` like the following can be run:
+
+.. code-block:: sh
+
+   make_apdb.py -c db_url="sqlite:///apdb.db"
+   pipetask run -b s3://rubin-pp-dev-users/central_repo -i LATISS/raw/all,LATISS/defaults,LATISS/templates -o u/username/collection  -d "detector=0 and instrument='LATISS' and exposure=2023082900500 and visit_system=0" -p $PROMPT_PROCESSING_DIR/pipelines/LATISS/ApPipe.yaml -c diaPipe:apdb.db_url=sqlite:///apdb.db --register-dataset-types
 
 Development Service
 ===================
@@ -294,13 +328,13 @@ There should be only one local repo per ``MiddlewareInterface`` object, and at t
 If in doubt, check the logs first.
 
 
-tester
-======
+Testers
+=======
 
 ``python/tester/upload.py`` and ``python/tester/upload_hsc_rc2.py`` are scripts that simulate the CCS image writer.
 It can be run from ``rubin-devl``, but requires the user to install the ``confluent_kafka`` package in their environment.
 
-You must have a profile set up for the ``rubin:rubin-pp`` bucket (see `Buckets`_, above).
+You must have a profile set up for the ``rubin-pp-dev`` bucket (see `Buckets`_, above).
 
 Install the Prompt Processing code, and set it up before use:
 
@@ -310,9 +344,7 @@ Install the Prompt Processing code, and set it up before use:
     setup -r prompt_processing
 
 The tester scripts send ``next_visit`` events for each detector via Kafka on the ``next-visit-topic`` topic.
-They then upload a batch of files representing the snaps of the visit to the ``rubin:rubin-pp`` S3 bucket, simulating incoming raw images.
-
-Eventually a set of parallel processes running on multiple nodes will be needed to upload the images sufficiently rapidly.
+They then upload a batch of files representing the snaps of the visit to the ``rubin-pp-dev`` S3 bucket, simulating incoming raw images.
 
 ``python/tester/upload.py``: Command line arguments are the instrument name (currently HSC or LATISS) and the number of groups of images to send.
 
@@ -323,7 +355,7 @@ Sample command line:
    python upload.py HSC 3
    python upload.py LATISS 3
 
-This script draws images stored in the ``rubin:rubin-pp-users`` bucket.
+This script draws images stored in the ``rubin-pp-dev-users`` bucket.
 
 * For HSC, 4 groups, in total 10 raw files, are curated.
   They are the COSMOS data as curated in `ap_verify_ci_cosmos_pdr2 <Rhttps://github.com/lsst/ap_verify_ci_cosmos_pdr2>`_.
@@ -343,6 +375,7 @@ Sample command line:
 This scripts draws images from the curated ``HSC/RC2/defaults`` collection at USDF's ``/repo/main`` butler repository.
 The source collection includes 432 visits, each with 103 detector images.
 The visits are randomly selected and uploaded as one new group for each visit.
+Images can be uploaded in parallel processes.
 
 
 .. note::

diff --git a/etc/db_butler.yaml b/etc/db_butler.yaml
@@ -1,5 +1,128 @@
 # Seed config for generating Prompt Processing central repo.
 # This is the test repo to be used with upload.py, not the repo for processing AuxTel data.
+datastore:
+  cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
+  records:
+    table: file_datastore_records
+  root: <butlerRoot>
 registry:
   db: postgresql://pp@usdf-prompt-processing-dev.slac.stanford.edu/ppcentralbutler
-  namespace: pp_central_repo
+  namespace: pp_dev_central_repo
+  managers:
+    attributes: lsst.daf.butler.registry.attributes.DefaultButlerAttributeManager
+    collections: lsst.daf.butler.registry.collections.synthIntKey.SynthIntKeyCollectionManager
+    datasets: lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID
+    datastores: lsst.daf.butler.registry.bridge.monolithic.MonolithicDatastoreRegistryBridgeManager
+    dimensions: lsst.daf.butler.registry.dimensions.static.StaticDimensionRecordStorageManager
+    opaque: lsst.daf.butler.registry.opaque.ByNameOpaqueTableStorageManager
+    obscore:
+      cls: lsst.daf.butler.registry.obscore._manager.ObsCoreLiveTableManager
+      config:
+        # Configuration copied from /repo/embargo
+        namespace: embargo
+        version: 1
+        facility_name: Rubin-LSST
+        # We don't explicitly specify instrument_name - it's auto-generated from the corresponding Butler dimension
+        obs_collection: LATISS_LIVE       # may evolve as experience with this builds up
+        collection_type: RUN              # means we are using all RUN-type collections that match line below
+        collections: ["LATISS/raw/all", "LATISS/runs/quickLook/202.*", "LATISS/runs/AUXTEL_DRP_IMAGING.*", "LATISS/prompt/.*"]
+        use_butler_uri: false             # do not use URI from Butler, use datalink_url_fmt defined below
+        dataset_types:
+          raw:
+            dataproduct_type: image
+            dataproduct_subtype: lsst.raw
+            calib_level: 1
+            obs_id_fmt: "{records[exposure].obs_id}-{records[detector].full_name}"
+            o_ucd: phot.count
+            access_format: "application/x-votable+xml;content=datalink"
+            # access_url format is still under discussion/review:
+            datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
+          calexp:
+            dataproduct_type: image
+            dataproduct_subtype: lsst.calexp
+            calib_level: 2
+            obs_id_fmt: "{records[visit].name}-{records[detector].full_name}"
+            o_ucd: phot.count
+            access_format: "application/x-votable+xml;content=datalink"
+            # access_url format is still under discussion/review:
+            datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
+          quickLookExp:
+            dataproduct_type: image
+            dataproduct_subtype: lsst.quickLookExp
+            calib_level: 2
+            obs_id_fmt: "{records[exposure].obs_id}-{records[detector].full_name}"
+            o_ucd: phot.count
+            access_format: "application/x-votable+xml;content=datalink"
+            # access_url format is still under discussion/review:
+            datalink_url_fmt: "https://usdf-rsp.slac.stanford.edu/api/datalink/links?ID=butler%3A//embargo/{id}"
+        extra_columns:
+          lsst_visit:
+            template: "{visit}"
+            type: "int"
+          lsst_exposure:
+            template: "{exposure}"
+            type: "int"
+          lsst_detector:
+            template: "{detector}"
+            type: "int"
+          lsst_tract:
+            template: "{tract}"
+            type: "int"
+          lsst_patch:
+            template: "{patch}"
+            type: "int"
+          lsst_band:
+            template: "{band}"
+            type: "string"
+            length: 32
+          lsst_filter:
+            template: "{physical_filter}"
+            type: "string"
+            length: 32
+          lsst_dataset_type:
+            template: "{dataset_type}"
+            type: "string"
+            length: 64
+          lsst_run:
+            template: "{run}"
+            type: "string"
+            length: 255
+        indices:
+          # Indices for obscore table, spatial columns are indexed automatically.
+          # We likely will need to extend this list to support most popular queries,
+          # would be good to have a list of possible queries generated by TAP.
+          instrument_name_idx: instrument_name
+          lsst_visit_idx: lsst_visit
+          lsst_exposure_idx: lsst_exposure
+          dataproduct_idx: [dataproduct_type, dataproduct_subtype]
+        spectral_ranges:
+          # This list includes every band defined now in registry, actual values for some
+          # of them are probably very approximate. Keys in this section could be a band
+          # name or a physical filter name
+          "u": [330.0e-9, 400.0e-9]
+          "u~nd": [330.0e-9, 400.0e-9]
+          "g": [402.0e-9, 552.0e-9]
+          "g~nd": [402.0e-9, 552.0e-9]
+          "r": [552.0e-9, 691.0e-9]
+          "r~nd": [552.0e-9, 691.0e-9]
+          "i": [691.0e-9, 818.0e-9]
+          "i~nd": [691.0e-9, 818.0e-9]
+          "z": [818.0e-9, 922.0e-9]
+          "z~nd": [818.0e-9, 922.0e-9]
+          "y": [922.0e-9, 1060.0e-9]
+          "y~nd": [922.0e-9, 1060.0e-9]
+          "white": [null, null]
+          "unknown": [null, null]
+          "diffuser": [null, null]
+          "notch": [null, null]
+          "grid": [null, null]
+          "grid~nd": [null, null]
+          "spot": [null, null]
+          "spot~nd": [null, null]
+        spatial_plugins:
+          pgsphere:
+            # adds pgsphere columns and indices
+            cls: lsst.daf.butler.registry.obscore.pgsphere.PgSphereObsCorePlugin
+            config:
+              region_column: pgs_region         # name of a column for a region/polygon
+              position_column: pgs_center       # name of a column for position/center
diff --git a/python/tester/upload.py b/python/tester/upload.py
@@ -130,10 +130,10 @@ def main():
     kafka_url = "https://usdf-rsp-dev.slac.stanford.edu/sasquatch-rest-proxy/topics/test.next-visit"
     endpoint_url = "https://s3dfrgw.slac.stanford.edu"
     s3 = boto3.resource("s3", endpoint_url=endpoint_url)
-    dest_bucket = s3.Bucket("rubin:rubin-pp")
+    dest_bucket = s3.Bucket("rubin-pp-dev")
     dest_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)
 
-    src_bucket = s3.Bucket("rubin:rubin-pp-users")
+    src_bucket = s3.Bucket("rubin-pp-dev-users")
     src_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)
 
     last_group = get_last_group(dest_bucket, instrument, date)

diff --git a/python/tester/upload_hsc_rc2.py b/python/tester/upload_hsc_rc2.py
@@ -70,7 +70,7 @@ def _set_s3_bucket():
         global dest_bucket
         endpoint_url = "https://s3dfrgw.slac.stanford.edu"
         s3 = boto3.resource("s3", endpoint_url=endpoint_url)
-        dest_bucket = s3.Bucket("rubin:rubin-pp")
+        dest_bucket = s3.Bucket("rubin-pp-dev")
         dest_bucket.meta.client.meta.events.unregister("before-parameter-build.s3", validate_bucket_name)