lsst · MichelleGower · Apr 6, 2023 · Mar 14, 2023
diff --git a/config/bps_idf.yaml b/config/bps_idf.yaml
@@ -1,72 +1,35 @@
-# PANDA plugin specific settings:
-# iddsServer: "https://aipanda015.cern.ch:443/idds"
-placeholderParams: ['qgraphNodeId', 'qgraphId']
-defaultPreCmdOpts: "--long-log --log-level=VERBOSE --log-file payload-log.json"
+includeConfigs:
+- ${CTRL_BPS_PANDA_DIR}/config/bps_panda.yaml
 
-# lsst Docker image location in GAR (Google Artifact Registry)
-# Must end with slash. Override with empty string to use images from the Docker hub instead.
-dockerImageLocation: "us-central1-docker.pkg.dev/panda-dev-1a74/"
-
-# Limit the number of jobs in a single PanDA task
-maxJobsPerTask: 30000
 
-# IDF PanDA specific settings:
-# One cloud can have multiple sites. One site can have multiple queues.
-# For LSST cloud, there is one one site LSST.
-# If there are multiple sites in a cloud, computeSite can be used.
-computeCloud: "LSST"
-computeSite: "LSST"
+project: dev
+campaign: quick
+computeCloud: LSST
+computeSite: LSST
+pipelineYaml: "${OBS_LSST_DIR}/pipelines/imsim/DRP.yaml#step1"
 
 payload:
-  s3EndpointUrl: "https://storage.googleapis.com"
-  payloadFolder: payload
-  fileDistributionEndPoint: "s3://butler-us-central1-panda-dev/dc2/{payloadFolder}/{uniqProcName}/"
+  butlerConfig: s3://butler-us-central1-panda-dev/dc2/butler-external.yaml
 
-# SLAC PanDA specific settings:
-# computingCloud: US
-# computeSite: DOMA_LSST_SLAC_TEST
-
-executionButler:
-  queue: "DOMA_LSST_GOOGLE_MERGE"
-
-pipetask:
-  pipetaskInit:
-    # This is different from the ctrl_bps default only in the addition of {fileDistributionEndPoint}
-    runQuantumCommand: >
-      ${CTRL_MPEXEC_DIR}/bin/pipetask {initPreCmdOpts} run
-      -b {butlerConfig}
-      -i {inCollection}
-      -o {output}
-      --output-run {outputRun}
-      --qgraph {fileDistributionEndPoint}/{qgraphFile}
-      --qgraph-id {qgraphId} --qgraph-node-id {qgraphNodeId}
-      --clobber-outputs
-      --init-only
-      --extend-run {extraInitOptions}
-  forcedPhotCoadd:
-    queue: "DOMA_LSST_GOOGLE_TEST_HIMEM_NON_PREEMPT"
-
-# This is different from the ctrl_bps default only in the addition of {fileDistributionEndPoint}
-runQuantumCommand: >
-  ${CTRL_MPEXEC_DIR}/bin/pipetask {runPreCmdOpts} run
-  -b {butlerConfig}
-  -i {inCollection}
-  -o {output}
-  --output-run {outputRun}
-  --qgraph {fileDistributionEndPoint}/{qgraphFile}
-  --qgraph-id {qgraphId}
-  --qgraph-node-id {qgraphNodeId}
-  --clobber-outputs
-  --skip-init-writes
-  --extend-run {extraRunQuantumOptions}
+# Job environment setup
+custom_lsst_setup: ""
+setupLSSTEnv: >
+  source /opt/lsst/software/stack/loadLSST.bash;
+  setup lsst_distrib;
+  {custom_lsst_setup}
 
-# This is a series of setup commands preceding the actual core SW execution with running prmon (memory monitor)
-runnerCommand: >
-  logDir=/tmp/panda/${PANDAID};
-  mkdir ${logDir};
+# lsst Docker image location in GAR (Google Artifact Registry)
+# Must end with slash. Override with empty string to use images from the Docker hub instead.
+dockerImageLocation: "us-central1-docker.pkg.dev/panda-dev-1a74/"
+payloadFolder: payload
+fileDistributionEndPoint: "s3://butler-us-central1-panda-dev/dc2/{payloadFolder}/{uniqProcName}/"
+s3EndpointUrl: "https://storage.googleapis.com"
+jobLogDir: "/tmp/panda/${PANDAID}"
+jobInitDir: "/tmp"
+jobContainer: >
   logFile=${logDir}/${REALTIME_LOGFILES};
   touch ${logFile};
-  chmod ugo+w ${logFile} ${logDir}; ln -s ${logFile} ./;
+  chmod ugo+w ${logFile}; ln -s ${logFile} ./;
   docker run
   --rm
   -v ${logDir}:${logDir}
@@ -77,19 +40,16 @@ runnerCommand: >
   --env AWS_SECRET_ACCESS_KEY=$(</credentials/AWS_SECRET_ACCESS_KEY)
   --env PGPASSWORD=$(</credentials/PGPASSWORD)
   --env S3_ENDPOINT_URL=${S3_ENDPOINT_URL} {sw_image}
-  /bin/bash -c "source /opt/lsst/software/stack/loadLSST.bash;
-  cd /tmp;
-  ls -al;
-  setup lsst_distrib;
-  pwd;
-  prmon
-  -i 5
-  -f ${logDir}/prmon.txt
-  -j ${logDir}/prmon.json
-  --
-  python3 \${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py _cmd_line_;" >&2;
-  retStat=$?;
-  ln -fs ${logDir}/prmon.txt ./memory_monitor_output.txt;
-  ln -fs ${logDir}/prmon.json ./memory_monitor_summary.json;
-  exit $retStat
-wmsServiceClass: lsst.ctrl.bps.panda.PanDAService
+  /bin/bash -c "{payloadCommand}" >&2;
+
+jobCleanup: ""
+
+# IDF is configured with many PanDA queues
+# non-preemption is currently not a feature currently
+# matched when selecting destination queue automatically.
+executionButler:
+  queue: "DOMA_LSST_GOOGLE_MERGE"
+
+pipetask:
+  forcedPhotCoadd:
+    queue: "DOMA_LSST_GOOGLE_TEST_HIMEM_NON_PREEMPT"
diff --git a/config/bps_idf_new.yaml b/config/bps_idf_new.yaml
diff --git a/config/bps_panda.yaml b/config/bps_panda.yaml
@@ -1,6 +1,8 @@
 # PANDA plugin specific settings:
+wmsServiceClass: lsst.ctrl.bps.panda.PanDAService
 placeholderParams: ['qgraphNodeId', 'qgraphId']
 defaultPreCmdOpts: "--long-log --log-level=VERBOSE --log-file payload-log.json --log-label REPO={butlerConfig}"
+qgraphPreCmdOpts: "--long-log --log-level=VERBOSE"  # don't need payload-log.json on submit side.
 
 # Limit the number of jobs in a single PanDA task
 maxJobsPerTask: 30000
@@ -14,58 +16,18 @@ priority: 500
 # Default number of retries
 numberOfRetries: 3
 
-pipetask:
-  pipetaskInit:
-    # This is different from the ctrl_bps default only in the addition of {fileDistributionEndPoint}
-    runQuantumCommand: >
-      ${CTRL_MPEXEC_DIR}/bin/pipetask {initPreCmdOpts} run
-      -b {butlerConfig}
-      -i {inCollection}
-      -o {output}
-      --output-run {outputRun}
-      --qgraph {fileDistributionEndPoint}/{qgraphFile}
-      --qgraph-id {qgraphId} --qgraph-node-id {qgraphNodeId}
-      --clobber-outputs
-      --init-only
-      --extend-run {extraInitOptions}
-
-# This is different from the ctrl_bps default only in the addition of {fileDistributionEndPoint}
-runQuantumCommand: >
-  ${CTRL_MPEXEC_DIR}/bin/pipetask {runPreCmdOpts} run
-  -b {butlerConfig}
-  -i {inCollection}
-  -o {output}
-  --output-run {outputRun}
-  --qgraph {fileDistributionEndPoint}/{qgraphFile}
-  --qgraph-id {qgraphId}
-  --qgraph-node-id {qgraphNodeId}
-  --clobber-outputs
-  --skip-init-writes
-  --extend-run {extraRunQuantumOptions}
+# Job values which sites need to define for runnerCommand:
+# custom_lsst_setup - user defined special environments
+# setupLSSTEnv - how to get lsst_distrib set up
+# jobLogDir - where prmon and other non-payload logs go
+# jobInitDir - directory were should be when job starts
+# jobContainer - any commands needed to set up container
+# jobCleanup - any cleanup commands to run after job
 
-# This is a series of setup commands preceding the actual core SW execution with running prmon (memory monitor)
-runnerCommand: >
-  logDir=/tmp/panda/${PANDAID};
-  mkdir ${logDir};
-  logFile=${logDir}/${REALTIME_LOGFILES};
-  touch ${logFile};
-  chmod ugo+w ${logFile} ${logDir}; ln -s ${logFile} ./;
-  file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file}; ln -s ${file} ./;
-  file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file}; ln -s ${file} ./;
-  docker run
-  --rm
-  -v ${logDir}:${logDir}
-  -v ${logFile}:/tmp/${REALTIME_LOGFILES}
-  --network host
-  --privileged
-  --env AWS_ACCESS_KEY_ID=$(</credentials/AWS_ACCESS_KEY_ID)
-  --env AWS_SECRET_ACCESS_KEY=$(</credentials/AWS_SECRET_ACCESS_KEY)
-  --env PGPASSWORD=$(</credentials/PGPASSWORD)
-  --env S3_ENDPOINT_URL=${S3_ENDPOINT_URL} {sw_image}
-  /bin/bash -c "source /opt/lsst/software/stack/loadLSST.bash;
-  cd /tmp;
+payloadCommand: >
+  cd {jobInitDir};
   ls -al;
-  setup lsst_distrib;
+  {setupLSSTEnv}
   pwd;
   python3 \${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py _cmd_line_ & pJob=\$!;
   prmon -i 5
@@ -75,8 +37,17 @@ runnerCommand: >
   wait \$pJob;
   ret=\$?;
   wait \$mJob;
-  exit \$ret;" >&2;
+  {jobCleanup}
+  exit \$ret;
+
+runnerCommand: >
+  logDir={jobLogDir};
+  mkdir -p ${logDir};
+  chmod ugo+w ${logDir};
+  file=${logDir}/memory_monitor_output.txt; touch ${file}; chmod ugo+w ${file};
+  if [ ! -e memory_monitor_output.txt ]; then ln -s ${file} ./; fi;
+  file=${logDir}/memory_monitor_summary.json; touch ${file}; chmod ugo+w ${file};
+  if [ ! -e memory_monitor_summary.json ]; then ln -s ${file} ./; fi;
+  {jobContainer}
   retStat=$?;
   exit $retStat
-
-wmsServiceClass: lsst.ctrl.bps.panda.PanDAService
diff --git a/config/bps_usdf.yaml b/config/bps_usdf.yaml
@@ -7,39 +7,32 @@
 campaign: quick
 computeCloud: US
 computeSite: SLAC
-requestMemory: 2048
-# PanDA does the scheduling based on memory request
+s3EndpointUrl: "https://storage.googleapis.com"
+payloadFolder: payload
+fileDistributionEndPoint: "file:///sdf/group/rubin/sandbox/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/"
 
+# location of main butler repo at USDF
+payload:
+  butlerConfig: /sdf/group/rubin/repo/main
 
+# Job environment setup
 custom_lsst_setup: ""
 setupLSSTEnv: >
+  unset PYTHONPATH;
   source /cvmfs/sw.lsst.eu/linux-x86_64/lsst_distrib/{LSST_VERSION}/loadLSST.bash;
   setup lsst_distrib;
   {custom_lsst_setup}
 
-# To override the IDF runnerCommand. It can be dropped when a universal runnerCommand is available in bps_panda.yaml
-runnerCommand: >
-  unset PYTHONPATH;
-  {setupLSSTEnv}
-  pwd; ls -al;
-  python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py _cmd_line_ & pJob=$!;
-  prmon -i 5
-  -f memory_monitor_output.txt
-  -j memory_monitor_summary.json
-  -p $pJob & mJob=$!;
-  wait $pJob;
-  retStat=$?;
-  wait $mJob;
-  rm -fr EXEC_REPO-*;
-  exit $retStat
+# Other job variables
+jobInitDir: "`pwd`"
+jobLogDir: "{jobInitDir}"
+jobContainer: >
+  /bin/bash -c "{payloadCommand}" >&2;
+jobCleanup: "rm -fr EXEC_REPO-*;"
 
-payload:
-  s3EndpointUrl: "https://storage.googleapis.com"
-  butlerConfig: /sdf/group/rubin/repo/main
-  payloadFolder: payload
-  fileDistributionEndPoint: "file:///sdf/group/rubin/sandbox/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/"
 
 # Specify memory request for executionButler, pipetaskInit and forcedPhotCoadd, placeholder for now
+requestMemory: 2048 # PanDA does the scheduling based on memory request
 executionButler:
   requestMemory: 7000
   queue: "SLAC_Rubin_Merge"

diff --git a/doc/changes/DM-38307.misc.rst b/doc/changes/DM-38307.misc.rst
@@ -0,0 +1,3 @@
+Update some default yaml values to more easily allow parts to be
+modified as well as provide values to go with the updated bps
+default yaml (e.g., no longer need runQuantumCommands).
diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py
@@ -429,8 +429,9 @@ def add_decoder_prefix(config, cmd_line, distribution_path, files):
         _LOG.debug("files_plc_hldr[%s] = %s", key, files_plc_hldr[key])
 
     cmdline_hex = convert_exec_string_to_hex(cmd_line)
-    _, decoder_prefix = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
-    decoder_prefix = decoder_prefix.replace(
+    _, runner_command = config.search("runnerCommand", opt={"replaceEnvVars": False, "expandEnvVars": False})
+    runner_command = runner_command.replace("\n", " ")
+    decoder_prefix = runner_command.replace(
         "_cmd_line_",
         str(cmdline_hex)
         + " ${IN/L} "