From 25c5ffe095f42925679a3d101104de40845c8f1b Mon Sep 17 00:00:00 2001 From: Hsin-Fang Chiang Date: Fri, 31 Oct 2025 11:12:33 -0700 Subject: [PATCH 1/4] Factor out the sidecar file reading --- python/shared/raw.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/shared/raw.py b/python/shared/raw.py index bcb57cba..2bec3dec 100644 --- a/python/shared/raw.py +++ b/python/shared/raw.py @@ -387,6 +387,23 @@ def get_group_id_from_oid(oid: str) -> str: oid.removesuffix(m["extension"]) + ".json" ) + group_id = _get_group_id_from_sidecar(sidecar) + return group_id + + +def _get_group_id_from_sidecar(sidecar): + """Read the group id from a sidecar JSON file. + + Parameters + ---------- + sidecar : `lsst.resources.ResourcePath` + URI to a sidecar JSON file. + + Returns + ------- + group_id : `str` + The group identifier as a string. + """ # Wait a bit but not too long for the file. # It should normally show up before the image. count = 0 From b4bc0c2fe2178d3b43beeb43ce35481be4c8179a Mon Sep 17 00:00:00 2001 From: Hsin-Fang Chiang Date: Fri, 31 Oct 2025 12:31:48 -0700 Subject: [PATCH 2/4] Retry reading JSON sidecar with two approches Previously, we only retry when the sidecar does not exist, for occasional slower file transfer from summit. But it's possible that the sidecar exists but is being re-written and hence cannot be read temporarily. We have seen intermittent ConcurrentModification errors from the embargo Ceph S3. It could have been due to summit re-sending successful transfer or other transient storage issues. Hence, this retries in a longer timescale when S3 ClientError is seen. This also combines existence checking and file reading to one operation so to avoid race. --- python/shared/raw.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/python/shared/raw.py b/python/shared/raw.py index 2bec3dec..ad45056a 100644 --- a/python/shared/raw.py +++ b/python/shared/raw.py @@ -44,12 +44,14 @@ import time import urllib.parse +import botocore.exceptions import requests from lsst.obs.lsst import LsstCam, LsstCamImSim, LsstComCam, LsstComCamSim from lsst.obs.lsst.translators.lsst import LsstBaseTranslator from lsst.resources import ResourcePath +from .connect_utils import retry from .visit import FannedOutVisit _log = logging.getLogger("lsst." + __name__) @@ -391,9 +393,14 @@ def get_group_id_from_oid(oid: str) -> str: return group_id +@retry(4, botocore.exceptions.ClientError, wait=5) def _get_group_id_from_sidecar(sidecar): """Read the group id from a sidecar JSON file. + The sidecar file normally show up before the image. If not present, wait a + bit but not too long for the file. Sometimes, the object store gives other + transient ClientErrors, which are retried in a longer timescale. + Parameters ---------- sidecar : `lsst.resources.ResourcePath` @@ -404,19 +411,16 @@ def _get_group_id_from_sidecar(sidecar): group_id : `str` The group identifier as a string. """ - # Wait a bit but not too long for the file. - # It should normally show up before the image. count = 0 - while not sidecar.exists(): - count += 1 - if count > 20: - raise RuntimeError(f"Unable to retrieve JSON sidecar: {sidecar}") - time.sleep(0.1) - - with sidecar.open("r") as f: - md = json.load(f) - - return md.get("GROUPID", "") + while count <= 20: + try: + md = json.loads(sidecar.read()) + return md.get("GROUPID", "") + # If no such sidecar exists, a FileNotFoundError is raised. + except FileNotFoundError: + count += 1 + time.sleep(0.1) + raise RuntimeError(f"Unable to retrieve JSON sidecar: {sidecar}") def get_raw_path(instrument, detector, group, snap, exposure_id, physical_filter): From b42ac8dffba3a3b38cd316d00f2fa6c7d73c4d3a Mon Sep 17 00:00:00 2001 From: Hsin-Fang Chiang Date: Fri, 31 Oct 2025 12:45:04 -0700 Subject: [PATCH 3/4] Retry image download --- python/activator/middleware_interface.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/activator/middleware_interface.py b/python/activator/middleware_interface.py index 66e9f7c9..0a7b89a3 100644 --- a/python/activator/middleware_interface.py +++ b/python/activator/middleware_interface.py @@ -1198,6 +1198,7 @@ def _prep_pipeline_graph(self, pipeline_file) -> lsst.pipe.base.PipelineGraph: """ return self._prep_pipeline(pipeline_file).to_graph() + @connect.retry(2, botocore.exceptions.ClientError, wait=5) def _download(self, remote): """Download an image located on a remote store. From fd08e3969fd36e80c7c1dfe8aa0203ac2304fb7a Mon Sep 17 00:00:00 2001 From: Hsin-Fang Chiang Date: Tue, 18 Nov 2025 11:52:14 -0800 Subject: [PATCH 4/4] Use a faster library to deserialize json --- python/shared/raw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/shared/raw.py b/python/shared/raw.py index ad45056a..83ef6f73 100644 --- a/python/shared/raw.py +++ b/python/shared/raw.py @@ -37,7 +37,6 @@ "get_raw_path", ] -import json import logging import os import re @@ -45,6 +44,7 @@ import urllib.parse import botocore.exceptions +import pydantic_core import requests from lsst.obs.lsst import LsstCam, LsstCamImSim, LsstComCam, LsstComCamSim @@ -414,7 +414,7 @@ def _get_group_id_from_sidecar(sidecar): count = 0 while count <= 20: try: - md = json.loads(sidecar.read()) + md = pydantic_core.from_json(sidecar.read()) return md.get("GROUPID", "") # If no such sidecar exists, a FileNotFoundError is raised. except FileNotFoundError: