lsst-dm · dhirving · Aug 21, 2025 · Jul 8, 2025 · Jul 8, 2025 · kfindeisen
diff --git a/python/activator/activator.py b/python/activator/activator.py
@@ -58,7 +58,8 @@
 from .exception import GracefulShutdownInterrupt, IgnorableVisit, InvalidVisitError, \
     NonRetriableError, RetriableError
 from .middleware_interface import get_central_butler, \
-    make_local_repo, make_local_cache, MiddlewareInterface
+    make_local_repo, make_local_cache, MiddlewareInterface, ButlerWriter, DirectButlerWriter
+from .kafka_butler_writer import KafkaButlerWriter
 from .repo_tracker import LocalRepoTracker
 
 # Platform that prompt processing will run on
@@ -96,6 +97,23 @@
 # The number of seconds to delay retrying connections to the Redis stream.
 redis_retry = float(os.environ.get("REDIS_RETRY_DELAY", 30))
 
+# If '1', sends outputs to a service for transfer into the central Butler
+# repository instead of writing to the database directly.
+use_kafka_butler_writer = os.environ.get("USE_KAFKA_BUTLER_WRITER", "0") == "1"
+if use_kafka_butler_writer:
+    # Hostname of the Kafka cluster used by the Butler writer.
+    butler_writer_kafka_cluster = os.environ["BUTLER_WRITER_KAFKA_CLUSTER"]
+    # Username for authentication to BUTLER_WRITER_KAFKA_CLUSTER.
+    butler_writer_kafka_username = os.environ["BUTLER_WRITER_KAFKA_USERNAME"]
+    # Password for authentication to BUTLER_WRITER_KAFKA_CLUSTER.
+    butler_writer_kafka_password = os.environ["BUTLER_WRITER_KAFKA_PASSWORD"]
+    # Topic used to transfer output datasets to the central repository.
+    butler_writer_kafka_topic = os.environ["BUTLER_WRITER_KAFKA_TOPIC"]
+    # URI to the path where output datasets will be written when using the Kafka
+    # writer to transfer outputs to the central Butler repository.
+    # This will generally be in the same S3 bucket used by the central Butler.
+    butler_writer_file_output_path = os.environ["BUTLER_WRITER_FILE_OUTPUT_PATH"]
+
 # Conditionally load keda environment variables
 if platform == "keda":
     # Time to wait for fanned out messages before spawning new pod.
@@ -163,6 +181,18 @@ def _get_consumer():
     })
 
 
+@functools.cache
+def _get_producer():
+    """Lazy initialization of Kafka Producer for Butler writer."""
+    return kafka.Producer({
+        "bootstrap.servers": butler_writer_kafka_cluster,
+        "security.protocol": "sasl_plaintext",
+        "sasl.mechanism": "SCRAM-SHA-512",
+        "sasl.username": butler_writer_kafka_username,
+        "sasl.password": butler_writer_kafka_password
+    })
+
+
 @functools.cache
 def _get_storage_client():
     """Lazy initialization of cloud storage reader."""
@@ -189,6 +219,19 @@ def _get_read_butler():
         return _get_write_butler()
 
 
+@functools.cache
+def _get_butler_writer() -> ButlerWriter:
+    """Lazy initialization of Butler writer."""
+    if use_kafka_butler_writer:
+        return KafkaButlerWriter(
+            _get_producer(),
+            output_topic=butler_writer_kafka_topic,
+            file_output_path=butler_writer_file_output_path
+        )
+    else:
+        return DirectButlerWriter(_get_write_butler())
+
+
 @functools.cache
 def _get_local_repo():
     """Lazy initialization of local repo.
@@ -461,7 +504,7 @@ def create_app():
         _get_consumer()
         _get_storage_client()
         _get_read_butler()
-        _get_write_butler()
+        _get_butler_writer()
         _get_local_repo()
 
         app = flask.Flask(__name__)
@@ -510,7 +553,7 @@ def keda_start():
         _get_consumer()
         _get_storage_client()
         _get_read_butler()
-        _get_write_butler()
+        _get_butler_writer()
         _get_local_repo()
 
         redis_session = RedisStreamSession(
@@ -1002,7 +1045,7 @@ def process_visit(expected_visit: FannedOutVisit):
                 # Create a fresh MiddlewareInterface object to avoid accidental
                 # "cross-talk" between different visits.
                 mwi = MiddlewareInterface(_get_read_butler(),
-                                          _get_write_butler(),
+                                          _get_butler_writer(),
                                           image_bucket,
                                           expected_visit,
                                           pre_pipelines,

diff --git a/python/activator/kafka_butler_writer.py b/python/activator/kafka_butler_writer.py
@@ -0,0 +1,93 @@
+# This file is part of prompt_processing.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (https://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ("KafkaButlerWriter",)
+
+from datetime import date
+from typing import Literal
+from uuid import uuid4
+
+from confluent_kafka import Producer
+import pydantic
+
+from lsst.daf.butler import (
+    Butler,
+    DatasetRef,
+    SerializedDimensionRecord,
+    SerializedFileDataset,
+)
+from lsst.resources import ResourcePath
+
+from .middleware_interface import ButlerWriter, GroupedDimensionRecords
+
+
+class KafkaButlerWriter(ButlerWriter):
+    def __init__(self, producer: Producer, *, output_topic: str, file_output_path: str) -> None:
+        self._producer = producer
+        self._output_topic = output_topic
+        self._file_output_path = ResourcePath(file_output_path, forceDirectory=True)
+
+    def transfer_outputs(
+        self, local_butler: Butler, dimension_records: GroupedDimensionRecords, datasets: list[DatasetRef]
+    ) -> list[DatasetRef]:
+        # Create a subdirectory in the output root distinct to this processing
+        # run.
+        date_string = date.today().strftime("%Y-%m-%d")
+        subdirectory = f"{date_string}/{uuid4()}/"
+        output_directory = self._file_output_path.join(subdirectory, forceDirectory=True)
+        # There is no such thing as a directory in S3, but the Butler complains
+        # if there is not an object at the prefix of the export path.
+        output_directory.mkdir()
+
+        # Copy files to the output directory, and retrieve metadata required to
+        # ingest them into the central Butler.
+        file_datasets = local_butler._datastore.export(datasets, directory=output_directory, transfer="copy")
+
+        # Serialize Butler data as a JSON string.
+        event = PromptProcessingOutputEvent(
+            type="pp-output",
+            dimension_records=_serialize_dimension_records(dimension_records),
+            datasets=[dataset.to_simple() for dataset in file_datasets],
+            root_directory=subdirectory,
+        )
+        message = event.model_dump_json()
+
+        self._producer.produce(self._output_topic, message)
+        self._producer.flush()
+
+        return datasets
+
+
+class PromptProcessingOutputEvent(pydantic.BaseModel):
+    type: Literal["pp-output"]
+    root_directory: str
+    dimension_records: list[SerializedDimensionRecord]
+    datasets: list[SerializedFileDataset]
+
+
+def _serialize_dimension_records(grouped_records: GroupedDimensionRecords) -> list[SerializedDimensionRecord]:
+    output = []
+    for records in grouped_records.values():
+        for item in records:
+            output.append(item.to_simple())
+    return output