neptune-ai · Raalsky · Oct 3, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
@@ -130,3 +130,6 @@ stream.bin
 
 # zenml e2e
 .zen
+
+# mocks
+MagicMock/
@@ -1,4 +1,7 @@
-## [UNRELEASED] neptune 1.7.1
+## [UNRELEASED] neptune 1.8.0
+
+### Features
+- Added support for callbacks that stop the synchronization if the lag or lack of progress exceeds a certain threshold ([#1478](https://github.com/neptune-ai/neptune-client/pull/1478))
 
 ### Fixes
 - Add newline at the end of generated `.patch` while tracking uncommitted changes ([#1473](https://github.com/neptune-ai/neptune-client/pull/1473))

diff --git a/src/neptune/envs.py b/src/neptune/envs.py
@@ -28,6 +28,8 @@
     "NEPTUNE_FETCH_TABLE_STEP_SIZE",
     "NEPTUNE_SYNC_AFTER_STOP_TIMEOUT",
     "NEPTUNE_REQUEST_TIMEOUT",
+    "NEPTUNE_ENABLE_DEFAULT_ASYNC_LAG_CALLBACK",
+    "NEPTUNE_ENABLE_DEFAULT_ASYNC_NO_PROGRESS_CALLBACK",
 ]
 
 from neptune.common.envs import (
@@ -59,4 +61,8 @@
 
 NEPTUNE_REQUEST_TIMEOUT = "NEPTUNE_REQUEST_TIMEOUT"
 
+NEPTUNE_ENABLE_DEFAULT_ASYNC_LAG_CALLBACK = "NEPTUNE_ENABLE_DEFAULT_ASYNC_LAG_CALLBACK"
+
+NEPTUNE_ENABLE_DEFAULT_ASYNC_NO_PROGRESS_CALLBACK = "NEPTUNE_ENABLE_DEFAULT_ASYNC_NO_PROGRESS_CALLBACK"
+
 S3_ENDPOINT_URL = "S3_ENDPOINT_URL"
diff --git a/src/neptune/internal/init/parameters.py b/src/neptune/internal/init/parameters.py
@@ -13,8 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-__all__ = ["DEFAULT_FLUSH_PERIOD", "DEFAULT_NAME", "OFFLINE_PROJECT_QUALIFIED_NAME"]
+__all__ = [
+    "DEFAULT_FLUSH_PERIOD",
+    "DEFAULT_NAME",
+    "OFFLINE_PROJECT_QUALIFIED_NAME",
+    "ASYNC_LAG_THRESHOLD",
+    "ASYNC_NO_PROGRESS_THRESHOLD",
+    "DEFAULT_STOP_TIMEOUT",
+]
 
 DEFAULT_FLUSH_PERIOD = 5
 DEFAULT_NAME = "Untitled"
 OFFLINE_PROJECT_QUALIFIED_NAME = "offline/project-placeholder"
+ASYNC_LAG_THRESHOLD = 1800.0
+ASYNC_NO_PROGRESS_THRESHOLD = 300.0
+DEFAULT_STOP_TIMEOUT = 60.0
diff --git a/src/neptune/internal/operation_processors/async_operation_processor.py b/src/neptune/internal/operation_processors/async_operation_processor.py
@@ -25,6 +25,7 @@
     time,
 )
 from typing import (
+    Callable,
     List,
     Optional,
 )
@@ -36,6 +37,11 @@
 from neptune.internal.container_type import ContainerType
 from neptune.internal.disk_queue import DiskQueue
 from neptune.internal.id_formats import UniqueId
+from neptune.internal.init.parameters import (
+    ASYNC_LAG_THRESHOLD,
+    ASYNC_NO_PROGRESS_THRESHOLD,
+    DEFAULT_STOP_TIMEOUT,
+)
 from neptune.internal.operation import Operation
 from neptune.internal.operation_processors.operation_processor import OperationProcessor
 from neptune.internal.operation_processors.operation_storage import (
@@ -50,7 +56,7 @@
 
 class AsyncOperationProcessor(OperationProcessor):
     STOP_QUEUE_STATUS_UPDATE_FREQ_SECONDS = 30
-    STOP_QUEUE_MAX_TIME_NO_CONNECTION_SECONDS = int(os.getenv(NEPTUNE_SYNC_AFTER_STOP_TIMEOUT, "300"))
+    STOP_QUEUE_MAX_TIME_NO_CONNECTION_SECONDS = int(os.getenv(NEPTUNE_SYNC_AFTER_STOP_TIMEOUT, DEFAULT_STOP_TIMEOUT))
 
     def __init__(
         self,
@@ -60,6 +66,10 @@ def __init__(
         lock: threading.RLock,
         sleep_time: float = 5,
         batch_size: int = 1000,
+        async_lag_callback: Optional[Callable[[], None]] = None,
+        async_lag_threshold: float = ASYNC_LAG_THRESHOLD,
+        async_no_progress_callback: Optional[Callable[[], None]] = None,
+        async_no_progress_threshold: float = ASYNC_NO_PROGRESS_THRESHOLD,
     ):
         self._operation_storage = OperationStorage(self._init_data_path(container_id, container_type))
 
@@ -74,9 +84,17 @@ def __init__(
         self._container_type = container_type
         self._backend = backend
         self._batch_size = batch_size
+        self._async_lag_callback = async_lag_callback or (lambda: None)
+        self._async_lag_threshold = async_lag_threshold
+        self._async_no_progress_callback = async_no_progress_callback or (lambda: None)
+        self._async_no_progress_threshold = async_no_progress_threshold
         self._last_version = 0
         self._consumed_version = 0
         self._consumer = self.ConsumerThread(self, sleep_time, batch_size)
+        self._lock = lock
+        self._last_ack = None
+        self._lag_exceeded = False
+        self._should_call_no_progress_callback = False
 
         # Caller is responsible for taking this lock
         self._waiting_cond = threading.Condition(lock=lock)
@@ -88,6 +106,9 @@ def _init_data_path(container_id: UniqueId, container_type: ContainerType) -> Pa
         return get_container_dir(ASYNC_DIRECTORY, container_id, container_type, process_path)
 
     def enqueue_operation(self, op: Operation, *, wait: bool) -> None:
+        self._check_lag()
+        self._check_no_progress()
+
         self._last_version = self._queue.put(op)
         if self._queue.size() > self._batch_size / 2:
             self._consumer.wake_up()
@@ -107,6 +128,20 @@ def wait(self):
         if not self._consumer.is_running():
             raise NeptuneSynchronizationAlreadyStoppedException()
 
+    def _check_lag(self):
+        if not self._lag_exceeded and self._last_ack and monotonic() - self._last_ack > self._async_lag_threshold:
+            with self._lock:
+                if not self._lag_exceeded:
+                    self._async_no_progress_callback()
+                    self._lag_exceeded = True
+
+    def _check_no_progress(self):
+        if self._should_call_no_progress_callback:
+            with self._lock:
+                if self._should_call_no_progress_callback:
+                    self._async_no_progress_callback()
+                    self._should_call_no_progress_callback = False
+
     def flush(self):
         self._queue.flush()
 
@@ -225,6 +260,7 @@ def __init__(
             self._processor = processor
             self._batch_size = batch_size
             self._last_flush = 0
+            self._no_progress_exceeded = False
 
         def run(self):
             try:
@@ -246,6 +282,12 @@ def work(self) -> None:
                     return
                 self.process_batch([element.obj for element in batch], batch[-1].ver)
 
+        def _check_no_progress(self):
+            if not self._no_progress_exceeded:
+                if monotonic() - self._processor._last_ack > self._processor._async_no_progress_threshold:
+                    self._no_progress_exceeded = True
+                    self._processor._should_call_no_progress_callback = True
+
         @Daemon.ConnectionRetryWrapper(
             kill_message=(
                 "Killing Neptune asynchronous thread. All data is safe on disk and can be later"
@@ -257,16 +299,27 @@ def process_batch(self, batch: List[Operation], version: int) -> None:
             version_to_ack = version - expected_count
             while True:
                 # TODO: Handle Metadata errors
-                processed_count, errors = self._processor._backend.execute_operations(
-                    container_id=self._processor._container_id,
-                    container_type=self._processor._container_type,
-                    operations=batch,
-                    operation_storage=self._processor._operation_storage,
-                )
+                try:
+                    processed_count, errors = self._processor._backend.execute_operations(
+                        container_id=self._processor._container_id,
+                        container_type=self._processor._container_type,
+                        operations=batch,
+                        operation_storage=self._processor._operation_storage,
+                    )
+                except Exception as e:
+                    self._check_no_progress()
+                    # Let default retry logic handle this
+                    raise e from e
+
+                self._no_progress_exceeded = False
+
                 version_to_ack += processed_count
                 batch = batch[processed_count:]
+
                 with self._processor._waiting_cond:
                     self._processor._queue.ack(version_to_ack)
+                    self._processor._last_ack = monotonic()
+                    self._processor._lag_exceeded = False
 
                     for error in errors:
                         _logger.error(

diff --git a/src/neptune/internal/operation_processors/factory.py b/src/neptune/internal/operation_processors/factory.py
@@ -17,10 +17,18 @@
 __all__ = ["get_operation_processor"]
 
 import threading
+from typing import (
+    Callable,
+    Optional,
+)
 
 from neptune.internal.backends.neptune_backend import NeptuneBackend
 from neptune.internal.container_type import ContainerType
 from neptune.internal.id_formats import UniqueId
+from neptune.internal.init.parameters import (
+    ASYNC_LAG_THRESHOLD,
+    ASYNC_NO_PROGRESS_THRESHOLD,
+)
 from neptune.types.mode import Mode
 
 from .async_operation_processor import AsyncOperationProcessor
@@ -37,14 +45,22 @@ def get_operation_processor(
     backend: NeptuneBackend,
     lock: threading.RLock,
     flush_period: float,
+    async_lag_callback: Optional[Callable[[], None]] = None,
+    async_lag_threshold: float = ASYNC_LAG_THRESHOLD,
+    async_no_progress_callback: Optional[Callable[[], None]] = None,
+    async_no_progress_threshold: float = ASYNC_NO_PROGRESS_THRESHOLD,
 ) -> OperationProcessor:
     if mode == Mode.ASYNC:
         return AsyncOperationProcessor(
-            container_id,
-            container_type,
-            backend,
-            lock,
+            container_id=container_id,
+            container_type=container_type,
+            backend=backend,
+            lock=lock,
             sleep_time=flush_period,
+            async_lag_callback=async_lag_callback,
+            async_lag_threshold=async_lag_threshold,
+            async_no_progress_callback=async_no_progress_callback,
+            async_no_progress_threshold=async_no_progress_threshold,
         )
     elif mode == Mode.SYNC:
         return SyncOperationProcessor(container_id, container_type, backend)

diff --git a/src/neptune/internal/utils/__init__.py b/src/neptune/internal/utils/__init__.py
@@ -26,6 +26,7 @@
     "is_string_like",
     "is_stringify_value",
     "verify_collection_type",
+    "verify_optional_callable",
     "is_collection",
     "base64_encode",
     "base64_decode",
@@ -133,6 +134,11 @@ def verify_collection_type(var_name: str, var, expected_type: Union[type, tuple]
         verify_type("elements of collection '{}'".format(var_name), value, expected_type)
 
 
+def verify_optional_callable(var_name: str, var):
+    if var and not callable(var):
+        raise TypeError("{} must be a callable (was {})".format(var_name, type(var)))
+
+
 def is_collection(var) -> bool:
     return isinstance(var, (list, set, tuple))
 

diff --git a/src/neptune/metadata_containers/abstract.py b/src/neptune/metadata_containers/abstract.py
@@ -13,13 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-__all__ = ["SupportsNamespaces"]
+__all__ = ["SupportsNamespaces", "NeptuneObject", "NeptuneObjectCallback"]
 
 from abc import (
     ABC,
     abstractmethod,
 )
-from typing import TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Optional,
+    Union,
+)
 
 if TYPE_CHECKING:
     from neptune.handler import Handler
@@ -64,3 +69,12 @@ def __delitem__(self, path) -> None:
     @abstractmethod
     def get_root_object(self) -> "SupportsNamespaces":
         ...
+
+
+class NeptuneObject(SupportsNamespaces, ABC):
+    @abstractmethod
+    def stop(self, *, seconds: Optional[Union[float, int]] = None) -> None:
+        ...
+
+
+NeptuneObjectCallback = Callable[[NeptuneObject], None]