diff --git a/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py b/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py index f032ed6c1..a12010402 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py +++ b/docker/mongodb-kubernetes-tests/kubetester/mongodb_common.py @@ -11,12 +11,26 @@ class MongoDBCommon: @TRACER.start_as_current_span("wait_for") - def wait_for(self, fn, timeout=None, should_raise=True): + def wait_for(self, fn, timeout=None, should_raise=True, persist_for=1): + """ + Waits for the given function `fn` to return True, retrying until the timeout is reached. + If persist_for > 1, the function must return True for that many consecutive checks. + Optionally raises an exception if the condition is not met within the timeout. + + Args: + fn: A callable that returns a boolean. + timeout: Maximum time to wait in seconds (default: 600). + should_raise: If True, raises an Exception on timeout (default: True). + persist_for: Number of consecutive successful checks required (default: 1). + Returns: + True if the condition is met within the timeout, otherwise raises Exception if `should_raise` is True. + """ if timeout is None: timeout = 600 initial_timeout = timeout wait = 3 + retries = 0 while timeout > 0: try: self.reload() @@ -24,7 +38,12 @@ def wait_for(self, fn, timeout=None, should_raise=True): print(f"Caught error: {e} while waiting for {fn.__name__}") pass if fn(self): - return True + retries += 1 + if retries == persist_for: + return True + else: + retries = 0 + timeout -= wait time.sleep(wait) diff --git a/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py b/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py index 0a463c839..de536f15d 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py +++ b/docker/mongodb-kubernetes-tests/kubetester/opsmanager.py @@ -1017,13 +1017,7 @@ def is_om_multi_cluster(self): return self["spec"].get("topology", "") == "MultiCluster" class StatusCommon: - def assert_reaches_phase( - self, - phase: Phase, - msg_regexp=None, - timeout=None, - ignore_errors=False, - ): + def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False): intermediate_events = ( # This can be an intermediate error, right before we check for this secret we create it. # The cluster might just be slow @@ -1057,6 +1051,41 @@ def assert_reaches_phase( f"Reaching phase {phase.name} for resource {self.__class__.__name__} took {end_time - start_time}s" ) + def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False, persist_for=3): + intermediate_events = ( + # This can be an intermediate error, right before we check for this secret we create it. + # The cluster might just be slow + "failed to locate the api key secret", + # etcd might be slow + "etcdserver: request timed out", + ) + + start_time = time.time() + self.ops_manager.wait_for( + lambda s: in_desired_state( + current_state=self.get_phase(), + desired_state=phase, + current_generation=self.ops_manager.get_generation(), + observed_generation=self.get_observed_generation(), + current_message=self.get_message(), + msg_regexp=msg_regexp, + ignore_errors=ignore_errors, + intermediate_events=intermediate_events, + ), + timeout, + should_raise=True, + persist_for=persist_for, + ) + end_time = time.time() + span = trace.get_current_span() + span.set_attribute("mck.resource", self.__class__.__name__) + span.set_attribute("mck.action", "assert_phase") + span.set_attribute("mck.desired_phase", phase.name) + span.set_attribute("mck.time_needed", end_time - start_time) + logger.debug( + f"Persist phase {phase.name} ({persist_for} retries) for resource {self.__class__.__name__} took {end_time - start_time}s" + ) + def assert_abandons_phase(self, phase: Phase, timeout=None): return self.ops_manager.wait_for(lambda s: self.get_phase() != phase, timeout, should_raise=True) @@ -1113,6 +1142,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400): def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False): super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors) + def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False, persist_for=1): + super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for) + def get_phase(self) -> Optional[Phase]: try: return Phase[self.ops_manager.get_status()["applicationDatabase"]["phase"]] @@ -1159,6 +1191,9 @@ def assert_abandons_phase(self, phase: Phase, timeout=400): def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False): super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors) + def assert_persist_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False, persist_for=1): + super().assert_persist_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for) + def get_phase(self) -> Optional[Phase]: try: return Phase[self.ops_manager.get_status()["opsManager"]["phase"]] diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py b/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py index d3deeaa79..6e04a50ea 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_appdb/multicluster_appdb_upgrade_downgrade_v1_27_to_mck.py @@ -256,7 +256,7 @@ def test_scale_appdb(self, ops_manager: MongoDBOpsManager): # Reordering the clusters triggers a change in the state ops_manager["spec"]["applicationDatabase"]["clusterSpecList"] = scale_on_upgrade.cluster_spec ops_manager.update() - ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=500) + ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600) ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=250) def test_migrated_state_correctness( diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py index 1fa4fb48c..308df6ca9 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_snippets.py @@ -104,7 +104,7 @@ def test_running(namespace: str): try: logger.debug(f"Waiting for {sc.name} to reach Running phase") # Once the first resource reached Running, it shouldn't take more than ~300s for the others to do so - sc.assert_reaches_phase(Phase.Running, timeout=900 if first_iter else 300) + sc.assert_reaches_phase(Phase.Running, timeout=1200 if first_iter else 300) succeeded_resources.append(sc.name) first_iter = False logger.info(f"{sc.name} reached Running phase") diff --git a/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py b/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py index 939c69afe..a3589c30c 100644 --- a/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py +++ b/docker/mongodb-kubernetes-tests/tests/upgrades/appdb_tls_operator_upgrade_v1_32_to_mck.py @@ -161,8 +161,11 @@ def test_upgrade_operator( @mark.e2e_appdb_tls_operator_upgrade_v1_32_to_mck def test_om_tls_ok(ops_manager_tls: MongoDBOpsManager): ops_manager_tls.load() - ops_manager_tls.appdb_status().assert_reaches_phase(Phase.Running, timeout=900) - ops_manager_tls.om_status().assert_reaches_phase(Phase.Running, timeout=900) + # We use assert_persist_phase here to ensure that the status stays in Running phase for some time, + # to avoid false positives due to a transient Running state before starting reconciliation of OM. + # TODO: Revert after fixing root issue (https://jira.mongodb.org/browse/CLOUDP-364841) + ops_manager_tls.appdb_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3) + ops_manager_tls.om_status().assert_persist_phase(Phase.Running, timeout=900, persist_for=3) ops_manager_tls.get_om_tester().assert_healthiness()