diff --git a/opensvc/core/comm.py b/opensvc/core/comm.py index 6366a3e41..228e6dc4a 100644 --- a/opensvc/core/comm.py +++ b/opensvc/core/comm.py @@ -12,7 +12,7 @@ import time import select import sys -from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN +from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN, ETIMEDOUT DEFAULT_DAEMON_TIMEOUT = 5 @@ -54,14 +54,14 @@ def to_bytes(x): ConnectionResetError = DummyException ConnectionRefusedError = DummyException -# add ECONNRESET, ENOTFOUND, ESOCKETTIMEDOUT, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ? +# add ECONNRESET, ENOTFOUND, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ? RETRYABLE = ( EAGAIN, EBUSY, EPIPE, EALREADY, ) -SOCK_TMO_REQUEST = 1.0 +SOCK_TMO_REQUEST = 1.5 SOCK_TMO_STREAM = 6.2 PAUSE = 0.2 PING = ".".encode() @@ -866,10 +866,24 @@ def raw_daemon_request(self, data, server=None, node=None, with_result=True, sil "status": 1, "err": "timeout daemon request (connect error)", } + else: + raise time.sleep(PAUSE) continue except socket.error as exc: - if exc.errno in RETRYABLE and \ + if exc.errno == ETIMEDOUT: + elapsed += SOCK_TMO_REQUEST + PAUSE + if timeout == 0 or (timeout and elapsed >= timeout): + if with_result: + return { + "status": 1, + "err": "timeout daemon request (connect error)", + } + else: + raise + time.sleep(PAUSE) + continue + elif exc.errno in RETRYABLE and \ (timeout == 0 or (timeout and elapsed < timeout)): # Resource temporarily unavailable (busy, overflow) # Retry after a delay, if the daemon is still diff --git a/opensvc/core/node/node.py b/opensvc/core/node/node.py index 68af24b52..c65cfb55c 100644 --- a/opensvc/core/node/node.py +++ b/opensvc/core/node/node.py @@ -4218,7 +4218,7 @@ def _ping(self, node, timeout=5): # relay must be tested from a cluster node data = self.daemon_node_action(action="ping", options={"node": node}, node="ANY", action_mode=False) status, error, info = self.parse_result(data) - return status + return status, error, info else: secret = None for section in self.conf_sections("arbitrator"): @@ -4245,9 +4245,12 @@ def _ping(self, node, timeout=5): secret=secret, timeout=timeout, ) - if data is None or "status" not in data or data["status"] != 0: - return 1 - return 0 + if data is None: + return 1, "", "" + elif "status" not in data or data["status"] != 0: + err = data.get("err", "") + return 1, err, "" + return 0, "", "" def ping(self): ret = 0 @@ -4264,7 +4267,7 @@ def ping(self): def ping_node(self, node): try: - ret = self._ping(node) + ret, error, _ = self._ping(node) except ex.Error as exc: print(exc) ret = 2 @@ -4273,7 +4276,10 @@ def ping_node(self, node): if ret == 0: print("%s is alive" % node) elif ret == 1: - print("%s is not alive" % node) + if error != "": + print("%s is not alive: %s" % (node, error)) + else: + print("%s is not alive" % node) return ret def drain(self): diff --git a/opensvc/daemon/shared.py b/opensvc/daemon/shared.py index 857b774fb..cf4bb3d85 100644 --- a/opensvc/daemon/shared.py +++ b/opensvc/daemon/shared.py @@ -892,9 +892,11 @@ def in_maintenance_grace_period(self, nmon): def arbitrators_votes(self): votes = [] for arbitrator in NODE.arbitrators: - ret = NODE._ping(arbitrator["name"], arbitrator["timeout"]) + ret, error, _ = NODE._ping(arbitrator["name"], arbitrator["timeout"]) if ret == 0: votes.append(arbitrator["name"]) + elif error != "": + self.log.warning("arbitrator %s stale, ping timeout %s: %s", arbitrator["name"], arbitrator["timeout"], error) return votes def live_nodes_count(self):