From 407276ab14d8ad5a19a836c577919ac3a4e10d88 Mon Sep 17 00:00:00 2001
From: Christophe Varoqui <christophe.varoqui@opensvc.com>
Date: Tue, 14 Nov 2023 17:45:31 +0100
Subject: [PATCH] Better support lossy networks for arbitrator pings

The current implementation sets the socket timeout to 1s and, in some
error cases, retries until the timeout passed as raw_daemon_request() kwarg
(5s default) is reached.

On tcp loss, the retransmit occurs after 1s, so after the socket timeout
we configured... and the ETIMEDOUT errno was not considered as
retryable => a error log was logged after 1s instead of retrying for the
configured or default timeout, which is usually more than 1s.

This patch:

1/ implements the ETIMEDOUT errno handler, treating the same as
a socket.timeout, ie retry.

2/ fix a infinite retry loop situation, that was never
actually experienced in lab or never reported to us.

3/ set the socket timeout to 1.5s instead of 1s to allow at least one
tcp retransmit

4/ return the error from _ping() so the caller can decide to print or
log the cause of the ping stale.

Example:

$ om node ping --node 1.2.2.3
1.2.2.3 is not alive: timeout daemon request (connect error)
---
 opensvc/core/comm.py      | 22 ++++++++++++++++++----
 opensvc/core/node/node.py | 18 ++++++++++++------
 opensvc/daemon/shared.py  |  4 +++-
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/opensvc/core/comm.py b/opensvc/core/comm.py
index 6366a3e41..228e6dc4a 100644
--- a/opensvc/core/comm.py
+++ b/opensvc/core/comm.py
@@ -12,7 +12,7 @@
 import time
 import select
 import sys
-from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN
+from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN, ETIMEDOUT
 
 
 DEFAULT_DAEMON_TIMEOUT = 5
@@ -54,14 +54,14 @@ def to_bytes(x):
     ConnectionResetError = DummyException
     ConnectionRefusedError = DummyException
 
-# add ECONNRESET, ENOTFOUND, ESOCKETTIMEDOUT, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ?
+# add ECONNRESET, ENOTFOUND, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ?
 RETRYABLE = (
     EAGAIN,
     EBUSY,
     EPIPE,
     EALREADY,
 )
-SOCK_TMO_REQUEST = 1.0
+SOCK_TMO_REQUEST = 1.5
 SOCK_TMO_STREAM = 6.2
 PAUSE = 0.2
 PING = ".".encode()
@@ -866,10 +866,24 @@ def raw_daemon_request(self, data, server=None, node=None, with_result=True, sil
                                 "status": 1,
                                 "err": "timeout daemon request (connect error)",
                             }
+                        else:
+                            raise
                     time.sleep(PAUSE)
                     continue
                 except socket.error as exc:
-                    if exc.errno in RETRYABLE and \
+                    if exc.errno == ETIMEDOUT:
+                        elapsed += SOCK_TMO_REQUEST + PAUSE
+                        if timeout == 0 or (timeout and elapsed >= timeout):
+                            if with_result:
+                                return {
+                                    "status": 1,
+                                    "err": "timeout daemon request (connect error)",
+                                }
+                            else:
+                                raise
+                        time.sleep(PAUSE)
+                        continue
+                    elif exc.errno in RETRYABLE and \
                        (timeout == 0 or (timeout and elapsed < timeout)):
                         # Resource temporarily unavailable (busy, overflow)
                         # Retry after a delay, if the daemon is still
diff --git a/opensvc/core/node/node.py b/opensvc/core/node/node.py
index 68af24b52..c65cfb55c 100644
--- a/opensvc/core/node/node.py
+++ b/opensvc/core/node/node.py
@@ -4218,7 +4218,7 @@ def _ping(self, node, timeout=5):
             # relay must be tested from a cluster node
             data = self.daemon_node_action(action="ping", options={"node": node}, node="ANY", action_mode=False)
             status, error, info = self.parse_result(data)
-            return status
+            return status, error, info
         else:
             secret = None
             for section in self.conf_sections("arbitrator"):
@@ -4245,9 +4245,12 @@ def _ping(self, node, timeout=5):
             secret=secret,
             timeout=timeout,
         )
-        if data is None or "status" not in data or data["status"] != 0:
-            return 1
-        return 0
+        if data is None:
+            return 1, "", ""
+        elif "status" not in data or data["status"] != 0:
+            err = data.get("err", "")
+            return 1, err, ""
+        return 0, "", ""
 
     def ping(self):
         ret = 0
@@ -4264,7 +4267,7 @@ def ping(self):
 
     def ping_node(self, node):
         try:
-            ret = self._ping(node)
+            ret, error, _ = self._ping(node)
         except ex.Error as exc:
             print(exc)
             ret = 2
@@ -4273,7 +4276,10 @@ def ping_node(self, node):
         if ret == 0:
             print("%s is alive" % node)
         elif ret == 1:
-            print("%s is not alive" % node)
+            if error != "":
+                print("%s is not alive: %s" % (node, error))
+            else:
+                print("%s is not alive" % node)
         return ret
 
     def drain(self):
diff --git a/opensvc/daemon/shared.py b/opensvc/daemon/shared.py
index 857b774fb..cf4bb3d85 100644
--- a/opensvc/daemon/shared.py
+++ b/opensvc/daemon/shared.py
@@ -892,9 +892,11 @@ def in_maintenance_grace_period(self, nmon):
     def arbitrators_votes(self):
         votes = []
         for arbitrator in NODE.arbitrators:
-            ret = NODE._ping(arbitrator["name"], arbitrator["timeout"])
+            ret, error, _ = NODE._ping(arbitrator["name"], arbitrator["timeout"])
             if ret == 0:
                 votes.append(arbitrator["name"])
+            elif error != "":
+                self.log.warning("arbitrator %s stale, ping timeout %s: %s", arbitrator["name"], arbitrator["timeout"], error)
         return votes
 
     def live_nodes_count(self):