Skip to content

Commit

Permalink
Better support lossy networks for arbitrator pings
Browse files Browse the repository at this point in the history
The current implementation sets the socket timeout to 1s and, in some
error cases, retries until the timeout passed as raw_daemon_request() kwarg
(5s default) is reached.

On tcp loss, the retransmit occurs after 1s, so after the socket timeout
we configured... and the ETIMEDOUT errno was not considered as
retryable => a error log was logged after 1s instead of retrying for the
configured or default timeout, which is usually more than 1s.

This patch:

1/ implements the ETIMEDOUT errno handler, treating the same as
a socket.timeout, ie retry.

2/ fix a infinite retry loop situation, that was never
actually experienced in lab or never reported to us.

3/ set the socket timeout to 1.5s instead of 1s to allow at least one
tcp retransmit

4/ return the error from _ping() so the caller can decide to print or
log the cause of the ping stale.

Example:

$ om node ping --node 1.2.2.3
1.2.2.3 is not alive: timeout daemon request (connect error)
  • Loading branch information
cvaroqui committed Nov 15, 2023
1 parent f83b6b0 commit 407276a
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 11 deletions.
22 changes: 18 additions & 4 deletions opensvc/core/comm.py
Expand Up @@ -12,7 +12,7 @@
import time
import select
import sys
from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN
from errno import ECONNREFUSED, EPIPE, EBUSY, EALREADY, EAGAIN, ETIMEDOUT


DEFAULT_DAEMON_TIMEOUT = 5
Expand Down Expand Up @@ -54,14 +54,14 @@ def to_bytes(x):
ConnectionResetError = DummyException
ConnectionRefusedError = DummyException

# add ECONNRESET, ENOTFOUND, ESOCKETTIMEDOUT, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ?
# add ECONNRESET, ENOTFOUND, ETIMEDOUT, EHOSTUNREACH, ECONNREFUSED, ?
RETRYABLE = (
EAGAIN,
EBUSY,
EPIPE,
EALREADY,
)
SOCK_TMO_REQUEST = 1.0
SOCK_TMO_REQUEST = 1.5
SOCK_TMO_STREAM = 6.2
PAUSE = 0.2
PING = ".".encode()
Expand Down Expand Up @@ -866,10 +866,24 @@ def raw_daemon_request(self, data, server=None, node=None, with_result=True, sil
"status": 1,
"err": "timeout daemon request (connect error)",
}
else:
raise
time.sleep(PAUSE)
continue
except socket.error as exc:
if exc.errno in RETRYABLE and \
if exc.errno == ETIMEDOUT:
elapsed += SOCK_TMO_REQUEST + PAUSE
if timeout == 0 or (timeout and elapsed >= timeout):
if with_result:
return {
"status": 1,
"err": "timeout daemon request (connect error)",
}
else:
raise
time.sleep(PAUSE)
continue
elif exc.errno in RETRYABLE and \
(timeout == 0 or (timeout and elapsed < timeout)):
# Resource temporarily unavailable (busy, overflow)
# Retry after a delay, if the daemon is still
Expand Down
18 changes: 12 additions & 6 deletions opensvc/core/node/node.py
Expand Up @@ -4218,7 +4218,7 @@ def _ping(self, node, timeout=5):
# relay must be tested from a cluster node
data = self.daemon_node_action(action="ping", options={"node": node}, node="ANY", action_mode=False)
status, error, info = self.parse_result(data)
return status
return status, error, info
else:
secret = None
for section in self.conf_sections("arbitrator"):
Expand All @@ -4245,9 +4245,12 @@ def _ping(self, node, timeout=5):
secret=secret,
timeout=timeout,
)
if data is None or "status" not in data or data["status"] != 0:
return 1
return 0
if data is None:
return 1, "", ""
elif "status" not in data or data["status"] != 0:
err = data.get("err", "")
return 1, err, ""
return 0, "", ""

def ping(self):
ret = 0
Expand All @@ -4264,7 +4267,7 @@ def ping(self):

def ping_node(self, node):
try:
ret = self._ping(node)
ret, error, _ = self._ping(node)
except ex.Error as exc:
print(exc)
ret = 2
Expand All @@ -4273,7 +4276,10 @@ def ping_node(self, node):
if ret == 0:
print("%s is alive" % node)
elif ret == 1:
print("%s is not alive" % node)
if error != "":
print("%s is not alive: %s" % (node, error))
else:
print("%s is not alive" % node)
return ret

def drain(self):
Expand Down
4 changes: 3 additions & 1 deletion opensvc/daemon/shared.py
Expand Up @@ -892,9 +892,11 @@ def in_maintenance_grace_period(self, nmon):
def arbitrators_votes(self):
votes = []
for arbitrator in NODE.arbitrators:
ret = NODE._ping(arbitrator["name"], arbitrator["timeout"])
ret, error, _ = NODE._ping(arbitrator["name"], arbitrator["timeout"])
if ret == 0:
votes.append(arbitrator["name"])
elif error != "":
self.log.warning("arbitrator %s stale, ping timeout %s: %s", arbitrator["name"], arbitrator["timeout"], error)
return votes

def live_nodes_count(self):
Expand Down

0 comments on commit 407276a

Please sign in to comment.