From f0ba28d1ed000b2316d3c403206eba78dd7b4c50 Mon Sep 17 00:00:00 2001 From: Mike Kazantsev Date: Sat, 11 Jul 2015 19:25:27 +0500 Subject: [PATCH] collectors.cjdns_peer_stats: be more tolerant to timeouts --- graphite_metrics/collectors/cjdns_peer_stats.py | 11 ++++++++--- graphite_metrics/harvestd.yaml | 3 ++- setup.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/graphite_metrics/collectors/cjdns_peer_stats.py b/graphite_metrics/collectors/cjdns_peer_stats.py index b4191ad..78fbb67 100644 --- a/graphite_metrics/collectors/cjdns_peer_stats.py +++ b/graphite_metrics/collectors/cjdns_peer_stats.py @@ -224,11 +224,16 @@ def get_stats_page(self, page, password, bs=2**30): try: self.sock.send(BTE.bencode(req)) - resp = BTE.bdecode(self.sock.recv(bs)) - assert resp.get('txid') == req['txid'], [req, resp] - return resp['peers'], resp.get('more', False) + for n in xrange(self.conf.recv_retries + 1): + resp = BTE.bdecode(self.sock.recv(bs)) + if resp.get('txid') != req['txid']: # likely timed-out responses to old requests + log.warn('Received out-of-order response (n: %s, request: %s): %s', n, req, resp) + continue + return resp['peers'], resp.get('more', False) except Exception as err: raise PeerStatsFailure('Failure communicating with cjdns', err) + raise PeerStatsFailure( 'Too many bogus (wrong or no txid) responses' + ' in a row (count: {}), last req/res: {} / {}'.format(self.conf.recv_retries, req, resp) ) def get_peer_stats(self): peers, page, more = list(), 0, True diff --git a/graphite_metrics/harvestd.yaml b/graphite_metrics/harvestd.yaml index 037a97c..1f200a2 100644 --- a/graphite_metrics/harvestd.yaml +++ b/graphite_metrics/harvestd.yaml @@ -135,7 +135,8 @@ collectors: count: network.services.cjdns.peer_state.total # Prefix for counts of peers by state (e.g. "established", "unresponsive", etc). count_state: network.services.cjdns.peer_state - timeout: 2 # how long to wait for cjdns responses + timeout: 8 # how long to wait for cjdns responses + recv_retries: 10 # how many responses with wrong txid (likely prev timeouts) to tolerate # self_profiling: # TODO # main_loop: true diff --git a/setup.py b/setup.py index 4663f40..0e54127 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( name = 'graphite-metrics', - version = '15.03.0', + version = '15.7.0', author = 'Mike Kazantsev', author_email = 'mk.fraggod@gmail.com', license = 'WTFPL',