-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #268 from msgmaxim/reachable-nodes
Bookkeeping for reachability of nodes; reporting them to Lokid
- Loading branch information
Showing
14 changed files
with
449 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
|
||
#include "reachability_testing.h" | ||
#include "loki_logger.h" | ||
|
||
using std::chrono::steady_clock; | ||
using namespace std::chrono_literals; | ||
|
||
namespace loki { | ||
|
||
namespace detail { | ||
|
||
reach_record_t::reach_record_t() { | ||
this->first_failure = steady_clock::now(); | ||
this->last_tested = this->first_failure; | ||
} | ||
|
||
} // namespace detail | ||
|
||
/// How long to wait until reporting unreachable nodes to Lokid | ||
constexpr std::chrono::minutes UNREACH_GRACE_PERIOD = 120min; | ||
|
||
bool reachability_records_t::record_unreachable(const sn_pub_key_t& sn) { | ||
|
||
const auto it = offline_nodes_.find(sn); | ||
|
||
if (it == offline_nodes_.end()) { | ||
/// TODO: change this to debug | ||
LOKI_LOG(debug, "Adding a new node to UNREACHABLE: {}", sn); | ||
offline_nodes_.insert({sn, {}}); | ||
} else { | ||
LOKI_LOG(debug, "Node is ALREAY known to be UNREACHABLE: {}", sn); | ||
|
||
it->second.last_tested = steady_clock::now(); | ||
|
||
const auto elapsed = it->second.last_tested - it->second.first_failure; | ||
const auto elapsed_sec = | ||
std::chrono::duration_cast<std::chrono::seconds>(elapsed).count(); | ||
LOKI_LOG(debug, "First time failed {} seconds ago", elapsed_sec); | ||
|
||
/// TODO: Might still want to report as unreachable since this status | ||
/// gets reset to `true` on Lokid restart | ||
if (it->second.reported) { | ||
LOKI_LOG(debug, "Already reported node: {}", sn); | ||
} else if (elapsed > UNREACH_GRACE_PERIOD) { | ||
LOKI_LOG(debug, "Will REPORT this node to Lokid!"); | ||
return true; | ||
} | ||
|
||
} | ||
|
||
return false; | ||
} | ||
|
||
bool reachability_records_t::record_reachable(const sn_pub_key_t& sn) { | ||
expire(sn); | ||
} | ||
|
||
bool reachability_records_t::expire(const sn_pub_key_t& sn) { | ||
|
||
if (offline_nodes_.erase(sn)) { | ||
LOKI_LOG(debug, "Removed entry for {}", sn); | ||
} | ||
} | ||
|
||
void reachability_records_t::set_reported(const sn_pub_key_t& sn) { | ||
|
||
const auto it = offline_nodes_.find(sn); | ||
if (it != offline_nodes_.end()) { | ||
it->second.reported = true; | ||
} | ||
} | ||
|
||
boost::optional<sn_pub_key_t> reachability_records_t::next_to_test() { | ||
|
||
const auto it = std::min_element( | ||
offline_nodes_.begin(), offline_nodes_.end(), | ||
[&](const auto& lhs, const auto& rhs) { | ||
return lhs.second.last_tested < rhs.second.last_tested; | ||
}); | ||
|
||
if (it == offline_nodes_.end()) { | ||
return boost::none; | ||
} else { | ||
|
||
LOKI_LOG(debug, "Selecting to be re-tested: {}", it->first); | ||
|
||
return it->first; | ||
} | ||
} | ||
|
||
} // namespace loki |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#pragma once | ||
|
||
#include "loki_common.h" | ||
#include <chrono> | ||
#include <unordered_map> | ||
|
||
namespace loki { | ||
|
||
namespace detail { | ||
|
||
/// TODO: make this class "private"? | ||
class reach_record_t { | ||
|
||
|
||
using time_point_t = std::chrono::time_point<std::chrono::steady_clock>; | ||
|
||
public: | ||
// The time the node failed for the first time | ||
// (and hasn't come back online) | ||
time_point_t first_failure; | ||
time_point_t last_tested; | ||
// whether it's been reported to Lokid | ||
bool reported = false; | ||
|
||
reach_record_t(); | ||
}; | ||
} // namespace detail | ||
|
||
class reachability_records_t { | ||
|
||
// TODO: sn_records are heavy (3 strings), so how about we only store the | ||
// pubkey? | ||
|
||
// Nodes that failed the reachability test | ||
// Note: I don't expect this list to be large, so | ||
// `std::vector` is probably faster than `std::set` here | ||
std::unordered_map<sn_pub_key_t, detail::reach_record_t> offline_nodes_; | ||
|
||
public: | ||
// Return nodes that should be tested first: decommissioned nodes | ||
// and those that failed our earlier tests (but not reported yet) | ||
// std::vector<> priority_nodes() const; | ||
|
||
// Records node as unreachable, return `true` if the node should be | ||
// reported to Lokid as being unreachable for a long time | ||
bool record_unreachable(const sn_pub_key_t& sn); | ||
|
||
bool record_reachable(const sn_pub_key_t& sn); | ||
|
||
bool expire(const sn_pub_key_t& sn); | ||
|
||
void set_reported(const sn_pub_key_t& sn); | ||
|
||
// Retrun the least recently tested node | ||
boost::optional<sn_pub_key_t> next_to_test(); | ||
}; | ||
|
||
} // namespace loki |
Oops, something went wrong.