-
Notifications
You must be signed in to change notification settings - Fork 47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bookkeeping for reachability of nodes; reporting them to Lokid #268
Changes from 4 commits
a61195c
69aa778
4351dec
d6633a2
bab0ad5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
|
||
#include "reachability_testing.h" | ||
#include "loki_logger.h" | ||
|
||
using std::chrono::steady_clock; | ||
using namespace std::chrono_literals; | ||
|
||
namespace loki { | ||
|
||
namespace detail { | ||
|
||
reach_record_t::reach_record_t() { | ||
this->first_failure = steady_clock::now(); | ||
this->last_tested = this->first_failure; | ||
} | ||
|
||
} // namespace detail | ||
|
||
/// How long to wait until reporting unreachable nodes to Lokid | ||
constexpr std::chrono::minutes UNREACH_GRACE_PERIOD = 120min; | ||
|
||
bool reachability_records_t::record_unreachable(const sn_pub_key_t& sn) { | ||
|
||
auto it = offline_nodes_.find(sn); | ||
|
||
if (it == offline_nodes_.end()) { | ||
LOKI_LOG(info, "adding a new node to UNREACHABLE: {}", sn); | ||
offline_nodes_.insert({sn, {}}); | ||
} else { | ||
LOKI_LOG(info, "node is ALREAY known to be UNREACHABLE: {}", sn); | ||
|
||
it->second.last_tested = steady_clock::now(); | ||
|
||
const auto elapsed = it->second.last_tested - it->second.first_failure; | ||
const auto elapsed_sec = | ||
std::chrono::duration_cast<std::chrono::seconds>(elapsed).count(); | ||
LOKI_LOG(info, " - first time failed {} seconds ago", elapsed_sec); | ||
|
||
/// TODO: Might still want to report as unreachable since this status | ||
/// gets reset to `true` on Lokid restart | ||
if (elapsed > UNREACH_GRACE_PERIOD && !it->second.reported) { | ||
LOKI_LOG(warn, " - will REPORT this node to Lokid!"); | ||
return true; | ||
} else { | ||
if (it->second.reported) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change to } else if (...) { ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, and I changed the order to simplify further. |
||
LOKI_LOG(warn, " - Already reported node: {}", sn); | ||
} | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
bool reachability_records_t::record_reachable(const sn_pub_key_t& sn) { | ||
expire(sn); | ||
} | ||
|
||
bool reachability_records_t::expire(const sn_pub_key_t& sn) { | ||
|
||
if (offline_nodes_.erase(sn)) { | ||
LOKI_LOG(warn, " - removed entry for {}", sn); | ||
} | ||
} | ||
|
||
void reachability_records_t::set_reported(const sn_pub_key_t& sn) { | ||
|
||
auto it = offline_nodes_.find(sn); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Const iterator? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||
if (it != offline_nodes_.end()) { | ||
it->second.reported = true; | ||
} | ||
} | ||
|
||
boost::optional<sn_pub_key_t> reachability_records_t::next_to_test() { | ||
|
||
const auto it = std::min_element( | ||
offline_nodes_.begin(), offline_nodes_.end(), | ||
[&](const auto& lhs, const auto& rhs) { | ||
return lhs.second.last_tested < rhs.second.last_tested; | ||
}); | ||
|
||
if (it == offline_nodes_.end()) { | ||
return boost::none; | ||
} else { | ||
|
||
LOKI_LOG(warn, "~~~ Selecting to be re-tested: {}", it->first); | ||
|
||
return it->first; | ||
} | ||
} | ||
|
||
} // namespace loki |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#pragma once | ||
|
||
#include "loki_common.h" | ||
#include <chrono> | ||
#include <unordered_map> | ||
|
||
namespace loki { | ||
|
||
namespace detail { | ||
|
||
/// TODO: make this class "private"? | ||
class reach_record_t { | ||
|
||
// The time the node failed for the first time | ||
// (and hasn't come back online) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this comment be above line 20? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, updated. |
||
|
||
using time_point_t = std::chrono::time_point<std::chrono::steady_clock>; | ||
|
||
public: | ||
time_point_t first_failure; | ||
time_point_t last_tested; | ||
// whether it's been reported to Lokid | ||
bool reported = false; | ||
|
||
reach_record_t(); | ||
}; | ||
} // namespace detail | ||
|
||
class reachability_records_t { | ||
|
||
// TODO: sn_records are heavy (3 strings), so how about we only store the | ||
// pubkey? | ||
|
||
// Nodes that failed the reachability test | ||
// Note: I don't expect this list to be large, so | ||
// `std::vector` is probably faster than `std::set` here | ||
std::unordered_map<sn_pub_key_t, detail::reach_record_t> offline_nodes_; | ||
|
||
public: | ||
// Return nodes that should be tested first: decommissioned nodes | ||
// and those that failed our earlier tests (but not reported yet) | ||
// std::vector<> priority_nodes() const; | ||
|
||
// Records node as unreachable, return `true` if the node should be | ||
// reported to Lokid as being unreachable for a long time | ||
bool record_unreachable(const sn_pub_key_t& sn); | ||
|
||
bool record_reachable(const sn_pub_key_t& sn); | ||
|
||
bool expire(const sn_pub_key_t& sn); | ||
|
||
void set_reported(const sn_pub_key_t& sn); | ||
|
||
// Retrun the least recently tested node | ||
boost::optional<sn_pub_key_t> next_to_test(); | ||
}; | ||
|
||
} // namespace loki |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the iterator can be const while still being able to mutate the element in the map.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you are right. Updated.