Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add heartbeat monitor to PD doctor role

  • Loading branch information...
commit 3493c65c72eb91a12de457ccc3ee848a5ea5b4f1 1 parent db78ad5
@labisso labisso authored
View
51 epu/processdispatcher/core.py
@@ -464,12 +464,11 @@ def evacuate_node(self, node, is_system_restart=False,
# mark resource ineligible for scheduling
self._disable_resource(resource)
- # go through and reschedule processes as needed
- for owner, upid, round in resource.assigned:
- self._evacuate_process(owner, upid, resource,
- is_system_restart=is_system_restart,
- dead_process_state=dead_process_state,
- rescheduled_process_state=rescheduled_process_state)
+ self.evacuate_resource(
+ resource,
+ is_system_restart=is_system_restart,
+ dead_process_state=dead_process_state,
+ rescheduled_process_state=rescheduled_process_state)
try:
self.store.remove_resource(resource_id)
@@ -489,6 +488,15 @@ def _disable_resource(self, resource):
except WriteConflictError:
resource = self.store.get_resource(resource.resource_id)
+ def evacuate_resource(self, resource, is_system_restart=False,
+ dead_process_state=None, rescheduled_process_state=None):
+ # go through and reschedule processes as needed
+ for owner, upid, round in resource.assigned:
+ self._evacuate_process(owner, upid, resource,
+ is_system_restart=is_system_restart,
+ dead_process_state=dead_process_state,
+ rescheduled_process_state=rescheduled_process_state)
+
def _evacuate_process(self, owner, upid, resource, is_system_restart=False,
dead_process_state=None, rescheduled_process_state=None):
"""Deal with a process on a terminating/terminated node
@@ -710,6 +718,22 @@ def process_should_restart(self, process, exit_state, is_system_restart=False):
return should_restart
+ def resource_change_state(self, resource, newstate):
+ updated = False
+ while resource and resource.state not in (ExecutionResourceState.DISABLED, newstate):
+ resource.state = newstate
+ try:
+ self.store.update_resource(resource)
+ updated = True
+ except NotFoundError:
+ resource = None
+ except WriteConflictError:
+ try:
+ resource = self.store.get_resource(resource.resource_id)
+ except NotFoundError:
+ resource = None
+ return resource, updated
+
def _first_heartbeat(self, sender, beat):
node_id = beat.get('node_id')
@@ -746,7 +770,7 @@ def _first_heartbeat(self, sender, beat):
log.exception("Node for EEagent %s has invalid domain_id!", sender)
return
- engine_spec = self.ee_registry.get_engine_by_id(engine_id)
+ engine_spec = self.get_engine(engine_id)
slots = engine_spec.slots
# just making engine type a generic property/constraint for now,
@@ -771,6 +795,19 @@ def _first_heartbeat(self, sender, beat):
# no problem if this resource was just created by another worker
log.info("Conflict writing new resource record %s. Ignoring.", sender)
+ def get_resource_engine(self, resource):
+ """Return an execution engine spec for a resource
+ """
+ engine_id = resource.properties.get('engine')
+ if not engine_id:
+ raise ValueError("Resource has no engine property")
+ return self.get_engine(engine_id)
+
+ def get_engine(self, engine_id):
+ """Return an execution engine spec object
+ """
+ return self.ee_registry.get_engine_by_id(engine_id)
+
def node_add_resource(self, node, resource_id):
"""Tentatively adds a resource to a node, retrying if conflict
View
387 epu/processdispatcher/doctor.py
@@ -1,7 +1,12 @@
import logging
import threading
+import heapq
+from collections import namedtuple
+from datetime import timedelta
-from epu.states import ProcessState, ProcessDispatcherState
+from epu.util import now_datetime, ensure_timedelta
+from epu.states import ProcessState, ProcessDispatcherState, ExecutionResourceState
+from epu import tevent
log = logging.getLogger(__name__)
@@ -16,21 +21,25 @@ class PDDoctor(object):
doctor establishes the current state of the system before allowing any
requests to be processed.
- Tracks heartbeats from Execution Engine Agents (EEAgents). When an agent
- hasn't been heard from in too long, the doctor first attempts to provoke
- a response from the agent. If that fails, the agent is marked as disabled
- and ultimately its VM is killed. (TODO)
+ hasn't been heard from in too long, the agent is marked as disabled.
"""
_PROCESS_STATES_TO_REQUEUE = (ProcessState.REQUESTED,
ProcessState.DIED_REQUESTED, ProcessState.WAITING)
- def __init__(self, core, store):
+ CONFIG_MONITOR_HEARTBEATS = "monitor_resource_heartbeats"
+
+ def __init__(self, core, store, config=None):
"""
@type core: ProcessDispatcherCore
@type store: ProcessDispatcherStore
"""
self.core = core
self.store = store
+ self.config = config or {}
+
+ self.monitor = None
+ self.monitor_thread = None
self.condition = threading.Condition()
@@ -62,15 +71,25 @@ def inaugurate(self):
self.watching_system_boot = True
self._watch_system_boot()
- # TODO this is where we can set up the heartbeat monitor
+ with self.condition:
+ if self.is_leader and self.config.get(self.CONFIG_MONITOR_HEARTBEATS, True):
+ self.monitor = ExecutionResourceMonitor(self.core, self.store)
+ self.monitor_thread = tevent.spawn(self.monitor.monitor)
while self.is_leader:
with self.condition:
self.condition.wait()
+ log.debug("Waiting on monitor thread to exit")
+ self.monitor_thread.join()
+ self.monitor = None
+ self.monitor_thread = None
+
def cancel(self):
with self.condition:
self.is_leader = False
+ if self.monitor:
+ self.monitor.cancel()
self.condition.notify_all()
def _watch_system_boot(self, *args):
@@ -153,3 +172,359 @@ def schedule_pending_processes(self):
if updated:
self.store.enqueue_process(process.owner, process.upid,
process.round)
+
+
+class ExecutionResourceMonitor(object):
+ """Implements missing heartbeat detection for execution resources
+
+ Execution resources emit heartbeats to the Process Dispatcher (PD).
+ These heartbeats serve two purposes:
+ 1. To notify the PD of state changes of managed processes
+ 2. To allow the PD to detect when resources have stalled or otherwise
+ disappeared.
+
+ Heartbeats are sent on a regular interval, and are also sent immediately
+ on a process state change. If the PD is aware of this interval, it can
+ detect when something is wrong by comparing the last heartbeat time to
+ the current time. However there may be clock sync or message queue backup
+ delays that interfere with this detection. So we exercise restraint in
+ declaring resources dead. The following algorithm is used:
+
+ - Each resource has a timestamp indicating the time the last heartbeat
+ was *sent*. Each resource also has warning_time and missing_time
+ thresholds
+
+ - When a resource's last heartbeat time is older than the warning_time
+ threshold, the resource is declared to be in the WARNING state. This is
+ based purely on the time delta and does not take into account potential
+ clock differences or messaging delays. The result of this state change
+ is some noisy logging but no countermeasures are used.
+
+ - If the resource stays in the WARNING state for missing_time-warning_time
+ *without any heartbeats received* as observed by the Doctor, the
+ resource is declared MISSING. This will not happen simply because the
+ last heartbeat time is older than missing_time. If a heartbeat is
+ received from a resource in the WARNING state, this event "resets" the
+ doctor's clock, regardless of the timestamp on the heartbeat. This means
+ that resources may stay in the WARNING state indefinitely if their clocks
+ are significantly off from the Doctor's clock.
+
+ - If at any point in this process the resource heartbeats resume or
+ "catch up" with warning_time, the resource is moved back to the OK
+ state. Likewise, if a resource is DISABLED it is ignored by the doctor.
+ """
+
+ _MONITOR_ERROR_DELAY_SECONDS = 60
+
+ _now_func = staticmethod(now_datetime)
+
+ def __init__(self, core, store):
+ self.core = core
+ self.store = store
+
+ self.condition = threading.Condition()
+ self.cancelled = False
+
+ self.resources = {}
+ self.resource_set_changed = True
+ self.changed_resources = set()
+
+ self.resource_checks = _ResourceChecks()
+
+ # local times at which resources were moved to the WARNING state.
+ self.resource_warnings = {}
+
+ def cancel(self):
+ with self.condition:
+ self.cancelled = True
+ self.condition.notify_all()
+
+ def monitor(self):
+ """Monitor execution resources until cancelled
+ """
+
+ self.resource_set_changed = True
+
+ while not self.cancelled:
+ try:
+ delay = self.monitor_cycle()
+
+ except Exception:
+ log.exception("Unexpected error monitoring heartbeats")
+ delay = self._MONITOR_ERROR_DELAY_SECONDS
+
+ # wait until the next check time, unless a resource update or cancel request
+ # happens first
+ if delay is None or delay > 0:
+ with self.condition:
+ if not any((self.cancelled, self.changed_resources,
+ self.resource_set_changed)):
+ self.condition.wait(delay)
+
+ def monitor_cycle(self):
+ """Run a single cycle of the monitor
+
+ returns the number of seconds until the next needed check, or None if no checks are needed
+ """
+ changed = self._update()
+ now = self._now_func()
+
+ for resource_id in changed:
+ resource = self.resources.get(resource_id)
+
+ # skip resource and remove any checks if it is removed or disabled
+ if resource is None or resource.state == ExecutionResourceState.DISABLED:
+ self.resource_checks.discard_resource_check(resource_id)
+ try:
+ del self.resource_warnings[resource_id]
+ except KeyError:
+ pass
+ continue
+
+ # otherwise, queue up the resource to be looked at in this cycle
+ self.resource_checks.set_resource_check(resource_id, now)
+
+ # walk the resource checks, up to the current time. This data structure
+ # ensures that we only check resources that have either been updated, or
+ # are due to timeout
+ for resource_id in self.resource_checks.walk_through_time(now):
+
+ if self.cancelled: # back out early if cancelled
+ return
+
+ try:
+ resource = self.resources[resource_id]
+ self._check_one_resource(resource, now)
+ except Exception:
+ log.exception("Problem checking execution resource %s. Will retry.")
+ if resource_id not in self.resource_checks:
+ next_check_time = now + timedelta(seconds=self._MONITOR_ERROR_DELAY_SECONDS)
+ self.resource_checks.set_resource_check(resource_id, next_check_time)
+
+ # return the number of seconds until the next expected check, or None
+ next_check_time = self.resource_checks.next_check_time
+ if next_check_time is not None:
+ return max((next_check_time - self._now_func()).total_seconds(), 0)
+ return None
+
+ def _check_one_resource(self, resource, now):
+
+ last_heartbeat = resource.last_heartbeat_datetime
+ warning_threshold, missing_threshold = self._get_resource_thresholds(resource)
+ if warning_threshold is None or missing_threshold is None:
+ return
+
+ log.debug("Examining heartbeat for resource %s (in state %s)",
+ resource.resource_id, resource.state)
+
+ resource_id = resource.resource_id
+ next_check_time = None
+
+ # this could end up negative in case of unsynced or shifting clocks
+ heartbeat_age = now - last_heartbeat
+
+ if resource.state == ExecutionResourceState.OK:
+
+ # go to WARNING state if heartbeat is older than threshold
+ if heartbeat_age >= warning_threshold:
+ self._mark_resource_warning(resource, now, last_heartbeat)
+ next_check_time = now + (missing_threshold - warning_threshold)
+ else:
+ next_check_time = last_heartbeat + warning_threshold
+
+ elif resource.state == ExecutionResourceState.WARNING:
+
+ # our real threshold is the gap between missing and warning
+ threshold = missing_threshold - warning_threshold
+
+ warning = self.resource_warnings.get(resource_id)
+
+ if heartbeat_age < warning_threshold:
+ self._mark_resource_ok(resource, now, last_heartbeat)
+
+ elif not warning or warning.last_heartbeat != last_heartbeat:
+ self.resource_warnings[resource_id] = _ResourceWarning(resource_id, last_heartbeat, now)
+ next_check_time = now + threshold
+
+ else:
+ delta = now - warning.warning_time
+ if delta >= threshold:
+ self._mark_resource_missing(resource, now, last_heartbeat)
+ else:
+ next_check_time = warning.warning_time + threshold
+
+ elif resource.state == ExecutionResourceState.MISSING:
+ # go OK if a heartbeat has been receieved in warning_time threshold
+ # go WARNING if a heartbeat has been receieved in missing_time threashold
+ # set check time
+ if heartbeat_age < warning_threshold:
+ self._mark_resource_ok(resource, now, last_heartbeat)
+
+ elif heartbeat_age < missing_threshold:
+ self._mark_resource_warning(resource, now, last_heartbeat)
+
+ elif resource.assigned:
+ # if resource has assignments, evacuate them.
+ # likely we would only get in this situation if previously
+ # marked the resource MISSING but failed before we evacuated
+ # everything
+ self.core.evacuate_resource(resource)
+
+ if next_check_time is not None:
+ self.resource_checks.set_resource_check(resource_id, next_check_time)
+
+ def _mark_resource_warning(self, resource, now, last_heartbeat):
+ resource, updated = self.core.resource_change_state(resource,
+ ExecutionResourceState.WARNING)
+ if updated:
+ resource_id = resource.resource_id
+ heartbeat_age = (now - last_heartbeat).total_seconds()
+ self.resource_warnings[resource_id] = _ResourceWarning(
+ resource_id, last_heartbeat, now)
+ log.warn("Execution resource %s is in WARNING state: Last known "
+ "heartbeat was sent %s seconds ago. This may be a legitimately "
+ "missing resource, OR it may be an issue of clock sync or "
+ "messaging delays.", resource.resource_id, heartbeat_age)
+
+ def _mark_resource_missing(self, resource, now, last_heartbeat):
+ resource, updated = self.core.resource_change_state(resource,
+ ExecutionResourceState.MISSING)
+ if updated:
+ resource_id = resource.resource_id
+
+ try:
+ del self.resource_warnings[resource_id]
+ except KeyError:
+ pass
+
+ heartbeat_age = (now - last_heartbeat).total_seconds()
+ log.warn("Execution resource %s is MISSING: Last known "
+ "heartbeat was sent %s seconds ago. ", resource.resource_id, heartbeat_age)
+
+ self.core.evacuate_resource(resource)
+
+ def _mark_resource_ok(self, resource, now, last_heartbeat):
+ resource, updated = self.core.resource_change_state(resource,
+ ExecutionResourceState.OK)
+ if updated:
+ heartbeat_age = (now - last_heartbeat).total_seconds()
+ log.info("Execution resource %s is OK again: Last known "
+ "heartbeat was sent %s seconds ago. ", resource.resource_id, heartbeat_age)
+
+ def _get_resource_thresholds(self, resource):
+ engine = self.core.get_resource_engine(resource)
+ warning = engine.heartbeat_warning
+ if warning is not None:
+ warning = ensure_timedelta(warning)
+
+ missing = engine.heartbeat_missing
+ if missing is not None:
+ missing = ensure_timedelta(missing)
+
+ return warning, missing
+
+ def _get_resource_missing_threshold(self, resource):
+ engine = self.core.get_resource_engine(resource)
+ return ensure_timedelta(engine.heartbeat_missing)
+
+ def _notify_resource_set_changed(self, *args):
+ with self.condition:
+ self.resource_set_changed = True
+ self.condition.notify_all()
+
+ def _notify_resource_changed(self, resource_id, *args):
+ with self.condition:
+ self.changed_resources.add(resource_id)
+ self.condition.notify_all()
+
+ def _update(self):
+ changed = set()
+ if self.resource_set_changed:
+ changed.update(self._update_resource_set())
+ if self.changed_resources:
+ changed.update(self._update_resources())
+ return changed
+
+ def _update_resource_set(self):
+ self.resource_set_changed = False
+ resource_ids = set(self.store.get_resource_ids(
+ watcher=self._notify_resource_set_changed))
+
+ previous = set(self.resources.keys())
+
+ added = resource_ids - previous
+ removed = previous - resource_ids
+
+ for resource_id in removed:
+ del self.resources[resource_id]
+
+ for resource_id in added:
+ self.resources[resource_id] = self.store.get_resource(
+ resource_id, watcher=self._notify_resource_changed)
+
+ return added | removed
+
+ def _update_resources(self):
+ with self.condition:
+ changed = self.changed_resources.copy()
+ self.changed_resources.clear()
+
+ for resource_id in changed:
+ resource = self.store.get_resource(resource_id,
+ watcher=self._notify_resource_changed)
+ if resource:
+ self.resources[resource_id] = resource
+
+ return changed
+
+
+_ResourceWarning = namedtuple("_ResourceWarning",
+ ["resource_id", "last_heartbeat", "warning_time"])
+
+
+class _ResourceChecks(object):
+
+ def __init__(self):
+ self.checks = {}
+ self.check_heap = []
+
+ def __contains__(self, item):
+ return item in self.checks
+
+ def set_resource_check(self, resource_id, check_time):
+
+ # do nothing if check already exists and is correct
+ if self.checks.get(resource_id) == check_time:
+ return
+ self.checks[resource_id] = check_time
+ heapq.heappush(self.check_heap, (check_time, resource_id))
+
+ def discard_resource_check(self, resource_id):
+ try:
+ del self.checks[resource_id]
+ except KeyError:
+ pass
+
+ def walk_through_time(self, now):
+ while self.check_heap:
+
+ # heap guarantees smallest element is at index 0, so we can peek before we pop
+ check_time, resource_id = self.check_heap[0]
+
+ # skip and drop entries that are no longer represented in our check set
+ if check_time != self.checks.get(resource_id):
+ heapq.heappop(self.check_heap)
+
+ elif check_time <= now:
+ heapq.heappop(self.check_heap)
+ del self.checks[resource_id]
+ yield resource_id
+
+ else:
+ break
+
+ @property
+ def next_check_time(self):
+ if self.check_heap:
+ return self.check_heap[0][0]
+ return None
View
38 epu/processdispatcher/engines.py
@@ -1,3 +1,5 @@
+from epu.util import ensure_timedelta
+
DOMAIN_PREFIX = "pd_domain_"
@@ -40,7 +42,10 @@ def from_config(cls, config, default=None):
replicas=engine_conf.get('replicas', 1),
spare_slots=engine_conf.get('spare_slots', 0),
iaas_allocation=engine_conf.get('iaas_allocation', None),
- maximum_vms=engine_conf.get('maximum_vms', None))
+ maximum_vms=engine_conf.get('maximum_vms', None),
+ heartbeat_period=engine_conf.get('heartbeat_period', 30),
+ heartbeat_warning=engine_conf.get('heartbeat_warning'),
+ heartbeat_missing=engine_conf.get('heartbeat_missing'))
registry.add(spec)
return registry
@@ -64,10 +69,13 @@ def get_engine_by_id(self, engine):
return self.by_engine[engine]
-class EngineSpec(object):
+_DEFAULT_HEARTBEAT_PERIOD = 30
+
+class EngineSpec(object):
def __init__(self, engine_id, slots, base_need=0, config=None, replicas=1,
- spare_slots=0, iaas_allocation=None, maximum_vms=None):
+ spare_slots=0, iaas_allocation=None, maximum_vms=None,
+ heartbeat_period=30, heartbeat_warning=45, heartbeat_missing=60):
self.engine_id = engine_id
self.config = config
self.base_need = int(base_need)
@@ -94,3 +102,27 @@ def __init__(self, engine_id, slots, base_need=0, config=None, replicas=1,
if maximum_vms < 0:
raise ValueError("maximum vms must be at least 0")
self.maximum_vms = maximum_vms
+
+ self.heartbeat_period = heartbeat_period
+ self.heartbeat_warning = heartbeat_warning
+ self.heartbeat_missing = heartbeat_missing
+
+ if (heartbeat_missing is None or heartbeat_warning is None) and not (
+ heartbeat_missing is None and heartbeat_warning is None):
+ raise ValueError("All heartbeat parameters must be specified, or none")
+
+ if self.heartbeat_period is None:
+ self.heartbeat_period = ensure_timedelta(_DEFAULT_HEARTBEAT_PERIOD)
+ else:
+ self.heartbeat_period = ensure_timedelta(self.heartbeat_period)
+
+ if self.heartbeat_missing is not None:
+ self.heartbeat_warning = ensure_timedelta(self.heartbeat_warning)
+ self.heartbeat_missing = ensure_timedelta(self.heartbeat_missing)
+
+ if self.heartbeat_period <= ensure_timedelta(0):
+ raise ValueError("heartbeat_period must be a positive value")
+ if self.heartbeat_warning <= self.heartbeat_period:
+ raise ValueError("heartbeat_warning must be greater than heartbeat_period")
+ if self.heartbeat_missing <= self.heartbeat_warning:
+ raise ValueError("heartbeat_missing must be greater than heartbeat_warning")
View
7 epu/processdispatcher/matchmaker.py
@@ -174,6 +174,8 @@ def _get_domain_config(self, engine, initial_n=0):
if engine_conf['provisioner_vars'].get('replicas') is None:
engine_conf['provisioner_vars']['replicas'] = engine.replicas
+ engine_conf['provisioner_vars']['heartbeat'] = engine.heartbeat_period.total_seconds()
+
engine_conf['preserve_n'] = initial_n
return config
@@ -632,6 +634,11 @@ def get_available_resources(self):
# first break down available resources by node
available_by_node = defaultdict(list)
for resource in self.resources.itervalues():
+
+ # only consider OK resources. We definitely don't want to consider
+ # MISSING or DISABLED resources. We could arguably include WARNING
+ # resources, but perhaps at a lower priority than OK.
+
if resource.state == ExecutionResourceState.OK and resource.available_slots:
node_id = resource.node_id
available_by_node[node_id].append(resource)
View
3  epu/processdispatcher/test/mocks.py
@@ -4,6 +4,7 @@
import logging
import threading
from collections import defaultdict
+from datetime import datetime
from epu.epumanagement.conf import * # noqa
from epu.exceptions import NotFoundError
@@ -261,5 +262,7 @@ def minimum_time_between_starts_config(minimum_time=2):
def make_beat(node_id, processes=None, timestamp=None):
+ if timestamp and isinstance(timestamp, datetime):
+ timestamp = timestamp.isoformat()
return {"node_id": node_id, "processes": processes or [],
"timestamp": timestamp or now_datetime().isoformat()}
View
141 epu/processdispatcher/test/test_doctor.py
@@ -2,28 +2,33 @@
import logging
import time
import uuid
+from datetime import timedelta, datetime
-import epu.tevent as tevent
+from mock import Mock
-from epu.processdispatcher.doctor import PDDoctor
+import epu.tevent as tevent
+from epu.processdispatcher.doctor import PDDoctor, ExecutionResourceMonitor
from epu.processdispatcher.core import ProcessDispatcherCore
from epu.processdispatcher.modes import RestartMode
from epu.processdispatcher.store import ProcessDispatcherStore, ProcessDispatcherZooKeeperStore
from epu.processdispatcher.test.mocks import MockResourceClient, MockNotifier
from epu.processdispatcher.store import ProcessRecord
from epu.processdispatcher.engines import EngineRegistry, domain_id_from_engine
-from epu.states import ProcessState, InstanceState, ProcessDispatcherState
+from epu.states import ProcessState, InstanceState, ProcessDispatcherState, ExecutionResourceState
from epu.processdispatcher.test.test_store import StoreTestMixin
from epu.processdispatcher.test.mocks import nosystemrestart_process_config, make_beat
from epu.test import ZooKeeperTestMixin
from epu.test.util import wait
+from epu.util import UTC
+
log = logging.getLogger(__name__)
class PDDoctorTests(unittest.TestCase, StoreTestMixin):
- engine_conf = {'engine1': {'slots': 4}}
+ engine_conf = {'engine1': {'slots': 4, 'heartbeat_period': 5,
+ 'heartbeat_warning': 10, 'heartbeat_missing': 20}}
def setUp(self):
self.store = self.setup_store()
@@ -36,11 +41,14 @@ def setUp(self):
self.docthread = None
+ self.monitor = None
+
def tearDown(self):
if self.docthread:
self.doctor.cancel()
self.docthread.join()
self.docthread = None
+
self.teardown_store()
def setup_store(self):
@@ -241,6 +249,125 @@ def test_initialized_not_system_boot(self):
# we have nothing really to check here, yet. but at least we can make sure
# the process is cancellable.
+ def test_monitor_thread(self):
+ self._run_in_thread()
+
+ assert self.store.wait_initialized(timeout=10)
+ self.assertEqual(self.store.get_pd_state(),
+ ProcessDispatcherState.OK)
+
+ self.assertIsNotNone(self.doctor.monitor)
+ monitor_thread = self.doctor.monitor_thread
+ self.assertIsNotNone(monitor_thread)
+ self.assertTrue(monitor_thread.is_alive())
+
+ # now cancel doctor. monitor should stop too
+ self.doctor.cancel()
+ wait(lambda: not monitor_thread.is_alive())
+
+ def _setup_resource_monitor(self):
+ self.monitor = ExecutionResourceMonitor(self.core, self.store)
+ return self.monitor
+
+ def _send_heartbeat(self, resource_id, node_id, timestamp):
+ self.core.ee_heartbeat(resource_id, make_beat(node_id, timestamp=timestamp))
+
+ def assert_monitor_cycle(self, expected_delay, resource_states=None):
+ self.assertEqual(expected_delay, self.monitor.monitor_cycle())
+
+ if resource_states:
+ for resource_id, expected_state in resource_states.iteritems():
+ found_state = self.store.get_resource(resource_id).state
+ if found_state != expected_state:
+ self.fail("Resource %s state = %s. Expected %s" % (resource_id,
+ found_state, expected_state))
+
+ def test_resource_monitor(self):
+ t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC)
+ mock_now = Mock()
+ mock_now.return_value = t0
+
+ def increment_now(seconds):
+ t = mock_now.return_value + timedelta(seconds=seconds)
+ mock_now.return_value = t
+ log.debug("THE TIME IS NOW: %s", t)
+ return t
+
+ monitor = self._setup_resource_monitor()
+ monitor._now_func = mock_now
+
+ # before there are any resources, monitor should work but return a None delay
+ self.assertIsNone(monitor.monitor_cycle())
+
+ self.core.node_state("node1", domain_id_from_engine("engine1"),
+ InstanceState.RUNNING)
+
+ # 3 resources. all report in at t0
+ r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3"
+ self._send_heartbeat(r1, "node1", t0)
+ self._send_heartbeat(r2, "node1", t0)
+ self._send_heartbeat(r3, "node1", t0)
+
+ states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK,
+ r3: ExecutionResourceState.OK}
+
+ self.assert_monitor_cycle(10, states)
+
+ t1 = increment_now(5) # :05
+ # heartbeat comes in for r1 5 seconds later
+ self._send_heartbeat(r1, "node1", t1)
+
+ self.assert_monitor_cycle(5, states)
+
+ increment_now(5) # :10
+
+ # no heartbeats for r2 and r3. they should be marked WARNING
+ states[r2] = ExecutionResourceState.WARNING
+ states[r3] = ExecutionResourceState.WARNING
+ self.assert_monitor_cycle(5, states)
+
+ increment_now(4) # :14
+
+ # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold
+ self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1))
+
+ self.assert_monitor_cycle(1, states)
+
+ increment_now(6) # :20
+
+ # r1 should go warning, r3 should go missing
+ states[r1] = ExecutionResourceState.WARNING
+ states[r3] = ExecutionResourceState.MISSING
+ self.assert_monitor_cycle(4, states)
+
+ t2 = increment_now(3) # :23
+ self._send_heartbeat(r1, "node1", t2)
+ states[r1] = ExecutionResourceState.OK
+ self.assert_monitor_cycle(1, states)
+
+ t3 = increment_now(2) # :25
+ self._send_heartbeat(r3, "node1", t3)
+ states[r2] = ExecutionResourceState.MISSING
+ states[r3] = ExecutionResourceState.OK
+ self.assert_monitor_cycle(8, states)
+
+ increment_now(5) # :30
+ # hearbeat r2 enough to go back to WARNING, but still late
+ self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15)))
+ self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15))
+ states[r2] = ExecutionResourceState.WARNING
+ self.assert_monitor_cycle(3, states)
+
+ t4 = increment_now(5) # :35
+ # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok)
+ self._send_heartbeat(r1, "node1", t4)
+ self._send_heartbeat(r3, "node1", t4)
+ self.core.resource_change_state(self.store.get_resource(r2),
+ ExecutionResourceState.DISABLED)
+
+ states[r2] = ExecutionResourceState.DISABLED
+ self.assert_monitor_cycle(10, states)
+
class PDDoctorZooKeeperTests(PDDoctorTests, ZooKeeperTestMixin):
def setup_store(self):
@@ -256,3 +383,9 @@ def teardown_store(self):
self.store.shutdown()
self.teardown_zookeeper()
+
+ def assert_monitor_cycle(self, *args, **kwargs):
+ # occasionally ZK doesn't fire the watches quick enough.
+ # so we add a little sleep before each cycle
+ time.sleep(0.05)
+ super(PDDoctorZooKeeperTests, self).assert_monitor_cycle(*args, **kwargs)
View
23 epu/states.py
@@ -256,5 +256,28 @@ class ProcessDispatcherState(object):
class ExecutionResourceState(object):
OK = "OK"
+ """Resource is active and healthy
+ """
+
+ WARNING = "WARNING"
+ """The resource is under suspicion due to missing or late heartbeats
+
+ Running processes are not rescheduled yet, but the resource is not
+ assigned any new processes while in this state. Note: This could later
+ be refined to allow processes, but only if there are no compatible slots
+ available on healthy resources.
+ """
+
+ MISSING = "MISSING"
+ """The resource has been declared dead by the PD Doctor due to a prolonged
+ lack of heartbeats.
+
+ Running processes on the resource have been rescheduled (if applicable)
+ and the resource is ineligible for running new processes. If the resource
+ resumes sending heartbeats, it will be returned to the OK state and made
+ available for processes.
+ """
DISABLED = "DISABLED"
+ """The resource has been disabled, likely in advance of being terminated
+ """
View
1  epu/util.py
@@ -126,4 +126,3 @@ def ensure_timedelta(t):
return timedelta(seconds=t)
raise TypeError("cannot convert %s to timedelta" % (t,))
-
Please sign in to comment.
Something went wrong with that request. Please try again.