Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

Reimage dead nodes #154

Merged
merged 11 commits into from
Oct 20, 2020
19 changes: 19 additions & 0 deletions src/api-service/__app__/onefuzzlib/pools.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
from .extension import fuzz_extensions
from .orm import MappingIntStrAny, ORMMixin, QueryFilter

NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1)

# Future work:
#
# Enabling autoscaling for the scalesets based on the pool work queues.
Expand Down Expand Up @@ -278,6 +280,18 @@ def set_halt(self) -> None:
self.set_shutdown()
self.stop()

@classmethod
def get_dead_nodes(
cls, scaleset_id: UUID, expiration_period: datetime.timedelta
) -> List["Node"]:
time_filter = "heartbeat lt datetime'%s'" % (
(datetime.datetime.utcnow() - expiration_period).isoformat()
)
return cls.search(
query={"scaleset_id": [scaleset_id]},
raw_unchecked_filter=time_filter,
)


class NodeTasks(BASE_NODE_TASK, ORMMixin):
@classmethod
Expand Down Expand Up @@ -743,6 +757,11 @@ def cleanup_nodes(self) -> bool:
# only add nodes that are not already set to reschedule
to_reimage.append(node)

dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
for node in dead_nodes:
chkeita marked this conversation as resolved.
Show resolved Hide resolved
node.set_halt()
to_reimage.append(node)

# Perform operations until they fail due to scaleset getting locked
try:
if to_delete:
Expand Down