Skip to content

Commit

Permalink
[core][agent] fix the race condition where the worker process termina…
Browse files Browse the repository at this point in the history
…ted during the get_all_workers call ray-project#37953

the issue seems caused by the race condition where the work process exited between list and stats call,
this PR fixed it.

Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
scv119 authored and arvind-chandra committed Aug 31, 2023
1 parent 8f4ea0b commit d403fc3
Showing 1 changed file with 23 additions and 14 deletions.
37 changes: 23 additions & 14 deletions dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,21 +509,30 @@ def _get_workers(self):
# Remove the current process (reporter agent), which is also a child of
# the Raylet.
self._workers.pop(self._generate_worker_key(self._get_agent_proc()))
return [
w.as_dict(
attrs=[
"pid",
"create_time",
"cpu_percent",
"cpu_times",
"cmdline",
"memory_info",
"memory_full_info",
]

result = []
for w in self._workers.values():
try:
if w.status() == psutil.STATUS_ZOMBIE:
continue
except psutil.NoSuchProcess:
# the process may have terminated due to race condition.
continue

result.append(
w.as_dict(
attrs=[
"pid",
"create_time",
"cpu_percent",
"cpu_times",
"cmdline",
"memory_info",
"memory_full_info",
]
)
)
for w in self._workers.values()
if w.status() != psutil.STATUS_ZOMBIE
]
return result

def _get_raylet_proc(self):
try:
Expand Down

0 comments on commit d403fc3

Please sign in to comment.