From 54fc737a207cffa8d5013e58ed0a2778cd4ec170 Mon Sep 17 00:00:00 2001 From: Nathan Marz Date: Mon, 1 Oct 2012 23:07:37 +0200 Subject: [PATCH] fix race condition in supervisor where resources for a worker were being cleaned up before setting the new assignment, potentially leading to the supervisor continuously dying --- src/clj/backtype/storm/daemon/supervisor.clj | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/clj/backtype/storm/daemon/supervisor.clj b/src/clj/backtype/storm/daemon/supervisor.clj index 817708c96..2d84534f1 100644 --- a/src/clj/backtype/storm/daemon/supervisor.clj +++ b/src/clj/backtype/storm/daemon/supervisor.clj @@ -296,13 +296,7 @@ " from " master-code-dir) )) - ;; remove any downloaded code that's no longer assigned or active - (doseq [storm-id downloaded-storm-ids] - (when-not (assigned-storm-ids storm-id) - (log-message "Removing code for storm id " - storm-id) - (rmr (supervisor-stormdist-root conf storm-id)) - )) + (log-debug "Writing new assignment " (pr-str new-assignment)) (doseq [p (set/difference (set (keys existing-assignment)) @@ -312,6 +306,16 @@ (.put local-state LS-LOCAL-ASSIGNMENTS new-assignment) + ;; remove any downloaded code that's no longer assigned or active + ;; important that this happens after setting the local assignment so that + ;; synchronize-supervisor doesn't try to launch workers for which the + ;; resources don't exist + (doseq [storm-id downloaded-storm-ids] + (when-not (assigned-storm-ids storm-id) + (log-message "Removing code for storm id " + storm-id) + (rmr (supervisor-stormdist-root conf storm-id)) + )) (.add processes-event-manager sync-processes) )))