From 923f4ddfef9576a01adb4cfa2dc925f84e1fb936 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 29 Mar 2016 09:15:22 -0700 Subject: [PATCH] Fix the 2.0 branch segfaults on finalize - we need to be in the same thread when closing the RML recv list --- orte/mca/rml/base/rml_base_frame.c | 35 +++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/orte/mca/rml/base/rml_base_frame.c b/orte/mca/rml/base/rml_base_frame.c index 2312459154..33dcbde6fa 100644 --- a/orte/mca/rml/base/rml_base_frame.c +++ b/orte/mca/rml/base/rml_base_frame.c @@ -5,7 +5,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2016 Intel Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,6 +24,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/state/state.h" +#include "orte/runtime/orte_wait.h" #include "orte/util/name_fns.h" #include "orte/mca/rml/base/base.h" @@ -72,14 +73,36 @@ static int orte_rml_base_register(mca_base_register_flag_t flags) return ORTE_SUCCESS; } -static int orte_rml_base_close(void) +static void cleanup(int sd, short args, void *cbdata) { - opal_list_item_t *item; + volatile bool *active = (volatile bool*)cbdata; - while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) { - OBJ_RELEASE(item); + OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs); + if (NULL != active) { + *active = false; } - OBJ_DESTRUCT(&orte_rml_base.posted_recvs); +} + +static int orte_rml_base_close(void) +{ + volatile bool active; + + /* because the RML posted recvs list is in a separate + * async thread for apps, we can't just destruct it here. + * Instead, we push it into that event thread and destruct + * it there */ + if (ORTE_PROC_IS_APP) { + opal_event_t ev; + active = true; + opal_event_set(orte_event_base, &ev, -1, + OPAL_EV_WRITE, cleanup, (void*)&active); + opal_event_set_priority(&ev, ORTE_ERROR_PRI); + opal_event_active(&ev, OPAL_EV_WRITE, 1); + ORTE_WAIT_FOR_COMPLETION(active); + } else { + /* we can call the destruct directly */ + cleanup(0, 0, NULL); + } OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml);