From bb2185783d0d0996bec50cc72783b40bdc1992c4 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 29 Jun 2017 15:48:18 -0700 Subject: [PATCH] Stop all progress threads prior to releasing the peer objects to avoid a race condition whereby a lost connection could be reported after a peer object was freed and before the threads were stopped. Signed-off-by: Ralph Castain (cherry picked from commit 85f8eb4c6bcc2b4995901832f566baf3677d023f) --- orte/mca/oob/tcp/oob_tcp_component.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 0915e726e61..7f00e063580 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -698,24 +698,14 @@ static int component_startup(void) static void component_shutdown(void) { mca_oob_tcp_peer_t *peer; - uint64_t ui64; - int i = 0; + int i = 0, rc; + uint64_t key; + void *node; opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s TCP SHUTDOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - /* cleanup all peers */ - OPAL_HASH_TABLE_FOREACH(ui64, uint64, peer, &mca_oob_tcp_component.peers) { - opal_output_verbose(2, orte_oob_base_framework.framework_output, - "%s RELEASING PEER OBJ %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == peer) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); - if (NULL != peer) { - OBJ_RELEASE(peer); - } - } - if (0 < orte_oob_base.num_threads) { for (i=0; i < orte_oob_base.num_threads; i++) { opal_progress_thread_finalize(mca_oob_tcp_component.ev_threads[i]); @@ -734,6 +724,18 @@ static void component_shutdown(void) "no hnp or not active"); } + /* release all peers from the hash table */ + rc = opal_hash_table_get_first_key_uint64(&mca_oob_tcp_component.peers, &key, + (void **)&peer, &node); + while (OPAL_SUCCESS == rc) { + if (NULL != peer) { + OBJ_RELEASE(peer); + opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, key, NULL); + } + rc = opal_hash_table_get_next_key_uint64(&mca_oob_tcp_component.peers, &key, + (void **) &peer, node, &node); + } + opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s TCP SHUTDOWN done", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));