From 060a7a92a391785808b1f3203bfb6f028046963d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 21 Jun 2017 06:33:37 -0700 Subject: [PATCH 1/4] Ensure we properly cleanup on termination, including when terminating due to ctrl-c Signed-off-by: Ralph Castain (cherry picked from commit 38636f4f0acd59e65ffc76966f90f6f6c2278288) --- orte/mca/ess/base/ess_base_std_app.c | 2 ++ orte/orted/orted_main.c | 4 +++ orte/runtime/orte_finalize.c | 5 +--- orte/tools/orterun/orterun.c | 4 +++ orte/util/session_dir.c | 42 ++++++++++++---------------- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 79e3a1fe486..e66cf798b7f 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -342,6 +342,8 @@ int orte_ess_base_app_finalize(void) (void) mca_base_framework_close(&orte_state_base_framework); orte_session_dir_finalize(ORTE_PROC_MY_NAME); + /* cleanup the process info */ + orte_proc_info_finalize(); return ORTE_SUCCESS; } diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index bab19c67390..f4f321fb37e 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -933,6 +933,10 @@ int orte_daemon(int argc, char *argv[]) orte_finalize(); opal_finalize_util(); + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); + /* cleanup the process info */ + orte_proc_info_finalize(); + if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); } diff --git a/orte/runtime/orte_finalize.c b/orte/runtime/orte_finalize.c index 40749137719..a72301efc13 100644 --- a/orte/runtime/orte_finalize.c +++ b/orte/runtime/orte_finalize.c @@ -12,7 +12,7 @@ * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -84,9 +84,6 @@ int orte_finalize(void) orte_schizo.finalize(); (void) mca_base_framework_close(&orte_schizo_base_framework); - /* cleanup the process info */ - orte_proc_info_finalize(); - /* Close the general debug stream */ opal_output_close(orte_debug_output); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 92220f07118..85aba0a0f33 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -86,6 +86,7 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/state/state.h" #include "orte/util/proc_info.h" +#include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/util/threads.h" @@ -222,6 +223,9 @@ int orterun(int argc, char *argv[]) /* cleanup and leave */ orte_submit_finalize(); orte_finalize(); + orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); + /* cleanup the process info */ + orte_proc_info_finalize(); if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); diff --git a/orte/util/session_dir.c b/orte/util/session_dir.c index bdd73f48be6..90f464fefbb 100644 --- a/orte/util/session_dir.c +++ b/orte/util/session_dir.c @@ -370,14 +370,12 @@ int orte_session_dir(bool create, orte_process_name_t *proc) int orte_session_dir_cleanup(orte_jobid_t jobid) { - int rc = ORTE_SUCCESS; - if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) { /* we haven't created them or RM will clean them up for us*/ return ORTE_SUCCESS; } - if (NULL == orte_process_info.job_session_dir || + if (NULL == orte_process_info.jobfam_session_dir || NULL == orte_process_info.proc_session_dir) { /* this should never happen - it means we are calling * cleanup *before* properly setting up the session @@ -385,30 +383,20 @@ orte_session_dir_cleanup(orte_jobid_t jobid) * accidentally removing directories we shouldn't * touch */ - rc = ORTE_ERR_NOT_INITIALIZED; - goto CLEANUP; + return ORTE_ERR_NOT_INITIALIZED; } /* recursively blow the whole session away for our job family, * saving only output files */ - opal_os_dirpath_destroy(orte_process_info.job_session_dir, + opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir, true, orte_dir_check_file); - /* now attempt to eliminate the top level directory itself - this - * will fail if anything is present, but ensures we cleanup if - * we are the last one out - */ - if( NULL != orte_process_info.top_session_dir ){ - opal_os_dirpath_destroy(orte_process_info.top_session_dir, - false, orte_dir_check_file); - } - - if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) { + if (opal_os_dirpath_is_empty(orte_process_info.jobfam_session_dir)) { if (orte_debug_flag) { - opal_output(0, "sess_dir_cleanup: found job session dir empty - deleting"); + opal_output(0, "sess_dir_cleanup: found jobfam session dir empty - deleting"); } - rmdir(orte_process_info.job_session_dir); + rmdir(orte_process_info.jobfam_session_dir); } else { if (orte_debug_flag) { if (OPAL_ERR_NOT_FOUND == @@ -418,12 +406,10 @@ orte_session_dir_cleanup(orte_jobid_t jobid) opal_output(0, "sess_dir_cleanup: job session dir not empty - leaving"); } } - goto CLEANUP; } - if ( NULL != orte_process_info.top_session_dir ){ - - if( opal_os_dirpath_is_empty(orte_process_info.top_session_dir) ) { + if (NULL != orte_process_info.top_session_dir) { + if (opal_os_dirpath_is_empty(orte_process_info.top_session_dir)) { if (orte_debug_flag) { opal_output(0, "sess_dir_cleanup: found top session dir empty - deleting"); } @@ -440,9 +426,17 @@ orte_session_dir_cleanup(orte_jobid_t jobid) } } -CLEANUP: + /* now attempt to eliminate the top level directory itself - this + * will fail if anything is present, but ensures we cleanup if + * we are the last one out + */ + if( NULL != orte_process_info.top_session_dir ){ + opal_os_dirpath_destroy(orte_process_info.top_session_dir, + false, orte_dir_check_file); + } + - return rc; + return ORTE_SUCCESS; } From bf6ac3b51b18dac62550e209bd2618e2733c948c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 23 Jun 2017 06:22:31 -0700 Subject: [PATCH 2/4] Remove stale field Signed-off-by: Ralph Castain (cherry picked from commit 3af9344764e02d211c6243c6df392eb0b45120e5) --- orte/util/proc_info.c | 9 +-------- orte/util/proc_info.h | 3 +-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 277afa2bc49..8999eea466c 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -12,7 +12,7 @@ * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * $COPYRIGHT$ * @@ -69,7 +69,6 @@ ORTE_DECLSPEC orte_proc_info_t orte_process_info = { .aliases = NULL, .pid = 0, .proc_type = ORTE_PROC_TYPE_NONE, - .sync_buf = NULL, .my_port = 0, .num_restarts = 0, .my_node_rank = ORTE_NODE_RANK_INVALID, @@ -265,9 +264,6 @@ int orte_proc_info(void) &orte_ess_node_rank); orte_process_info.my_node_rank = (orte_node_rank_t) orte_ess_node_rank; - /* setup the sync buffer */ - orte_process_info.sync_buf = OBJ_NEW(opal_buffer_t); - return ORTE_SUCCESS; } @@ -330,9 +326,6 @@ int orte_proc_info_finalize(void) orte_process_info.proc_type = ORTE_PROC_TYPE_NONE; - OBJ_RELEASE(orte_process_info.sync_buf); - orte_process_info.sync_buf = NULL; - OBJ_DESTRUCT(&orte_process_info.super); opal_argv_free(orte_process_info.aliases); diff --git a/orte/util/proc_info.h b/orte/util/proc_info.h index 810f31cf84d..75d11c2d92c 100644 --- a/orte/util/proc_info.h +++ b/orte/util/proc_info.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * @@ -99,7 +99,6 @@ struct orte_proc_info_t { char **aliases; /**< aliases for this node */ pid_t pid; /**< Local process ID for this process */ orte_proc_type_t proc_type; /**< Type of process */ - opal_buffer_t *sync_buf; /**< buffer to store sync response */ uint16_t my_port; /**< TCP port for out-of-band comm */ int num_restarts; /**< number of times this proc has restarted */ orte_node_rank_t my_node_rank; /**< node rank */ From a35cd6e323dbde1b0db78e1bf617e934d358360d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 23 Jun 2017 07:49:14 -0700 Subject: [PATCH 3/4] Also need to avoid calling destruct on the opal_process_info struct after finalize Signed-off-by: Ralph Castain (cherry picked from commit 168e50bc13a0daef3f3f8f0170eb2f54976e100b) --- orte/runtime/orte_finalize.c | 4 ++++ orte/util/proc_info.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/orte/runtime/orte_finalize.c b/orte/runtime/orte_finalize.c index a72301efc13..e5f343d21b4 100644 --- a/orte/runtime/orte_finalize.c +++ b/orte/runtime/orte_finalize.c @@ -39,6 +39,7 @@ #include "orte/runtime/orte_locks.h" #include "orte/util/listener.h" #include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" #include "orte/util/show_help.h" int orte_finalize(void) @@ -91,6 +92,9 @@ int orte_finalize(void) opal_argv_free(orte_fork_agent); } + /* destruct our process info */ + OBJ_DESTRUCT(&orte_process_info.super); + /* finalize the opal utilities */ rc = opal_finalize(); diff --git a/orte/util/proc_info.c b/orte/util/proc_info.c index 8999eea466c..4e0db3db890 100644 --- a/orte/util/proc_info.c +++ b/orte/util/proc_info.c @@ -326,8 +326,6 @@ int orte_proc_info_finalize(void) orte_process_info.proc_type = ORTE_PROC_TYPE_NONE; - OBJ_DESTRUCT(&orte_process_info.super); - opal_argv_free(orte_process_info.aliases); init = false; From ce7a183341d5f48bbfbc6ba3aa3cc0de764c2a26 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 23 Jun 2017 11:12:26 -0700 Subject: [PATCH 4/4] Fix uninitialized variables Signed-off-by: Ralph Castain (cherry picked from commit 8263efff65ef7922220aa0eaa4a260e4977bdae2) --- opal/mca/pmix/pmix2x/pmix2x.c | 12 ++++++++---- opal/mca/pmix/pmix2x/pmix2x_client.c | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix2x.c b/opal/mca/pmix/pmix2x/pmix2x.c index d30cd1547a9..2362219488c 100644 --- a/opal/mca/pmix/pmix2x/pmix2x.c +++ b/opal/mca/pmix/pmix2x/pmix2x.c @@ -1256,8 +1256,10 @@ static void pmix2x_query(opal_list_t *queries, OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock); if (0 >= opal_pmix_base.initialized) { OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); - rc = OPAL_ERR_NOT_INITIALIZED; - goto CLEANUP; + if (NULL != cbfunc) { + cbfunc(OPAL_ERR_NOT_INITIALIZED, NULL, cbdata, NULL, NULL); + } + return; } OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); @@ -1323,8 +1325,10 @@ static void pmix2x_log(opal_list_t *info, OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock); if (0 >= opal_pmix_base.initialized) { OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); - rc = OPAL_ERR_NOT_INITIALIZED; - goto CLEANUP; + if (NULL != cbfunc) { + cbfunc(OPAL_ERR_NOT_INITIALIZED, cbdata); + } + return; } OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); diff --git a/opal/mca/pmix/pmix2x/pmix2x_client.c b/opal/mca/pmix/pmix2x/pmix2x_client.c index e4c73854101..d187f033fd7 100644 --- a/opal/mca/pmix/pmix2x/pmix2x_client.c +++ b/opal/mca/pmix/pmix2x/pmix2x_client.c @@ -312,7 +312,7 @@ int pmix2x_fence(opal_list_t *procs, int collect_data) pmix_status_t rc; opal_namelist_t *ptr; char *nsptr; - size_t cnt, n; + size_t cnt = 0, n; pmix_proc_t *parray = NULL; pmix_info_t info, *iptr; @@ -725,7 +725,7 @@ int pmix2x_lookup(opal_list_t *data, opal_list_t *info) pmix_pdata_t *pdata; pmix_info_t *pinfo = NULL; pmix_status_t rc; - size_t cnt, n, sz; + size_t cnt, n, sz = 0; opal_value_t *iptr; opal_pmix2x_jobid_trkr_t *jptr, *job; @@ -993,7 +993,7 @@ int pmix2x_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid) pmix_status_t rc; pmix_info_t *info = NULL; pmix_app_t *papps; - size_t ninfo, napps, n, m; + size_t ninfo = 0, napps, n, m; opal_value_t *ival; opal_pmix_app_t *app; char nspace[PMIX_MAX_NSLEN+1];