Skip to content

Commit

Permalink
Revamp the session directory system
Browse files Browse the repository at this point in the history
We now have multiple tools (e.g., psched, prte, and even
multiple prte instances) running on the same node. Keeping
all those session directory trees under a single root is
problematic and leading to inadvertent deletion of contact
files. So simplify things and put each instance under its
own session directory tree root.

Add the pid and uid to the session directory root name. Prefix
the root name with the argv[0] of the tool so we know what
generated it.

Fix an error in PRRTE that assumed the job-level session was
a global name. It is not - it is different for each job, so
we need to track it by job. Have the prte_job_t destructor
call the session_dir_destroy function to remove it when
the job is complete.

Fix refcounts so the job object destructor gets called upon
job completion.

Signed-off-by: Ralph Castain <rhc@pmix.org>
(cherry picked from commit 14dd818)
  • Loading branch information
rhc54 committed Feb 25, 2024
1 parent aa2df0e commit 9d54eda
Show file tree
Hide file tree
Showing 39 changed files with 377 additions and 809 deletions.
47 changes: 1 addition & 46 deletions src/mca/errmgr/base/errmgr_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -92,48 +92,3 @@ void prte_errmgr_base_log(int error_code, char *filename, int line)
pmix_output(0, "%s PRTE_ERROR_LOG: %s in file %s at line %d",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), errstring, filename, line);
}

void prte_errmgr_base_abort(int error_code, char *fmt, ...)
{
va_list arglist;

/* If there was a message, output it */
va_start(arglist, fmt);
if (NULL != fmt) {
char *buffer = NULL;
pmix_vasprintf(&buffer, fmt, arglist);
pmix_output(0, "%s", buffer);
free(buffer);
}
va_end(arglist);

/* if I am a daemon or the HNP... */
if (PRTE_PROC_IS_MASTER || PRTE_PROC_IS_DAEMON) {
/* whack my local procs */
if (NULL != prte_odls.kill_local_procs) {
prte_odls.kill_local_procs(NULL);
}
/* whack any session directories */
prte_session_dir_cleanup(PRTE_JOBID_WILDCARD);
}

/* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core
*/
if (PRTE_ERR_CONNECTION_FAILED == error_code || PRTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
prte_ess.abort(error_code, false);
} else {
prte_ess.abort(error_code, true);
}

/*
* We must exit in prte_ess.abort; all implementations of prte_ess.abort
* contain __prte_attribute_noreturn__
*/
/* No way to reach here */
}

int prte_errmgr_base_abort_peers(pmix_proc_t *procs, int32_t num_procs, int error_code)
{
PRTE_HIDE_UNUSED_PARAMS(procs, num_procs, error_code);
return PRTE_ERR_NOT_IMPLEMENTED;
}
30 changes: 9 additions & 21 deletions src/mca/errmgr/base/errmgr_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* Copyright (c) 2014-2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -48,27 +48,21 @@

#include "src/mca/errmgr/base/static-components.h"

/*
* Globals
*/
prte_errmgr_base_t prte_errmgr_base = {
.error_cbacks = PMIX_LIST_STATIC_INIT
};

/* Public module provides a wrapper around previous functions */
prte_errmgr_base_module_t prte_errmgr_default_fns = {.init = NULL, /* init */
.finalize = NULL, /* finalize */
.logfn = prte_errmgr_base_log,
.abort = prte_errmgr_base_abort,
.abort_peers = prte_errmgr_base_abort_peers,
.enable_detector = NULL};
prte_errmgr_base_module_t prte_errmgr_default_fns = {
.init = NULL, /* init */
.finalize = NULL, /* finalize */
.logfn = prte_errmgr_base_log
};

/* NOTE: ABSOLUTELY MUST initialize this
* struct to include the log function as it
* gets called even if the errmgr hasn't been
* opened yet due to error
*/
prte_errmgr_base_module_t prte_errmgr = {.logfn = prte_errmgr_base_log};
prte_errmgr_base_module_t prte_errmgr = {
.logfn = prte_errmgr_base_log
};

static int prte_errmgr_base_close(void)
{
Expand All @@ -80,9 +74,6 @@ static int prte_errmgr_base_close(void)
/* always leave a default set of fn pointers */
prte_errmgr = prte_errmgr_default_fns;

/* destruct the callback list */
PMIX_LIST_DESTRUCT(&prte_errmgr_base.error_cbacks);

return pmix_mca_base_framework_components_close(&prte_errmgr_base_framework, NULL);
}

Expand All @@ -95,9 +86,6 @@ static int prte_errmgr_base_open(pmix_mca_base_open_flag_t flags)
/* load the default fns */
prte_errmgr = prte_errmgr_default_fns;

/* initialize the error callback list */
PMIX_CONSTRUCT(&prte_errmgr_base.error_cbacks, pmix_list_t);

/* Open up all available components */
return pmix_mca_base_framework_components_open(&prte_errmgr_base_framework, flags);
}
Expand Down
13 changes: 1 addition & 12 deletions src/mca/errmgr/base/errmgr_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* All rights reserved.
* Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -48,13 +48,6 @@
*/
BEGIN_C_DECLS

/* define a struct to hold framework-global values */
typedef struct {
pmix_list_t error_cbacks;
} prte_errmgr_base_t;

PRTE_EXPORT extern prte_errmgr_base_t prte_errmgr_base;

/* declare the base default module */
PRTE_EXPORT extern prte_errmgr_base_module_t prte_errmgr_default_fns;

Expand All @@ -63,9 +56,5 @@ PRTE_EXPORT extern prte_errmgr_base_module_t prte_errmgr_default_fns;
*/
PRTE_EXPORT void prte_errmgr_base_log(int error_code, char *filename, int line);

PRTE_EXPORT void prte_errmgr_base_abort(int error_code, char *fmt, ...)
__prte_attribute_format__(__printf__, 2, 3);
PRTE_EXPORT int prte_errmgr_base_abort_peers(pmix_proc_t *procs, int32_t num_procs, int error_code);

END_C_DECLS
#endif
6 changes: 2 additions & 4 deletions src/mca/errmgr/dvm/errmgr_dvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -71,9 +71,7 @@ static int finalize(void);
prte_errmgr_base_module_t prte_errmgr_dvm_module = {
.init = init,
.finalize = finalize,
.logfn = prte_errmgr_base_log,
.abort = prte_errmgr_base_abort,
.abort_peers = prte_errmgr_base_abort_peers
.logfn = prte_errmgr_base_log
};

/*
Expand Down
28 changes: 1 addition & 27 deletions src/mca/errmgr/errmgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* reserved.
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -99,27 +99,6 @@ typedef int (*prte_errmgr_base_module_finalize_fn_t)(void);
*/
typedef void (*prte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);

/**
* Alert - self aborting
* This function is called when a process is aborting due to some internal error.
* It will finalize the process
* itself, and then exit - it takes no other actions. The intent here is to provide
* a last-ditch exit procedure that attempts to clean up a little.
*/
typedef void (*prte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
__prte_attribute_format_funcptr__(__printf__, 2, 3);

/**
* Alert - abort peers
* This function is called when a process wants to abort one or more peer processes.
* For example, MPI_Abort(comm) will use this function to terminate peers in the
* communicator group before aborting itself.
*/
typedef int (*prte_errmgr_base_module_abort_peers_fn_t)(pmix_proc_t *procs, int32_t num_procs,
int error_code);

typedef void (*prte_errmgr_base_module_enable_detector_fn_t)(bool flag);

/*
* Module Structure
*/
Expand All @@ -130,11 +109,6 @@ struct prte_errmgr_base_module_2_3_0_t {
prte_errmgr_base_module_finalize_fn_t finalize;

prte_errmgr_base_module_log_fn_t logfn;
prte_errmgr_base_module_abort_fn_t abort;
prte_errmgr_base_module_abort_peers_fn_t abort_peers;

/* start error detector and propagator */
prte_errmgr_base_module_enable_detector_fn_t enable_detector;
};
typedef struct prte_errmgr_base_module_2_3_0_t prte_errmgr_base_module_2_3_0_t;
typedef prte_errmgr_base_module_2_3_0_t prte_errmgr_base_module_t;
Expand Down
20 changes: 8 additions & 12 deletions src/mca/errmgr/prted/errmgr_prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* All rights reserved.
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -64,12 +64,11 @@ static void prted_abort(int error_code, char *fmt, ...);
/******************
* prted module
******************/
prte_errmgr_base_module_t prte_errmgr_prted_module = {.init = init,
.finalize = finalize,
.logfn = prte_errmgr_base_log,
.abort = prted_abort,
.abort_peers = prte_errmgr_base_abort_peers,
.enable_detector = NULL};
prte_errmgr_base_module_t prte_errmgr_prted_module = {
.init = init,
.finalize = finalize,
.logfn = prte_errmgr_base_log
};

/* Local functions */
static bool any_live_children(pmix_nspace_t job);
Expand Down Expand Up @@ -671,8 +670,8 @@ static void proc_errors(int fd, short args, void *cbdata)

/* remove all of this job's children from the global list */
for (i = 0; i < prte_local_children->size; i++) {
if (NULL
== (ptr = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i))) {
ptr = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i);
if (NULL == ptr) {
continue;
}
if (PMIX_CHECK_NSPACE(jdata->nspace, ptr->name.nspace)) {
Expand All @@ -681,9 +680,6 @@ static void proc_errors(int fd, short args, void *cbdata)
}
}

/* ensure the job's local session directory tree is removed */
prte_session_dir_cleanup(jdata->nspace);

/* remove this job from our local job data since it is complete */
PMIX_RELEASE(jdata);

Expand Down
9 changes: 5 additions & 4 deletions src/mca/ess/alps/ess_alps_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -46,9 +46,10 @@ static int alps_set_name(void);
static int rte_init(int argc, char **argv);
static int rte_finalize(void);

prte_ess_base_module_t prte_ess_alps_module = {.init = rte_init,
.finalize = rte_finalize,
.abort = NULL};
prte_ess_base_module_t prte_ess_alps_module = {
.init = rte_init,
.finalize = rte_finalize
};

/* Local variables */
static pmix_rank_t starting_vpid = 0;
Expand Down
3 changes: 1 addition & 2 deletions src/mca/ess/base/ess_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017-2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -47,7 +47,6 @@
prte_ess_base_module_t prte_ess = {
.init = NULL,
.finalize = NULL,
.abort = NULL,
};
int prte_ess_base_num_procs = -1;
char *prte_ess_base_nspace = NULL;
Expand Down
Loading

0 comments on commit 9d54eda

Please sign in to comment.