Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion opal/include/opal/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,15 @@ enum {
OPAL_ERR_COMM_FAILURE = (OPAL_ERR_BASE - 51),
OPAL_ERR_SERVER_NOT_AVAIL = (OPAL_ERR_BASE - 52),
OPAL_ERR_IN_PROCESS = (OPAL_ERR_BASE - 53),
/* PMIx equivalents for notification support */
OPAL_ERR_DEBUGGER_RELEASE = (OPAL_ERR_BASE - 54),
OPAL_ERR_HANDLERS_COMPLETE = (OPAL_ERR_BASE - 55),
OPAL_ERR_PARTIAL_SUCCESS = (OPAL_ERR_BASE - 56)
OPAL_ERR_PARTIAL_SUCCESS = (OPAL_ERR_BASE - 56),
OPAL_ERR_PROC_ABORTED = (OPAL_ERR_BASE - 57),
OPAL_ERR_PROC_REQUESTED_ABORT = (OPAL_ERR_BASE - 58),
OPAL_ERR_PROC_ABORTING = (OPAL_ERR_BASE - 59),
OPAL_ERR_NODE_DOWN = (OPAL_ERR_BASE - 60),
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61)
};

#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)
Expand Down
2 changes: 2 additions & 0 deletions opal/mca/pmix/pmix2x/pmix/include/pmix/pmix_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ BEGIN_C_DECLS
#define PMIX_EVENT_ENVIRO_LEVEL "pmix.evenv" // (bool) register for environment events only
#define PMIX_EVENT_ORDER_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list
#define PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_proc_t*) array of pmix_proc_t defining range of event notification
#define PMIX_EVENT_AFFECTED_PROC "pmix.evproc" // (pmix_proc_t) single proc that was affected
#define PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_proc_t*) array of pmix_proc_t defining affected procs
#define PMIX_EVENT_NON_DEFAULT "pmix.evnondef" // (bool) event is not to be delivered to default event handlers
/* fault tolerance-related events */
Expand Down Expand Up @@ -462,6 +463,7 @@ typedef struct pmix_value {
double dval;
struct timeval tv;
pmix_status_t status;
pmix_proc_t proc;
pmix_info_array_t array;
pmix_byte_object_t bo;
void *ptr;
Expand Down
5 changes: 5 additions & 0 deletions opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,11 @@ static pmix_status_t pack_val(pmix_buffer_t *buffer,
return ret;
}
break;
case PMIX_PROC:
if (PMIX_SUCCESS != (ret = pmix_bfrop_pack_buffer(buffer, &p->data.proc, 1, PMIX_PROC))) {
return ret;
}
break;
default:
pmix_output(0, "PACK-PMIX-VALUE: UNSUPPORTED TYPE %d", (int)p->type);
return PMIX_ERROR;
Expand Down
7 changes: 6 additions & 1 deletion opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c
Original file line number Diff line number Diff line change
Expand Up @@ -634,8 +634,13 @@ pmix_status_t pmix_bfrop_unpack_status(pmix_buffer_t *buffer, void *dest,
return ret;
}
break;
case PMIX_PROC:
if (PMIX_SUCCESS != (ret = pmix_bfrop_unpack_buffer(buffer, &val->data.proc, &m, PMIX_PROC))) {
return ret;
}
break;
default:
pmix_output(0, "UNPACK-PMIX-VALUE: UNSUPPORTED TYPE");
pmix_output(0, "UNPACK-PMIX-VALUE: UNSUPPORTED TYPE %d", (int)val->type);
return PMIX_ERROR;
}

Expand Down
54 changes: 54 additions & 0 deletions opal/mca/pmix/pmix2x/pmix2x.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,24 @@ pmix_status_t pmix2x_convert_opalrc(int rc)
case OPAL_ERR_DEBUGGER_RELEASE:
return PMIX_ERR_DEBUGGER_RELEASE;

case OPAL_ERR_HANDLERS_COMPLETE:
return PMIX_EVENT_ACTION_COMPLETE;

case OPAL_ERR_PROC_ABORTED:
return PMIX_ERR_PROC_ABORTED;

case OPAL_ERR_PROC_REQUESTED_ABORT:
return PMIX_ERR_PROC_REQUESTED_ABORT;

case OPAL_ERR_PROC_ABORTING:
return PMIX_ERR_PROC_ABORTING;

case OPAL_ERR_NODE_DOWN:
return PMIX_ERR_NODE_DOWN;

case OPAL_ERR_NODE_OFFLINE:
return PMIX_ERR_NODE_OFFLINE;

case OPAL_ERR_NOT_IMPLEMENTED:
case OPAL_ERR_NOT_SUPPORTED:
return PMIX_ERR_NOT_SUPPORTED;
Expand Down Expand Up @@ -452,6 +470,9 @@ pmix_status_t pmix2x_convert_opalrc(int rc)
case OPAL_EXISTS:
return PMIX_EXISTS;

case OPAL_ERR_PARTIAL_SUCCESS:
return PMIX_QUERY_PARTIAL_SUCCESS;

case OPAL_ERROR:
return PMIX_ERROR;
case OPAL_SUCCESS:
Expand All @@ -467,6 +488,24 @@ int pmix2x_convert_rc(pmix_status_t rc)
case PMIX_ERR_DEBUGGER_RELEASE:
return OPAL_ERR_DEBUGGER_RELEASE;

case PMIX_EVENT_ACTION_COMPLETE:
return OPAL_ERR_HANDLERS_COMPLETE;

case PMIX_ERR_PROC_ABORTED:
return OPAL_ERR_PROC_ABORTED;

case PMIX_ERR_PROC_REQUESTED_ABORT:
return OPAL_ERR_PROC_REQUESTED_ABORT;

case PMIX_ERR_PROC_ABORTING:
return OPAL_ERR_PROC_ABORTING;

case PMIX_ERR_NODE_DOWN:
return OPAL_ERR_NODE_DOWN;

case PMIX_ERR_NODE_OFFLINE:
return OPAL_ERR_NODE_OFFLINE;

case PMIX_ERR_NOT_SUPPORTED:
return OPAL_ERR_NOT_SUPPORTED;

Expand Down Expand Up @@ -500,6 +539,9 @@ int pmix2x_convert_rc(pmix_status_t rc)
case PMIX_EXISTS:
return OPAL_EXISTS;

case PMIX_QUERY_PARTIAL_SUCCESS:
return OPAL_ERR_PARTIAL_SUCCESS;

case PMIX_ERROR:
return OPAL_ERROR;
case PMIX_SUCCESS:
Expand Down Expand Up @@ -671,6 +713,11 @@ void pmix2x_value_load(pmix_value_t *v,
}
}
break;
case OPAL_NAME:
v->type = PMIX_PROC;
(void)opal_snprintf_jobid(v->data.proc.nspace, PMIX_MAX_NSLEN, kv->data.name.jobid);
v->data.proc.rank = kv->data.name.vpid;
break;
default:
/* silence warnings */
break;
Expand Down Expand Up @@ -772,6 +819,13 @@ int pmix2x_value_unload(opal_value_t *kv,
kv->data.bo.size = 0;
}
break;
case PMIX_PROC:
kv->type = OPAL_NAME;
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc.nspace))) {
return pmix2x_convert_opalrc(rc);
}
kv->data.name.vpid = v->data.proc.rank;
break;
default:
/* silence warnings */
rc = OPAL_ERROR;
Expand Down
1 change: 1 addition & 0 deletions opal/mca/pmix/pmix_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_EVENT_ENVIRO_LEVEL "pmix.evenv" // (bool) register for environment events only
#define OPAL_PMIX_EVENT_ORDER_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list
#define OPAL_PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_proc_t*) array of pmix_proc_t defining range of event notification
#define OPAL_PMIX_EVENT_AFFECTED_PROC "pmix.evproc" // (pmix_proc_t) single proc that was affected
#define OPAL_PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_proc_t*) array of pmix_proc_t defining affected procs
#define OPAL_PMIX_EVENT_NON_DEFAULT "opal.evnondef" // (bool) event is not to be delivered to default event handlers
/* fault tolerance-related events */
Expand Down
17 changes: 16 additions & 1 deletion opal/runtime/opal_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,26 @@ opal_err2str(int errnum, const char **errmsg)
retval = "Release debugger";
break;
case OPAL_ERR_HANDLERS_COMPLETE:
retval = "Event handler processing complete";
retval = "Event handlers complete";
break;
case OPAL_ERR_PARTIAL_SUCCESS:
retval = "Partial success";
break;
case OPAL_ERR_PROC_ABORTED:
retval = "Process abnormally terminated";
break;
case OPAL_ERR_PROC_REQUESTED_ABORT:
retval = "Process requested abort";
break;
case OPAL_ERR_PROC_ABORTING:
retval = "Process is aborting";
break;
case OPAL_ERR_NODE_DOWN:
retval = "Node has gone down";
break;
case OPAL_ERR_NODE_OFFLINE:
retval = "Node has gone offline";
break;
default:
retval = "UNRECOGNIZED";
}
Expand Down
5 changes: 5 additions & 0 deletions orte/include/orte/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ enum {
ORTE_ERR_COMM_FAILURE = OPAL_ERR_COMM_FAILURE,
ORTE_ERR_DEBUGGER_RELEASE = OPAL_ERR_DEBUGGER_RELEASE,
ORTE_ERR_PARTIAL_SUCCESS = OPAL_ERR_PARTIAL_SUCCESS,
ORTE_ERR_PROC_ABORTED = OPAL_ERR_PROC_ABORTED,
ORTE_ERR_PROC_REQUESTED_ABORT = OPAL_ERR_PROC_REQUESTED_ABORT,
ORTE_ERR_PROC_ABORTING = OPAL_ERR_PROC_ABORTING,
ORTE_ERR_NODE_DOWN = OPAL_ERR_NODE_DOWN,
ORTE_ERR_NODE_OFFLINE = OPAL_ERR_NODE_OFFLINE,

/* error codes specific to ORTE - don't forget to update
orte/util/error_strings.c when adding new error codes!!
Expand Down
18 changes: 13 additions & 5 deletions orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,8 @@ static void proc_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
Expand All @@ -357,7 +357,7 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s errmgr_hnp: all routes and children gone - ordering exit",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
} else {
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
Expand Down Expand Up @@ -398,7 +398,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}

/* if we were ordered to terminate, mark this proc as dead and see if
* any of our routes or local children remain alive - if not, then
* any of our routes or local children remain alive - if not, then
* terminate ourselves. */
if (orte_orteds_term_ordered) {
for (i=0; i < orte_local_children->size; i++) {
Expand All @@ -419,6 +419,14 @@ static void proc_errors(int fd, short args, void *cbdata)
}

keep_going:
/* if this is a continuously operating job, then there is nothing more
* to do - we let the job continue to run */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
/* always mark the waitpid as having fired */
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
goto cleanup;
}

/* ensure we record the failed proc properly so we can report
* the error once we terminate
*/
Expand Down Expand Up @@ -490,7 +498,7 @@ static void proc_errors(int fd, short args, void *cbdata)
/* this job has terminated */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
}
}
}
break;

case ORTE_PROC_STATE_TERM_WO_SYNC:
Expand Down
6 changes: 5 additions & 1 deletion orte/mca/schizo/ompi/schizo_ompi.c
Original file line number Diff line number Diff line change
Expand Up @@ -411,13 +411,17 @@ static opal_cmd_line_init_t cmd_line_init[] = {
"Report events to a tool listening at the specified URI" },

{ "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0,
&orte_cmd_options.enable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable recovery from process failure [Default = disabled]" },

{ "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to restart a failed process" },

{ NULL, '\0', "continuous", "continuous", 0,
&orte_cmd_options.continuous, OPAL_CMD_LINE_TYPE_BOOL,
"Job is to run until explicitly terminated" },

{ "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },
Expand Down
Loading