diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index f8de1baf87b..409aa0b3b94 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -139,7 +139,13 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_argv_append_nosize(&members, nstring); free(nstring); /* have to add the number of procs in the job so the remote side - * can correctly add the procs by computing their names */ + * can correctly add the procs by computing their names, and our nspace + * so they can update their records */ + if (NULL == (nstring = (char*)opal_pmix.get_nspace(OMPI_PROC_MY_NAME->jobid))) { + opal_argv_free(members); + return OMPI_ERR_NOT_SUPPORTED; + } + opal_argv_append_nosize(&members, nstring); (void)asprintf(&nstring, "%d", size); opal_argv_append_nosize(&members, nstring); free(nstring); @@ -171,6 +177,11 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } opal_argv_append_nosize(&members, nstring); free(nstring); + if (NULL == (nstring = (char*)opal_pmix.get_nspace(proc_list[i]->super.proc_name.jobid))) { + opal_argv_free(members); + return OMPI_ERR_NOT_SUPPORTED; + } + opal_argv_append_nosize(&members, nstring); } if (!dense) { free(proc_list); @@ -246,6 +257,17 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, OPAL_LIST_DESTRUCT(&mlist); goto exit; } + /* step over the nspace */ + ++i; + if (NULL == members[i]) { + /* this shouldn't happen and is an error */ + OMPI_ERROR_LOG(OMPI_ERR_BAD_PARAM); + OPAL_LIST_DESTRUCT(&mlist); + opal_argv_free(members); + free(rport); + rc = OMPI_ERR_BAD_PARAM; + goto exit; + } /* if the rank is wildcard, then we need to add all procs * in that job to the list */ if (OPAL_VPID_WILDCARD == nm->name.vpid) { @@ -295,6 +317,16 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, OPAL_LIST_DESTRUCT(&rlist); goto exit; } + /* next entry is the nspace - register it */ + ++i; + if (NULL == members[i]) { + OMPI_ERROR_LOG(OMPI_ERR_NOT_SUPPORTED); + opal_argv_free(members); + OPAL_LIST_DESTRUCT(&ilist); + OPAL_LIST_DESTRUCT(&rlist); + goto exit; + } + opal_pmix.register_jobid(nm->name.jobid, members[i]); if (OPAL_VPID_WILDCARD == nm->name.vpid) { jobid = nm->name.jobid; OBJ_RELEASE(nm); diff --git a/opal/mca/pmix/cray/pmix_cray.c b/opal/mca/pmix/cray/pmix_cray.c index 9d36d6e6ee4..d4ce7acc09f 100644 --- a/opal/mca/pmix/cray/pmix_cray.c +++ b/opal/mca/pmix/cray/pmix_cray.c @@ -78,6 +78,9 @@ static int cray_unpublish_nb(char **keys, opal_list_t *info, static const char *cray_get_version(void); static int cray_store_local(const opal_process_name_t *proc, opal_value_t *val); +static const char *cray_get_nspace(opal_jobid_t jobid); +static void cray_register_jobid(opal_jobid_t jobid, const char *nspace); + #if 0 static bool cray_get_attr(const char *attr, opal_value_t **kv); #endif @@ -109,7 +112,9 @@ const opal_pmix_base_module_t opal_pmix_cray_module = { .get_version = cray_get_version, .register_errhandler = opal_pmix_base_register_handler, .deregister_errhandler = opal_pmix_base_deregister_handler, - .store_local = cray_store_local + .store_local = cray_store_local, + .get_nspace = cray_get_nspace, + .register_jobid = cray_register_jobid }; // usage accounting @@ -814,6 +819,16 @@ static int cray_store_local(const opal_process_name_t *proc, return OPAL_SUCCESS; } +static const char *cray_get_nspace(opal_jobid_t jobid) +{ + return NULL; +} + +static void cray_register_jobid(opal_jobid_t jobid, const char *nspace) +{ + return; +} + static char* pmix_error(int pmix_err) { char * err_msg; diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index c8e26b315d4..bcdcda6ea01 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -701,6 +701,12 @@ typedef void (*opal_pmix_base_module_deregister_fn_t)(void); typedef int (*opal_pmix_base_module_store_fn_t)(const opal_process_name_t *proc, opal_value_t *val); +/* retrieve the nspace corresponding to a given jobid */ +typedef const char* (*opal_pmix_base_module_get_nspace_fn_t)(opal_jobid_t jobid); + +/* register a jobid-to-nspace pair */ +typedef void (*opal_pmix_base_module_register_jobid_fn_t)(opal_jobid_t jobid, const char *nspace); + /* * the standard public API data structure */ @@ -745,6 +751,8 @@ typedef struct { opal_pmix_base_module_register_fn_t register_errhandler; opal_pmix_base_module_deregister_fn_t deregister_errhandler; opal_pmix_base_module_store_fn_t store_local; + opal_pmix_base_module_get_nspace_fn_t get_nspace; + opal_pmix_base_module_register_jobid_fn_t register_jobid; } opal_pmix_base_module_t; typedef struct { diff --git a/opal/mca/pmix/pmix1xx/pmix1.h b/opal/mca/pmix/pmix1xx/pmix1.h index f944dbd144d..835096b20b5 100644 --- a/opal/mca/pmix/pmix1xx/pmix1.h +++ b/opal/mca/pmix/pmix1xx/pmix1.h @@ -30,11 +30,24 @@ BEGIN_C_DECLS -OPAL_DECLSPEC extern opal_pmix_base_component_t mca_pmix_pmix1_component; +typedef struct { + opal_pmix_base_component_t super; + opal_list_t jobids; + bool native_launch; +} mca_pmix_pmix1_component_t; + +OPAL_DECLSPEC extern mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component; OPAL_DECLSPEC extern const opal_pmix_base_module_t opal_pmix_pmix1xx_module; /**** INTERNAL OBJECTS ****/ +typedef struct { + opal_list_item_t super; + opal_jobid_t jobid; + char nspace[PMIX_MAX_NSLEN + 1]; +} opal_pmix1_jobid_trkr_t; +OBJ_CLASS_DECLARATION(opal_pmix1_jobid_trkr_t); + typedef struct { opal_object_t super; pmix_proc_t p; diff --git a/opal/mca/pmix/pmix1xx/pmix1_client.c b/opal/mca/pmix/pmix1xx/pmix1_client.c index cb9359d323d..a54f4e581b6 100644 --- a/opal/mca/pmix/pmix1xx/pmix1_client.c +++ b/opal/mca/pmix/pmix1xx/pmix1_client.c @@ -31,19 +31,8 @@ #include "opal/mca/pmix/pmix1xx/pmix/include/pmix.h" #include "opal/mca/pmix/pmix1xx/pmix/src/buffer_ops/buffer_ops.h" -typedef struct { - opal_list_item_t super; - opal_jobid_t jobid; - char nspace[PMIX_MAX_NSLEN + 1]; -} opal_pmix1_jobid_trkr_t; -static OBJ_CLASS_INSTANCE(opal_pmix1_jobid_trkr_t, - opal_list_item_t, - NULL, NULL); - static pmix_proc_t my_proc; static char *dbgvalue=NULL; -static opal_list_t jobids; -static bool native_launch = false; static void myerr(pmix_status_t status, pmix_proc_t procs[], size_t nprocs, @@ -87,12 +76,11 @@ int pmix1_client_init(void) opal_process_name_t pname; pmix_status_t rc; int dbg; + opal_pmix1_jobid_trkr_t *job; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client init"); - OBJ_CONSTRUCT(&jobids, opal_list_t); - if (0 < (dbg = opal_output_get_verbosity(opal_pmix_base_framework.framework_output))) { asprintf(&dbgvalue, "PMIX_DEBUG=%d", dbg); putenv(dbgvalue); @@ -106,13 +94,20 @@ int pmix1_client_init(void) if (NULL != getenv(OPAL_MCA_PREFIX"orte_launch")) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ - native_launch = true; + mca_pmix_pmix1xx_component.native_launch = true; opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace); } else { /* we were launched by someone else, so make the * jobid just be the hash of the nspace */ OPAL_HASH_STR(my_proc.nspace, pname.jobid); } + /* insert this into our list of jobids - it will be the + * first, and so we'll check it first */ + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, my_proc.nspace, PMIX_MAX_NSLEN); + job->jobid = pname.jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); + pname.vpid = my_proc.rank; opal_proc_set_name(&pname); @@ -134,8 +129,6 @@ int pmix1_client_finalize(void) rc = PMIx_Finalize(); - OPAL_LIST_DESTRUCT(&jobids); - return pmix1_convert_rc(rc); } @@ -157,7 +150,7 @@ int pmix1_abort(int flag, const char *msg, pmix_proc_t *parray=NULL; size_t n, cnt=0; opal_namelist_t *ptr; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client abort"); @@ -168,20 +161,19 @@ int pmix1_abort(int flag, const char *msg, PMIX_PROC_CREATE(parray, cnt); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == ptr->name.jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); parray[n].rank = ptr->name.vpid; ++n; } @@ -201,23 +193,22 @@ int pmix1_store_local(const opal_process_name_t *proc, opal_value_t *val) pmix_value_t kv; pmix_status_t rc; pmix_proc_t p; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; if (NULL != proc) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == proc->jobid) { - (void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == proc->jobid) { - (void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == proc->jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN); p.rank = proc->vpid; } else { /* use our name */ @@ -259,7 +250,7 @@ int pmix1_fence(opal_list_t *procs, int collect_data) size_t n, cnt=0; opal_namelist_t *ptr; pmix_info_t info, *iptr; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client fence"); @@ -270,20 +261,19 @@ int pmix1_fence(opal_list_t *procs, int collect_data) PMIX_PROC_CREATE(parray, cnt); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == ptr->name.jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); parray[n].rank = ptr->name.vpid; ++n; } @@ -322,7 +312,7 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data, opal_namelist_t *ptr; pmix1_opcaddy_t *op; pmix_info_t info, *iptr; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client fence_nb"); @@ -333,20 +323,19 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data, PMIX_PROC_CREATE(parray, cnt); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == ptr->name.jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); parray[n].rank = ptr->name.vpid; ++n; } @@ -406,7 +395,7 @@ int pmix1_get(const opal_process_name_t *proc, const char *key, size_t ninfo, n; pmix_info_t *pinfo; opal_value_t *ival; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "%s PMIx_client get on proc %s key %s", @@ -416,20 +405,19 @@ int pmix1_get(const opal_process_name_t *proc, const char *key, /* prep default response */ *val = NULL; if (NULL != proc) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == proc->jobid) { - (void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == proc->jobid) { - (void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == proc->jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(p.nspace, job->nspace, PMIX_MAX_NSLEN); p.rank = proc->vpid; pptr = &p; } else { @@ -509,7 +497,7 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key, pmix_status_t rc; size_t n; opal_value_t *ival; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "%s PMIx_client get_nb on proc %s key %s", @@ -522,20 +510,19 @@ int pmix1_getnb(const opal_process_name_t *proc, const char *key, op->cbdata = cbdata; if (NULL != proc) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == proc->jobid) { - (void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == proc->jobid) { - (void)strncpy(op->p.nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == proc->jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(op->p.nspace, job->nspace, PMIX_MAX_NSLEN); op->p.rank = proc->vpid; } else { (void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); @@ -640,6 +627,7 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info) pmix_status_t ret; opal_pmix_pdata_t *d; opal_value_t *iptr; + opal_pmix1_jobid_trkr_t *job, *jptr; opal_output_verbose(1, opal_pmix_base_framework.framework_output, "PMIx_client lookup"); @@ -676,7 +664,7 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info) /* transfer the data back */ n=0; OPAL_LIST_FOREACH(d, data, opal_pmix_pdata_t) { - if (native_launch) { + if (mca_pmix_pmix1xx_component.native_launch) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ opal_convert_string_to_jobid(&d->proc.jobid, pdata[n].proc.nspace); @@ -685,6 +673,20 @@ int pmix1_lookup(opal_list_t *data, opal_list_t *info) * jobid just be the hash of the nspace */ OPAL_HASH_STR(pdata[n].proc.nspace, d->proc.jobid); } + /* if we don't already have it, add this to our jobid tracker */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == d->proc.jobid) { + job = jptr; + break; + } + } + if (NULL == job) { + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, pdata[n].proc.nspace, PMIX_MAX_NSLEN); + job->jobid = d->proc.jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); + } if (PMIX_RANK_WILDCARD == pdata[n].proc.rank) { d->proc.vpid = OPAL_VPID_WILDCARD; } else { @@ -712,6 +714,7 @@ static void lk_cbfunc(pmix_status_t status, opal_list_t results, *r; int rc; size_t n; + opal_pmix1_jobid_trkr_t *job, *jptr; if (NULL == op->lkcbfunc) { OBJ_RELEASE(op); @@ -724,7 +727,7 @@ static void lk_cbfunc(pmix_status_t status, for (n=0; n < ndata; n++) { d = OBJ_NEW(opal_pmix_pdata_t); opal_list_append(&results, &d->super); - if (native_launch) { + if (mca_pmix_pmix1xx_component.native_launch) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ opal_convert_string_to_jobid(&d->proc.jobid, data[n].proc.nspace); @@ -733,6 +736,20 @@ static void lk_cbfunc(pmix_status_t status, * jobid just be the hash of the nspace */ OPAL_HASH_STR(data[n].proc.nspace, d->proc.jobid); } + /* if we don't already have it, add this to our jobid tracker */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == d->proc.jobid) { + job = jptr; + break; + } + } + if (NULL == job) { + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, data[n].proc.nspace, PMIX_MAX_NSLEN); + job->jobid = d->proc.jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); + } if (PMIX_RANK_WILDCARD == data[n].proc.rank) { d->proc.vpid = OPAL_VPID_WILDCARD; } else { @@ -898,7 +915,7 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid) ret = PMIx_Spawn(pinfo, ninfo, papps, napps, nspace); if (PMIX_SUCCESS == ret) { - if (native_launch) { + if (mca_pmix_pmix1xx_component.native_launch) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ opal_convert_string_to_jobid(jobid, nspace); @@ -906,12 +923,12 @@ int pmix1_spawn(opal_list_t *job_info, opal_list_t *apps, opal_jobid_t *jobid) /* we were launched by someone else, so make the * jobid just be the hash of the nspace */ OPAL_HASH_STR(nspace, *jobid); - /* add this to our jobid tracker */ - job = OBJ_NEW(opal_pmix1_jobid_trkr_t); - (void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN); - job->jobid = *jobid; - opal_list_append(&jobids, &job->super); } + /* add this to our jobid tracker */ + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN); + job->jobid = *jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); } PMIX_APP_FREE(papps, napps); @@ -928,7 +945,7 @@ static void spcbfunc(pmix_status_t status, rc = pmix1_convert_rc(status); if (PMIX_SUCCESS == status) { - if (native_launch) { + if (mca_pmix_pmix1xx_component.native_launch) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ opal_convert_string_to_jobid(&jobid, nspace); @@ -936,12 +953,12 @@ static void spcbfunc(pmix_status_t status, /* we were launched by someone else, so make the * jobid just be the hash of the nspace */ OPAL_HASH_STR(nspace, jobid); - /* add this to our jobid tracker */ - job = OBJ_NEW(opal_pmix1_jobid_trkr_t); - (void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN); - job->jobid = jobid; - opal_list_append(&jobids, &job->super); } + /* add this to our jobid tracker */ + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN); + job->jobid = jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); } op->spcbfunc(rc, jobid, op->cbdata); @@ -1004,7 +1021,7 @@ int pmix1_connect(opal_list_t *procs) pmix_proc_t *parray=NULL; size_t n, cnt=0; opal_namelist_t *ptr; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; /* protect against bozo error */ if (NULL == procs || 0 == (cnt = opal_list_get_size(procs))) { @@ -1016,20 +1033,20 @@ int pmix1_connect(opal_list_t *procs) PMIX_PROC_CREATE(parray, cnt); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == ptr->name.jobid) { + job = jptr; + break; } } + if (NULL == job) { + OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND); + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); if (OPAL_VPID_WILDCARD == ptr->name.vpid) { parray[n].rank = PMIX_RANK_WILDCARD; } else { @@ -1070,18 +1087,12 @@ int pmix1_connectnb(opal_list_t *procs, PMIX_PROC_CREATE(op->procs, op->nprocs); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (job->jobid == ptr->name.jobid) { + (void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN); + break; } } if (OPAL_VPID_WILDCARD == ptr->name.vpid) { @@ -1115,18 +1126,12 @@ int pmix1_disconnect(opal_list_t *procs) PMIX_PROC_CREATE(parray, cnt); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (job->jobid == ptr->name.jobid) { + (void)strncpy(parray[n].nspace, job->nspace, PMIX_MAX_NSLEN); + break; } } if (OPAL_VPID_WILDCARD == ptr->name.vpid) { @@ -1169,18 +1174,12 @@ int pmix1_disconnectnb(opal_list_t *procs, PMIX_PROC_CREATE(op->procs, op->nprocs); n=0; OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == ptr->name.jobid) { - (void)strncpy(op->procs[n].nspace, my_proc.nspace, PMIX_MAX_NSLEN); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == ptr->name.jobid) { - (void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + OPAL_LIST_FOREACH(job, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (job->jobid == ptr->name.jobid) { + (void)strncpy(op->procs[n].nspace, job->nspace, PMIX_MAX_NSLEN); + break; } } if (OPAL_VPID_WILDCARD == ptr->name.vpid) { @@ -1206,24 +1205,32 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid, opal_namelist_t *nm; int rc; pmix_status_t ret; + opal_pmix1_jobid_trkr_t *job, *jptr; if (OPAL_JOBID_WILDCARD == jobid) { nspace = NULL; } else { - nspace = opal_convert_jobid_to_string(jobid); + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == jobid) { + job = jptr; + break; + } + } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + nspace = job->nspace; } ret = PMIx_Resolve_peers(nodename, nspace, &array, &nprocs); - if (NULL != nspace) { - free(nspace); - } rc = pmix1_convert_rc(ret); if (NULL != array && 0 < nprocs) { for (n=0; n < nprocs; n++) { nm = OBJ_NEW(opal_namelist_t); opal_list_append(procs, &nm->super); - if (native_launch) { + if (mca_pmix_pmix1xx_component.native_launch) { /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ opal_convert_string_to_jobid(&nm->name.jobid, array[n].nspace); @@ -1232,6 +1239,20 @@ int pmix1_resolve_peers(const char *nodename, opal_jobid_t jobid, * jobid just be the hash of the nspace */ OPAL_HASH_STR(array[n].nspace, nm->name.jobid); } + /* if we don't already have it, add this to our jobid tracker */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == nm->name.jobid) { + job = jptr; + break; + } + } + if (NULL == job) { + job = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(job->nspace, nspace, PMIX_MAX_NSLEN); + job->jobid = jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &job->super); + } nm->name.vpid = array[n].rank; } } @@ -1244,29 +1265,25 @@ int pmix1_resolve_nodes(opal_jobid_t jobid, char **nodelist) { pmix_status_t ret; char *nspace=NULL; - opal_pmix1_jobid_trkr_t *job; + opal_pmix1_jobid_trkr_t *job, *jptr; if (OPAL_JOBID_WILDCARD != jobid) { - /* if the jobid is my own, then we can just use - * my namespace */ - if (OPAL_PROC_MY_NAME.jobid == jobid) { - nspace = strdup(my_proc.nspace); - } else { - /* look thru our list of jobids and find the - * corresponding nspace */ - OPAL_LIST_FOREACH(job, &jobids, opal_pmix1_jobid_trkr_t) { - if (job->jobid == jobid) { - nspace = strdup(job->nspace); - break; - } + /* look thru our list of jobids and find the + * corresponding nspace */ + job = NULL; + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == jobid) { + job = jptr; + break; } } + if (NULL == job) { + return OPAL_ERR_NOT_FOUND; + } + nspace = job->nspace; } ret = PMIx_Resolve_nodes(nspace, nodelist); - if (NULL != nspace) { - free(nspace); - } return pmix1_convert_rc(ret);; } diff --git a/opal/mca/pmix/pmix1xx/pmix_pmix1.c b/opal/mca/pmix/pmix1xx/pmix_pmix1.c index f7c0290c726..4bcee893b04 100644 --- a/opal/mca/pmix/pmix1xx/pmix_pmix1.c +++ b/opal/mca/pmix/pmix1xx/pmix_pmix1.c @@ -44,6 +44,8 @@ /* These are functions used by both client and server to * access common functions in the embedded PMIx library */ +static const char *pmix1_get_nspace(opal_jobid_t jobid); +static void pmix1_register_jobid(opal_jobid_t jobid, const char *nspace); const opal_pmix_base_module_t opal_pmix_pmix1xx_module = { /* client APIs */ @@ -85,9 +87,39 @@ const opal_pmix_base_module_t opal_pmix_pmix1xx_module = { PMIx_Get_version, opal_pmix_base_register_handler, opal_pmix_base_deregister_handler, - pmix1_store_local + pmix1_store_local, + pmix1_get_nspace, + pmix1_register_jobid }; +static const char *pmix1_get_nspace(opal_jobid_t jobid) +{ + opal_pmix1_jobid_trkr_t *jptr; + + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == jobid) { + return jptr->nspace; + } + } + return NULL; +} + +static void pmix1_register_jobid(opal_jobid_t jobid, const char *nspace) +{ + opal_pmix1_jobid_trkr_t *jptr; + + /* if we don't already have it, add this to our jobid tracker */ + OPAL_LIST_FOREACH(jptr, &mca_pmix_pmix1xx_component.jobids, opal_pmix1_jobid_trkr_t) { + if (jptr->jobid == jobid) { + return; + } + } + jptr = OBJ_NEW(opal_pmix1_jobid_trkr_t); + (void)strncpy(jptr->nspace, nspace, PMIX_MAX_NSLEN); + jptr->jobid = jobid; + opal_list_append(&mca_pmix_pmix1xx_component.jobids, &jptr->super); +} + pmix_status_t pmix1_convert_opalrc(int rc) { switch (rc) { @@ -436,6 +468,10 @@ int pmix1_value_unload(opal_value_t *kv, /**** INSTANTIATE INTERNAL CLASSES ****/ +OBJ_CLASS_INSTANCE(opal_pmix1_jobid_trkr_t, + opal_list_item_t, + NULL, NULL); + static void opcon(pmix1_opcaddy_t *p) { memset(&p->p, 0, sizeof(pmix_proc_t)); diff --git a/opal/mca/pmix/pmix1xx/pmix_pmix1_component.c b/opal/mca/pmix/pmix1xx/pmix_pmix1_component.c index aca0806ac48..840d02f4486 100644 --- a/opal/mca/pmix/pmix1xx/pmix_pmix1_component.c +++ b/opal/mca/pmix/pmix1xx/pmix_pmix1_component.c @@ -18,6 +18,7 @@ #include "opal_config.h" #include "opal/constants.h" +#include "opal/class/opal_list.h" #include "opal/util/proc.h" #include "opal/mca/pmix/pmix.h" #include "pmix1.h" @@ -41,43 +42,47 @@ static int pmix1xx_component_query(mca_base_module_t **module, int *priority); * and pointers to our public functions in it */ -opal_pmix_base_component_t mca_pmix_pmix1xx_component = { +mca_pmix_pmix1_component_t mca_pmix_pmix1xx_component = { + { /* First, the mca_component_t struct containing meta information about the component itself */ - .base_version = { + .base_version = { /* Indicate that we are a pmix v1.1.0 component (which also implies a specific MCA version) */ - OPAL_PMIX_BASE_VERSION_2_0_0, + OPAL_PMIX_BASE_VERSION_2_0_0, /* Component name and version */ - .mca_component_name = "pmix1xx", - MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION), + .mca_component_name = "pmix1xx", + MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION), /* Component open and close functions */ - .mca_open_component = pmix1xx_open, - .mca_close_component = pmix1xx_close, - .mca_query_component = pmix1xx_component_query, - }, - /* Next the MCA v1.0.0 component meta data */ - .base_data = { + .mca_open_component = pmix1xx_open, + .mca_close_component = pmix1xx_close, + .mca_query_component = pmix1xx_component_query, + }, + /* Next the MCA v1.0.0 component meta data */ + .base_data = { /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + }, + .native_launch = false }; static int pmix1xx_open(void) { - + OBJ_CONSTRUCT(&mca_pmix_pmix1xx_component.jobids, opal_list_t); return OPAL_SUCCESS; } static int pmix1xx_close(void) { + OPAL_LIST_DESTRUCT(&mca_pmix_pmix1xx_component.jobids); return OPAL_SUCCESS; } diff --git a/opal/mca/pmix/s1/pmix_s1.c b/opal/mca/pmix/s1/pmix_s1.c index 2264062ea7d..b8d2536a077 100644 --- a/opal/mca/pmix/s1/pmix_s1.c +++ b/opal/mca/pmix/s1/pmix_s1.c @@ -49,6 +49,8 @@ static int s1_job_connect(opal_list_t *procs); static int s1_job_disconnect(opal_list_t *procs); static int s1_store_local(const opal_process_name_t *proc, opal_value_t *val); +static const char *s1_get_nspace(opal_jobid_t jobid); +static void s1_register_jobid(opal_jobid_t jobid, const char *nspace); const opal_pmix_base_module_t opal_pmix_s1_module = { s1_init, @@ -89,7 +91,9 @@ const opal_pmix_base_module_t opal_pmix_s1_module = { NULL, opal_pmix_base_register_handler, opal_pmix_base_deregister_handler, - s1_store_local + s1_store_local, + s1_get_nspace, + s1_register_jobid }; // usage accounting @@ -644,6 +648,14 @@ static int s1_store_local(const opal_process_name_t *proc, return OPAL_SUCCESS; } +static const char *s1_get_nspace(opal_jobid_t jobid) +{ + return NULL; +} +static void s1_register_jobid(opal_jobid_t jobid, const char *nspace) +{ + return; +} static char* pmix_error(int pmix_err) { diff --git a/opal/mca/pmix/s2/pmix_s2.c b/opal/mca/pmix/s2/pmix_s2.c index aa1cc219c16..889cba8ccd4 100644 --- a/opal/mca/pmix/s2/pmix_s2.c +++ b/opal/mca/pmix/s2/pmix_s2.c @@ -56,6 +56,8 @@ static int s2_job_connect(opal_list_t *procs); static int s2_job_disconnect(opal_list_t *procs); static int s2_store_local(const opal_process_name_t *proc, opal_value_t *val); +static const char *s2_get_nspace(opal_jobid_t jobid); +static void s2_register_jobid(opal_jobid_t jobid, const char *nspace); const opal_pmix_base_module_t opal_pmix_s2_module = { s2_init, @@ -96,7 +98,9 @@ const opal_pmix_base_module_t opal_pmix_s2_module = { NULL, opal_pmix_base_register_handler, opal_pmix_base_deregister_handler, - s2_store_local + s2_store_local, + s2_get_nspace, + s2_register_jobid }; // usage accounting @@ -663,6 +667,14 @@ static int s2_store_local(const opal_process_name_t *proc, return OPAL_SUCCESS; } +static const char *s2_get_nspace(opal_jobid_t jobid) +{ + return NULL; +} +static void s2_register_jobid(opal_jobid_t jobid, const char *nspace) +{ + return; +} static char* pmix_error(int pmix_err) {