Skip to content

Commit b85f97f

Browse files
committed
ompi/comm: Improve MPI_Comm_create algorithm
Force only procs that are participating in the ne Comm to decide what CID is appropriate. This will have 2 advantages: * Speedup Comm creation for small communicators: non-participating procs will not interfere * Reduce CID fragmentation: non-overlaping groups will be allowed to use same CID. Signed-off-by: Artem Polyakov <artpol84@gmail.com> (cherry picked from commit 50a6d10)
1 parent 11ffb66 commit b85f97f

File tree

1 file changed

+56
-23
lines changed

1 file changed

+56
-23
lines changed

ompi/communicator/comm_cid.c

Lines changed: 56 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
303303
ompi_request_t *subreq;
304304
bool flag;
305305
int ret;
306+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
306307

307308
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
308309
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
@@ -318,54 +319,63 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
318319
/**
319320
* This is the real algorithm described in the doc
320321
*/
321-
flag = false;
322-
context->nextlocal_cid = mca_pml.pml_max_contextid;
323-
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
324-
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
325-
context->comm);
326-
if (true == flag) {
327-
context->nextlocal_cid = i;
328-
break;
322+
if( participate ){
323+
flag = false;
324+
context->nextlocal_cid = mca_pml.pml_max_contextid;
325+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
326+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
327+
context->comm);
328+
if (true == flag) {
329+
context->nextlocal_cid = i;
330+
break;
331+
}
329332
}
333+
} else {
334+
context->nextlocal_cid = 0;
330335
}
331336

332337
ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX,
333338
context, &subreq);
339+
/* there was a failure during non-blocking collective
340+
* all we can do is abort
341+
*/
334342
if (OMPI_SUCCESS != ret) {
335-
ompi_comm_cid_lowest_id = INT64_MAX;
336-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
337-
return ret;
343+
goto err_exit;
338344
}
339345

340-
if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) {
341-
/* at least one peer ran out of CIDs */
342-
if (flag) {
343-
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
344-
}
345-
346-
ompi_comm_cid_lowest_id = INT64_MAX;
347-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
348-
return OMPI_ERR_OUT_OF_RESOURCE;
346+
if ( ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) ) {
347+
/* Our local CID space is out, others already aware (allreduce above) */
348+
ret = OMPI_ERR_OUT_OF_RESOURCE;
349+
goto err_exit;
349350
}
350351
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
351352

352353
/* next we want to verify that the resulting commid is ok */
353354
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);
355+
err_exit:
356+
if (participate && flag) {
357+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
358+
}
359+
ompi_comm_cid_lowest_id = INT64_MAX;
360+
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
361+
return ret;
362+
354363
}
355364

356365
static int ompi_comm_checkcid (ompi_comm_request_t *request)
357366
{
358367
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
359368
ompi_request_t *subreq;
360369
int ret;
370+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
361371

362372
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
363373
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
364374
}
365375

366376
context->flag = (context->nextcid == context->nextlocal_cid);
367377

368-
if (!context->flag) {
378+
if ( participate && !context->flag) {
369379
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
370380

371381
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
@@ -377,22 +387,45 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
377387
ret = context->allreduce_fn (&context->flag, &context->rflag, 1, MPI_MIN, context, &subreq);
378388
if (OMPI_SUCCESS == ret) {
379389
ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1);
390+
} else {
391+
if (participate && context->flag ) {
392+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
393+
}
394+
ompi_comm_cid_lowest_id = INT64_MAX;
380395
}
381396

382397
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
383-
384398
return ret;
385399
}
386400

387401
static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
388402
{
389403
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
404+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
390405

391406
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
392407
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
393408
}
394409

395410
if (1 == context->rflag) {
411+
if( !participate ) {
412+
/* we need to provide something sane here
413+
* but we cannot use `nextcid` as we may have it
414+
* in-use, go ahead with next locally-available CID
415+
*/
416+
context->nextlocal_cid = mca_pml.pml_max_contextid;
417+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
418+
bool flag;
419+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
420+
context->comm);
421+
if (true == flag) {
422+
context->nextlocal_cid = i;
423+
break;
424+
}
425+
}
426+
context->nextcid = context->nextlocal_cid;
427+
}
428+
396429
/* set the according values to the newcomm */
397430
context->newcomm->c_contextid = context->nextcid;
398431
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm);
@@ -405,7 +438,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
405438
return OMPI_SUCCESS;
406439
}
407440

408-
if (1 == context->flag) {
441+
if (participate && (1 == context->flag)) {
409442
/* we could use this cid, but other don't agree */
410443
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
411444
context->start = context->nextcid + 1; /* that's where we can start the next round */

0 commit comments

Comments
 (0)