Skip to content

Commit 1dee7b2

Browse files
authored
Merge pull request #3230 from rhc54/cmr3x/abort
v3.x: If we lose connection to the server after initiating a send/recv in P…
2 parents a6ac93b + a358c7b commit 1dee7b2

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[],
492492
pmix_buffer_t *bfr;
493493
pmix_cmd_t cmd = PMIX_ABORT_CMD;
494494
pmix_status_t rc;
495-
pmix_ptl_sr_t cb;
495+
volatile bool active;
496496

497497
pmix_output_verbose(2, pmix_globals.debug_output,
498498
"pmix:client abort called");
@@ -541,23 +541,15 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[],
541541
}
542542
}
543543

544-
/* create a callback object as we need to pass it to the
545-
* recv routine so we know which callback to use when
546-
* the return message is recvd */
547-
PMIX_CONSTRUCT(&cb, pmix_ptl_sr_t);
548-
cb.active = true;
549-
cb.cbfunc = wait_cbfunc;
550-
551544
/* send to the server */
545+
active = true;
552546
if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, bfr,
553-
wait_cbfunc, &cb))){
554-
PMIX_DESTRUCT(&cb);
547+
wait_cbfunc, (void*)&active))){
555548
return rc;
556549
}
557550

558551
/* wait for the release */
559-
PMIX_WAIT_FOR_COMPLETION(cb.active);
560-
PMIX_DESTRUCT(&cb);
552+
PMIX_WAIT_FOR_COMPLETION(active);
561553
return PMIX_SUCCESS;
562554
}
563555

opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err)
6262
pmix_regevents_info_t *reginfoptr, *regnext;
6363
pmix_peer_events_info_t *pr, *pnext;
6464
pmix_rank_info_t *info, *pinfo;
65+
pmix_ptl_posted_recv_t *rcv;
66+
pmix_buffer_t buf;
67+
pmix_ptl_hdr_t hdr;
6568

6669
/* stop all events */
6770
if (peer->recv_ev_active) {
@@ -143,6 +146,23 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err)
143146
pmix_globals.connected = false;
144147
/* set the public error status */
145148
err = PMIX_ERR_LOST_CONNECTION_TO_SERVER;
149+
/* it is possible that we have sendrecv's in progress where
150+
* we are waiting for a response to arrive. Since we have
151+
* lost connection to the server, that will never happen.
152+
* Thus, to preclude any chance of hanging, cycle thru
153+
* the list of posted recvs and complete any that are
154+
* the return call from a sendrecv - i.e., any that are
155+
* waiting on dynamic tags */
156+
PMIX_CONSTRUCT(&buf, pmix_buffer_t);
157+
hdr.nbytes = 0; // initialize the hdr to something safe
158+
PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) {
159+
if (UINT_MAX != rcv->tag && NULL != rcv->cbfunc) {
160+
/* construct and load the buffer */
161+
hdr.tag = rcv->tag;
162+
rcv->cbfunc(pmix_globals.mypeer, &hdr, &buf, rcv->cbdata);
163+
}
164+
}
165+
PMIX_DESTRUCT(&buf);
146166
}
147167
PMIX_REPORT_EVENT(err, _notify_complete);
148168
}

0 commit comments

Comments
 (0)