From c0d19cf9510ab96ccac3d1ef20f3abba6ae03642 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 7 Oct 2014 11:45:22 -0600 Subject: [PATCH 1/2] osc/rdma: fix issue identified by Berk Hess osc/rdma uses counters to determine if all messages have been received before exiting synchronization calls. The problem is that the active target counter is always increasing (never zeroed). If over 2^31-1 messages are sent this causes the counter to overflow (in itself this isn't an error). This causes test/wait to return before the communication is complete. There is an additional error in the use of the fragment flush function. If PSCW synchronization is in use this function CAN NOT be called unless a post message has arrived. Relevant mailing list thread: http://www.open-mpi.org/community/lists/devel/2014/10/16016.php This commit fixes both issues. Tested against MTT and issue reproducer. (cherry picked from commit open-mpi/ompi@eed7b45db59b356f6257f3b019e77643f14f84ce) --- ompi/mca/osc/rdma/osc_rdma.h | 4 +-- ompi/mca/osc/rdma/osc_rdma_active_target.c | 4 +-- ompi/mca/osc/rdma/osc_rdma_comm.c | 42 +++++++++++++++++++++- ompi/mca/osc/rdma/osc_rdma_data_move.c | 14 +++++--- ompi/mca/osc/rdma/osc_rdma_frag.c | 9 +++-- 5 files changed, 59 insertions(+), 14 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 5bb8bbe953..1ebef9d90c 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -423,8 +423,8 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int { if (MPI_PROC_NULL == source) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "mark_incoming_completion marking active incoming complete. count = %d", - (int) module->active_incoming_frag_count + 1)); + "mark_incoming_completion marking active incoming complete. count = %d. signal = %d", + (int) module->active_incoming_frag_count + 1, module->active_incoming_frag_signal_count)); OPAL_THREAD_ADD32(&module->active_incoming_frag_count, 1); if (module->active_incoming_frag_count >= module->active_incoming_frag_signal_count) { opal_condition_broadcast(&module->cond); diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index 135aa87da2..e2b2ab4a91 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -461,7 +461,7 @@ ompi_osc_rdma_wait(ompi_win_t *win) OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_complete_msgs || - module->active_incoming_frag_count < module->active_incoming_frag_signal_count) { + module->active_incoming_frag_count != module->active_incoming_frag_signal_count) { OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "num_complete_msgs = %d, active_incoming_frag_count = %d, active_incoming_frag_signal_count = %d", module->num_complete_msgs, module->active_incoming_frag_count, module->active_incoming_frag_signal_count)); @@ -501,7 +501,7 @@ ompi_osc_rdma_test(ompi_win_t *win, OPAL_THREAD_LOCK(&(module->lock)); if (0 != module->num_complete_msgs || - module->active_incoming_frag_count < module->active_incoming_frag_signal_count) { + module->active_incoming_frag_count != module->active_incoming_frag_signal_count) { *flag = 0; ret = OMPI_SUCCESS; goto cleanup; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index b3de4df0c5..4f0399f792 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -349,6 +349,16 @@ static inline int ompi_osc_rdma_put_w_req (void *origin_addr, int origin_count, tag = get_tag(module); } + /* flush will be called at the end of this function. make sure the post message has + * arrived. */ + if ((is_long_msg || request) && module->sc_group) { + while (0 != module->num_post_msgs) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); + opal_condition_wait(&module->cond, &module->lock); + } + } + OPAL_THREAD_UNLOCK(&module->lock); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, @@ -525,6 +535,16 @@ ompi_osc_rdma_accumulate_w_req (void *origin_addr, int origin_count, tag = get_tag (module); } + /* flush will be called at the end of this function. make sure the post message has + * arrived. */ + if ((is_long_msg || request) && module->sc_group) { + while (0 != module->num_post_msgs) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); + opal_condition_wait(&module->cond, &module->lock); + } + } + OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_acc_t*) ptr; @@ -607,7 +627,7 @@ ompi_osc_rdma_accumulate_w_req (void *origin_addr, int origin_count, ret = ompi_osc_rdma_frag_finish(module, frag); - if (request) { + if (is_long_msg || request) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_rdma_frag_flush_target (module, target); } @@ -858,6 +878,16 @@ static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_cou /* for bookkeeping the get is "outgoing" */ ompi_osc_signal_outgoing (module, target, 1); + /* flush will be called at the end of this function. make sure the post message has + * arrived. */ + if (!release_req && module->sc_group) { + while (0 != module->num_post_msgs) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); + opal_condition_wait(&module->cond, &module->lock); + } + } + OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_get_t*) ptr; @@ -1088,6 +1118,16 @@ int ompi_osc_rdma_rget_accumulate_internal (void *origin_addr, int origin_count, /* increment the number of outgoing fragments */ ompi_osc_signal_outgoing (module, target_rank, rdma_request->outstanding_requests); + /* flush will be called at the end of this function. make sure the post message has + * arrived. */ + if (!release_req && module->sc_group) { + while (0 != module->num_post_msgs) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); + opal_condition_wait(&module->cond, &module->lock); + } + } + OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_acc_t *) ptr; diff --git a/ompi/mca/osc/rdma/osc_rdma_data_move.c b/ompi/mca/osc/rdma/osc_rdma_data_move.c index f4e8d21399..833a2c6feb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_data_move.c +++ b/ompi/mca/osc/rdma/osc_rdma_data_move.c @@ -1331,8 +1331,10 @@ static inline int process_complete (ompi_osc_rdma_module_t *module, int source, ompi_osc_rdma_header_complete_t *complete_header) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "osc rdma: process_complete got complete message from %d. expected fragment count %d", - source, complete_header->frag_count)); + "osc rdma: process_complete got complete message from %d. expected fragment count %d. " + "current signal count %d. current incomming count: %d", + source, complete_header->frag_count, module->active_incoming_frag_signal_count, + module->active_incoming_frag_count)); OPAL_THREAD_LOCK(&module->lock); @@ -1677,8 +1679,12 @@ static int ompi_osc_rdma_callback (ompi_request_t *request) /* restart the receive request */ OPAL_THREAD_LOCK(&module->lock); - mark_incoming_completion (module, (base_header->flags & OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET) ? - source : MPI_PROC_NULL); + /* post messages come unbuffered and should NOT increment the incoming completion + * counters */ + if (OMPI_OSC_RDMA_HDR_TYPE_POST != base_header->type) { + mark_incoming_completion (module, (base_header->flags & OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET) ? + source : MPI_PROC_NULL); + } osc_rdma_gc_clean (); diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.c b/ompi/mca/osc/rdma/osc_rdma_frag.c index 09ef067d09..6ee389d9bd 100644 --- a/ompi/mca/osc/rdma/osc_rdma_frag.c +++ b/ompi/mca/osc/rdma/osc_rdma_frag.c @@ -54,7 +54,6 @@ static int frag_send_cb (ompi_request_t *request) return OMPI_SUCCESS; } - static int frag_send(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *frag) @@ -67,6 +66,10 @@ frag_send(ompi_osc_rdma_module_t *module, "osc rdma: frag_send called to %d, frag = %p, count = %d", frag->target, (void *) frag, count)); + /* we need to signal now that a frag is outgoing to ensure the count sent + * with the unlock message is correct */ + ompi_osc_signal_outgoing (module, frag->target, 1); + return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG, module->comm, frag_send_cb, frag); } @@ -81,10 +84,6 @@ ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, assert(0 == frag->pending); assert(module->peers[frag->target].active_frag != frag); - /* we need to signal now that a frag is outgoing to ensure the count sent - * with the unlock message is correct */ - ompi_osc_signal_outgoing (module, frag->target, 1); - /* if eager sends are not active, can't send yet, so buffer and get out... */ if (module->passive_target_access_epoch) { From 0c02000097cf194236a6ead7e78c78c613610f93 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 22 Oct 2014 15:34:29 -0600 Subject: [PATCH 2/2] osc/rdma: use unsigned types for all counters Some of the counters used by the "rdma" one-sided component are intended to overflow. Since overflow behavior is undefined for signed integers in C it is safer to use unsigned integers here. (cherry picked from commit open-mpi/ompi@23dd3af946dbfec55f3261c2a1a8e3ea30ed695c) --- ompi/mca/osc/rdma/osc_rdma.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 1ebef9d90c..a4ab11b9f7 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -130,29 +130,29 @@ struct ompi_osc_rdma_module_t { /** Nmber of communication fragments started for this epoch, by peer. Not in peer data to make fence more manageable. */ - int32_t *epoch_outgoing_frag_count; + uint32_t *epoch_outgoing_frag_count; /** List of full communication buffers queued to be sent. Should be maintained in order (at least in per-target order). */ opal_list_t queued_frags; /** cyclic counter for a unique tage for long messages. */ - int tag_counter; + unsigned int tag_counter; /* Number of outgoing fragments that have completed since the begining of time */ - int32_t outgoing_frag_count; + uint32_t outgoing_frag_count; /* Next outgoing fragment count at which we want a signal on cond */ - int32_t outgoing_frag_signal_count; + uint32_t outgoing_frag_signal_count; /* Number of incoming fragments that have completed since the begining of time */ - int32_t active_incoming_frag_count; + uint32_t active_incoming_frag_count; /* Next incoming buffer count at which we want a signal on cond */ - int32_t active_incoming_frag_signal_count; + uint32_t active_incoming_frag_signal_count; - int32_t *passive_incoming_frag_count; - int32_t *passive_incoming_frag_signal_count; + uint32_t *passive_incoming_frag_count; + uint32_t *passive_incoming_frag_signal_count; /* Number of flush ack requests send since beginning of time */ uint64_t flush_ack_requested_count; @@ -425,7 +425,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "mark_incoming_completion marking active incoming complete. count = %d. signal = %d", (int) module->active_incoming_frag_count + 1, module->active_incoming_frag_signal_count)); - OPAL_THREAD_ADD32(&module->active_incoming_frag_count, 1); + OPAL_THREAD_ADD32((int32_t *) &module->active_incoming_frag_count, 1); if (module->active_incoming_frag_count >= module->active_incoming_frag_signal_count) { opal_condition_broadcast(&module->cond); } @@ -433,7 +433,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "mark_incoming_completion marking passive incoming complete. source = %d, count = %d", source, (int) module->passive_incoming_frag_count[source] + 1)); - OPAL_THREAD_ADD32(module->passive_incoming_frag_count + source, 1); + OPAL_THREAD_ADD32((int32_t *) module->passive_incoming_frag_count + source, 1); if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) { opal_condition_broadcast(&module->cond); } @@ -455,7 +455,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int */ static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module) { - OPAL_THREAD_ADD32(&module->outgoing_frag_count, 1); + OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1); if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) { opal_condition_broadcast(&module->cond); } @@ -475,12 +475,12 @@ static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module) */ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count) { - OPAL_THREAD_ADD32(&module->outgoing_frag_signal_count, count); + OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count); if (MPI_PROC_NULL != target) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_signal_outgoing_passive: target = %d, count = %d, total = %d", target, count, module->epoch_outgoing_frag_count[target] + count)); - OPAL_THREAD_ADD32(module->epoch_outgoing_frag_count + target, count); + OPAL_THREAD_ADD32((int32_t *) module->epoch_outgoing_frag_count + target, count); } }