From 5beaa97c1b85708d853a0710bd75ba1aaca44f71 Mon Sep 17 00:00:00 2001 From: Piotr Lesnicki Date: Tue, 23 May 2017 20:32:55 +0200 Subject: [PATCH 1/4] mtl/portals4: add timeout to get retransmit Signed-off-by: Todd Kordenbrock --- ompi/mca/mtl/portals4/mtl_portals4.h | 1 + ompi/mca/mtl/portals4/mtl_portals4_component.c | 10 ++++++++++ ompi/mca/mtl/portals4/mtl_portals4_recv.c | 11 ++++++++++- ompi/mca/mtl/portals4/mtl_portals4_request.h | 2 ++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4.h b/ompi/mca/mtl/portals4/mtl_portals4.h index bfbb53f6b42..52b21b9354d 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.h +++ b/ompi/mca/mtl/portals4/mtl_portals4.h @@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t { /* free list of rendezvous get fragments */ opal_free_list_t fl_rndv_get_frag; + int get_retransmit_timeout; /** Network interface handle for matched interface */ ptl_handle_ni_t ni_h; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index 9b36b091acd..915e3e2fc74 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_portals4.max_msg_size_mtl); + ompi_mtl_portals4.get_retransmit_timeout=10000; + (void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version, + "get_retransmit_timeout", + "PtlGET retransmission timeout in usec", + MCA_BASE_VAR_TYPE_INT, + NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mtl_portals4.get_retransmit_timeout); + OBJ_RELEASE(new_enum); if (0 > ret) { return OMPI_ERR_NOT_SUPPORTED; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 607a5c96271..c5270005017 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -27,6 +27,7 @@ #include "ompi/mca/mtl/base/base.h" #include "ompi/mca/mtl/base/mtl_base_datatype.h" #include "ompi/message/message.h" +#include "opal/mca/timer/base/base.h" #include "mtl_portals4.h" #include "mtl_portals4_endpoint.h" @@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target, frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl; frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress; + frag->frag_start_time_usec = opal_timer_base_get_usec(); OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send", i + 1, frag_count, frag->frag_length)); @@ -322,17 +324,24 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev, ompi_mtl_portals4_recv_request_t* ptl_request = (ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request; - assert(ev->type==PTL_EVENT_REPLY); + assert(PTL_EVENT_REPLY == ev->type); OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) got reply event", ptl_request->opcount, ptl_request->hdr_data)); + if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); + opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec; + if (time > (unsigned int) ompi_mtl_portals4.get_retransmit_timeout) { + mtl_ptl_error(1, "timeout retrying GET"); + return OMPI_ERROR; + } + OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num)); diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index b7ae187d6ef..a54090f6837 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -22,6 +22,7 @@ #include "opal/datatype/opal_convertor.h" #include "ompi/mca/mtl/mtl.h" +#include "opal/mca/timer/base/base.h" struct ompi_mtl_portals4_message_t; struct ompi_mtl_portals4_pending_request_t; @@ -93,6 +94,7 @@ struct ompi_mtl_portals4_rndv_get_frag_t { ptl_process_t frag_target; ptl_hdr_data_t frag_match_bits; ptl_size_t frag_remote_offset; + opal_timer_t frag_start_time_usec; int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*); From cbaae7eeaecadeda4c1eebd977ef3197ec78680a Mon Sep 17 00:00:00 2001 From: Piotr Lesnicki Date: Tue, 23 May 2017 20:46:13 +0200 Subject: [PATCH 2/4] mtl/portals4: get retransmission REPLY code Signed-off-by: Todd Kordenbrock --- ompi/mca/mtl/portals4/mtl_portals4_recv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index c5270005017..11feb79a7c1 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -336,6 +336,12 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev, "%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d", __FILE__, __LINE__, ev->ni_fail_type); + if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) { + mtl_ptl_error(1, "PTL_EVENT_REPLY with ni_fail_type: %s" + " => cannot retry", + name_of_err[ev->ni_fail_type]); + } + opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec; if (time > (unsigned int) ompi_mtl_portals4.get_retransmit_timeout) { mtl_ptl_error(1, "timeout retrying GET"); From f00ff9de01b99722a1135de810cd12674edeb8b4 Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Mon, 12 Jun 2017 14:03:31 -0500 Subject: [PATCH 3/4] mtl/portals4: if frag retry fails, then fail the entire receive If the a frag cannot be retried because the ni_fail_type is other than PTL_NI_DROPPED, then set the return type and jump to callback_error. This sets MPI_ERROR and completes the receive. Signed-off-by: Todd Kordenbrock --- ompi/mca/mtl/portals4/mtl_portals4_recv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 11feb79a7c1..c2dcd27a6ca 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -340,6 +340,8 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev, mtl_ptl_error(1, "PTL_EVENT_REPLY with ni_fail_type: %s" " => cannot retry", name_of_err[ev->ni_fail_type]); + ret = PTL_FAIL; + goto callback_error; } opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec; From 4ca032e93074854d108c9e0bfb8db0c9a594b048 Mon Sep 17 00:00:00 2001 From: Todd Kordenbrock Date: Mon, 12 Jun 2017 18:17:41 -0500 Subject: [PATCH 4/4] mtl/portals4: move opal_timer_base_get_usec() out of the fast path Rearrange the receive frag timeout logic to avoid calling opal_timer_base_get_usec() in read_msg(). Instead set it at the first retry. Signed-off-by: Todd Kordenbrock --- ompi/mca/mtl/portals4/mtl_portals4_recv.c | 25 +++++++++++++------- ompi/mca/mtl/portals4/mtl_portals4_request.h | 3 ++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index c2dcd27a6ca..230b3785532 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -82,7 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target, frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl; frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress; - frag->frag_start_time_usec = opal_timer_base_get_usec(); + frag->frag_abs_timeout_usec = 0; OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send", i + 1, frag_count, frag->frag_length)); @@ -337,17 +337,26 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev, __FILE__, __LINE__, ev->ni_fail_type); if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) { - mtl_ptl_error(1, "PTL_EVENT_REPLY with ni_fail_type: %s" - " => cannot retry", - name_of_err[ev->ni_fail_type]); + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry", + (uint32_t)ev->ni_fail_type); ret = PTL_FAIL; goto callback_error; } - opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec; - if (time > (unsigned int) ompi_mtl_portals4.get_retransmit_timeout) { - mtl_ptl_error(1, "timeout retrying GET"); - return OMPI_ERROR; + if (0 == rndv_get_frag->frag_abs_timeout_usec) { + /* this is the first retry of the frag. start the timer. */ + /* instead of recording the start time, record the end time + * and avoid addition on each retry. */ + rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout; + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "setting frag timeout at %lu", + rndv_get_frag->frag_abs_timeout_usec); + } else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "timeout retrying GET"); + ret = PTL_FAIL; + goto callback_error; } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index a54090f6837..c7e3c31e47a 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -94,7 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t { ptl_process_t frag_target; ptl_hdr_data_t frag_match_bits; ptl_size_t frag_remote_offset; - opal_timer_t frag_start_time_usec; + /* the absolute time at which this frag times out */ + opal_timer_t frag_abs_timeout_usec; int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);