From 8186b6ba57bddc3898b0b741f7e66d56fd4522bb Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 12 Feb 2016 10:05:52 -0800 Subject: [PATCH 1/8] mtl/ofi: Fix dynamic add_procs. (cherry-picked from open-mpi/ompi@b3d8ead76e6d4989812266451dfc06a25afd9807) --- ompi/mca/mtl/ofi/mtl_ofi.h | 15 +++++---------- ompi/mca/mtl/ofi/mtl_ofi_endpoint.h | 15 +++++++++++++-- ompi/mca/mtl/ofi/mtl_ofi_request.h | 4 ++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 2bb95ab4a2..3d058e74f9 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -51,10 +51,6 @@ BEGIN_C_DECLS extern mca_mtl_ofi_module_t ompi_mtl_ofi; extern mca_base_framework_t ompi_mtl_base_framework; -extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, - size_t nprocs, - struct ompi_proc_t **procs); - extern int ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t **procs); @@ -236,7 +232,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */ ompi_proc = ompi_comm_peer_lookup(comm, dest); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); if (OMPI_SUCCESS != ompi_ret) return ompi_ret; @@ -461,7 +457,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, if (ompi_mtl_ofi.any_addr == ofi_req->remote_addr) { src = MTL_OFI_GET_SOURCE(wc->tag); ompi_proc = ompi_comm_peer_lookup(ofi_req->comm, src); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc); ofi_req->remote_addr = endpoint->peer_fiaddr; } MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep, @@ -533,7 +529,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl, if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup(comm, src); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_addr = endpoint->peer_fiaddr; } else { remote_addr = ompi_mtl_ofi.any_addr; @@ -745,7 +741,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl, */ if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_proc = endpoint->peer_fiaddr; } @@ -830,7 +826,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, */ if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_proc = endpoint->peer_fiaddr; } @@ -962,7 +958,6 @@ ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl, return OMPI_SUCCESS; } - END_C_DECLS #endif /* MTL_OFI_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h index 2799d495b5..e886dcea23 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h @@ -11,10 +11,12 @@ #ifndef OMPI_MTL_OFI_ENDPOINT_H #define OMPI_MTL_OFI_ENDPOINT_H -#include "mtl_ofi.h" - BEGIN_C_DECLS +extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, + size_t nprocs, + struct ompi_proc_t **procs); + OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint_t); /** @@ -37,5 +39,14 @@ struct mca_mtl_ofi_endpoint_t { typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t; OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint); +static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc) +{ + if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) { + ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc); + } + + return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; +} + END_C_DECLS #endif diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h index ee544073cc..dc35a31f07 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_request.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h @@ -55,6 +55,10 @@ struct ompi_mtl_ofi_request_t { /* lookup source of an ANY_SOURCE Recv */ struct ompi_communicator_t *comm; + /** Reference to the MTL used to lookup */ + /* source of an ANY_SOURCE Recv */ + struct mca_mtl_base_module_t* mtl; + /** Pack buffer */ void *buffer; From eb4040b639c95acb3abcac937d882d6365a1fd6b Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 12 Feb 2016 10:06:52 -0800 Subject: [PATCH 2/8] mtl/ofi: FI_AV_MAP support only. (cherry-picked from open-mpi/ompi@67ce4a080aacaa882050ca467c1c1218ad075ae4) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 6b4fa3f4b2..f8be2ff2e7 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -241,6 +241,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints->domain_attr->threading = FI_THREAD_UNSPEC; hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->domain_attr->av_type = FI_AV_MAP; /** * FI_VERSION provides binary backward and forward compatibility support From 0cb81f67292a8651588da80be19ce3716ee547c5 Mon Sep 17 00:00:00 2001 From: yohann Date: Tue, 16 Feb 2016 09:56:09 -0800 Subject: [PATCH 3/8] mtl/ofi: update copyright dates. (cherry-picked from open-mpi/ompi@22eddfee1041d497f9a19268bd4e7e71ea48e254) --- ompi/mca/mtl/ofi/mtl_ofi.h | 2 +- ompi/mca/mtl/ofi/mtl_ofi_component.c | 2 +- ompi/mca/mtl/ofi/mtl_ofi_endpoint.h | 2 +- ompi/mca/mtl/ofi/mtl_ofi_request.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 3d058e74f9..02e1c49c61 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index f8be2ff2e7..1469fe767b 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights diff --git a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h index e886dcea23..6e02a84b4f 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h index dc35a31f07..5e2faad645 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_request.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * From fb2d59eb40718b34b25d26d1cba0b8d4f37dad7f Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 12 Feb 2016 16:48:03 -0800 Subject: [PATCH 4/8] mtl/ofi: cleanup (cherry-picked from open-mpi/ompi@7fe395c82a0900d180a4a7e3b35eeb237ab99de6) --- ompi/mca/mtl/ofi/mtl_ofi_endpoint.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h index 6e02a84b4f..788d091916 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h @@ -37,7 +37,6 @@ struct mca_mtl_ofi_endpoint_t { }; typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t; -OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint); static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc) { From 40f00141105ca08f2eb1ea1e486e50093c3065d6 Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 19 Feb 2016 16:21:22 -0800 Subject: [PATCH 5/8] mtl/ofi: Prevent possible memory leak. (cherry-picked from open-mpi/ompi@3ad59435ced4965952a18c01567abe4fa013a6f7) --- ompi/mca/mtl/ofi/mtl_ofi.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 02e1c49c61..f856bfebf9 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -861,11 +861,13 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, * The search request completed but no matching message was found. */ *matched = 0; + free(ofi_req); return OMPI_SUCCESS; } else if (OPAL_UNLIKELY(0 > ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_trecvmsg failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + free(ofi_req); return ompi_mtl_ofi_get_error(ret); } @@ -891,6 +893,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, } else { (*message) = MPI_MESSAGE_NULL; + free(ofi_req); } return OMPI_SUCCESS; From 4292cac2bfae3a161f6104db5762080a6b4e54ed Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 19 Feb 2016 16:54:34 -0800 Subject: [PATCH 6/8] mtl/ofi: Fix mismatching types. (cherry-picked from open-mpi/ompi@404987e9b3739bc4c290a88f153becb67bd93197) --- ompi/mca/mtl/ofi/mtl_ofi_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index e56a439896..1b1bdb1e1c 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -84,7 +84,7 @@ typedef struct mca_mtl_ofi_component_t { { \ match_bits = contextid; \ match_bits = (match_bits << 16); \ - match_bits |= source; \ + match_bits |= (uint64_t)source; \ match_bits = (match_bits << 32); \ match_bits |= (MTL_OFI_TAG_MASK & tag) | type; \ } @@ -106,7 +106,7 @@ typedef struct mca_mtl_ofi_component_t { match_bits = (match_bits << 32); \ mask_bits |= MTL_OFI_SOURCE_MASK; \ } else { \ - match_bits |= source; \ + match_bits |= (uint64_t)source; \ match_bits = (match_bits << 32); \ } \ \ From dd6f4790d869520d803ab3f193ee04cdf5fec26f Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 19 Feb 2016 16:58:41 -0800 Subject: [PATCH 7/8] mtl/ofi: Fix error handling. (cherry-picked from open-mpi/ompi@bd47062764766bb7da2ce0f7abe451286cce084e) --- ompi/mca/mtl/ofi/mtl_ofi.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index f856bfebf9..d5cd1f8ace 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -263,6 +263,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_trecv failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + free(ack_req); return ompi_mtl_ofi_get_error(ret); } } else { @@ -281,6 +282,10 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_tinject failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + if (ack_req) { + fi_cancel((fid_t)ompi_mtl_ofi.ep, &ack_req->ctx); + free(ack_req); + } return ompi_mtl_ofi_get_error(ret); } From d2016f9badf33a303de09b22fd2ebb63d45c7012 Mon Sep 17 00:00:00 2001 From: yohann Date: Fri, 19 Feb 2016 16:59:47 -0800 Subject: [PATCH 8/8] mtl/ofi: Check allocated pointer. (cherry-picked from open-mpi/ompi@59b6d041f8c99858e6246ac460b956fc2996e904) --- ompi/mca/mtl/ofi/mtl_ofi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.c b/ompi/mca/mtl/ofi/mtl_ofi.c index 1f4abb72ba..ed6aae6bc4 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.c +++ b/ompi/mca/mtl/ofi/mtl_ofi.c @@ -110,6 +110,15 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, */ for (i = 0; i < nprocs; ++i) { endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t); + if (NULL == endpoint) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: mtl/ofi: could not allocate endpoint" + " structure\n", + __FILE__, __LINE__); + ret = OMPI_ERROR; + goto bail; + } + endpoint->mtl_ofi_module = &ompi_mtl_ofi; endpoint->peer_fiaddr = fi_addrs[i];