diff --git a/ompi/mca/mtl/ofi/mtl_ofi.c b/ompi/mca/mtl/ofi/mtl_ofi.c index 1f4abb72ba..ed6aae6bc4 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.c +++ b/ompi/mca/mtl/ofi/mtl_ofi.c @@ -110,6 +110,15 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, */ for (i = 0; i < nprocs; ++i) { endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t); + if (NULL == endpoint) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: mtl/ofi: could not allocate endpoint" + " structure\n", + __FILE__, __LINE__); + ret = OMPI_ERROR; + goto bail; + } + endpoint->mtl_ofi_module = &ompi_mtl_ofi; endpoint->peer_fiaddr = fi_addrs[i]; diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 2bb95ab4a2..d5cd1f8ace 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -51,10 +51,6 @@ BEGIN_C_DECLS extern mca_mtl_ofi_module_t ompi_mtl_ofi; extern mca_base_framework_t ompi_mtl_base_framework; -extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, - size_t nprocs, - struct ompi_proc_t **procs); - extern int ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t **procs); @@ -236,7 +232,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */ ompi_proc = ompi_comm_peer_lookup(comm, dest); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after); if (OMPI_SUCCESS != ompi_ret) return ompi_ret; @@ -267,6 +263,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_trecv failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + free(ack_req); return ompi_mtl_ofi_get_error(ret); } } else { @@ -285,6 +282,10 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_tinject failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + if (ack_req) { + fi_cancel((fid_t)ompi_mtl_ofi.ep, &ack_req->ctx); + free(ack_req); + } return ompi_mtl_ofi_get_error(ret); } @@ -461,7 +462,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, if (ompi_mtl_ofi.any_addr == ofi_req->remote_addr) { src = MTL_OFI_GET_SOURCE(wc->tag); ompi_proc = ompi_comm_peer_lookup(ofi_req->comm, src); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc); ofi_req->remote_addr = endpoint->peer_fiaddr; } MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep, @@ -533,7 +534,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl, if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup(comm, src); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_addr = endpoint->peer_fiaddr; } else { remote_addr = ompi_mtl_ofi.any_addr; @@ -745,7 +746,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl, */ if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_proc = endpoint->peer_fiaddr; } @@ -830,7 +831,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, */ if (MPI_ANY_SOURCE != src) { ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); remote_proc = endpoint->peer_fiaddr; } @@ -865,11 +866,13 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, * The search request completed but no matching message was found. */ *matched = 0; + free(ofi_req); return OMPI_SUCCESS; } else if (OPAL_UNLIKELY(0 > ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_trecvmsg failed: %s(%zd)", __FILE__, __LINE__, fi_strerror(-ret), ret); + free(ofi_req); return ompi_mtl_ofi_get_error(ret); } @@ -895,6 +898,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, } else { (*message) = MPI_MESSAGE_NULL; + free(ofi_req); } return OMPI_SUCCESS; @@ -962,7 +966,6 @@ ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl, return OMPI_SUCCESS; } - END_C_DECLS #endif /* MTL_OFI_H_HAS_BEEN_INCLUDED */ diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 6b4fa3f4b2..1469fe767b 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights @@ -241,6 +241,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints->domain_attr->threading = FI_THREAD_UNSPEC; hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->domain_attr->av_type = FI_AV_MAP; /** * FI_VERSION provides binary backward and forward compatibility support diff --git a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h index 2799d495b5..788d091916 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_endpoint.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -11,10 +11,12 @@ #ifndef OMPI_MTL_OFI_ENDPOINT_H #define OMPI_MTL_OFI_ENDPOINT_H -#include "mtl_ofi.h" - BEGIN_C_DECLS +extern int ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl, + size_t nprocs, + struct ompi_proc_t **procs); + OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint_t); /** @@ -35,7 +37,15 @@ struct mca_mtl_ofi_endpoint_t { }; typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t; -OBJ_CLASS_DECLARATION(mca_mtl_ofi_endpoint); + +static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc) +{ + if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) { + ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc); + } + + return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]; +} END_C_DECLS #endif diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h index ee544073cc..5e2faad645 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_request.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -55,6 +55,10 @@ struct ompi_mtl_ofi_request_t { /* lookup source of an ANY_SOURCE Recv */ struct ompi_communicator_t *comm; + /** Reference to the MTL used to lookup */ + /* source of an ANY_SOURCE Recv */ + struct mca_mtl_base_module_t* mtl; + /** Pack buffer */ void *buffer; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index e56a439896..1b1bdb1e1c 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -84,7 +84,7 @@ typedef struct mca_mtl_ofi_component_t { { \ match_bits = contextid; \ match_bits = (match_bits << 16); \ - match_bits |= source; \ + match_bits |= (uint64_t)source; \ match_bits = (match_bits << 32); \ match_bits |= (MTL_OFI_TAG_MASK & tag) | type; \ } @@ -106,7 +106,7 @@ typedef struct mca_mtl_ofi_component_t { match_bits = (match_bits << 32); \ mask_bits |= MTL_OFI_SOURCE_MASK; \ } else { \ - match_bits |= source; \ + match_bits |= (uint64_t)source; \ match_bits = (match_bits << 32); \ } \ \