From 16bf139f85be6cfda12a4e0f7702d4d41942cd98 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:33:21 -0700 Subject: [PATCH 1/3] btl/ofi: Set domain threading model based on MPI thread support Signed-off-by: Jessie Yang (cherry picked from commit f65f900bbbd044fa17e9153adb3bdf2906d2b28d) --- opal/mca/btl/ofi/btl_ofi_component.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 110004f8094..5a0baee44fd 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -339,6 +339,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, domain_attr.control_progress = progress_mode; domain_attr.data_progress = progress_mode; + if (enable_mpi_threads) { + domain_attr.threading = FI_THREAD_SAFE; + } else { + domain_attr.threading = FI_THREAD_DOMAIN; + } + /* select endpoint type */ ep_attr.type = FI_EP_RDM; From bed44d186f959b0f6b4e56cd2051a946ca8d316d Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 14 Oct 2025 15:35:18 -0700 Subject: [PATCH 2/3] btl/ofi: Add FI_COMPLETION flag to tx and rx attributes Add FI_COMPLETION flag to ensure completion entries are generated for all data transfer operations. Signed-off-by: Jessie Yang (cherry picked from commit 15fe24645cd21c1c99f5ce7796e86344f442971d) --- opal/mca/btl/ofi/btl_ofi_component.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 5a0baee44fd..6c0af9be3b0 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -365,7 +365,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, tx_attr.iov_limit = 1; rx_attr.iov_limit = 1; - tx_attr.op_flags = FI_DELIVERY_COMPLETE; + tx_attr.op_flags = FI_DELIVERY_COMPLETE | FI_COMPLETION; + rx_attr.op_flags = FI_COMPLETION; mca_btl_ofi_component.module_count = 0; From 4f118d6b3e2c28a56ab077b62ced1717536ad1f5 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Tue, 7 Oct 2025 23:52:01 +0000 Subject: [PATCH 3/3] ofi: Share domain between MTL and BTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Share the domain between the MTL and BTL layers to reduce the total number of domains created. This helps avoid hitting system resource limits on platforms with high core counts. Instead of having the common code allocate a single domain with the superset of all required capabilities, we attempt to reuse an existing fabric and domain if the providers can support MTL’s and BTL’s different capability sets. This approach allows providers that support domain sharing to reuse resources efficiently while still preserving flexibility. If the providers cannot reuse the fabric and domain due to incompatible requirements, separate domains will be created as before. Signed-off-by: Jessie Yang (cherry picked from commit 69d273793dfb5e26fe93e2e3de58d511cb35b3f1) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 35 ++++-- opal/mca/btl/ofi/btl_ofi_component.c | 17 ++- opal/mca/btl/ofi/btl_ofi_module.c | 4 +- opal/mca/common/ofi/common_ofi.c | 161 ++++++++++++++++++++++++++- opal/mca/common/ofi/common_ofi.h | 57 +++++++++- 5 files changed, 253 insertions(+), 21 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 0ca9b31aad7..c26b6118195 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -694,6 +694,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->domain_attr->domain = opal_common_ofi.domain; + hints->fabric_attr->fabric = opal_common_ofi.fabric; /** * The EFA provider in Libfabric versions prior to 1.10 contains a bug @@ -715,10 +717,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints_dup->fabric_attr->prov_name = strdup("efa"); ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + if (FI_ENODATA == -ret && (hints_dup->fabric_attr->fabric || hints_dup->domain_attr->domain)) { + /* Retry without fabric and domain */ + hints_dup->fabric_attr->fabric = NULL; + hints_dup->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers); + } if (FI_ENOSYS == -ret) { /* libfabric is not new enough, fallback to use older version of API */ ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers); - } + } opal_output_verbose(1, opal_common_ofi.output, "%s:%d: EFA specific fi_getinfo(): %s\n", @@ -756,6 +764,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ + if (FI_ENODATA == -ret && (hints->fabric_attr->fabric || hints->domain_attr->domain)) { + hints->fabric_attr->fabric = NULL; + hints->domain_attr->domain = NULL; + ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints, &providers); + } if (FI_ENOSYS == -ret) { ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers); } @@ -972,9 +985,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * instantiate the virtual or physical network. This opens a "fabric * provider". See man fi_fabric for details. */ - ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ - &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ - NULL); /* Optional context for fabric events */ + ret = opal_common_ofi_fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ + &ompi_mtl_ofi.fabric); /* Out: Fabric handle */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_fabric", @@ -988,10 +1000,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * hardware port/collection of ports. Returns a domain object that can be * used to create endpoints. See man fi_domain for details. */ - ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ - prov, /* In: Provider */ - &ompi_mtl_ofi.domain, /* Out: Domain object */ - NULL); /* Optional context for domain events */ + ret = opal_common_ofi_fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ + prov, /* In: Provider */ + &ompi_mtl_ofi.domain); /* Out: Domain object */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_domain", @@ -1158,10 +1169,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, (void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq); } if (ompi_mtl_ofi.domain) { - (void) fi_close((fid_t)ompi_mtl_ofi.domain); + (void) opal_common_ofi_domain_release(ompi_mtl_ofi.domain); } if (ompi_mtl_ofi.fabric) { - (void) fi_close((fid_t)ompi_mtl_ofi.fabric); + (void) opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric); } if (ompi_mtl_ofi.comm_to_context) { free(ompi_mtl_ofi.comm_to_context); @@ -1209,11 +1220,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) } } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) { + if ((ret = opal_common_ofi_domain_release(ompi_mtl_ofi.domain))) { goto finalize_err; } - if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) { + if ((ret = opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric))) { goto finalize_err; } diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 6c0af9be3b0..e0a16848da1 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -379,9 +379,18 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, no_hmem: #endif + hints.fabric_attr->fabric = opal_common_ofi.fabric; + hints.domain_attr->domain = opal_common_ofi.domain; + /* Do the query. The earliest version that supports FI_HMEM hints is 1.9. * The earliest version the explictly allow provider to call CUDA API is 1.18 */ rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + if (FI_ENODATA == -rc && (hints.fabric_attr->fabric || hints.domain_attr->domain)) { + /* Retry without fabric and domain */ + hints.fabric_attr->fabric = NULL; + hints.domain_attr->domain = NULL; + rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); + } if (FI_ENOSYS == -rc) { rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list); } @@ -560,14 +569,14 @@ static int mca_btl_ofi_init_device(struct fi_info *info) ("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name)); /* fabric */ - rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL); + rc = opal_common_ofi_fi_fabric(ofi_info->fabric_attr, &fabric); if (0 != rc) { BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; } /* domain */ - rc = fi_domain(fabric, ofi_info, &domain, NULL); + rc = opal_common_ofi_fi_domain(fabric, ofi_info, &domain); if (0 != rc) { BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc))); goto fail; @@ -750,11 +759,11 @@ static int mca_btl_ofi_init_device(struct fi_info *info) } if (NULL != domain) { - fi_close(&domain->fid); + opal_common_ofi_domain_release(domain); } if (NULL != fabric) { - fi_close(&fabric->fid); + opal_common_ofi_fabric_release(fabric); } free(module); diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 330ebbae66c..17c4d281a3d 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -380,11 +380,11 @@ int mca_btl_ofi_finalize(mca_btl_base_module_t *btl) } if (NULL != ofi_btl->domain) { - fi_close(&ofi_btl->domain->fid); + opal_common_ofi_domain_release(ofi_btl->domain); } if (NULL != ofi_btl->fabric) { - fi_close(&ofi_btl->fabric->fid); + opal_common_ofi_fabric_release(ofi_btl->fabric); } if (NULL != ofi_btl->fabric_info) { diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 82a331e2527..7624727eb98 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -6,7 +6,7 @@ * reserved. * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved. * $COPYRIGHT$ @@ -40,7 +40,11 @@ opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, .prov_exclude = NULL, - .output = -1}; + .output = -1, + .fabric = NULL, + .domain = NULL, + .fabric_ref_count = 0, + .domain_ref_count = 0}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; static int opal_common_ofi_verbose_level = 0; @@ -1037,3 +1041,156 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } + +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric_attr->fabric) { + *fabric = fabric_attr->fabric; + opal_common_ofi.fabric_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing fabric: %s", + fabric_attr->name); + } else { + ret = fi_fabric(fabric_attr, fabric, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.fabric = *fabric; + opal_common_ofi.fabric_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or OPAL error code + */ +int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain) +{ + int ret; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (info->domain_attr->domain) { + *domain = info->domain_attr->domain; + opal_common_ofi.domain_ref_count++; + opal_output_verbose(1, opal_common_ofi.output, "Reusing existing domain: %s", + info->domain_attr->name); + } else { + ret = fi_domain(fabric, info, domain, NULL); + if (0 != ret) { + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; + } + opal_common_ofi.domain = *domain; + opal_common_ofi.domain_ref_count = 1; + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return OPAL_SUCCESS; +} + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_fabric_release(struct fid_fabric *fabric) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (fabric == opal_common_ofi.fabric && opal_common_ofi.fabric_ref_count > 0) { + opal_common_ofi.fabric_ref_count--; + if (opal_common_ofi.fabric_ref_count == 0) { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.fabric = NULL; + } + } else { + ret = fi_close(&fabric->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for fabric: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +int opal_common_ofi_domain_release(struct fid_domain *domain) +{ + int ret = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (domain == opal_common_ofi.domain && opal_common_ofi.domain_ref_count > 0) { + opal_common_ofi.domain_ref_count--; + if (opal_common_ofi.domain_ref_count == 0) { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + opal_common_ofi.domain = NULL; + } + } else { + ret = fi_close(&domain->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: fi_close failed for domain: %s (%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret); + } + } + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); + return ret; +} diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 3deeb0c63ec..4357840604f 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -5,7 +5,7 @@ * reserved. * Copyright (c) 2020-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights + * Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights * reserved. * * $COPYRIGHT$ @@ -30,6 +30,10 @@ typedef struct opal_common_ofi_module { char **prov_include; char **prov_exclude; int output; + struct fid_fabric *fabric; + struct fid_domain *domain; + int fabric_ref_count; + int domain_ref_count; } opal_common_ofi_module_t; /** @@ -223,6 +227,57 @@ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *pr */ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen); +/** + * Get or create fabric object + * + * Reuses existing fabric from fabric_attr->fabric if available, + * otherwise creates new fabric using fi_fabric(). + * + * @param fabric_attr (IN) Fabric attributes + * @param fabric (OUT) Fabric object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr, + struct fid_fabric **fabric); + +/** + * Get or create domain object + * + * Reuses existing domain from info->domain_attr->domain if available, + * otherwise creates new domain using fi_domain(). + * + * @param fabric (IN) Fabric object + * @param info (IN) Provider info + * @param domain (OUT) Domain object (new or existing) + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain); + +/** + * Release fabric reference + * + * Decrements fabric reference count and closes fabric if count reaches zero. + * + * @param fabric (IN) Fabric object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_fabric_release(struct fid_fabric *fabric); + +/** + * Release domain reference + * + * Decrements domain reference count and closes domain if count reaches zero. + * + * @param domain (IN) Domain object to release + * + * @return OPAL_SUCCESS or error code + */ +OPAL_DECLSPEC int opal_common_ofi_domain_release(struct fid_domain *domain); + END_C_DECLS #endif /* OPAL_MCA_COMMON_OFI_H */