UCT/GDA/MLX5: Check UAR is supported when querying resources

yosefe · yosefe · commit f60a22c2ecc0 · 2025-09-14T09:45:59.000+03:00
diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c
@@ -2446,7 +2446,7 @@ ucp_wireup_add_device_lanes(const ucp_wireup_select_params_t *select_params,
     ucp_wireup_select_flags_t iface_rma_flags, peer_rma_flags;
     ucp_wireup_select_bw_info_t bw_info = {};
     ucp_tl_bitmap_t mem_type_tl_bitmap;
-    ucp_tl_bitmap_t tl_bitmap;
+    int found_lane;
 
     if (!context->config.ext.proto_enable ||
         (ep_init_flags &
@@ -2478,15 +2478,15 @@ ucp_wireup_add_device_lanes(const ucp_wireup_select_params_t *select_params,
      */
     bw_info.max_lanes = ucp_wireup_bw_max_lanes(select_params);
 
-    UCS_STATIC_BITMAP_RESET_ALL(&tl_bitmap);
     ucp_wireup_memaccess_bitmap(context, UCS_MEMORY_TYPE_CUDA,
                                 &mem_type_tl_bitmap);
-    (void)ucp_wireup_add_bw_lanes(select_params, &bw_info,
-                                  UCP_TL_BITMAP_AND_NOT(mem_type_tl_bitmap,
-                                                        tl_bitmap),
-                                  UCP_NULL_LANE, select_ctx, 0);
-
-    UCS_STATIC_BITMAP_OR_INPLACE(&tl_bitmap, mem_type_tl_bitmap);
+    found_lane = ucp_wireup_add_bw_lanes(select_params, &bw_info,
+                                         mem_type_tl_bitmap, UCP_NULL_LANE,
+                                         select_ctx, 0);
+    if (!found_lane) {
+        ucs_error("could not find device lanes");
+        return UCS_ERR_UNREACHABLE;
+    }
 
     return UCS_OK;
 }
diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c
@@ -104,8 +104,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
     if (self->umem == NULL) {
         uct_ib_check_memlock_limit_msg(md->super.dev.ibv_context,
                                        UCS_LOG_LEVEL_ERROR,
-                                       "mlx5dv_devx_umem_reg(size=%zu)",
-                                       dev_ep_size);
+                                       "mlx5dv_devx_umem_reg(ptr=%p size=%zu)",
+                                       self->ep_gpu, dev_ep_size);
         status = UCS_ERR_NO_MEMORY;
         goto err_mem;
     }
@@ -567,17 +567,60 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h,
 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_gdaki_iface_t, uct_iface_t);
 
 static ucs_status_t
-uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
+uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md, CUdevice cuda_dev)
+{
+    struct mlx5dv_devx_uar *uar;
+    ucs_status_t status;
+    CUcontext cuda_ctx;
+    unsigned flags;
+
+    status = uct_ib_mlx5_devx_alloc_uar(md, 0, &uar);
+    if (status != UCS_OK) {
+        goto out;
+    }
+
+    status = UCT_CUDADRV_FUNC_LOG_ERR(
+            cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_dev));
+    if (status != UCS_OK) {
+        goto out_free_uar;
+    }
+
+    status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(cuda_ctx));
+    if (status != UCS_OK) {
+        goto out_ctx_release;
+    }
+
+    flags  = CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP |
+             CU_MEMHOSTREGISTER_IOMEMORY;
+    status = UCT_CUDADRV_FUNC_LOG_DEBUG(
+            cuMemHostRegister(uar->reg_addr, UCT_IB_MLX5_BF_REG_SIZE, flags));
+    if (status == UCS_OK) {
+        UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemHostUnregister(uar->reg_addr));
+    }
+
+    UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL));
+out_ctx_release:
+    UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(cuda_dev));
+out_free_uar:
+    mlx5dv_devx_free_uar(uar);
+out:
+    return status;
+}
+
+static ucs_status_t
+uct_gdaki_query_tl_devices(uct_md_h tl_md,
+                           uct_tl_device_resource_t **tl_devices_p,
                            unsigned *num_tl_devices_p)
 {
-    uct_ib_md_t *ib_md      = ucs_derived_of(md, uct_ib_md_t);
-    unsigned num_tl_devices = 0;
+    static int uar_supported = -1;
+    uct_ib_mlx5_md_t *md     = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
+    unsigned num_tl_devices  = 0;
     uct_tl_device_resource_t *tl_devices;
     ucs_status_t status;
     CUdevice device;
     ucs_sys_device_t dev;
     ucs_sys_dev_distance_t dist;
-    int num_gpus;
+    int i, num_gpus;
 
     status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetCount(&num_gpus));
     if (status != UCS_OK) {
@@ -589,14 +632,35 @@ uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
         return UCS_ERR_NO_MEMORY;
     }
 
-    for (int i = 0; i < num_gpus; i++) {
+    for (i = 0; i < num_gpus; i++) {
         status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&device, i));
         if (status != UCS_OK) {
             goto err;
         }
 
+        /*
+         * Save the result of UAR support in a global flag since to avoid the
+         * overhead of checking UAR support for each GPU and MD. Assume the
+         * support is the same for all GPUs and MDs in the system.
+         */
+        if (uar_supported == -1) {
+            status = uct_gdaki_md_check_uar(md, device);
+            if (status == UCS_OK) {
+                uar_supported = 1;
+            } else {
+                ucs_diag("GDAKI not supported, please add "
+                         "NVreg_RegistryDwords=\"PeerMappingOverride=1;\" "
+                         "option for nvidia kernel driver");
+                uar_supported = 0;
+            }
+        }
+        if (uar_supported == 0) {
+            status = UCS_ERR_NO_DEVICE;
+            goto err;
+        }
+
         uct_cuda_base_get_sys_dev(device, &dev);
-        status = ucs_topo_get_distance(dev, ib_md->dev.sys_dev, &dist);
+        status = ucs_topo_get_distance(dev, md->super.dev.sys_dev, &dist);
         if (status != UCS_OK) {
             goto err;
         }
@@ -608,8 +672,8 @@ uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
 
         snprintf(tl_devices[num_tl_devices].name,
                  sizeof(tl_devices[num_tl_devices].name), "%s%d-%s:%d",
-                 UCT_DEVICE_CUDA_NAME, device, uct_ib_device_name(&ib_md->dev),
-                 ib_md->dev.first_port);
+                 UCT_DEVICE_CUDA_NAME, device,
+                 uct_ib_device_name(&md->super.dev), md->super.dev.first_port);
         tl_devices[num_tl_devices].type       = UCT_DEVICE_TYPE_NET;
         tl_devices[num_tl_devices].sys_device = dev;
         num_tl_devices++;
diff --git a/src/uct/ib/mlx5/ib_mlx5.c b/src/uct/ib/mlx5/ib_mlx5.c
@@ -569,9 +569,8 @@ int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
 }
 
 #if HAVE_DEVX
-static ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md,
-                                               uint32_t flags,
-                                               struct mlx5dv_devx_uar **uar_p)
+ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md, uint32_t flags,
+                                        struct mlx5dv_devx_uar **uar_p)
 {
     const char *uar_type_str      = (flags == UCT_IB_MLX5_UAR_ALLOC_TYPE_WC) ?
                                     "WC" : "NC_DEDICATED";
diff --git a/src/uct/ib/mlx5/ib_mlx5.h b/src/uct/ib/mlx5/ib_mlx5.h
@@ -441,7 +441,7 @@ typedef struct uct_ib_mlx5_md {
     uint8_t                  log_max_dci_stream_channels;
     uint32_t                 smkey_index;
     struct {
-        /* Max dp ordering level per transport, 
+        /* Max dp ordering level per transport,
            as listed in uct_ib_mlx5_dp_ordering_t */
         uint8_t              rc;
         uint8_t              dc;
@@ -931,10 +931,12 @@ void uct_ib_mlx5_verbs_srq_cleanup(uct_ib_mlx5_srq_t *srq, struct ibv_srq *verbs
 /**
  * DEVX UAR API
  */
-int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
-                             uct_ib_mlx5_md_t *md,
+int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar, uct_ib_mlx5_md_t *md,
                              uct_ib_mlx5_mmio_mode_t mmio_mode);
 
+ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md, uint32_t flags,
+                                        struct mlx5dv_devx_uar **uar_p);
+
 ucs_status_t uct_ib_mlx5_devx_check_uar(uct_ib_mlx5_md_t *md);
 
 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc
@@ -189,4 +189,4 @@ UCS_TEST_P(test_ucp_device, create_fail)
               ucp_device_mem_list_create(sender().ep(), &params1, &handle));
 }
 
-UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, gdaki, "rc,rc_gda")
+UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, rc_gda, "rc,rc_gda")

Original file line number	Diff line number	Diff line change
`@@ -569,9 +569,8 @@ int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,`
`569`	`569`	`}`
`570`	`570`
`571`	`571`	`#if HAVE_DEVX`
`572`		`-static ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md,`
`573`		`- uint32_t flags,`
`574`		`- struct mlx5dv_devx_uar **uar_p)`
	`572`	`+ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md, uint32_t flags,`
	`573`	`+ struct mlx5dv_devx_uar **uar_p)`
`575`	`574`	`{`
`576`	`575`	`const char *uar_type_str = (flags == UCT_IB_MLX5_UAR_ALLOC_TYPE_WC) ?`
`577`	`576`	`"WC" : "NC_DEDICATED";`
Original file line number	Diff line number	Diff line change
`@@ -189,4 +189,4 @@ UCS_TEST_P(test_ucp_device, create_fail)`
`189`	`189`	`ucp_device_mem_list_create(sender().ep(), &params1, &handle));`
`190`	`190`	`}`
`191`	`191`
`192`		`-UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, gdaki, "rc,rc_gda")`
	`192`	`+UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, rc_gda, "rc,rc_gda")`