Skip to content

Commit 48e9e47

Browse files
authored
UCT/GDA: Check that nvidia peermem driver is loaded - port to 1.20. (#10995)
UCT/GDA: Check that nvidia peermem driver is loaded.
1 parent a5297d6 commit 48e9e47

File tree

1 file changed

+23
-3
lines changed

1 file changed

+23
-3
lines changed

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -602,9 +602,10 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
602602
uct_tl_device_resource_t **tl_devices_p,
603603
unsigned *num_tl_devices_p)
604604
{
605-
static int uar_supported = -1;
606-
uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
607-
unsigned num_tl_devices = 0;
605+
static int uar_supported = -1;
606+
static int peermem_loaded = -1;
607+
uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
608+
unsigned num_tl_devices = 0;
608609
uct_tl_device_resource_t *tl_devices;
609610
ucs_status_t status;
610611
CUdevice device;
@@ -649,6 +650,25 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
649650
goto err;
650651
}
651652

653+
/*
654+
* Save the result of peermem driver check in a global flag to avoid
655+
* printing diag message for each GPU and MD.
656+
*/
657+
if (peermem_loaded == -1) {
658+
peermem_loaded = !!(md->super.reg_mem_types &
659+
UCS_BIT(UCS_MEMORY_TYPE_CUDA));
660+
if (peermem_loaded == 0) {
661+
ucs_diag("GDAKI not supported, please load "
662+
"Nvidia peermem driver by running "
663+
"\"modprobe nvidia_peermem\"");
664+
}
665+
}
666+
667+
if (peermem_loaded == 0) {
668+
status = UCS_ERR_NO_DEVICE;
669+
goto err;
670+
}
671+
652672
uct_cuda_base_get_sys_dev(device, &dev);
653673
status = ucs_topo_get_distance(dev, md->super.dev.sys_dev, &dist);
654674
if (status != UCS_OK) {

0 commit comments

Comments
 (0)