UCP/PERF: UCP cuda device real tests

iyastreb · iyastreb · commit 176cec5e71dc · 2025-09-16T18:29:35.000Z
diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh
@@ -658,7 +658,7 @@ run_ucx_perftest_cuda_device() {
 	ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
 
 	# TODO: Run on all GPUs
-	ucp_client_args="-a cuda $(hostname)"
+	ucp_client_args="-a cuda:0 $(hostname)"
 
 	run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
 }
diff --git a/src/tools/perf/cuda/cuda_alloc.c b/src/tools/perf/cuda/cuda_alloc.c
@@ -25,7 +25,7 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
 
     group_index = rte_call(perf, group_index);
 
-    CUDA_CALL(UCS_ERR_NO_DEVICE, cudaGetDeviceCount, &num_gpus);
+    CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetDeviceCount, &num_gpus);
     if (num_gpus == 0) {
         ucs_error("no cuda devices available");
         return UCS_ERR_NO_DEVICE;
@@ -41,7 +41,7 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
         return UCS_ERR_NO_DEVICE;
     }
 
-    CUDA_CALL(UCS_ERR_NO_DEVICE, cudaSetDevice, gpu_index);
+    CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaSetDevice, gpu_index);
 
     /* actually set device context as calling cudaSetDevice may result in
      * context being initialized lazily */
@@ -55,10 +55,10 @@ static inline ucs_status_t ucx_perf_cuda_alloc(size_t length,
                                                void **address_p)
 {
     if (mem_type == UCS_MEMORY_TYPE_CUDA) {
-        CUDA_CALL(UCS_ERR_NO_MEMORY, cudaMalloc, address_p, length);
+        CUDA_CALL_RET(UCS_ERR_NO_MEMORY, cudaMalloc, address_p, length);
     } else if (mem_type == UCS_MEMORY_TYPE_CUDA_MANAGED) {
-        CUDA_CALL(UCS_ERR_NO_MEMORY, cudaMallocManaged, address_p, length,
-                  cudaMemAttachGlobal);
+        CUDA_CALL_RET(UCS_ERR_NO_MEMORY, cudaMallocManaged, address_p, length,
+                      cudaMemAttachGlobal);
     } else {
         ucs_error("invalid memory type %s (%d)",
                   ucs_memory_type_names[mem_type], mem_type);
@@ -136,21 +136,21 @@ static void uct_perf_cuda_free(const ucx_perf_context_t *perf,
         ucs_error("failed to deregister memory");
     }
 
-    CUDA_CALL_HANDLER(ucs_warn, , cudaFree, alloc_mem->address);
+    CUDA_CALL_WARN(cudaFree, alloc_mem->address);
 }
 
 static void ucx_perf_cuda_memcpy(void *dst, ucs_memory_type_t dst_mem_type,
                                  const void *src, ucs_memory_type_t src_mem_type,
                                  size_t count)
 {
-    CUDA_CALL(, cudaMemcpy, dst, src, count, cudaMemcpyDefault);
-    CUDA_CALL(, cudaDeviceSynchronize);
+    CUDA_CALL_ERR(cudaMemcpy, dst, src, count, cudaMemcpyDefault);
+    CUDA_CALL_ERR(cudaDeviceSynchronize);
 }
 
 static void* ucx_perf_cuda_memset(void *dst, int value, size_t count)
 {
-    CUDA_CALL(dst, cudaMemset, dst, value, count);
-    CUDA_CALL(dst, cudaDeviceSynchronize);
+    CUDA_CALL_RET(dst, cudaMemset, dst, value, count);
+    CUDA_CALL_ERR(cudaDeviceSynchronize);
     return dst;
 }
 
diff --git a/src/tools/perf/cuda/cuda_common.h b/src/tools/perf/cuda/cuda_common.h
@@ -12,18 +12,24 @@
 BEGIN_C_DECLS
 
 /* TODO: move it to some common place */
-#define CUDA_CALL_HANDLER(_handler, _ret, _func, ...) \
+#define CUDA_CALL(_handler, _log_level, _func, ...) \
     do { \
         cudaError_t _cerr = _func(__VA_ARGS__); \
         if (_cerr != cudaSuccess) { \
-            _handler("%s() failed: %d (%s)", UCS_PP_MAKE_STRING(_func), \
-                     (int)_cerr, cudaGetErrorString(_cerr)); \
-            return _ret; \
+            ucs_log(_log_level, "%s() failed: %d (%s)", UCS_PP_MAKE_STRING(_func), \
+                    (int)_cerr, cudaGetErrorString(_cerr)); \
+            _handler; \
         } \
     } while (0)
 
-#define CUDA_CALL(_ret, _func, ...) \
-    CUDA_CALL_HANDLER(ucs_error, _ret, _func, __VA_ARGS__)
+#define CUDA_CALL_RET(_ret, _func, ...) \
+    CUDA_CALL(return _ret, UCS_LOG_LEVEL_ERROR, _func, __VA_ARGS__)
+
+#define CUDA_CALL_ERR(_func, ...) \
+    CUDA_CALL(, UCS_LOG_LEVEL_ERROR, _func, __VA_ARGS__)
+
+#define CUDA_CALL_WARN(_func, ...) \
+    CUDA_CALL(, UCS_LOG_LEVEL_WARN, _func, __VA_ARGS__)
 
 END_C_DECLS
 
diff --git a/src/tools/perf/cuda/cuda_kernel.cuh b/src/tools/perf/cuda/cuda_kernel.cuh
@@ -21,6 +21,7 @@ struct ucx_perf_cuda_context {
     ucx_perf_counter_t   max_iters;
     ucx_perf_cuda_time_t report_interval_ns;
     ucx_perf_counter_t   completed_iters;
+    ucs_status_t         status;
 };
 
 UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
@@ -48,31 +49,63 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
     }
 }
 
+UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length)
+{
+    return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
+}
+
+UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value)
+{
+    if (threadIdx.x == 0) {
+        while (*sn < value);
+    }
+    __syncthreads();
+}
+
+/* Simple bitset */
+#define UCX_BIT_MASK(bit)       (1 << ((bit) & (CHAR_BIT - 1)))
+#define UCX_BIT_SET(set, bit)   (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
+#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
+#define UCX_BIT_GET(set, bit)   (set[(bit)/CHAR_BIT] &  UCX_BIT_MASK(bit))
+#define UCX_BITSET_SIZE(bits)   ((bits + CHAR_BIT - 1) / CHAR_BIT)
+
+UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
+    size_t count = 0;
+    for (size_t i = 0; i < bits; i++) {
+        if (UCX_BIT_GET(set, i)) {
+            count++;
+        }
+    }
+    return count;
+}
+
+UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
+    for (size_t i = from; i < bits; i++) {
+        if (UCX_BIT_GET(set, i)) {
+            return i;
+        }
+    }
+    return bits;
+}
+
 class ucx_perf_cuda_test_runner {
 public:
     ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
     {
-        ucs_status_t status = init_ctx();
-        if (status != UCS_OK) {
-            ucs_fatal("failed to allocate device memory context: %s",
-                      ucs_status_string(status));
-        }
+        init_ctx();
 
         m_cpu_ctx->max_outstanding    = perf.params.max_outstanding;
         m_cpu_ctx->max_iters          = perf.max_iter;
         m_cpu_ctx->completed_iters    = 0;
-        if (perf.report_interval == ULONG_MAX) {
-            m_cpu_ctx->report_interval_ns = ULONG_MAX;
-        } else {
-            m_cpu_ctx->report_interval_ns = ucs_time_to_nsec(
-                                                    perf.report_interval) /
-                                            100;
-        }
+        m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
+                                        ULONG_MAX :
+                                        ucs_time_to_nsec(perf.report_interval) / 100;
+        m_cpu_ctx->status             = UCS_ERR_NOT_IMPLEMENTED;
     }
 
     ~ucx_perf_cuda_test_runner()
     {
-        destroy_ctx();
+        CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
     }
 
     ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }
@@ -91,6 +124,7 @@ public:
             }
             last_completed = completed;
             completed      = m_cpu_ctx->completed_iters;
+            // TODO: use cuStreamWaitValue64 if available
             usleep(100);
         }
     }
@@ -99,25 +133,12 @@ protected:
     ucx_perf_context_t &m_perf;
 
 private:
-    ucs_status_t init_ctx()
+    void init_ctx()
     {
-        CUDA_CALL(UCS_ERR_NO_MEMORY, cudaHostAlloc, &m_cpu_ctx,
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostAlloc, &m_cpu_ctx,
                   sizeof(ucx_perf_cuda_context), cudaHostAllocMapped);
-
-        cudaError_t err = cudaHostGetDevicePointer(&m_gpu_ctx, m_cpu_ctx, 0);
-        if (err != cudaSuccess) {
-            ucs_error("cudaHostGetDevicePointer() failed: %s",
-                      cudaGetErrorString(err));
-            cudaFreeHost(m_cpu_ctx);
-            return UCS_ERR_IO_ERROR;
-        }
-
-        return UCS_OK;
-    }
-
-    void destroy_ctx()
-    {
-        CUDA_CALL_HANDLER(ucs_warn, , cudaFreeHost, m_cpu_ctx);
+        CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
+                  &m_gpu_ctx, m_cpu_ctx, 0);
     }
 
     ucx_perf_cuda_context *m_cpu_ctx;
diff --git a/src/tools/perf/cuda/ucp_cuda_kernel.cu b/src/tools/perf/cuda/ucp_cuda_kernel.cu
diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c

Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ run_ucx_perftest_cuda_device() {`
`658`	`658`	`ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"`
`659`	`659`
`660`	`660`	`# TODO: Run on all GPUs`
`661`		`- ucp_client_args="-a cuda $(hostname)"`
	`661`	`+ ucp_client_args="-a cuda:0 $(hostname)"`
`662`	`662`
`663`	`663`	`run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0`
`664`	`664`	`}`