@@ -21,6 +21,7 @@ struct ucx_perf_cuda_context {
2121 ucx_perf_counter_t max_iters;
2222 ucx_perf_cuda_time_t report_interval_ns;
2323 ucx_perf_counter_t completed_iters;
24+ ucs_status_t status;
2425};
2526
2627UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns ()
@@ -48,31 +49,63 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
4849 }
4950}
5051
52+ UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn (const void *address, size_t length)
53+ {
54+ return (uint64_t *)UCS_PTR_BYTE_OFFSET (address, length - sizeof (uint64_t ));
55+ }
56+
57+ UCS_F_DEVICE void ucx_perf_cuda_wait_sn (volatile uint64_t *sn, uint64_t value)
58+ {
59+ if (threadIdx .x == 0 ) {
60+ while (*sn < value);
61+ }
62+ __syncthreads ();
63+ }
64+
65+ /* Simple bitset */
66+ #define UCX_BIT_MASK (bit ) (1 << ((bit) & (CHAR_BIT - 1 )))
67+ #define UCX_BIT_SET (set, bit ) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
68+ #define UCX_BIT_RESET (set, bit ) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK (bit))
69+ #define UCX_BIT_GET (set, bit ) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK (bit))
70+ #define UCX_BITSET_SIZE (bits ) ((bits + CHAR_BIT - 1 ) / CHAR_BIT)
71+
72+ UCS_F_DEVICE size_t ucx_bitset_popcount (const uint8_t *set, size_t bits) {
73+ size_t count = 0 ;
74+ for (size_t i = 0 ; i < bits; i++) {
75+ if (UCX_BIT_GET (set, i)) {
76+ count++;
77+ }
78+ }
79+ return count;
80+ }
81+
82+ UCS_F_DEVICE size_t ucx_bitset_ffs (const uint8_t *set, size_t bits, size_t from) {
83+ for (size_t i = from; i < bits; i++) {
84+ if (UCX_BIT_GET (set, i)) {
85+ return i;
86+ }
87+ }
88+ return bits;
89+ }
90+
5191class ucx_perf_cuda_test_runner {
5292public:
5393 ucx_perf_cuda_test_runner (ucx_perf_context_t &perf) : m_perf(perf)
5494 {
55- ucs_status_t status = init_ctx ();
56- if (status != UCS_OK) {
57- ucs_fatal (" failed to allocate device memory context: %s" ,
58- ucs_status_string (status));
59- }
95+ init_ctx ();
6096
6197 m_cpu_ctx->max_outstanding = perf.params .max_outstanding ;
6298 m_cpu_ctx->max_iters = perf.max_iter ;
6399 m_cpu_ctx->completed_iters = 0 ;
64- if (perf.report_interval == ULONG_MAX) {
65- m_cpu_ctx->report_interval_ns = ULONG_MAX;
66- } else {
67- m_cpu_ctx->report_interval_ns = ucs_time_to_nsec (
68- perf.report_interval ) /
69- 100 ;
70- }
100+ m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
101+ ULONG_MAX :
102+ ucs_time_to_nsec (perf.report_interval ) / 100 ;
103+ m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
71104 }
72105
73106 ~ucx_perf_cuda_test_runner ()
74107 {
75- destroy_ctx ( );
108+ CUDA_CALL_WARN (cudaFreeHost, m_cpu_ctx );
76109 }
77110
78111 ucx_perf_cuda_context &gpu_ctx () const { return *m_gpu_ctx; }
@@ -91,6 +124,7 @@ public:
91124 }
92125 last_completed = completed;
93126 completed = m_cpu_ctx->completed_iters ;
127+ // TODO: use cuStreamWaitValue64 if available
94128 usleep (100 );
95129 }
96130 }
@@ -99,25 +133,12 @@ protected:
99133 ucx_perf_context_t &m_perf;
100134
101135private:
102- ucs_status_t init_ctx ()
136+ void init_ctx ()
103137 {
104- CUDA_CALL (UCS_ERR_NO_MEMORY , cudaHostAlloc, &m_cpu_ctx,
138+ CUDA_CALL (, UCS_LOG_LEVEL_FATAL , cudaHostAlloc, &m_cpu_ctx,
105139 sizeof (ucx_perf_cuda_context), cudaHostAllocMapped);
106-
107- cudaError_t err = cudaHostGetDevicePointer (&m_gpu_ctx, m_cpu_ctx, 0 );
108- if (err != cudaSuccess) {
109- ucs_error (" cudaHostGetDevicePointer() failed: %s" ,
110- cudaGetErrorString (err));
111- cudaFreeHost (m_cpu_ctx);
112- return UCS_ERR_IO_ERROR;
113- }
114-
115- return UCS_OK;
116- }
117-
118- void destroy_ctx ()
119- {
120- CUDA_CALL_HANDLER (ucs_warn, , cudaFreeHost, m_cpu_ctx);
140+ CUDA_CALL (, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
141+ &m_gpu_ctx, m_cpu_ctx, 0 );
121142 }
122143
123144 ucx_perf_cuda_context *m_cpu_ctx;
0 commit comments