Skip to content

Commit 176cec5

Browse files
committed
UCP/PERF: UCP cuda device real tests
1 parent 79a3c75 commit 176cec5

File tree

6 files changed

+290
-62
lines changed

6 files changed

+290
-62
lines changed

contrib/test_jenkins.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ run_ucx_perftest_cuda_device() {
658658
ucp_test_args="-b $ucx_inst_ptest/test_types_ucp_device_cuda"
659659

660660
# TODO: Run on all GPUs
661-
ucp_client_args="-a cuda $(hostname)"
661+
ucp_client_args="-a cuda:0 $(hostname)"
662662

663663
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
664664
}

src/tools/perf/cuda/cuda_alloc.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
2525

2626
group_index = rte_call(perf, group_index);
2727

28-
CUDA_CALL(UCS_ERR_NO_DEVICE, cudaGetDeviceCount, &num_gpus);
28+
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaGetDeviceCount, &num_gpus);
2929
if (num_gpus == 0) {
3030
ucs_error("no cuda devices available");
3131
return UCS_ERR_NO_DEVICE;
@@ -41,7 +41,7 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
4141
return UCS_ERR_NO_DEVICE;
4242
}
4343

44-
CUDA_CALL(UCS_ERR_NO_DEVICE, cudaSetDevice, gpu_index);
44+
CUDA_CALL_RET(UCS_ERR_NO_DEVICE, cudaSetDevice, gpu_index);
4545

4646
/* actually set device context as calling cudaSetDevice may result in
4747
* context being initialized lazily */
@@ -55,10 +55,10 @@ static inline ucs_status_t ucx_perf_cuda_alloc(size_t length,
5555
void **address_p)
5656
{
5757
if (mem_type == UCS_MEMORY_TYPE_CUDA) {
58-
CUDA_CALL(UCS_ERR_NO_MEMORY, cudaMalloc, address_p, length);
58+
CUDA_CALL_RET(UCS_ERR_NO_MEMORY, cudaMalloc, address_p, length);
5959
} else if (mem_type == UCS_MEMORY_TYPE_CUDA_MANAGED) {
60-
CUDA_CALL(UCS_ERR_NO_MEMORY, cudaMallocManaged, address_p, length,
61-
cudaMemAttachGlobal);
60+
CUDA_CALL_RET(UCS_ERR_NO_MEMORY, cudaMallocManaged, address_p, length,
61+
cudaMemAttachGlobal);
6262
} else {
6363
ucs_error("invalid memory type %s (%d)",
6464
ucs_memory_type_names[mem_type], mem_type);
@@ -136,21 +136,21 @@ static void uct_perf_cuda_free(const ucx_perf_context_t *perf,
136136
ucs_error("failed to deregister memory");
137137
}
138138

139-
CUDA_CALL_HANDLER(ucs_warn, , cudaFree, alloc_mem->address);
139+
CUDA_CALL_WARN(cudaFree, alloc_mem->address);
140140
}
141141

142142
static void ucx_perf_cuda_memcpy(void *dst, ucs_memory_type_t dst_mem_type,
143143
const void *src, ucs_memory_type_t src_mem_type,
144144
size_t count)
145145
{
146-
CUDA_CALL(, cudaMemcpy, dst, src, count, cudaMemcpyDefault);
147-
CUDA_CALL(, cudaDeviceSynchronize);
146+
CUDA_CALL_ERR(cudaMemcpy, dst, src, count, cudaMemcpyDefault);
147+
CUDA_CALL_ERR(cudaDeviceSynchronize);
148148
}
149149

150150
static void* ucx_perf_cuda_memset(void *dst, int value, size_t count)
151151
{
152-
CUDA_CALL(dst, cudaMemset, dst, value, count);
153-
CUDA_CALL(dst, cudaDeviceSynchronize);
152+
CUDA_CALL_RET(dst, cudaMemset, dst, value, count);
153+
CUDA_CALL_ERR(cudaDeviceSynchronize);
154154
return dst;
155155
}
156156

src/tools/perf/cuda/cuda_common.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,24 @@
1212
BEGIN_C_DECLS
1313

1414
/* TODO: move it to some common place */
15-
#define CUDA_CALL_HANDLER(_handler, _ret, _func, ...) \
15+
#define CUDA_CALL(_handler, _log_level, _func, ...) \
1616
do { \
1717
cudaError_t _cerr = _func(__VA_ARGS__); \
1818
if (_cerr != cudaSuccess) { \
19-
_handler("%s() failed: %d (%s)", UCS_PP_MAKE_STRING(_func), \
20-
(int)_cerr, cudaGetErrorString(_cerr)); \
21-
return _ret; \
19+
ucs_log(_log_level, "%s() failed: %d (%s)", UCS_PP_MAKE_STRING(_func), \
20+
(int)_cerr, cudaGetErrorString(_cerr)); \
21+
_handler; \
2222
} \
2323
} while (0)
2424

25-
#define CUDA_CALL(_ret, _func, ...) \
26-
CUDA_CALL_HANDLER(ucs_error, _ret, _func, __VA_ARGS__)
25+
#define CUDA_CALL_RET(_ret, _func, ...) \
26+
CUDA_CALL(return _ret, UCS_LOG_LEVEL_ERROR, _func, __VA_ARGS__)
27+
28+
#define CUDA_CALL_ERR(_func, ...) \
29+
CUDA_CALL(, UCS_LOG_LEVEL_ERROR, _func, __VA_ARGS__)
30+
31+
#define CUDA_CALL_WARN(_func, ...) \
32+
CUDA_CALL(, UCS_LOG_LEVEL_WARN, _func, __VA_ARGS__)
2733

2834
END_C_DECLS
2935

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct ucx_perf_cuda_context {
2121
ucx_perf_counter_t max_iters;
2222
ucx_perf_cuda_time_t report_interval_ns;
2323
ucx_perf_counter_t completed_iters;
24+
ucs_status_t status;
2425
};
2526

2627
UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
@@ -48,31 +49,63 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
4849
}
4950
}
5051

52+
UCS_F_DEVICE uint64_t *ucx_perf_cuda_get_sn(const void *address, size_t length)
53+
{
54+
return (uint64_t*)UCS_PTR_BYTE_OFFSET(address, length - sizeof(uint64_t));
55+
}
56+
57+
UCS_F_DEVICE void ucx_perf_cuda_wait_sn(volatile uint64_t *sn, uint64_t value)
58+
{
59+
if (threadIdx.x == 0) {
60+
while (*sn < value);
61+
}
62+
__syncthreads();
63+
}
64+
65+
/* Simple bitset */
66+
#define UCX_BIT_MASK(bit) (1 << ((bit) & (CHAR_BIT - 1)))
67+
#define UCX_BIT_SET(set, bit) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
68+
#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
69+
#define UCX_BIT_GET(set, bit) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK(bit))
70+
#define UCX_BITSET_SIZE(bits) ((bits + CHAR_BIT - 1) / CHAR_BIT)
71+
72+
UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
73+
size_t count = 0;
74+
for (size_t i = 0; i < bits; i++) {
75+
if (UCX_BIT_GET(set, i)) {
76+
count++;
77+
}
78+
}
79+
return count;
80+
}
81+
82+
UCS_F_DEVICE size_t ucx_bitset_ffs(const uint8_t *set, size_t bits, size_t from) {
83+
for (size_t i = from; i < bits; i++) {
84+
if (UCX_BIT_GET(set, i)) {
85+
return i;
86+
}
87+
}
88+
return bits;
89+
}
90+
5191
class ucx_perf_cuda_test_runner {
5292
public:
5393
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
5494
{
55-
ucs_status_t status = init_ctx();
56-
if (status != UCS_OK) {
57-
ucs_fatal("failed to allocate device memory context: %s",
58-
ucs_status_string(status));
59-
}
95+
init_ctx();
6096

6197
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
6298
m_cpu_ctx->max_iters = perf.max_iter;
6399
m_cpu_ctx->completed_iters = 0;
64-
if (perf.report_interval == ULONG_MAX) {
65-
m_cpu_ctx->report_interval_ns = ULONG_MAX;
66-
} else {
67-
m_cpu_ctx->report_interval_ns = ucs_time_to_nsec(
68-
perf.report_interval) /
69-
100;
70-
}
100+
m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
101+
ULONG_MAX :
102+
ucs_time_to_nsec(perf.report_interval) / 100;
103+
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
71104
}
72105

73106
~ucx_perf_cuda_test_runner()
74107
{
75-
destroy_ctx();
108+
CUDA_CALL_WARN(cudaFreeHost, m_cpu_ctx);
76109
}
77110

78111
ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }
@@ -91,6 +124,7 @@ public:
91124
}
92125
last_completed = completed;
93126
completed = m_cpu_ctx->completed_iters;
127+
// TODO: use cuStreamWaitValue64 if available
94128
usleep(100);
95129
}
96130
}
@@ -99,25 +133,12 @@ protected:
99133
ucx_perf_context_t &m_perf;
100134

101135
private:
102-
ucs_status_t init_ctx()
136+
void init_ctx()
103137
{
104-
CUDA_CALL(UCS_ERR_NO_MEMORY, cudaHostAlloc, &m_cpu_ctx,
138+
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostAlloc, &m_cpu_ctx,
105139
sizeof(ucx_perf_cuda_context), cudaHostAllocMapped);
106-
107-
cudaError_t err = cudaHostGetDevicePointer(&m_gpu_ctx, m_cpu_ctx, 0);
108-
if (err != cudaSuccess) {
109-
ucs_error("cudaHostGetDevicePointer() failed: %s",
110-
cudaGetErrorString(err));
111-
cudaFreeHost(m_cpu_ctx);
112-
return UCS_ERR_IO_ERROR;
113-
}
114-
115-
return UCS_OK;
116-
}
117-
118-
void destroy_ctx()
119-
{
120-
CUDA_CALL_HANDLER(ucs_warn, , cudaFreeHost, m_cpu_ctx);
140+
CUDA_CALL(, UCS_LOG_LEVEL_FATAL, cudaHostGetDevicePointer,
141+
&m_gpu_ctx, m_cpu_ctx, 0);
121142
}
122143

123144
ucx_perf_cuda_context *m_cpu_ctx;

0 commit comments

Comments
 (0)