Skip to content

Commit d3ff211

Browse files
committed
TOOLS/PERF: Perftest cuda kernel fixes
1 parent f60a22c commit d3ff211

File tree

4 files changed

+48
-99
lines changed

4 files changed

+48
-99
lines changed

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 22 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,9 @@ ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
4747
}
4848
}
4949

50-
template <typename Base>
51-
class ucx_perf_cuda_test_runner: public Base {
50+
class ucx_perf_cuda_test_runner {
5251
public:
53-
using psn_t = uint64_t;
54-
using Base::m_perf;
55-
56-
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : Base(perf)
52+
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
5753
{
5854
ucs_status_t status = init_ctx();
5955
if (status != UCS_OK) {
@@ -62,12 +58,15 @@ public:
6258
}
6359

6460
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
65-
m_cpu_ctx->max_iters = perf.params.max_iter;
66-
m_cpu_ctx->report_interval_ns = perf.params.report_interval *
67-
UCS_NSEC_PER_SEC;
61+
m_cpu_ctx->max_iters = perf.max_iter;
6862
m_cpu_ctx->completed_iters = 0;
69-
70-
m_poll_interval = perf.params.report_interval / 10000;
63+
if (perf.report_interval == ULONG_MAX) {
64+
m_cpu_ctx->report_interval_ns = ULONG_MAX;
65+
} else {
66+
m_cpu_ctx->report_interval_ns = ucs_time_to_nsec(
67+
perf.report_interval) /
68+
100;
69+
}
7170
}
7271

7372
~ucx_perf_cuda_test_runner()
@@ -77,54 +76,26 @@ public:
7776

7877
ucx_perf_cuda_context &gpu_ctx() const { return *m_gpu_ctx; }
7978

80-
UCS_F_ALWAYS_INLINE psn_t get_sn(const psn_t *gpu_ptr, const psn_t *cpu_ptr)
81-
{
82-
if (cpu_ptr != nullptr) {
83-
return *cpu_ptr;
84-
}
85-
86-
unsigned my_index = rte_call(&m_perf, group_index);
87-
ucs_memory_type_t mem_type = my_index ? m_perf.params.send_mem_type :
88-
m_perf.params.recv_mem_type;
89-
auto allocator = my_index ? m_perf.send_allocator :
90-
m_perf.recv_allocator;
91-
return Base::get_sn(gpu_ptr, mem_type, allocator);
92-
}
93-
94-
psn_t wait_sn_geq(const psn_t *gpu_ptr, const psn_t *cpu_ptr, psn_t value)
79+
void wait_for_kernel(size_t msg_length)
9580
{
96-
psn_t sn = get_sn(gpu_ptr, cpu_ptr);
97-
if (sn >= value) {
98-
return sn;
99-
}
100-
101-
// TODO: use cuStreamWaitValue64 if available
102-
usleep(m_poll_interval);
103-
return get_sn(gpu_ptr, cpu_ptr);
104-
}
105-
106-
void wait_for_kernel(size_t length)
107-
{
108-
psn_t last_completed = 0;
109-
while (last_completed < m_perf.params.max_iter) {
110-
psn_t completed = wait_sn_geq(&m_gpu_ctx->completed_iters,
111-
&m_cpu_ctx->completed_iters,
112-
last_completed);
113-
psn_t delta = completed - last_completed;
81+
ucx_perf_counter_t last_completed = 0;
82+
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
83+
while (1) {
84+
ucx_perf_counter_t delta = completed - last_completed;
11485
if (delta > 0) {
11586
// TODO: calculate latency percentile on kernel
116-
ucx_perf_update_multi(&m_perf, delta, delta * length);
87+
ucx_perf_update(&m_perf, delta, msg_length);
88+
} else if (completed >= m_perf.max_iter) {
89+
break;
11790
}
11891
last_completed = completed;
92+
completed = m_cpu_ctx->completed_iters;
93+
usleep(100);
11994
}
12095
}
12196

122-
void wait_for_sn(size_t length)
123-
{
124-
const psn_t *ptr = Base::sn_ptr(m_perf.recv_buffer, length);
125-
while (wait_sn_geq(ptr, nullptr, m_perf.params.max_iter)
126-
< m_perf.params.max_iter);
127-
}
97+
protected:
98+
ucx_perf_context_t &m_perf;
12899

129100
private:
130101
ucs_status_t init_ctx()
@@ -150,7 +121,6 @@ private:
150121

151122
ucx_perf_cuda_context *m_cpu_ctx;
152123
ucx_perf_cuda_context *m_gpu_ctx;
153-
double m_poll_interval;
154124
};
155125

156126

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ ucp_perf_cuda_put_multi_bw_kernel(ucx_perf_cuda_context &ctx)
2323

2424
for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
2525
// TODO: replace with actual put multi call
26-
__nanosleep(1000000); // 1ms
26+
__nanosleep(100000); // 100us
2727

2828
ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
2929
__syncthreads();
@@ -40,23 +40,19 @@ ucp_perf_cuda_put_multi_latency_kernel(ucx_perf_cuda_context &ctx, bool is_sende
4040
for (ucx_perf_counter_t idx = 0; idx < max_iters; idx++) {
4141
// TODO: replace with actual put multi call
4242
// TODO: wait for completion
43-
__nanosleep(1000000); // 1ms
43+
__nanosleep(100000); // 100us
4444

4545
ucx_perf_cuda_update_report(ctx, idx + 1, max_iters, last_report_time);
4646
__syncthreads();
4747
}
4848
}
4949

50-
class ucp_perf_cuda_test_runner:
51-
public ucx_perf_cuda_test_runner<ucp_perf_test_runner_base<uint64_t>> {
50+
class ucp_perf_cuda_test_runner : public ucx_perf_cuda_test_runner {
5251
public:
53-
using psn_t = uint64_t;
54-
5552
ucp_perf_cuda_test_runner(ucx_perf_context_t &perf) :
56-
ucx_perf_cuda_test_runner<ucp_perf_test_runner_base<uint64_t>>(perf)
53+
ucx_perf_cuda_test_runner(perf)
5754
{
5855
size_t length = ucx_perf_get_message_size(&m_perf.params);
59-
ucs_assert(length >= sizeof(psn_t));
6056

6157
m_perf.send_allocator->memset(m_perf.send_buffer, 0, length);
6258
m_perf.recv_allocator->memset(m_perf.recv_buffer, 0, length);
@@ -74,8 +70,9 @@ public:
7470
ucp_perf_cuda_put_multi_latency_kernel
7571
<UCP_DEVICE_LEVEL_BLOCK><<<1, thread_count>>>(gpu_ctx(), my_index);
7672
CUDA_CALL(UCS_ERR_NO_DEVICE, cudaGetLastError);
77-
7873
wait_for_kernel(length);
74+
75+
CUDA_CALL(UCS_ERR_IO_ERROR, cudaDeviceSynchronize);
7976
ucx_perf_get_time(&m_perf);
8077
ucp_perf_barrier(&m_perf);
8178
return UCS_OK;
@@ -94,36 +91,15 @@ public:
9491
ucp_perf_cuda_put_multi_bw_kernel<UCP_DEVICE_LEVEL_BLOCK>
9592
<<<1, thread_count>>>(gpu_ctx());
9693
CUDA_CALL(UCS_ERR_NO_DEVICE, cudaGetLastError);
97-
9894
wait_for_kernel(length);
99-
100-
// TODO: remove once real GDAKI is used
101-
send_signal(length);
102-
} else if (my_index == 0) {
103-
wait_for_sn(length);
10495
}
96+
// TODO run receiver kernel
10597

98+
CUDA_CALL(UCS_ERR_IO_ERROR, cudaDeviceSynchronize);
10699
ucx_perf_get_time(&m_perf);
107100
ucp_perf_barrier(&m_perf);
108101
return UCS_OK;
109102
}
110-
111-
private:
112-
// TODO: remove once real GDAKI is used
113-
void send_signal(size_t length)
114-
{
115-
ucs_memory_type_t mem_type = m_perf.params.send_mem_type;
116-
write_sn(m_perf.send_buffer, mem_type, length, m_perf.params.max_iter,
117-
m_perf.ucp.self_send_rkey);
118-
119-
ucs_status_ptr_t request;
120-
ucp_request_param_t param = {0};
121-
request = ucp_put_nbx(m_perf.ucp.ep, m_perf.send_buffer, length,
122-
m_perf.ucp.remote_addr, m_perf.ucp.rkey, &param);
123-
request_wait(request, mem_type, "write_sn");
124-
request = ucp_ep_flush_nbx(m_perf.ucp.self_ep, &param);
125-
request_wait(request, mem_type, "flush write_sn");
126-
}
127103
};
128104

129105
ucx_perf_device_dispatcher_t ucx_perf_cuda_dispatcher;

src/tools/perf/lib/libperf_int.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,24 +228,27 @@ static inline void ucx_perf_omp_barrier(ucx_perf_context_t *perf)
228228

229229
static UCS_F_ALWAYS_INLINE void ucx_perf_update(ucx_perf_context_t *perf,
230230
ucx_perf_counter_t iters,
231-
size_t bytes)
231+
size_t bytes_per_iter)
232232
{
233233
perf->current.time = ucs_get_time();
234234
perf->current.iters += iters;
235-
perf->current.bytes += bytes;
236-
perf->current.msgs += 1;
235+
perf->current.bytes += bytes_per_iter * iters;
236+
perf->current.msgs += iters;
237237

238-
perf->timing_queue[perf->timing_queue_head] =
239-
perf->current.time - perf->prev_time;
240-
++perf->timing_queue_head;
241-
if (perf->timing_queue_head == TIMING_QUEUE_SIZE) {
242-
perf->timing_queue_head = 0;
238+
if (iters == 1) {
239+
perf->timing_queue[perf->timing_queue_head] = perf->current.time -
240+
perf->prev_time;
241+
++perf->timing_queue_head;
242+
if (perf->timing_queue_head == TIMING_QUEUE_SIZE) {
243+
perf->timing_queue_head = 0;
244+
}
243245
}
244246

245247
perf->prev_time = perf->current.time;
246248

247249
if (ucs_unlikely((perf->current.time - perf->prev.time) >=
248-
perf->report_interval)) {
250+
perf->report_interval) &&
251+
(perf->current.iters < perf->max_iter)) {
249252
ucx_perf_report(perf);
250253
}
251254
}

src/tools/perf/perftest_params.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,12 @@ ucs_status_t adjust_test_params(perftest_params_t *params,
637637
params->super.max_outstanding = test->window_size;
638638
}
639639

640+
if (params->super.send_device.mem_type != UCS_MEMORY_TYPE_LAST) {
641+
/* TODO: Add getter function for thread count */
642+
params->super.device_thread_count = params->super.thread_count;
643+
params->super.thread_count = 1;
644+
}
645+
640646
return UCS_OK;
641647
}
642648

@@ -847,12 +853,6 @@ ucs_status_t parse_opts(struct perftest_context *ctx, int mpi_initialized,
847853
}
848854
}
849855

850-
if (ctx->params.super.send_device.mem_type != UCS_MEMORY_TYPE_LAST) {
851-
/* TODO: Add getter function for thread count */
852-
ctx->params.super.device_thread_count = ctx->params.super.thread_count;
853-
ctx->params.super.thread_count = 1;
854-
}
855-
856856
return init_daemon_params(&ctx->params.super);
857857

858858
err:

0 commit comments

Comments
 (0)