Skip to content

Commit df3d8b9

Browse files
committed
PERF: Perftest GDAKI kernel option
1 parent 9d0394a commit df3d8b9

File tree

8 files changed

+158
-5
lines changed

8 files changed

+158
-5
lines changed

src/tools/perf/api/libperf.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ typedef enum {
160160
(_params)->uct.dev_name
161161

162162

163+
#define UCX_PERF_MEM_DEV_DEFAULT -1
164+
165+
163166
/**
164167
* Performance counter type.
165168
*/
@@ -185,6 +188,12 @@ typedef struct ucx_perf_result {
185188
} ucx_perf_result_t;
186189

187190

191+
typedef struct {
192+
ucs_memory_type_t mem_type;
193+
int device_id;
194+
} ucx_perf_accel_dev_t;
195+
196+
188197
typedef void (*ucx_perf_rte_progress_cb_t)(void *arg);
189198

190199
typedef ucs_status_t (*ucx_perf_rte_setup_func_t)(void *arg);
@@ -253,6 +262,8 @@ typedef struct ucx_perf_params {
253262
ucx_perf_wait_mode_t wait_mode; /* How to wait */
254263
ucs_memory_type_t send_mem_type; /* Send memory type */
255264
ucs_memory_type_t recv_mem_type; /* Recv memory type */
265+
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
266+
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
256267
unsigned flags; /* See ucx_perf_test_flags. */
257268

258269
size_t *msg_size_list; /* Test message sizes list. The size

src/tools/perf/cuda/cuda_alloc.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,15 @@ static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf)
3939
return UCS_ERR_NO_DEVICE;
4040
}
4141

42-
gpu_index = group_index % num_gpus;
42+
gpu_index = (group_index == 0) ? perf->params.recv_device.device_id :
43+
perf->params.send_device.device_id;
44+
if (gpu_index == UCX_PERF_MEM_DEV_DEFAULT) {
45+
gpu_index = group_index % num_gpus;
46+
} else if (gpu_index >= num_gpus) {
47+
ucs_error("Illegal cuda device %d number of devices %d", gpu_index,
48+
num_gpus);
49+
return UCS_ERR_NO_DEVICE;
50+
}
4351

4452
CUDA_CALL(UCS_ERR_NO_DEVICE, cudaSetDevice, gpu_index);
4553

@@ -150,6 +158,7 @@ static void ucx_perf_cuda_memcpy(void *dst, ucs_memory_type_t dst_mem_type,
150158
static void* ucx_perf_cuda_memset(void *dst, int value, size_t count)
151159
{
152160
CUDA_CALL(dst, cudaMemset, dst, value, count);
161+
CUDA_CALL(dst, cudaDeviceSynchronize);
153162
return dst;
154163
}
155164

@@ -174,8 +183,8 @@ UCS_STATIC_INIT {
174183
ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA] = &cuda_allocator;
175184
ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] = &cuda_managed_allocator;
176185
}
186+
177187
UCS_STATIC_CLEANUP {
178188
ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA] = NULL;
179189
ucx_perf_mem_type_allocators[UCS_MEMORY_TYPE_CUDA_MANAGED] = NULL;
180-
181190
}

src/tools/perf/perftest.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ static int safe_recv(int sock, void *data, size_t size,
170170
ucs_status_t init_test_params(perftest_params_t *params)
171171
{
172172
static const struct sockaddr_storage empty_addr = {};
173+
static const ucx_perf_accel_dev_t default_dev =
174+
{UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
173175

174176
memset(params, 0, sizeof(*params));
175177
params->super.api = UCX_PERF_API_LAST;
@@ -193,6 +195,8 @@ ucs_status_t init_test_params(perftest_params_t *params)
193195
params->super.uct.am_hdr_size = 8;
194196
params->super.send_mem_type = UCS_MEMORY_TYPE_HOST;
195197
params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST;
198+
params->super.send_device = default_dev;
199+
params->super.recv_device = default_dev;
196200
params->super.msg_size_cnt = 1;
197201
params->super.iov_stride = 0;
198202
params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;

src/tools/perf/perftest.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#endif
2020

2121
#define TL_RESOURCE_NAME_NONE "<none>"
22-
#define TEST_PARAMS_ARGS "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:R:lyz"
22+
#define TEST_PARAMS_ARGS "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:a:R:lyz"
2323
#define TEST_ID_UNDEFINED -1
2424

2525
#define DEFAULT_DAEMON_PORT 1338

src/tools/perf/perftest_params.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ static void usage(const struct perftest_context *ctx, const char *program)
6767
api_names[test->api], test->desc);
6868
}
6969
printf("\n");
70+
printf(" -a <send-device-type[:dev-id]>[,<recv-device-type[:dev-id]>]\n");
71+
printf(" Accelerator device type and device id to use for running the test.\n");
72+
printf(" device id is optional, it corresponds to the index of\n");
73+
printf(" the device in the list of available devices\n");
7074
printf(" -s <size> list of scatter-gather sizes for single message (%zu)\n",
7175
ctx->params.super.msg_size_list[0]);
7276
printf(" for example: \"-s 16,48,8192,8192,14\"\n");
@@ -165,6 +169,26 @@ static void usage(const struct perftest_context *ctx, const char *program)
165169
printf("\n");
166170
}
167171

172+
static ucs_status_t parse_device_id(const char *opt_arg, int *device_id)
173+
{
174+
char *endptr;
175+
int parsed_device_id;
176+
177+
if (opt_arg == NULL) {
178+
ucs_error("device id string is NULL");
179+
return UCS_ERR_INVALID_PARAM;
180+
}
181+
182+
parsed_device_id = strtol(opt_arg, &endptr, 10);
183+
if ((endptr == opt_arg) || (*endptr != '\0') || (parsed_device_id < 0)) {
184+
ucs_error("Failed to parse device id: %s", opt_arg);
185+
return UCS_ERR_INVALID_PARAM;
186+
}
187+
188+
*device_id = parsed_device_id;
189+
return UCS_OK;
190+
}
191+
168192
static ucs_status_t parse_mem_type(const char *opt_arg,
169193
ucs_memory_type_t *mem_type)
170194
{
@@ -186,6 +210,42 @@ static ucs_status_t parse_mem_type(const char *opt_arg,
186210
return UCS_ERR_INVALID_PARAM;
187211
}
188212

213+
static ucs_status_t
214+
parse_accel_device(char *opt_arg, ucx_perf_accel_dev_t *dev)
215+
{
216+
const char *delim = ":";
217+
char *saveptr = NULL;
218+
char *token;
219+
ucs_status_t status;
220+
ucs_memory_type_t mem_type;
221+
int device_id;
222+
223+
if (opt_arg == NULL) {
224+
ucs_error("mem type param is NULL");
225+
return UCS_ERR_INVALID_PARAM;
226+
}
227+
228+
token = strtok_r(opt_arg, delim, &saveptr);
229+
status = parse_mem_type(token, &mem_type);
230+
if (status != UCS_OK) {
231+
return status;
232+
}
233+
234+
token = strtok_r(NULL, delim, &saveptr);
235+
if (NULL == token) {
236+
device_id = UCX_PERF_MEM_DEV_DEFAULT;
237+
} else {
238+
status = parse_device_id(token, &device_id);
239+
if (status != UCS_OK) {
240+
return status;
241+
}
242+
}
243+
244+
dev->mem_type = mem_type;
245+
dev->device_id = device_id;
246+
return UCS_OK;
247+
}
248+
189249
static ucs_status_t parse_mem_type_params(const char *opt_arg,
190250
ucs_memory_type_t *send_mem_type,
191251
ucs_memory_type_t *recv_mem_type)
@@ -208,6 +268,45 @@ static ucs_status_t parse_mem_type_params(const char *opt_arg,
208268
}
209269
}
210270

271+
static ucs_status_t parse_accel_device_params(const char *opt_arg,
272+
ucx_perf_accel_dev_t *send_device,
273+
ucx_perf_accel_dev_t *recv_device)
274+
{
275+
const char *delim = ",";
276+
char *saveptr = NULL;
277+
char *token, *arg;
278+
ucs_status_t status;
279+
280+
arg = ucs_alloca(strlen(opt_arg) + 1);
281+
strcpy(arg, opt_arg);
282+
token = strtok_r(arg, delim, &saveptr);
283+
status = parse_accel_device(token, send_device);
284+
if (status != UCS_OK) {
285+
return status;
286+
}
287+
288+
token = strtok_r(NULL, delim, &saveptr);
289+
if (NULL == token) {
290+
*recv_device = *send_device;
291+
return UCS_OK;
292+
}
293+
294+
status = parse_accel_device(token, recv_device);
295+
if (status != UCS_OK) {
296+
return status;
297+
}
298+
299+
if (send_device->mem_type == recv_device->mem_type) {
300+
if (send_device->device_id == UCX_PERF_MEM_DEV_DEFAULT) {
301+
send_device->device_id = recv_device->device_id;
302+
} else if (recv_device->device_id == UCX_PERF_MEM_DEV_DEFAULT) {
303+
recv_device->device_id = send_device->device_id;
304+
}
305+
}
306+
307+
return UCS_OK;
308+
}
309+
211310
static ucs_status_t parse_message_sizes_params(const char *opt_arg,
212311
ucx_perf_params_t *params)
213312
{
@@ -504,6 +603,13 @@ ucs_status_t parse_test_params(perftest_params_t *params, char opt,
504603
return UCS_ERR_INVALID_PARAM;
505604
}
506605
return UCS_OK;
606+
case 'a':
607+
if (UCS_OK != parse_accel_device_params(opt_arg,
608+
&params->super.send_device,
609+
&params->super.recv_device)) {
610+
return UCS_ERR_INVALID_PARAM;
611+
}
612+
return UCS_OK;
507613
case 'y':
508614
params->super.flags |= UCX_PERF_TEST_FLAG_AM_RECV_COPY;
509615
return UCS_OK;

src/tools/perf/perftest_run.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,21 @@ void print_progress(void *UCS_V_UNUSED rte_group,
104104
fflush(stdout);
105105
}
106106

107+
static void
108+
get_accel_device_str(const ucx_perf_accel_dev_t *dev, char *str, size_t size)
109+
{
110+
ucs_snprintf_safe(str, size, "%s:%d", ucs_memory_type_names[dev->mem_type],
111+
dev->device_id);
112+
}
113+
107114
static void print_header(struct perftest_context *ctx)
108115
{
109116
const char *overhead_lat_str;
110117
const char *test_data_str;
111118
const char *test_api_str;
112119
test_type_t *test;
113120
unsigned i;
121+
char mem_dev_str[16];
114122

115123
test = (ctx->params.test_id == TEST_ID_UNDEFINED) ? NULL :
116124
&tests[ctx->params.test_id];
@@ -148,6 +156,14 @@ static void print_header(struct perftest_context *ctx)
148156
printf("| Data layout: %-60s |\n", test_data_str);
149157
printf("| Send memory: %-60s |\n", ucs_memory_type_names[ctx->params.super.send_mem_type]);
150158
printf("| Recv memory: %-60s |\n", ucs_memory_type_names[ctx->params.super.recv_mem_type]);
159+
if (ctx->params.super.send_device.mem_type != UCS_MEMORY_TYPE_LAST) {
160+
get_accel_device_str(&ctx->params.super.send_device, mem_dev_str, sizeof(mem_dev_str));
161+
printf("| Send device: %-60s |\n", mem_dev_str);
162+
}
163+
if (ctx->params.super.recv_device.mem_type != UCS_MEMORY_TYPE_LAST) {
164+
get_accel_device_str(&ctx->params.super.recv_device, mem_dev_str, sizeof(mem_dev_str));
165+
printf("| Recv device: %-60s |\n", mem_dev_str);
166+
}
151167
printf("| Message size: %-60zu |\n", ucx_perf_get_message_size(&ctx->params.super));
152168
printf("| Window size: %-60u |\n", ctx->params.super.max_outstanding);
153169

test/gtest/common/test_perf.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ void test_perf::test_params_init(const test_spec &test,
214214
params.max_outstanding = test.max_outstanding;
215215
params.send_mem_type = test.send_mem_type;
216216
params.recv_mem_type = test.recv_mem_type;
217+
params.send_device = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
218+
params.recv_device = {UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
217219
params.percentile_rank = 50.0;
218220

219221
memset(params.uct.md_name, 0, sizeof(params.uct.md_name));

test/gtest/ucp/cuda/test_kernels.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,18 @@ int memcmp(const void *s1, const void *s2, size_t size)
3636
}
3737

3838
if (cudaHostGetDevicePointer(&d_result, h_result, 0) != cudaSuccess) {
39-
result = 1;
39+
result = -1;
4040
goto out;
4141
}
4242

4343
*h_result = 0;
4444
memcmp_kernel<<<16, 64>>>(s1, s2, d_result, size);
45-
cudaDeviceSynchronize();
45+
46+
if (cudaDeviceSynchronize() != cudaSuccess) {
47+
result = -1;
48+
goto out;
49+
}
50+
4651
result = *h_result;
4752

4853
out:

0 commit comments

Comments
 (0)