Skip to content

Commit 9455002

Browse files
UCT/CUDA_IPC/TEST: change indices handling in put partial (#10924)
1 parent d2ee3c4 commit 9455002

File tree

3 files changed

+23
-38
lines changed

3 files changed

+23
-38
lines changed

src/uct/cuda/cuda_ipc/cuda_ipc.cuh

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -359,21 +359,17 @@ uct_cuda_ipc_ep_put_multi_partial(uct_device_ep_h device_ep,
359359
unsigned int lane_id, num_lanes;
360360

361361
uct_cuda_ipc_get_lane<level>(lane_id, num_lanes);
362-
for (int i = 0, j = 0; i < mem_list_count; i++) {
363-
if (i == counter_index) {
364-
continue;
365-
}
362+
for (int i = 0; i < mem_list_count; i++) {
366363
auto cuda_ipc_mem_element = reinterpret_cast<const uct_cuda_ipc_device_mem_element_t *>(
367364
UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * mem_list_indices[i]));
368-
auto mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, remote_addresses[j]);
369-
uct_cuda_ipc_copy_level<level>(mapped_rem_addr, addresses[j], lengths[j]);
370-
j++;
365+
auto mapped_rem_addr = uct_cuda_ipc_map_remote(cuda_ipc_mem_element, remote_addresses[i]);
366+
uct_cuda_ipc_copy_level<level>(mapped_rem_addr, addresses[i], lengths[i]);
371367
}
372368

373369

374370
if ((counter_remote_address != 0) && (lane_id == 0)) {
375371
auto cuda_ipc_mem_element = reinterpret_cast<const uct_cuda_ipc_device_mem_element_t *>(
376-
UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * mem_list_indices[counter_index]));
372+
UCS_PTR_BYTE_OFFSET(mem_list, sizeof(uct_cuda_ipc_device_mem_element_t) * counter_index));
377373
auto mapped_counter_rem_addr = reinterpret_cast<uint64_t *>(uct_cuda_ipc_map_remote(cuda_ipc_mem_element,
378374
counter_remote_address));
379375
uct_cuda_ipc_atomic_inc(mapped_counter_rem_addr, counter_inc_value);

src/uct/cuda/cuda_ipc/cuda_ipc_ep.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ ucs_status_t uct_cuda_ipc_ep_get_device_ep(uct_ep_h tl_ep,
266266
*device_ep_p = ep->device_ep;
267267
return UCS_OK;
268268
err_free_mem:
269-
cuMemFree((CUdeviceptr)&ep->device_ep);
269+
cuMemFree((CUdeviceptr)ep->device_ep);
270270
ep->device_ep = NULL;
271271
err:
272272
return status;

test/gtest/uct/cuda/test_cuda_ipc_device.cc

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -361,36 +361,25 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device)
361361
ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&remote_addresses_dev, iovcnt * sizeof(uint64_t)));
362362
ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&lengths_dev, iovcnt * sizeof(size_t)));
363363
ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&addresses_dev, iovcnt * sizeof(void *)));
364-
ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&mem_list_indices_dev, (iovcnt + 1) * sizeof(unsigned)));
364+
ASSERT_EQ(CUDA_SUCCESS, cuMemAlloc((CUdeviceptr *)&mem_list_indices_dev, iovcnt * sizeof(unsigned)));
365365

366-
367-
for (int i = 0, j = 0; i < iovcnt + 1; i++) {
368-
if (i == counter_index) {
369-
continue;
370-
}
371-
mem_list_indices[i] = j++;
366+
/* Fill indices and pack PUT entries */
367+
for (int i = 0; i < iovcnt; i++) {
368+
unsigned idx = (i < counter_index) ? i : (i + 1);
369+
mem_list_indices[i] = idx;
370+
uct_device_mem_element_t *mem_elem_iov =
371+
(uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_elem,
372+
mem_elem_size * idx);
373+
ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), sendbuf.memh(),
374+
recvbuf.rkey(), mem_elem_iov));
372375
}
373-
mem_list_indices[counter_index] = iovcnt;
374-
375-
for (int i = 0; i < iovcnt + 1; i++) {
376-
uct_rkey_t rkey;
377-
uct_mem_h memh;
378-
uct_device_mem_element_t *mem_elem_iov;
379-
380-
if (i == counter_index) {
381-
rkey = signal.rkey();
382-
memh = nullptr;
383-
} else {
384-
rkey = recvbuf.rkey();
385-
memh = sendbuf.memh();
386-
}
387-
388-
mem_elem_iov = (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_elem,
389-
mem_elem_size * mem_list_indices[i]);
390376

391-
ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), memh, rkey,
392-
mem_elem_iov));
393-
}
377+
/* Pack counter entry directly at mem_list[counter_index] */
378+
uct_device_mem_element_t *mem_elem_counter =
379+
(uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_elem,
380+
mem_elem_size * counter_index);
381+
ASSERT_UCS_OK(uct_iface_mem_element_pack(m_sender->iface(), nullptr,
382+
signal.rkey(), mem_elem_counter));
394383

395384
for (int i = 0; i < iovcnt; i++) {
396385
size_t iov_offset = (base_length + offset) * i;
@@ -406,13 +395,13 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device)
406395
ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)addresses_dev, addresses,
407396
iovcnt * sizeof(void*)));
408397
ASSERT_EQ(CUDA_SUCCESS, cuMemcpyHtoD((CUdeviceptr)mem_list_indices_dev, mem_list_indices,
409-
(iovcnt + 1) * sizeof(unsigned)));
398+
iovcnt * sizeof(unsigned)));
410399
for (int i = 0; i < iovcnt; i++) {
411400
mem_buffer::pattern_fill(addresses[i], base_length, SEED1, UCS_MEMORY_TYPE_CUDA);
412401
}
413402

414403
cuda_uct::launch_uct_put_multi_partial(device_ep, mem_elem, mem_list_indices_dev,
415-
iovcnt + 1, addresses_dev,
404+
iovcnt, addresses_dev,
416405
remote_addresses_dev, lengths_dev,
417406
counter_index, signal_val, (uint64_t)signal.ptr(),
418407
device_level, num_threads, num_blocks);

0 commit comments

Comments
 (0)