@@ -361,36 +361,25 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device)
361361 ASSERT_EQ (CUDA_SUCCESS, cuMemAlloc ((CUdeviceptr *)&remote_addresses_dev, iovcnt * sizeof (uint64_t )));
362362 ASSERT_EQ (CUDA_SUCCESS, cuMemAlloc ((CUdeviceptr *)&lengths_dev, iovcnt * sizeof (size_t )));
363363 ASSERT_EQ (CUDA_SUCCESS, cuMemAlloc ((CUdeviceptr *)&addresses_dev, iovcnt * sizeof (void *)));
364- ASSERT_EQ (CUDA_SUCCESS, cuMemAlloc ((CUdeviceptr *)&mem_list_indices_dev, ( iovcnt + 1 ) * sizeof (unsigned )));
364+ ASSERT_EQ (CUDA_SUCCESS, cuMemAlloc ((CUdeviceptr *)&mem_list_indices_dev, iovcnt * sizeof (unsigned )));
365365
366-
367- for (int i = 0 , j = 0 ; i < iovcnt + 1 ; i++) {
368- if (i == counter_index) {
369- continue ;
370- }
371- mem_list_indices[i] = j++;
366+ /* Fill indices and pack PUT entries */
367+ for (int i = 0 ; i < iovcnt; i++) {
368+ unsigned idx = (i < counter_index) ? i : (i + 1 );
369+ mem_list_indices[i] = idx;
370+ uct_device_mem_element_t *mem_elem_iov =
371+ (uct_device_mem_element_t *)UCS_PTR_BYTE_OFFSET (mem_elem,
372+ mem_elem_size * idx);
373+ ASSERT_UCS_OK (uct_iface_mem_element_pack (m_sender->iface (), sendbuf.memh (),
374+ recvbuf.rkey (), mem_elem_iov));
372375 }
373- mem_list_indices[counter_index] = iovcnt;
374-
375- for (int i = 0 ; i < iovcnt + 1 ; i++) {
376- uct_rkey_t rkey;
377- uct_mem_h memh;
378- uct_device_mem_element_t *mem_elem_iov;
379-
380- if (i == counter_index) {
381- rkey = signal.rkey ();
382- memh = nullptr ;
383- } else {
384- rkey = recvbuf.rkey ();
385- memh = sendbuf.memh ();
386- }
387-
388- mem_elem_iov = (uct_device_mem_element_t *)UCS_PTR_BYTE_OFFSET (mem_elem,
389- mem_elem_size * mem_list_indices[i]);
390376
391- ASSERT_UCS_OK (uct_iface_mem_element_pack (m_sender->iface (), memh, rkey,
392- mem_elem_iov));
393- }
377+ /* Pack counter entry directly at mem_list[counter_index] */
378+ uct_device_mem_element_t *mem_elem_counter =
379+ (uct_device_mem_element_t *)UCS_PTR_BYTE_OFFSET (mem_elem,
380+ mem_elem_size * counter_index);
381+ ASSERT_UCS_OK (uct_iface_mem_element_pack (m_sender->iface (), nullptr ,
382+ signal.rkey (), mem_elem_counter));
394383
395384 for (int i = 0 ; i < iovcnt; i++) {
396385 size_t iov_offset = (base_length + offset) * i;
@@ -406,13 +395,13 @@ UCS_TEST_P(test_cuda_ipc_rma_device, put_multi_partial_device)
406395 ASSERT_EQ (CUDA_SUCCESS, cuMemcpyHtoD ((CUdeviceptr)addresses_dev, addresses,
407396 iovcnt * sizeof (void *)));
408397 ASSERT_EQ (CUDA_SUCCESS, cuMemcpyHtoD ((CUdeviceptr)mem_list_indices_dev, mem_list_indices,
409- ( iovcnt + 1 ) * sizeof (unsigned )));
398+ iovcnt * sizeof (unsigned )));
410399 for (int i = 0 ; i < iovcnt; i++) {
411400 mem_buffer::pattern_fill (addresses[i], base_length, SEED1, UCS_MEMORY_TYPE_CUDA);
412401 }
413402
414403 cuda_uct::launch_uct_put_multi_partial (device_ep, mem_elem, mem_list_indices_dev,
415- iovcnt + 1 , addresses_dev,
404+ iovcnt, addresses_dev,
416405 remote_addresses_dev, lengths_dev,
417406 counter_index, signal_val, (uint64_t )signal.ptr (),
418407 device_level, num_threads, num_blocks);
0 commit comments