Skip to content

Commit 035793c

Browse files
UCT/CUDA_IPC: implement device api (#10839)
1 parent c84ccde commit 035793c

File tree

14 files changed

+773
-24
lines changed

14 files changed

+773
-24
lines changed

src/uct/api/device/uct_device_impl.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <uct/api/uct_def.h>
1313
#include <ucs/sys/compiler_def.h>
14+
#include <uct/cuda/cuda_ipc/cuda_ipc.cuh>
1415

1516
#include <uct/ib/mlx5/gdaki/gdaki.cuh>
1617

@@ -49,8 +50,9 @@ UCS_F_DEVICE ucs_status_t uct_device_ep_put_single(
4950
address, remote_address,
5051
length, flags, comp);
5152
} else if (device_ep->uct_tl_id == UCT_DEVICE_TL_CUDA_IPC) {
52-
// return uct_cuda_ipc_ep_put_single(device_ep, mem_elem, address,
53-
// remote_address, length, flags, comp);
53+
return uct_cuda_ipc_ep_put_single<level>(device_ep, mem_elem, address,
54+
remote_address, length, flags,
55+
comp);
5456
}
5557
return UCS_ERR_UNSUPPORTED;
5658
}

src/uct/cuda/cuda_ipc/cuda_ipc.cuh

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
/**
2+
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
3+
*
4+
* See file LICENSE for terms.
5+
*/
6+
7+
#ifndef UCT_CUDA_IPC_CUH
8+
#define UCT_CUDA_IPC_CUH
9+
10+
#include "ucs/type/status.h"
11+
#include "uct/api/uct_def.h"
12+
#ifdef HAVE_CONFIG_H
13+
#include "config.h"
14+
#endif
15+
16+
#include <uct/api/device/uct_device_types.h>
17+
#include <uct/cuda/cuda_ipc/cuda_ipc_device.h>
18+
19+
#define UCT_CUDA_IPC_IS_ALIGNED_POW2(_n, _p) (!((_n) & ((_p) - 1)))
20+
#define UCT_CUDA_IPC_WARP_SIZE 32
21+
#define UCT_CUDA_IPC_COPY_LOOP_UNROLL 8
22+
23+
UCS_F_DEVICE int4 uct_cuda_ipc_ld_global_cg(const int4* p) {
24+
int4 v;
25+
asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"
26+
: "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w)
27+
: "l"(p));
28+
return v;
29+
}
30+
31+
UCS_F_DEVICE void uct_cuda_ipc_st_global_cg(int4* p, const int4& v) {
32+
asm volatile ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};"
33+
:
34+
: "l"(p), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w));
35+
}
36+
37+
UCS_F_DEVICE int2 uct_cuda_ipc_ld_global_cg(const int2* p) {
38+
int2 v;
39+
asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"
40+
: "=r"(v.x), "=r"(v.y)
41+
: "l"(p));
42+
return v;
43+
}
44+
45+
UCS_F_DEVICE void uct_cuda_ipc_st_global_cg(int2* p, const int2& v) {
46+
asm volatile ("st.global.cg.v2.s32 [%0], {%1,%2};"
47+
:
48+
: "l"(p), "r"(v.x), "r"(v.y));
49+
}
50+
51+
UCS_F_DEVICE void uct_cuda_ipc_copy_single_nv(void *dst, const void *src, size_t size)
52+
{
53+
/* TODO: add vectorized version*/
54+
auto s1 = reinterpret_cast<const char*>(src);
55+
auto d1 = reinterpret_cast<char *>(dst);
56+
57+
for (size_t i = threadIdx.x; i < size; i += blockDim.x) {
58+
d1[i] = s1[i];
59+
}
60+
}
61+
62+
template<int UNROLL>
63+
UCS_F_DEVICE void uct_cuda_ipc_copy_single(void *dst, const void *src, size_t size)
64+
{
65+
using vec4 = int4;
66+
using vec2 = int2;
67+
auto s1 = reinterpret_cast<const char*>(src);
68+
auto d1 = reinterpret_cast<char *>(dst);
69+
const vec4 *s4;
70+
vec4 *d4;
71+
int warp, num_warps, idx;
72+
size_t num_lines;
73+
74+
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec4)) &&
75+
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec4))) {
76+
vec4 tmp[UNROLL];
77+
warp = threadIdx.x / UCT_CUDA_IPC_WARP_SIZE;
78+
num_warps = blockDim.x / UCT_CUDA_IPC_WARP_SIZE;
79+
idx = threadIdx.x % UCT_CUDA_IPC_WARP_SIZE;
80+
s4 = reinterpret_cast<const vec4*>(s1);
81+
d4 = reinterpret_cast<vec4*>(d1);
82+
num_lines = (size / (UCT_CUDA_IPC_WARP_SIZE * UNROLL * sizeof(vec4))) *
83+
(UCT_CUDA_IPC_WARP_SIZE * UNROLL);
84+
85+
for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UNROLL + idx; line < num_lines;
86+
line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UNROLL) {
87+
#pragma unroll
88+
for (int i = 0; i < UNROLL; i++) {
89+
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
90+
}
91+
92+
#pragma unroll
93+
for (int i = 0; i < UNROLL; i++) {
94+
uct_cuda_ipc_st_global_cg(d4 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp[i]);
95+
}
96+
}
97+
size = size - num_lines * sizeof(vec4);
98+
if (size == 0) {
99+
return;
100+
}
101+
102+
s4 = s4 + num_lines;
103+
d4 = d4 + num_lines;
104+
num_lines = size / sizeof(vec4);
105+
for (size_t line = threadIdx.x; line < num_lines; line += blockDim.x) {
106+
vec4 v = uct_cuda_ipc_ld_global_cg(s4 + line);
107+
uct_cuda_ipc_st_global_cg(d4 + line, v);
108+
}
109+
110+
size = size - num_lines * sizeof(vec4);
111+
if (size == 0) {
112+
return;
113+
}
114+
115+
s1 = reinterpret_cast<const char*>(s4 + num_lines);
116+
d1 = reinterpret_cast<char*>(d4 + num_lines);
117+
}
118+
119+
/* If not 16B-aligned, try 8B-aligned fast path using vec2 */
120+
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec2)) &&
121+
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec2))) {
122+
const vec2 *s2;
123+
vec2 *d2;
124+
vec2 tmp2[UNROLL];
125+
126+
warp = threadIdx.x / UCT_CUDA_IPC_WARP_SIZE;
127+
num_warps = blockDim.x / UCT_CUDA_IPC_WARP_SIZE;
128+
idx = threadIdx.x % UCT_CUDA_IPC_WARP_SIZE;
129+
s2 = reinterpret_cast<const vec2*>(s1);
130+
d2 = reinterpret_cast<vec2*>(d1);
131+
num_lines = (size / (UCT_CUDA_IPC_WARP_SIZE * UNROLL * sizeof(vec2))) *
132+
(UCT_CUDA_IPC_WARP_SIZE * UNROLL);
133+
134+
for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UNROLL + idx; line < num_lines;
135+
line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UNROLL) {
136+
#pragma unroll
137+
for (int i = 0; i < UNROLL; i++) {
138+
tmp2[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
139+
}
140+
141+
#pragma unroll
142+
for (int i = 0; i < UNROLL; i++) {
143+
uct_cuda_ipc_st_global_cg(d2 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp2[i]);
144+
}
145+
}
146+
147+
size = size - num_lines * sizeof(vec2);
148+
if (size == 0) {
149+
return;
150+
}
151+
152+
s2 = s2 + num_lines;
153+
d2 = d2 + num_lines;
154+
num_lines = size / sizeof(vec2);
155+
for (size_t line = threadIdx.x; line < num_lines; line += blockDim.x) {
156+
vec2 v2 = uct_cuda_ipc_ld_global_cg(s2 + line);
157+
uct_cuda_ipc_st_global_cg(d2 + line, v2);
158+
}
159+
160+
size = size - num_lines * sizeof(vec2);
161+
if (size == 0) {
162+
return;
163+
}
164+
165+
s1 = reinterpret_cast<const char*>(s2 + num_lines);
166+
d1 = reinterpret_cast<char*>(d2 + num_lines);
167+
}
168+
169+
for (size_t line = threadIdx.x; line < size; line += blockDim.x) {
170+
d1[line] = s1[line];
171+
}
172+
}
173+
174+
template<uct_device_level_t level = UCT_DEVICE_LEVEL_BLOCK>
175+
UCS_F_DEVICE ucs_status_t
176+
uct_cuda_ipc_ep_put_single(uct_device_ep_h device_ep,
177+
const uct_device_mem_element_t *mem_elem,
178+
const void *address, uint64_t remote_address,
179+
size_t length, uint64_t flags,
180+
uct_device_completion_t *comp)
181+
{
182+
auto cuda_ipc_mem_element =
183+
reinterpret_cast<const uct_cuda_ipc_device_mem_element_t *>(mem_elem);
184+
void *mapped_rem_addr;
185+
186+
mapped_rem_addr = (void *)((uintptr_t)(remote_address) + cuda_ipc_mem_element->mapped_offset);
187+
188+
switch (level) {
189+
case UCT_DEVICE_LEVEL_THREAD:
190+
/* TODO: add vectorized version*/
191+
memcpy(mapped_rem_addr, address, length);
192+
break;
193+
case UCT_DEVICE_LEVEL_WARP:
194+
/* TODO: check if we can use uct_cuda_ipc_copy_single, need to see perf impact */
195+
uct_cuda_ipc_copy_single_nv(mapped_rem_addr, address, length);
196+
break;
197+
case UCT_DEVICE_LEVEL_BLOCK:
198+
uct_cuda_ipc_copy_single<UCT_CUDA_IPC_COPY_LOOP_UNROLL>(mapped_rem_addr, address, length);
199+
break;
200+
case UCT_DEVICE_LEVEL_GRID:
201+
return UCS_ERR_UNSUPPORTED;
202+
default:
203+
return UCS_ERR_INVALID_PARAM;
204+
}
205+
206+
__syncthreads();
207+
if (threadIdx.x == 0) {
208+
comp->count = 0;
209+
}
210+
return UCS_OK;
211+
}
212+
213+
#endif /* UCT_CUDA_IPC_CUH */

src/uct/cuda/cuda_ipc/cuda_ipc_cache.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,13 @@ ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, uintptr_t d_bptr,
494494
ucs_pgt_region_t *pgt_region;
495495
uct_cuda_ipc_cache_region_t *region;
496496

497+
/* checking if the mapped address is the same as the d_bptr
498+
* this is true for the case of single process memory mapping
499+
* see uct_cuda_ipc_map_memhandle for more details */
500+
if (d_bptr == (uintptr_t)mapped_addr) {
501+
return UCS_OK;
502+
}
503+
497504
status = uct_cuda_ipc_get_remote_cache(pid, cu_dev, &cache);
498505
if (status != UCS_OK) {
499506
return status;
@@ -535,8 +542,23 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
535542
ucs_status_t status;
536543
ucs_pgt_region_t *pgt_region;
537544
uct_cuda_ipc_cache_region_t *region;
545+
CUuuid uuid;
538546
int ret;
539547

548+
status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetUuid(&uuid, cu_dev));
549+
if (status != UCS_OK) {
550+
return status;
551+
}
552+
553+
if ((getpid() == key->pid) &&
554+
(memcmp(uuid.bytes, key->uuid.bytes, sizeof(uuid.bytes)) == 0)) { /* TODO: added for test purpose to enable cuda_ipc tests in gtest
555+
* mapped addrr is set to be same as d_bptr avoiding any calls to
556+
* uct_cuda_ipc_open_memhandle which would fail with invalid argument
557+
* error*/
558+
*mapped_addr = (CUdeviceptr*)key->d_bptr;
559+
return UCS_OK;
560+
}
561+
540562
status = uct_cuda_ipc_get_remote_cache(key->pid, cu_dev, &cache);
541563
if (status != UCS_OK) {
542564
return status;
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/**
2+
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
3+
* See file LICENSE for terms.
4+
*/
5+
6+
#ifndef UCT_CUDA_IPC_EP_DEVICE_H
7+
#define UCT_CUDA_IPC_EP_DEVICE_H
8+
9+
#include <stddef.h>
10+
11+
typedef struct {
12+
ptrdiff_t mapped_offset;
13+
} uct_cuda_ipc_device_mem_element_t;
14+
15+
#endif

src/uct/cuda/cuda_ipc/cuda_ipc_ep.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
* See file LICENSE for terms.
44
*/
55

6+
#include "base/cuda_iface.h"
7+
#include "uct/api/uct_def.h"
8+
#include "uct/api/device/uct_device_types.h"
69
#ifdef HAVE_CONFIG_H
710
# include "config.h"
811
#endif
@@ -32,11 +35,15 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_ep_t, const uct_ep_params_t *params)
3235
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super);
3336

3437
self->remote_pid = *(const pid_t*)params->iface_addr;
38+
self->device_ep = NULL;
3539
return UCS_OK;
3640
}
3741

3842
static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_ep_t)
3943
{
44+
if (self->device_ep != NULL) {
45+
(void)UCT_CUDADRV_FUNC_LOG_WARN(cuMemFree((CUdeviceptr)self->device_ep));
46+
}
4047
}
4148

4249
UCS_CLASS_DEFINE(uct_cuda_ipc_ep_t, uct_base_ep_t)
@@ -229,3 +236,37 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_ep_put_zcopy,
229236
uct_iov_total_length(iov, iovcnt));
230237
return status;
231238
}
239+
240+
ucs_status_t uct_cuda_ipc_ep_get_device_ep(uct_ep_h tl_ep,
241+
uct_device_ep_h *device_ep_p)
242+
{
243+
uct_cuda_ipc_ep_t *ep = ucs_derived_of(tl_ep, uct_cuda_ipc_ep_t);
244+
uct_device_ep_t device_ep;
245+
ucs_status_t status;
246+
247+
if (ep->device_ep != NULL) {
248+
goto out;
249+
}
250+
251+
device_ep.uct_tl_id = UCT_DEVICE_TL_CUDA_IPC;
252+
status = UCT_CUDADRV_FUNC_LOG_ERR(
253+
cuMemAlloc((CUdeviceptr *)&ep->device_ep, sizeof(uct_device_ep_t)));
254+
if (status != UCS_OK) {
255+
goto err;
256+
}
257+
258+
status = UCT_CUDADRV_FUNC_LOG_ERR(
259+
cuMemcpyHtoD((CUdeviceptr)ep->device_ep, &device_ep, sizeof(uct_device_ep_t)));
260+
if (status != UCS_OK) {
261+
goto err_free_mem;
262+
}
263+
264+
out:
265+
*device_ep_p = ep->device_ep;
266+
return UCS_OK;
267+
err_free_mem:
268+
cuMemFree((CUdeviceptr)&ep->device_ep);
269+
ep->device_ep = NULL;
270+
err:
271+
return status;
272+
}

src/uct/cuda/cuda_ipc/cuda_ipc_ep.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
typedef struct uct_cuda_ipc_ep {
1515
uct_base_ep_t super;
1616
pid_t remote_pid;
17+
uct_device_ep_h device_ep;
1718
} uct_cuda_ipc_ep_t;
1819

1920

@@ -33,4 +34,7 @@ ucs_status_t uct_cuda_ipc_ep_put_zcopy(uct_ep_h tl_ep,
3334
int uct_cuda_ipc_ep_is_connected(const uct_ep_h tl_ep,
3435
const uct_ep_is_connected_params_t *params);
3536

37+
ucs_status_t uct_cuda_ipc_ep_get_device_ep(uct_ep_h tl_ep,
38+
uct_device_ep_h *device_ep_p);
39+
3640
#endif

0 commit comments

Comments
 (0)