Skip to content

Commit db2c287

Browse files
authored
UCP/API: Add GPU device and host API (#10815)
1 parent 8649b04 commit db2c287

File tree

6 files changed

+476
-1
lines changed

6 files changed

+476
-1
lines changed

contrib/check_inst_headers.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ do
2727
continue
2828
fi
2929

30+
# devices files should be ignored for now
31+
if test "$hfile" != "${hfile#ucp/api/device/}"
32+
then
33+
echo "SKIPPED $hfile (device compiler)"
34+
continue
35+
fi
36+
3037
# try to compile a test program (from stdin) which includes hfile
3138
for compile in "${CC} -Werror=strict-prototypes -x c" "${CXX} -x c++"
3239
do

src/ucp/Makefile.am

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ nobase_dist_libucp_la_HEADERS = \
1818
api/ucp_compat.h \
1919
api/ucp_def.h \
2020
api/ucp_version.h \
21-
api/ucp.h
21+
api/ucp.h \
22+
api/device/ucp_device_impl.h \
23+
api/device/ucp_device_types.h \
24+
api/device/ucp_host.h
2225

2326
noinst_HEADERS = \
2427
am/eager.inl \
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
/**
2+
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
3+
*
4+
* See file LICENSE for terms.
5+
*/
6+
7+
#ifndef UCP_DEVICE_IMPL_H
8+
#define UCP_DEVICE_IMPL_H
9+
10+
#include "ucp_device_types.h"
11+
12+
#include <ucs/sys/compiler_def.h>
13+
#include <ucs/type/status.h>
14+
#include <stdint.h>
15+
16+
17+
/**
18+
* @ingroup UCP_DEVICE
19+
* @brief Posts one memory put operation.
20+
*
21+
* This device routine posts one put operation using descriptor list handle.
22+
* The @a mem_list_index is used to point at the @a mem_list entry to be used
23+
* for the memory transfer. The addresses and length must be valid for the used
24+
* @a mem_list entry.
25+
*
26+
* The routine returns a request that can be progressed and checked for
27+
* completion with @ref ucp_device_progress_req.
28+
*
29+
* This routine can be called repeatedly with the same handle and different
30+
* addresses and length. The flags parameter can be used to modify the behavior
31+
* of the routine.
32+
*
33+
* @param [in] mem_list Memory descriptor list handle to use.
34+
* @param [in] mem_list_index Index in descriptor list pointing to the memory
35+
* @param [in] address Local virtual address to send data from.
36+
* @param [in] remote_address Remote virtual address to send data to.
37+
* @param [in] length Length in bytes of the data to send.
38+
* registration keys to use for the transfer.
39+
* @param [in] flags Flags usable to modify the function behavior.
40+
* @param [out] req Request populated by the call.
41+
*
42+
* @return Error code as defined by @ref ucs_status_t
43+
*/
44+
UCS_F_DEVICE ucs_status_t
45+
ucp_device_put_single(ucp_device_mem_list_handle_h mem_list,
46+
unsigned mem_list_index,
47+
const void *address, uint64_t remote_address,
48+
size_t length, uint64_t flags, ucp_device_request_t *req)
49+
{
50+
return UCS_ERR_NOT_IMPLEMENTED;
51+
}
52+
53+
54+
/**
55+
* @ingroup UCP_DEVICE
56+
* @brief Posts one memory increment operation.
57+
*
58+
* This device routine posts one increment operation using memory descriptor
59+
* list handle. The @ref mem_list_index is used to point at the @a mem_list
60+
* entry to be used for the increment operation. The remote address must be
61+
* valid for the used @a mem_list entry.
62+
*
63+
* The routine returns a request that can be progressed and checked for
64+
* completion with @ref ucp_device_progress_req.
65+
*
66+
* This routine can be called repeatedly with the same handle and different
67+
* address. The flags parameter can be used to modify the behavior of the
68+
* routine.
69+
*
70+
* @param [in] mem_list Memory descriptor list handle to use.
71+
* @param [in] mem_list_index Index in descriptor list pointing to the memory
72+
* remote key to use for the increment operation.
73+
* @param [in] inc_value Value used to increment the remote address.
74+
* @param [in] remote_address Remote virtual address to perform the increment
75+
* to.
76+
* @param [in] flags Flags usable to modify the function behavior.
77+
* @param [out] req Request populated by the call.
78+
*
79+
* @return Error code as defined by @ref ucs_status_t
80+
*/
81+
UCS_F_DEVICE ucs_status_t
82+
ucp_device_counter_inc(ucp_device_mem_list_handle_h mem_list,
83+
unsigned mem_list_index, uint64_t inc_value,
84+
uint64_t remote_address, uint64_t flags,
85+
ucp_device_request_t *req)
86+
{
87+
return UCS_ERR_NOT_IMPLEMENTED;
88+
}
89+
90+
91+
/**
92+
* @ingroup UCP_DEVICE
93+
* @brief Posts multiple put operations followed by one increment operation.
94+
*
95+
* This device routine posts a batch of put operations using the descriptor list
96+
* entries in the input handle, followed by an increment operation. This
97+
* operation can be polled on the receiver to detect completion of all the
98+
* operations of the batch, started during the same routine call.
99+
*
100+
* The content of each entries in the arrays @a addresses, @a remote_addresses
101+
* and @a lengths must be valid for each corresponding entry in the descriptor
102+
* list from the input handle. The last entry in the descriptor list contains
103+
* the remote memory registration descriptors to be used for the increment
104+
* operation.
105+
*
106+
* The size of the arrays @a addresses, @a remote_addresses, and @a lengths
107+
* are all equal to the size of the descriptor list array from the handle,
108+
* minus one.
109+
*
110+
* The routine returns a request that can be progressed and checked for
111+
* completion with @ref ucp_device_progress_req.
112+
*
113+
* This routine can be called repeatedly with the same handle and different
114+
* @a addresses, @a lengths and counter related parameters. The @a flags
115+
* parameter can be used to modify the behavior of the routine.
116+
*
117+
* @param [in] mem_list Memory descriptor list handle to use.
118+
* @param [in] addresses Array of local addresses to send from.
119+
* @param [in] remote_addresses Array of remote addresses to send to.
120+
* @param [in] lengths Array of lengths in bytes for each send.
121+
* @param [in] counter_inc_value Value of the remote increment.
122+
* @param [in] counter_remote_address Remote address to increment to.
123+
* @param [in] flags Flags to modify the function behavior.
124+
* @param [out] req Request populated by the call.
125+
*
126+
* @return Error code as defined by @ref ucs_status_t
127+
*/
128+
UCS_F_DEVICE ucs_status_t
129+
ucp_device_put_multi(ucp_device_mem_list_handle_h mem_list,
130+
void *const *addresses, const uint64_t *remote_addresses,
131+
const size_t *lengths, uint64_t counter_inc_value,
132+
uint64_t counter_remote_address, uint64_t flags,
133+
ucp_device_request_t *req)
134+
{
135+
return UCS_ERR_NOT_IMPLEMENTED;
136+
}
137+
138+
139+
/**
140+
* @ingroup UCP_DEVICE
141+
* @brief Posts few put operations followed by one atomic increment operation.
142+
*
143+
* This device routine posts a batch of put operations using only some of the
144+
* descriptor list entries in the input handle, followed by an operation.
145+
* This increment operation can be polled on the receiver to detect completion
146+
* of all operations of the batch, started during the same routine call.
147+
*
148+
* The set of indices from the descriptor list entries to use are to be passed
149+
* in the array @ref mem_list_indices. The last entry of the descriptor list is to
150+
* be used for the final increment operation.
151+
*
152+
* The content of each entries in the arrays addresses, remote_addresses and
153+
* lengths must be valid for each corresponding descriptor list entry whose
154+
* index is referenced in @ref mem_list_indices.
155+
*
156+
* The size of the arrays mem_list_indices, addresses, remote_addresses, and
157+
* lengths are all equal. They are lower than the size of the descriptor list
158+
* array from the handle.
159+
*
160+
* The routine returns a request that can be progressed and checked for
161+
* completion with @ref ucp_device_progress_req.
162+
*
163+
* This routine can be called repeatedly with the same handle and different
164+
* mem_list_indices, addresses, lengths and increment related parameters. The
165+
* flags parameter can be used to modify the behavior of the routine.
166+
*
167+
* @param [in] mem_list Memory descriptor list handle to use.
168+
* @param [in] mem_list_indices Array of indices, to use in descriptor
169+
* list of entries from handle.
170+
* @param [in] mem_list_count Number of indices in the array @ref
171+
* mem_list_indices.
172+
* @param [in] addresses Array of local addresses to send from.
173+
* @param [in] remote_addresses Array of remote addresses to send to.
174+
* @param [in] lengths Array of lengths in bytes for each send.
175+
* @param [in] counter_inc_value Value of the remote increment.
176+
* @param [in] counter_remote_address Remote address to increment to.
177+
* @param [in] flags Flags to modify the function behavior.
178+
* @param [out] req Request populated by the call.
179+
*
180+
* @return Error code as defined by @ref ucs_status_t
181+
*/
182+
UCS_F_DEVICE ucs_status_t
183+
ucp_device_put_multi_partial(ucp_device_mem_list_handle_h mem_list,
184+
const unsigned *mem_list_indices,
185+
unsigned mem_list_count,
186+
void *const *addresses,
187+
const uint64_t *remote_addresses,
188+
const size_t *lengths,
189+
uint64_t counter_inc_value,
190+
uint64_t counter_remote_address,
191+
uint64_t flags,
192+
ucp_device_request_t *req)
193+
{
194+
return UCS_ERR_NOT_IMPLEMENTED;
195+
}
196+
197+
198+
/**
199+
* @ingroup UCP_DEVICE
200+
* @brief Progress a device request containing a batch of operations.
201+
*
202+
* This device progress function checks and progresses a request representing a
203+
* batch of one or many operations in progress.
204+
*
205+
* @param [in] req Request containing operations in progress.
206+
*
207+
* @return UCS_OK - The request has completed, no more operations are
208+
* in progress.
209+
* @return UCS_INPROGRESS - One or more operations in the request batch
210+
* have not completed.
211+
* @return Error code as defined by @ref ucs_status_t
212+
*/
213+
UCS_F_DEVICE ucs_status_t
214+
ucp_device_progress_req(ucp_device_request_t *req)
215+
{
216+
return UCS_ERR_NOT_IMPLEMENTED;
217+
}
218+
219+
#endif /* UCP_DEVICE_IMPL_H */
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/**
2+
* Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2025. ALL RIGHTS RESERVED.
3+
*
4+
* See file LICENSE for terms.
5+
*/
6+
7+
#ifndef UCP_DEVICE_TYPES_H
8+
#define UCP_DEVICE_TYPES_H
9+
10+
#include <uct/api/uct.h>
11+
12+
13+
typedef struct ucp_mem_list_elem {
14+
} ucp_device_mem_list_elem_t;
15+
16+
17+
/**
18+
* @ingroup UCP_DEVICE
19+
* @brief Descriptor list handle stored on GPU memory.
20+
*
21+
* This handle is obtained and managed with functions called on host. It can be
22+
* used repeatedly from GPU code to perform memory transfers.
23+
*
24+
* The handle and most of its content is stored on GPU memory, with the intent
25+
* to be as memory-local as possible.
26+
*/
27+
typedef struct {
28+
/**
29+
* Allow runtime ABI compatibility checks, between host and device code.
30+
*/
31+
int version;
32+
33+
/**
34+
* Protocol index computed by host handle management functions when
35+
* creating handle.
36+
*/
37+
int proto_idx;
38+
39+
/**
40+
* Array of pointers to UCT exported endpoints, used for multi-lane
41+
* transfers.
42+
*/
43+
uct_ep_h *uct_ep;
44+
45+
/**
46+
* Number of UCT exported endpoints found in @a uct_ep array.
47+
*/
48+
unsigned num_uct_eps;
49+
50+
/**
51+
* Number of entries in the memory descriptors array @a elems.
52+
*/
53+
unsigned mem_list_length;
54+
55+
/**
56+
* Array of memory descriptors containing memory pairs to be used by device
57+
* functions for memory transfers.
58+
*/
59+
ucp_device_mem_list_elem_t elems[];
60+
} ucp_device_mem_list_handle_t;
61+
62+
typedef ucp_device_mem_list_handle_t *ucp_device_mem_list_handle_h;
63+
64+
65+
/**
66+
* @ingroup UCP_DEVICE
67+
* @brief GPU request descriptor of a given batch
68+
*
69+
* This request tracks a batch of memory operations in progress. It can be used
70+
* with @ref ucp_device_progress_req to detect request completion.
71+
*/
72+
typedef struct ucp_device_request {
73+
} ucp_device_request_t;
74+
75+
#endif /* UCP_DEVICE_TYPES_H */

0 commit comments

Comments
 (0)