Skip to content

Commit 8649b04

Browse files
UCT/GDAKI: Add endpoint (#10825)
1 parent 1107c14 commit 8649b04

File tree

13 files changed

+343
-18
lines changed

13 files changed

+343
-18
lines changed

src/uct/ib/base/ib_iface.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,8 +1684,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *tl_ops,
16841684
dev->stats)
16851685
UCS_STATS_ARG(params->mode.device.dev_name));
16861686

1687-
status = uct_ib_device_find_port(dev, params->mode.device.dev_name,
1688-
&port_num);
1687+
status = uct_ib_device_find_port(dev, init_attr->dev_name, &port_num);
16891688
if (status != UCS_OK) {
16901689
goto err;
16911690
}

src/uct/ib/base/ib_iface.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ typedef struct uct_ib_iface_init_attr {
272272
unsigned max_rd_atomic;
273273
uint8_t cqe_zip_sizes[UCT_IB_DIR_LAST];
274274
uint16_t tx_moderation; /* TX CQ moderation */
275+
const char *dev_name; /* Device Name */
275276
} uct_ib_iface_init_attr_t;
276277

277278

src/uct/ib/efa/srd/srd_iface.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ static UCS_CLASS_INIT_FUNC(uct_srd_iface_t, uct_md_h md, uct_worker_h worker,
472472
init_attr.rx_hdr_len = sizeof(uct_srd_hdr_t);
473473
init_attr.seg_size = ucs_min(mtu, config->super.seg_size);
474474
init_attr.qp_type = IBV_QPT_DRIVER;
475+
init_attr.dev_name = params->mode.device.dev_name;
475476

476477
UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &uct_srd_iface_tl_ops,
477478
&uct_srd_iface_ops, md, worker, params,

src/uct/ib/mlx5/dc/dc_mlx5.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,6 +1608,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
16081608
init_attr.fc_req_size = sizeof(uct_dc_fc_request_t);
16091609
init_attr.max_rd_atomic = md->max_rd_atomic_dc;
16101610
init_attr.tx_moderation = 0; /* disable tx moderation for dcs */
1611+
init_attr.dev_name = params->mode.device.dev_name;
16111612

16121613
if (md->flags & UCT_IB_MLX5_MD_FLAG_DC_TM) {
16131614
init_attr.flags |= UCT_IB_TM_SUPPORTED;

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 299 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,305 @@ ucs_config_field_t uct_rc_gdaki_iface_config_table[] = {
3434
{NULL}
3535
};
3636

37+
38+
static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
39+
{
40+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(params->iface,
41+
uct_rc_gdaki_iface_t);
42+
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
43+
uct_ib_mlx5_md_t);
44+
uct_ib_iface_init_attr_t init_attr = {};
45+
uct_ib_mlx5_qp_attr_t qp_attr = {};
46+
ucs_status_t status;
47+
48+
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super.super.super);
49+
50+
init_attr.cq_len[UCT_IB_DIR_TX] = iface->super.super.config.tx_qp_len *
51+
UCT_IB_MLX5_MAX_BB;
52+
init_attr.flags = UCT_IB_CQ_IGNORE_OVERRUN;
53+
status = uct_ib_mlx5_devx_create_cq(&iface->super.super.super,
54+
UCT_IB_DIR_TX, &init_attr, &self->cq, 0,
55+
0);
56+
if (status != UCS_OK) {
57+
return status;
58+
}
59+
60+
uct_rc_iface_fill_attr(&iface->super.super, &qp_attr.super,
61+
iface->super.super.config.tx_qp_len, NULL);
62+
qp_attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB;
63+
qp_attr.super.srq_num = 0;
64+
status = uct_ib_mlx5_devx_create_qp(&iface->super.super.super, &self->cq,
65+
&self->cq, &self->qp.super, &self->qp,
66+
&qp_attr);
67+
if (status != UCS_OK) {
68+
goto err_qp;
69+
}
70+
71+
return UCS_OK;
72+
73+
err_qp:
74+
uct_ib_mlx5_devx_destroy_cq(md, &self->cq);
75+
return status;
76+
}
77+
78+
static UCS_CLASS_CLEANUP_FUNC(uct_rc_gdaki_ep_t)
79+
{
80+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(self->super.super.iface,
81+
uct_rc_gdaki_iface_t);
82+
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
83+
uct_ib_mlx5_md_t);
84+
85+
uct_ib_mlx5_devx_destroy_qp(md, &self->qp.super);
86+
uct_ib_mlx5_devx_destroy_cq(md, &self->cq);
87+
}
88+
89+
UCS_CLASS_DEFINE(uct_rc_gdaki_ep_t, uct_base_ep_t);
90+
UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_ep_t, uct_ep_t, const uct_ep_params_t *);
91+
UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_gdaki_ep_t, uct_ep_t);
92+
93+
static ucs_status_t
94+
uct_rc_gdaki_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr)
95+
{
96+
uct_rc_gdaki_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_gdaki_ep_t);
97+
uct_rc_mlx5_base_ep_address_t *rc_addr = (void*)addr;
98+
99+
uct_ib_pack_uint24(rc_addr->qp_num, ep->qp.super.qp_num);
100+
return UCS_OK;
101+
}
102+
103+
static ucs_status_t uct_rc_gdaki_iface_get_address(uct_iface_h tl_iface,
104+
uct_iface_addr_t *addr)
105+
{
106+
*(uint8_t*)addr = UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC;
107+
return UCS_OK;
108+
}
109+
110+
static ucs_status_t
111+
uct_rc_gdaki_ep_connect_to_ep_v2(uct_ep_h ep,
112+
const uct_device_addr_t *device_addr,
113+
const uct_ep_addr_t *ep_addr,
114+
const uct_ep_connect_to_ep_params_t *params)
115+
{
116+
uct_rc_gdaki_ep_t *gdaki_ep = ucs_derived_of(ep, uct_rc_gdaki_ep_t);
117+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(ep->iface,
118+
uct_rc_gdaki_iface_t);
119+
const uct_ib_address_t *ib_addr = (void*)device_addr;
120+
const uct_rc_mlx5_base_ep_address_t *rc_addr = (void*)ep_addr;
121+
uint8_t path_index = 0;
122+
struct ibv_ah_attr ah_attr;
123+
enum ibv_mtu path_mtu;
124+
uint32_t dest_qp_num;
125+
ucs_status_t status;
126+
127+
status = uct_ib_iface_fill_ah_attr_from_addr(&iface->super.super.super,
128+
ib_addr, path_index, &ah_attr,
129+
&path_mtu);
130+
if (status != UCS_OK) {
131+
return status;
132+
}
133+
134+
ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
135+
dest_qp_num = uct_ib_unpack_uint24(rc_addr->qp_num);
136+
137+
return uct_rc_mlx5_iface_common_devx_connect_qp(
138+
&iface->super, &gdaki_ep->qp.super, dest_qp_num, &ah_attr, path_mtu,
139+
path_index, iface->super.super.config.max_rd_atomic);
140+
}
141+
142+
int uct_rc_gdaki_ep_is_connected(uct_ep_h tl_ep,
143+
const uct_ep_is_connected_params_t *params)
144+
{
145+
uct_rc_gdaki_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_gdaki_ep_t);
146+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(ep->super.super.iface,
147+
uct_rc_gdaki_iface_t);
148+
uint32_t addr_qp = 0;
149+
uct_rc_mlx5_base_ep_address_t *rc_addr;
150+
ucs_status_t status;
151+
struct ibv_ah_attr ah_attr;
152+
uint32_t qp_num;
153+
union ibv_gid *rgid;
154+
const uct_ib_address_t *ib_addr;
155+
156+
status = uct_ib_mlx5_query_qp_peer_info(&iface->super.super.super,
157+
&ep->qp.super, &ah_attr, &qp_num);
158+
if (status != UCS_OK) {
159+
return 0;
160+
}
161+
162+
/* TODO unite code with uct_rc_mlx5_base_ep_is_connected */
163+
if (params->field_mask & UCT_EP_IS_CONNECTED_FIELD_EP_ADDR) {
164+
rc_addr = (uct_rc_mlx5_base_ep_address_t*)params->ep_addr;
165+
addr_qp = uct_ib_unpack_uint24(rc_addr->qp_num);
166+
}
167+
168+
if ((addr_qp != 0) && (qp_num != addr_qp)) {
169+
return 0;
170+
}
171+
172+
rgid = (ah_attr.is_global) ? &ah_attr.grh.dgid : NULL;
173+
ib_addr = (const uct_ib_address_t*)params->device_addr;
174+
return uct_ib_iface_is_same_device(ib_addr, ah_attr.dlid, rgid);
175+
}
176+
177+
static ucs_status_t
178+
uct_rc_gdaki_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
179+
{
180+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(tl_iface,
181+
uct_rc_gdaki_iface_t);
182+
ucs_status_t status;
183+
184+
status = uct_ib_iface_query(&iface->super.super.super, 0, iface_attr);
185+
if (status != UCS_OK) {
186+
return status;
187+
}
188+
189+
/* TODO:
190+
* - add UCT_IFACE_FLAG_PUT_BATCH
191+
* - PENDING and PUT_ZCOPY will be needed to establish rma_bw lanes
192+
* - As this lane does not really support PUT_ZCOPY and PENDING, this could be
193+
* causing issue when trying to send standard PUT. Eventually we must probably
194+
* introduce another type of lane (rma_batch#x).
195+
*/
196+
iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_EP;
197+
iface_attr->ep_addr_len = sizeof(uct_rc_mlx5_base_ep_address_t);
198+
iface_attr->iface_addr_len = sizeof(uint8_t);
199+
iface_attr->overhead = UCT_RC_MLX5_IFACE_OVERHEAD;
200+
201+
iface_attr->cap.put.min_zcopy = 0;
202+
iface_attr->cap.put.max_zcopy =
203+
uct_ib_iface_port_attr(&iface->super.super.super)->max_msg_sz;
204+
return UCS_OK;
205+
}
206+
207+
ucs_status_t
208+
uct_rc_gdaki_create_cq(uct_ib_iface_t *ib_iface, uct_ib_dir_t dir,
209+
const uct_ib_iface_init_attr_t *init_attr,
210+
int preferred_cpu, size_t inl)
211+
{
212+
uct_rc_gdaki_iface_t *iface = ucs_derived_of(ib_iface,
213+
uct_rc_gdaki_iface_t);
214+
215+
iface->super.cq[dir].type = UCT_IB_MLX5_OBJ_TYPE_NULL;
216+
return UCS_OK;
217+
}
218+
219+
static UCS_CLASS_DECLARE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h,
220+
uct_worker_h, const uct_iface_params_t*,
221+
const uct_iface_config_t*);
222+
223+
static UCS_CLASS_DECLARE_DELETE_FUNC(uct_rc_gdaki_iface_t, uct_iface_t);
224+
225+
static uct_rc_iface_ops_t uct_rc_gdaki_internal_ops = {
226+
.super = {
227+
.super = {
228+
.iface_estimate_perf = uct_ib_iface_estimate_perf,
229+
.iface_vfs_refresh = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
230+
.ep_query = (uct_ep_query_func_t)ucs_empty_function_return_unsupported,
231+
.ep_invalidate = (uct_ep_invalidate_func_t)ucs_empty_function_return_unsupported,
232+
.ep_connect_to_ep_v2 = uct_rc_gdaki_ep_connect_to_ep_v2,
233+
.iface_is_reachable_v2 = (uct_iface_is_reachable_v2_func_t)ucs_empty_function_return_one_int,
234+
.ep_is_connected = uct_rc_gdaki_ep_is_connected,
235+
},
236+
.create_cq = uct_rc_gdaki_create_cq,
237+
.destroy_cq = (uct_ib_iface_destroy_cq_func_t)ucs_empty_function_return_success,
238+
},
239+
.init_rx = (uct_rc_iface_init_rx_func_t)ucs_empty_function_return_success,
240+
.cleanup_rx = (uct_rc_iface_cleanup_rx_func_t)
241+
ucs_empty_function_return_success,
242+
};
243+
244+
static uct_iface_ops_t uct_rc_gdaki_iface_tl_ops = {
245+
.ep_flush = uct_base_ep_flush,
246+
.ep_fence = uct_base_ep_fence,
247+
.ep_create = UCS_CLASS_NEW_FUNC_NAME(uct_rc_gdaki_ep_t),
248+
.ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_gdaki_ep_t),
249+
.ep_get_address = uct_rc_gdaki_ep_get_address,
250+
.ep_connect_to_ep = uct_base_ep_connect_to_ep,
251+
.ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function,
252+
.iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_gdaki_iface_t),
253+
.iface_query = uct_rc_gdaki_iface_query,
254+
.iface_get_address = uct_rc_gdaki_iface_get_address,
255+
.iface_get_device_address = uct_ib_iface_get_device_address,
256+
.iface_is_reachable = uct_base_iface_is_reachable,
257+
.iface_flush = (uct_iface_flush_func_t)
258+
ucs_empty_function_return_success,
259+
.iface_fence = (uct_iface_fence_func_t)
260+
ucs_empty_function_return_unsupported,
261+
.iface_progress_enable = (uct_iface_progress_enable_func_t)
262+
ucs_empty_function_return_unsupported,
263+
.iface_progress_disable = (uct_iface_progress_disable_func_t)
264+
ucs_empty_function_return_unsupported,
265+
.iface_progress = (uct_iface_progress_func_t)
266+
ucs_empty_function_return_unsupported,
267+
};
268+
269+
37270
static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_iface_t, uct_md_h tl_md,
38271
uct_worker_h worker,
39272
const uct_iface_params_t *params,
40273
const uct_iface_config_t *tl_config)
41274
{
42-
return UCS_ERR_NOT_IMPLEMENTED;
275+
uct_rc_gdaki_iface_config_t *config =
276+
ucs_derived_of(tl_config, uct_rc_gdaki_iface_config_t);
277+
uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
278+
uct_ib_iface_init_attr_t init_attr = {};
279+
UCS_STRING_BUFFER_ONSTACK(strb, 64);
280+
char *gpu_name, *ib_name;
281+
char pci_addr[UCS_SYS_BDF_NAME_MAX];
282+
ucs_status_t status;
283+
doca_error_t derr;
284+
int cuda_id;
285+
286+
status = uct_rc_mlx5_dp_ordering_ooo_init(md, &self->super,
287+
md->dp_ordering_cap.rc,
288+
&config->mlx5, "gdaki");
289+
if (status != UCS_OK) {
290+
return status;
291+
}
292+
293+
ucs_string_buffer_appendf(&strb, "%s", params->mode.device.dev_name);
294+
gpu_name = ucs_string_buffer_next_token(&strb, NULL, "-");
295+
ib_name = ucs_string_buffer_next_token(&strb, gpu_name, "-");
296+
297+
init_attr.seg_size = config->super.super.seg_size;
298+
init_attr.qp_type = IBV_QPT_RC;
299+
init_attr.dev_name = ib_name;
300+
301+
UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t,
302+
&uct_rc_gdaki_iface_tl_ops,
303+
&uct_rc_gdaki_internal_ops, tl_md, worker, params,
304+
&config->super, &config->mlx5, &init_attr);
305+
306+
if (memcmp(gpu_name, UCT_DEV_CUDA_NAME, UCT_DEV_CUDA_NAME_LEN)) {
307+
ucs_error("wrong device name: %s\n", gpu_name);
308+
return status;
309+
}
310+
311+
cuda_id = atoi(gpu_name + UCT_DEV_CUDA_NAME_LEN);
312+
status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetPCIBusId(
313+
pci_addr, UCS_SYS_BDF_NAME_MAX, cuda_id));
314+
if (status != UCS_OK) {
315+
return status;
316+
}
317+
318+
derr = doca_gpu_create(pci_addr, &self->gpu_dev);
319+
if (derr != DOCA_SUCCESS) {
320+
status = UCS_ERR_IO_ERROR;
321+
ucs_error("doca_gpu_create failed: %s %s", doca_error_get_descr(derr),
322+
pci_addr);
323+
return status;
324+
}
325+
326+
status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&self->cuda_dev, cuda_id));
327+
if (status != UCS_OK) {
328+
goto err_doca;
329+
}
330+
331+
return UCS_OK;
332+
333+
err_doca:
334+
doca_gpu_destroy(self->gpu_dev);
335+
return status;
43336
}
44337

45338
static UCS_CLASS_CLEANUP_FUNC(uct_rc_gdaki_iface_t)
@@ -53,6 +346,8 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h,
53346
uct_worker_h, const uct_iface_params_t*,
54347
const uct_iface_config_t*);
55348

349+
static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_gdaki_iface_t, uct_iface_t);
350+
56351
static ucs_status_t
57352
uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
58353
unsigned *num_tl_devices_p)
@@ -94,8 +389,9 @@ uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
94389
}
95390

96391
snprintf(tl_devices[num_tl_devices].name,
97-
sizeof(tl_devices[num_tl_devices].name), "cuda%d-%s", device,
98-
uct_ib_device_name(&ib_md->dev));
392+
sizeof(tl_devices[num_tl_devices].name), "cuda%d-%s:%d",
393+
device, uct_ib_device_name(&ib_md->dev),
394+
ib_md->dev.first_port);
99395
tl_devices[num_tl_devices].type = UCT_DEVICE_TYPE_NET;
100396
tl_devices[num_tl_devices].sys_device = dev;
101397
num_tl_devices++;

src/uct/ib/mlx5/gdaki/gdaki.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,28 @@
66
#ifndef UCT_GDAKI_IFACE_H
77
#define UCT_GDAKI_IFACE_H
88

9+
#include <uct/ib/mlx5/rc/rc_mlx5_common.h>
910
#include <uct/base/uct_iface.h>
1011

1112
#include <cuda.h>
1213
#include <doca_gpunetio.h>
1314

15+
16+
#define UCT_DEV_CUDA_NAME "cuda"
17+
#define UCT_DEV_CUDA_NAME_LEN 4
18+
19+
1420
typedef struct uct_rc_gdaki_iface {
15-
struct doca_gpu *gpu_dev;
16-
CUdevice cuda_dev;
21+
uct_rc_mlx5_iface_common_t super;
22+
struct doca_gpu *gpu_dev;
23+
CUdevice cuda_dev;
1724
} uct_rc_gdaki_iface_t;
1825

26+
27+
typedef struct uct_rc_gdaki_ep {
28+
uct_base_ep_t super;
29+
uct_ib_mlx5_cq_t cq;
30+
uct_ib_mlx5_txwq_t qp;
31+
} uct_rc_gdaki_ep_t;
32+
1933
#endif /* UCT_GDAKI_IFACE_H */

0 commit comments

Comments
 (0)