@@ -34,12 +34,305 @@ ucs_config_field_t uct_rc_gdaki_iface_config_table[] = {
3434 {NULL }
3535};
3636
37+
38+ static UCS_CLASS_INIT_FUNC (uct_rc_gdaki_ep_t , const uct_ep_params_t * params )
39+ {
40+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (params -> iface ,
41+ uct_rc_gdaki_iface_t );
42+ uct_ib_mlx5_md_t * md = ucs_derived_of (iface -> super .super .super .super .md ,
43+ uct_ib_mlx5_md_t );
44+ uct_ib_iface_init_attr_t init_attr = {};
45+ uct_ib_mlx5_qp_attr_t qp_attr = {};
46+ ucs_status_t status ;
47+
48+ UCS_CLASS_CALL_SUPER_INIT (uct_base_ep_t , & iface -> super .super .super .super );
49+
50+ init_attr .cq_len [UCT_IB_DIR_TX ] = iface -> super .super .config .tx_qp_len *
51+ UCT_IB_MLX5_MAX_BB ;
52+ init_attr .flags = UCT_IB_CQ_IGNORE_OVERRUN ;
53+ status = uct_ib_mlx5_devx_create_cq (& iface -> super .super .super ,
54+ UCT_IB_DIR_TX , & init_attr , & self -> cq , 0 ,
55+ 0 );
56+ if (status != UCS_OK ) {
57+ return status ;
58+ }
59+
60+ uct_rc_iface_fill_attr (& iface -> super .super , & qp_attr .super ,
61+ iface -> super .super .config .tx_qp_len , NULL );
62+ qp_attr .mmio_mode = UCT_IB_MLX5_MMIO_MODE_DB ;
63+ qp_attr .super .srq_num = 0 ;
64+ status = uct_ib_mlx5_devx_create_qp (& iface -> super .super .super , & self -> cq ,
65+ & self -> cq , & self -> qp .super , & self -> qp ,
66+ & qp_attr );
67+ if (status != UCS_OK ) {
68+ goto err_qp ;
69+ }
70+
71+ return UCS_OK ;
72+
73+ err_qp :
74+ uct_ib_mlx5_devx_destroy_cq (md , & self -> cq );
75+ return status ;
76+ }
77+
78+ static UCS_CLASS_CLEANUP_FUNC (uct_rc_gdaki_ep_t )
79+ {
80+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (self -> super .super .iface ,
81+ uct_rc_gdaki_iface_t );
82+ uct_ib_mlx5_md_t * md = ucs_derived_of (iface -> super .super .super .super .md ,
83+ uct_ib_mlx5_md_t );
84+
85+ uct_ib_mlx5_devx_destroy_qp (md , & self -> qp .super );
86+ uct_ib_mlx5_devx_destroy_cq (md , & self -> cq );
87+ }
88+
89+ UCS_CLASS_DEFINE (uct_rc_gdaki_ep_t , uct_base_ep_t );
90+ UCS_CLASS_DEFINE_NEW_FUNC (uct_rc_gdaki_ep_t , uct_ep_t , const uct_ep_params_t * );
91+ UCS_CLASS_DEFINE_DELETE_FUNC (uct_rc_gdaki_ep_t , uct_ep_t );
92+
93+ static ucs_status_t
94+ uct_rc_gdaki_ep_get_address (uct_ep_h tl_ep , uct_ep_addr_t * addr )
95+ {
96+ uct_rc_gdaki_ep_t * ep = ucs_derived_of (tl_ep , uct_rc_gdaki_ep_t );
97+ uct_rc_mlx5_base_ep_address_t * rc_addr = (void * )addr ;
98+
99+ uct_ib_pack_uint24 (rc_addr -> qp_num , ep -> qp .super .qp_num );
100+ return UCS_OK ;
101+ }
102+
103+ static ucs_status_t uct_rc_gdaki_iface_get_address (uct_iface_h tl_iface ,
104+ uct_iface_addr_t * addr )
105+ {
106+ * (uint8_t * )addr = UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC ;
107+ return UCS_OK ;
108+ }
109+
110+ static ucs_status_t
111+ uct_rc_gdaki_ep_connect_to_ep_v2 (uct_ep_h ep ,
112+ const uct_device_addr_t * device_addr ,
113+ const uct_ep_addr_t * ep_addr ,
114+ const uct_ep_connect_to_ep_params_t * params )
115+ {
116+ uct_rc_gdaki_ep_t * gdaki_ep = ucs_derived_of (ep , uct_rc_gdaki_ep_t );
117+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (ep -> iface ,
118+ uct_rc_gdaki_iface_t );
119+ const uct_ib_address_t * ib_addr = (void * )device_addr ;
120+ const uct_rc_mlx5_base_ep_address_t * rc_addr = (void * )ep_addr ;
121+ uint8_t path_index = 0 ;
122+ struct ibv_ah_attr ah_attr ;
123+ enum ibv_mtu path_mtu ;
124+ uint32_t dest_qp_num ;
125+ ucs_status_t status ;
126+
127+ status = uct_ib_iface_fill_ah_attr_from_addr (& iface -> super .super .super ,
128+ ib_addr , path_index , & ah_attr ,
129+ & path_mtu );
130+ if (status != UCS_OK ) {
131+ return status ;
132+ }
133+
134+ ucs_assert (path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU );
135+ dest_qp_num = uct_ib_unpack_uint24 (rc_addr -> qp_num );
136+
137+ return uct_rc_mlx5_iface_common_devx_connect_qp (
138+ & iface -> super , & gdaki_ep -> qp .super , dest_qp_num , & ah_attr , path_mtu ,
139+ path_index , iface -> super .super .config .max_rd_atomic );
140+ }
141+
142+ int uct_rc_gdaki_ep_is_connected (uct_ep_h tl_ep ,
143+ const uct_ep_is_connected_params_t * params )
144+ {
145+ uct_rc_gdaki_ep_t * ep = ucs_derived_of (tl_ep , uct_rc_gdaki_ep_t );
146+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (ep -> super .super .iface ,
147+ uct_rc_gdaki_iface_t );
148+ uint32_t addr_qp = 0 ;
149+ uct_rc_mlx5_base_ep_address_t * rc_addr ;
150+ ucs_status_t status ;
151+ struct ibv_ah_attr ah_attr ;
152+ uint32_t qp_num ;
153+ union ibv_gid * rgid ;
154+ const uct_ib_address_t * ib_addr ;
155+
156+ status = uct_ib_mlx5_query_qp_peer_info (& iface -> super .super .super ,
157+ & ep -> qp .super , & ah_attr , & qp_num );
158+ if (status != UCS_OK ) {
159+ return 0 ;
160+ }
161+
162+ /* TODO unite code with uct_rc_mlx5_base_ep_is_connected */
163+ if (params -> field_mask & UCT_EP_IS_CONNECTED_FIELD_EP_ADDR ) {
164+ rc_addr = (uct_rc_mlx5_base_ep_address_t * )params -> ep_addr ;
165+ addr_qp = uct_ib_unpack_uint24 (rc_addr -> qp_num );
166+ }
167+
168+ if ((addr_qp != 0 ) && (qp_num != addr_qp )) {
169+ return 0 ;
170+ }
171+
172+ rgid = (ah_attr .is_global ) ? & ah_attr .grh .dgid : NULL ;
173+ ib_addr = (const uct_ib_address_t * )params -> device_addr ;
174+ return uct_ib_iface_is_same_device (ib_addr , ah_attr .dlid , rgid );
175+ }
176+
177+ static ucs_status_t
178+ uct_rc_gdaki_iface_query (uct_iface_h tl_iface , uct_iface_attr_t * iface_attr )
179+ {
180+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (tl_iface ,
181+ uct_rc_gdaki_iface_t );
182+ ucs_status_t status ;
183+
184+ status = uct_ib_iface_query (& iface -> super .super .super , 0 , iface_attr );
185+ if (status != UCS_OK ) {
186+ return status ;
187+ }
188+
189+ /* TODO:
190+ * - add UCT_IFACE_FLAG_PUT_BATCH
191+ * - PENDING and PUT_ZCOPY will be needed to establish rma_bw lanes
192+ * - As this lane does not really support PUT_ZCOPY and PENDING, this could be
193+ * causing issue when trying to send standard PUT. Eventually we must probably
194+ * introduce another type of lane (rma_batch#x).
195+ */
196+ iface_attr -> cap .flags = UCT_IFACE_FLAG_CONNECT_TO_EP ;
197+ iface_attr -> ep_addr_len = sizeof (uct_rc_mlx5_base_ep_address_t );
198+ iface_attr -> iface_addr_len = sizeof (uint8_t );
199+ iface_attr -> overhead = UCT_RC_MLX5_IFACE_OVERHEAD ;
200+
201+ iface_attr -> cap .put .min_zcopy = 0 ;
202+ iface_attr -> cap .put .max_zcopy =
203+ uct_ib_iface_port_attr (& iface -> super .super .super )-> max_msg_sz ;
204+ return UCS_OK ;
205+ }
206+
207+ ucs_status_t
208+ uct_rc_gdaki_create_cq (uct_ib_iface_t * ib_iface , uct_ib_dir_t dir ,
209+ const uct_ib_iface_init_attr_t * init_attr ,
210+ int preferred_cpu , size_t inl )
211+ {
212+ uct_rc_gdaki_iface_t * iface = ucs_derived_of (ib_iface ,
213+ uct_rc_gdaki_iface_t );
214+
215+ iface -> super .cq [dir ].type = UCT_IB_MLX5_OBJ_TYPE_NULL ;
216+ return UCS_OK ;
217+ }
218+
219+ static UCS_CLASS_DECLARE_NEW_FUNC (uct_rc_gdaki_iface_t , uct_iface_t , uct_md_h ,
220+ uct_worker_h , const uct_iface_params_t * ,
221+ const uct_iface_config_t * ) ;
222+
223+ static UCS_CLASS_DECLARE_DELETE_FUNC (uct_rc_gdaki_iface_t , uct_iface_t ) ;
224+
225+ static uct_rc_iface_ops_t uct_rc_gdaki_internal_ops = {
226+ .super = {
227+ .super = {
228+ .iface_estimate_perf = uct_ib_iface_estimate_perf ,
229+ .iface_vfs_refresh = (uct_iface_vfs_refresh_func_t )ucs_empty_function ,
230+ .ep_query = (uct_ep_query_func_t )ucs_empty_function_return_unsupported ,
231+ .ep_invalidate = (uct_ep_invalidate_func_t )ucs_empty_function_return_unsupported ,
232+ .ep_connect_to_ep_v2 = uct_rc_gdaki_ep_connect_to_ep_v2 ,
233+ .iface_is_reachable_v2 = (uct_iface_is_reachable_v2_func_t )ucs_empty_function_return_one_int ,
234+ .ep_is_connected = uct_rc_gdaki_ep_is_connected ,
235+ },
236+ .create_cq = uct_rc_gdaki_create_cq ,
237+ .destroy_cq = (uct_ib_iface_destroy_cq_func_t )ucs_empty_function_return_success ,
238+ },
239+ .init_rx = (uct_rc_iface_init_rx_func_t )ucs_empty_function_return_success ,
240+ .cleanup_rx = (uct_rc_iface_cleanup_rx_func_t )
241+ ucs_empty_function_return_success ,
242+ };
243+
244+ static uct_iface_ops_t uct_rc_gdaki_iface_tl_ops = {
245+ .ep_flush = uct_base_ep_flush ,
246+ .ep_fence = uct_base_ep_fence ,
247+ .ep_create = UCS_CLASS_NEW_FUNC_NAME (uct_rc_gdaki_ep_t ),
248+ .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME (uct_rc_gdaki_ep_t ),
249+ .ep_get_address = uct_rc_gdaki_ep_get_address ,
250+ .ep_connect_to_ep = uct_base_ep_connect_to_ep ,
251+ .ep_pending_purge = (uct_ep_pending_purge_func_t )ucs_empty_function ,
252+ .iface_close = UCS_CLASS_DELETE_FUNC_NAME (uct_rc_gdaki_iface_t ),
253+ .iface_query = uct_rc_gdaki_iface_query ,
254+ .iface_get_address = uct_rc_gdaki_iface_get_address ,
255+ .iface_get_device_address = uct_ib_iface_get_device_address ,
256+ .iface_is_reachable = uct_base_iface_is_reachable ,
257+ .iface_flush = (uct_iface_flush_func_t )
258+ ucs_empty_function_return_success ,
259+ .iface_fence = (uct_iface_fence_func_t )
260+ ucs_empty_function_return_unsupported ,
261+ .iface_progress_enable = (uct_iface_progress_enable_func_t )
262+ ucs_empty_function_return_unsupported ,
263+ .iface_progress_disable = (uct_iface_progress_disable_func_t )
264+ ucs_empty_function_return_unsupported ,
265+ .iface_progress = (uct_iface_progress_func_t )
266+ ucs_empty_function_return_unsupported ,
267+ };
268+
269+
37270static UCS_CLASS_INIT_FUNC (uct_rc_gdaki_iface_t , uct_md_h tl_md ,
38271 uct_worker_h worker ,
39272 const uct_iface_params_t * params ,
40273 const uct_iface_config_t * tl_config )
41274{
42- return UCS_ERR_NOT_IMPLEMENTED ;
275+ uct_rc_gdaki_iface_config_t * config =
276+ ucs_derived_of (tl_config , uct_rc_gdaki_iface_config_t );
277+ uct_ib_mlx5_md_t * md = ucs_derived_of (tl_md , uct_ib_mlx5_md_t );
278+ uct_ib_iface_init_attr_t init_attr = {};
279+ UCS_STRING_BUFFER_ONSTACK (strb , 64 );
280+ char * gpu_name , * ib_name ;
281+ char pci_addr [UCS_SYS_BDF_NAME_MAX ];
282+ ucs_status_t status ;
283+ doca_error_t derr ;
284+ int cuda_id ;
285+
286+ status = uct_rc_mlx5_dp_ordering_ooo_init (md , & self -> super ,
287+ md -> dp_ordering_cap .rc ,
288+ & config -> mlx5 , "gdaki" );
289+ if (status != UCS_OK ) {
290+ return status ;
291+ }
292+
293+ ucs_string_buffer_appendf (& strb , "%s" , params -> mode .device .dev_name );
294+ gpu_name = ucs_string_buffer_next_token (& strb , NULL , "-" );
295+ ib_name = ucs_string_buffer_next_token (& strb , gpu_name , "-" );
296+
297+ init_attr .seg_size = config -> super .super .seg_size ;
298+ init_attr .qp_type = IBV_QPT_RC ;
299+ init_attr .dev_name = ib_name ;
300+
301+ UCS_CLASS_CALL_SUPER_INIT (uct_rc_mlx5_iface_common_t ,
302+ & uct_rc_gdaki_iface_tl_ops ,
303+ & uct_rc_gdaki_internal_ops , tl_md , worker , params ,
304+ & config -> super , & config -> mlx5 , & init_attr );
305+
306+ if (memcmp (gpu_name , UCT_DEV_CUDA_NAME , UCT_DEV_CUDA_NAME_LEN )) {
307+ ucs_error ("wrong device name: %s\n" , gpu_name );
308+ return status ;
309+ }
310+
311+ cuda_id = atoi (gpu_name + UCT_DEV_CUDA_NAME_LEN );
312+ status = UCT_CUDADRV_FUNC_LOG_ERR (cuDeviceGetPCIBusId (
313+ pci_addr , UCS_SYS_BDF_NAME_MAX , cuda_id ));
314+ if (status != UCS_OK ) {
315+ return status ;
316+ }
317+
318+ derr = doca_gpu_create (pci_addr , & self -> gpu_dev );
319+ if (derr != DOCA_SUCCESS ) {
320+ status = UCS_ERR_IO_ERROR ;
321+ ucs_error ("doca_gpu_create failed: %s %s" , doca_error_get_descr (derr ),
322+ pci_addr );
323+ return status ;
324+ }
325+
326+ status = UCT_CUDADRV_FUNC_LOG_ERR (cuDeviceGet (& self -> cuda_dev , cuda_id ));
327+ if (status != UCS_OK ) {
328+ goto err_doca ;
329+ }
330+
331+ return UCS_OK ;
332+
333+ err_doca :
334+ doca_gpu_destroy (self -> gpu_dev );
335+ return status ;
43336}
44337
45338static UCS_CLASS_CLEANUP_FUNC (uct_rc_gdaki_iface_t )
@@ -53,6 +346,8 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h,
53346 uct_worker_h , const uct_iface_params_t * ,
54347 const uct_iface_config_t * ) ;
55348
349+ static UCS_CLASS_DEFINE_DELETE_FUNC (uct_rc_gdaki_iface_t , uct_iface_t ) ;
350+
56351static ucs_status_t
57352uct_gdaki_query_tl_devices (uct_md_h md , uct_tl_device_resource_t * * tl_devices_p ,
58353 unsigned * num_tl_devices_p )
@@ -94,8 +389,9 @@ uct_gdaki_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
94389 }
95390
96391 snprintf (tl_devices [num_tl_devices ].name ,
97- sizeof (tl_devices [num_tl_devices ].name ), "cuda%d-%s" , device ,
98- uct_ib_device_name (& ib_md -> dev ));
392+ sizeof (tl_devices [num_tl_devices ].name ), "cuda%d-%s:%d" ,
393+ device , uct_ib_device_name (& ib_md -> dev ),
394+ ib_md -> dev .first_port );
99395 tl_devices [num_tl_devices ].type = UCT_DEVICE_TYPE_NET ;
100396 tl_devices [num_tl_devices ].sys_device = dev ;
101397 num_tl_devices ++ ;
0 commit comments