Skip to content

Commit f2f43b4

Browse files
committed
UCT/EFA/SRD: Add flush and fence, enable for UCT
1 parent 2f8711f commit f2f43b4

File tree

17 files changed

+862
-225
lines changed

17 files changed

+862
-225
lines changed

buildlib/pr/efa.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
demands: ${{ parameters.demands }}
1010
displayName: ${{ parameters.name }} on ${{ parameters.container }}
1111
container: ${{ parameters.container }}
12-
timeoutInMinutes: 90
12+
timeoutInMinutes: 120
1313
workspace:
1414
clean: outputs
1515
steps:

contrib/test_efa.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,12 @@ run_gtests() {
103103
./install/bin/ucx_perftest -l -t tag_bw
104104
./install/bin/ucx_info -d
105105
106+
IBMOCK_FILTER='srd/uct_p2p_am_misc.no_rx_buffs*:srd/test_uct_peer_failure.purge_failed_peer*'
106107
# Try the faster approach before valgrind
107-
make -C contrib/test/gtest test GTEST_FILTER=*ud*:*test_srd*
108-
make -C contrib/test/gtest test_valgrind GTEST_FILTER=*ud*:*test_srd*:-*test_uct_perf.envelope*
108+
make -C contrib/test/gtest test \
109+
GTEST_FILTER=*ud*:srd/*-$IBMOCK_FILTER
110+
make -C contrib/test/gtest test_valgrind \
111+
GTEST_FILTER=*ud*:srd/*:-*test_uct_perf.envelope*:$IBMOCK_FILTER
109112
}
110113
111114
test_ucx_rpm() {

src/ucp/core/ucp_context.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,14 @@ static ucs_config_field_t ucp_config_table[] = {
602602
" - sm/shm : all shared memory transports (mm, cma, knem).\n"
603603
" - mm : shared memory transports - only memory mappers.\n"
604604
" - ugni : ugni_smsg and ugni_rdma (uses ugni_udt for bootstrap).\n"
605-
" - ib : all infiniband transports (rc/rc_mlx5, ud/ud_mlx5, dc_mlx5).\n"
605+
" - ib : all infiniband transports (rc/rc_mlx5, ud/ud_mlx5, dc_mlx5, srd).\n"
606606
" - rc_v : rc verbs (uses ud for bootstrap).\n"
607607
" - rc_x : rc with accelerated verbs (uses ud_mlx5 for bootstrap).\n"
608608
" - rc : rc_v and rc_x (preferably if available).\n"
609609
" - ud_v : ud verbs.\n"
610610
" - ud_x : ud with accelerated verbs.\n"
611611
" - ud : ud_v and ud_x (preferably if available).\n"
612+
" - srd : EFA srd reliable transport.\n"
612613
" - dc/dc_x : dc with accelerated verbs.\n"
613614
" - tcp : sockets over TCP/IP.\n"
614615
" - cuda : CUDA (NVIDIA GPU) memory support.\n"
@@ -690,7 +691,8 @@ static ucp_tl_alias_t ucp_tl_aliases[] = {
690691
{ "sm", { "posix", "sysv", "xpmem", "knem", "cma", NULL } },
691692
{ "shm", { "posix", "sysv", "xpmem", "knem", "cma", NULL } },
692693
{ "ib", { "rc_verbs", "ud_verbs", "rc_mlx5", "ud_mlx5", "dc_mlx5",
693-
"gga_mlx5", UCP_TL_AUX("ud_mlx5"), UCP_TL_AUX("ud_verbs"), NULL } },
694+
"gga_mlx5", UCP_TL_AUX("ud_mlx5"), UCP_TL_AUX("ud_verbs"),
695+
"srd", NULL } },
694696
{ "ud_v", { "ud_verbs", NULL } },
695697
{ "ud_x", { "ud_mlx5", NULL } },
696698
{ "ud", { "ud_mlx5", "ud_verbs", NULL } },

src/uct/ib/base/ib_iface.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2104,3 +2104,24 @@ ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
21042104
}
21052105
return UCS_OK;
21062106
}
2107+
2108+
int uct_ib_iface_is_connected(uct_ib_iface_t *ib_iface,
2109+
const uct_ib_address_t *ib_addr,
2110+
unsigned path_index, struct ibv_ah *peer_ah)
2111+
{
2112+
enum ibv_mtu path_mtu;
2113+
ucs_status_t status;
2114+
struct ibv_ah_attr ah_attr;
2115+
struct ibv_ah *ah;
2116+
2117+
uct_ib_iface_fill_ah_attr_from_addr(ib_iface, ib_addr, path_index,
2118+
&ah_attr, &path_mtu);
2119+
2120+
status = uct_ib_device_get_ah_cached(uct_ib_iface_device(ib_iface),
2121+
&ah_attr, &ah);
2122+
if (status != UCS_OK) {
2123+
return 0;
2124+
}
2125+
2126+
return ah == peer_ah;
2127+
}

src/uct/ib/base/ib_iface.h

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface);
494494
size_t uct_ib_iface_address_size(uct_ib_iface_t *iface);
495495

496496

497+
int uct_ib_iface_is_connected(uct_ib_iface_t *ib_iface,
498+
const uct_ib_address_t *ib_addr,
499+
unsigned path_index, struct ibv_ah *peer_ah);
500+
501+
497502
/**
498503
* Pack IB address.
499504
*
@@ -660,16 +665,23 @@ uint16_t uct_ib_iface_resolve_remote_flid(uct_ib_iface_t *iface,
660665
uct_ib_iface_is_roce(_iface) ? "RoCE" : "IB"
661666

662667

663-
#define UCT_IB_IFACE_VERBS_COMPLETION_ERR(_type, _iface, _i, _wc) \
664-
ucs_fatal("%s completion[%d] with error on %s/%p: %s, vendor_err 0x%x wr_id 0x%lx", \
665-
_type, _i, uct_ib_device_name(uct_ib_iface_device(_iface)), _iface, \
666-
uct_ib_wc_status_str(_wc[_i].status), _wc[_i].vendor_err, \
667-
_wc[_i].wr_id);
668+
#define UCT_IB_IFACE_VERBS_COMPLETION_MSG(_type, _iface, _i, _wc) \
669+
"%s completion[%d] with error on %s/%p: %s," \
670+
" vendor_err 0x%x wr_id 0x%lx", \
671+
_type, _i, uct_ib_device_name(uct_ib_iface_device(_iface)), \
672+
_iface, uct_ib_wc_status_str(_wc[_i].status), _wc[_i].vendor_err, \
673+
_wc[_i].wr_id
674+
675+
#define UCT_IB_IFACE_VERBS_COMPLETION_LOG(_log_lvl, _type, _iface, _i, _wc) \
676+
ucs_log(_log_lvl, UCT_IB_IFACE_VERBS_COMPLETION_MSG(_type, _iface, _i, _wc))
677+
678+
#define UCT_IB_IFACE_VERBS_COMPLETION_FATAL(_type, _iface, _i, _wc) \
679+
ucs_fatal(UCT_IB_IFACE_VERBS_COMPLETION_MSG(_type, _iface, _i, _wc))
668680

669681
#define UCT_IB_IFACE_VERBS_FOREACH_RXWQE(_iface, _i, _hdr, _wc, _wc_count) \
670682
for (_i = 0; _i < _wc_count && ({ \
671683
if (ucs_unlikely(_wc[_i].status != IBV_WC_SUCCESS)) { \
672-
UCT_IB_IFACE_VERBS_COMPLETION_ERR("receive", _iface, _i, _wc); \
684+
UCT_IB_IFACE_VERBS_COMPLETION_FATAL("receive", _iface, _i, _wc); \
673685
} \
674686
_hdr = (typeof(_hdr))uct_ib_iface_recv_desc_hdr(_iface, \
675687
(uct_ib_iface_recv_desc_t *)(uintptr_t)_wc[_i].wr_id); \
@@ -765,4 +777,26 @@ uct_ib_fill_cq_attr(struct ibv_cq_init_attr_ex *cq_attr,
765777
}
766778
#endif /* HAVE_DECL_IBV_CREATE_CQ_EX */
767779

780+
static UCS_F_ALWAYS_INLINE ucs_status_t
781+
uct_ib_wc_to_ucs_status(enum ibv_wc_status status)
782+
{
783+
switch (status)
784+
{
785+
case IBV_WC_SUCCESS:
786+
return UCS_OK;
787+
case IBV_WC_REM_ACCESS_ERR:
788+
case IBV_WC_REM_OP_ERR:
789+
case IBV_WC_REM_INV_RD_REQ_ERR:
790+
return UCS_ERR_CONNECTION_RESET;
791+
case IBV_WC_RETRY_EXC_ERR:
792+
case IBV_WC_RNR_RETRY_EXC_ERR:
793+
case IBV_WC_REM_ABORT_ERR:
794+
return UCS_ERR_ENDPOINT_TIMEOUT;
795+
case IBV_WC_WR_FLUSH_ERR:
796+
return UCS_ERR_CANCELED;
797+
default:
798+
return UCS_ERR_IO_ERROR;
799+
}
800+
}
801+
768802
#endif

0 commit comments

Comments
 (0)