From 2d30a368412542644615ca138477c06b2a1895c6 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 15 Sep 2017 18:41:15 +0000 Subject: [PATCH 01/10] util: Add link-local check to net interface Add a check for link-local IPv6 addresses to the net interface to support better computation of network pairings in the weighted reachable component. Signed-off-by: Brian Barrett --- opal/util/net.c | 24 ++++++++++++++++++++++++ opal/util/net.h | 8 ++++++++ 2 files changed, 32 insertions(+) diff --git a/opal/util/net.c b/opal/util/net.c index f8c71e1e4a8..06178e0c89b 100644 --- a/opal/util/net.c +++ b/opal/util/net.c @@ -358,6 +358,30 @@ opal_net_addr_isipv4public(const struct sockaddr *addr) return false; } +bool +opal_net_addr_isipv6linklocal(const struct sockaddr *addr) +{ + struct sockaddr_in6 if_addr; + + switch (addr->sa_family) { +#if OPAL_ENABLE_IPV6 + case AF_INET6: + if_addr.sin6_family = AF_INET6; + if (1 != inet_pton(AF_INET6, "fe80::0000", &if_addr.sin6_addr)) { + return false; + } + return opal_net_samenetwork(addr, (struct sockaddr*)&if_addr, 64); +#endif + case AF_INET: + return false; + default: + opal_output (0, + "unhandled sa_family %d passed to opal_net_addr_isipv6linklocal\n", + addr->sa_family); + } + + return false; +} char* opal_net_get_hostname(const struct sockaddr *addr) diff --git a/opal/util/net.h b/opal/util/net.h index 27dad966625..aff84125730 100644 --- a/opal/util/net.h +++ b/opal/util/net.h @@ -112,6 +112,14 @@ OPAL_DECLSPEC bool opal_net_samenetwork(const struct sockaddr *addr1, */ OPAL_DECLSPEC bool opal_net_addr_isipv4public(const struct sockaddr *addr); +/** + * Is the given address a link-local IPv6 address? Returns false for IPv4 + * address. + * + * @param addr address as struct sockaddr + * @return true, if \c addr is IPv6 link-local, false otherwise + */ +OPAL_DECLSPEC bool opal_net_addr_isipv6linklocal(const struct sockaddr *addr); /** * Get string version of address From 4345208123ba85629ca7fcdb90b4b0a9a943bfe0 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 13 Sep 2017 18:36:42 +0000 Subject: [PATCH 02/10] reachable: Rename code copied from usnic Ralph and Jeff created the reachable framework and added the netlink component based on code copied from the usnic btl. However, they never renamed all the symbols from the libnl compatibility code. This patch finishes the rename. Signed-off-by: Brian Barrett --- opal/mca/reachable/netlink/libnl1_utils.h | 6 +- opal/mca/reachable/netlink/libnl3_utils.h | 6 +- opal/mca/reachable/netlink/libnl_utils.h | 10 ++- .../netlink/reachable_netlink_utils_common.c | 79 +++++++++++-------- 4 files changed, 59 insertions(+), 42 deletions(-) diff --git a/opal/mca/reachable/netlink/libnl1_utils.h b/opal/mca/reachable/netlink/libnl1_utils.h index 6665c587115..8eedf0ac8b6 100644 --- a/opal/mca/reachable/netlink/libnl1_utils.h +++ b/opal/mca/reachable/netlink/libnl1_utils.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * * Portions of this software copied from libfabric * (https://github.com/ofiwg/libfabric) @@ -86,12 +88,12 @@ typedef struct nl_handle NL_HANDLE; } \ } while (0) -struct usnic_rt_cb_arg { +struct opal_reachable_netlink_rt_cb_arg { uint32_t nh_addr; int oif; int found; int msg_cnt; - struct usnic_nl_sk *unlsk; + struct opal_reachable_netlink_sk *unlsk; }; #endif /* LIBNL1_UTILS_H */ diff --git a/opal/mca/reachable/netlink/libnl3_utils.h b/opal/mca/reachable/netlink/libnl3_utils.h index ea99c88fc1c..957d4c2a8e4 100644 --- a/opal/mca/reachable/netlink/libnl3_utils.h +++ b/opal/mca/reachable/netlink/libnl3_utils.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * * Portions of this software copied from libfabric * (https://github.com/ofiwg/libfabric) @@ -69,12 +71,12 @@ typedef struct nl_sock NL_HANDLE; } \ } while (0) -struct usnic_rt_cb_arg { +struct opal_reachable_netlink_rt_cb_arg { uint32_t nh_addr; int oif; int found; int replied; - struct usnic_nl_sk *unlsk; + struct opal_reachable_netlink_sk *unlsk; }; #endif /* LIBNL3_UTILS_H */ diff --git a/opal/mca/reachable/netlink/libnl_utils.h b/opal/mca/reachable/netlink/libnl_utils.h index 3e3abbcabff..cff70735628 100644 --- a/opal/mca/reachable/netlink/libnl_utils.h +++ b/opal/mca/reachable/netlink/libnl_utils.h @@ -2,6 +2,8 @@ * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * * Portions of this software copied from libfabric * (https://github.com/ofiwg/libfabric) @@ -52,13 +54,13 @@ #include "libnl1_utils.h" #endif -struct usnic_nl_sk { +struct opal_reachable_netlink_sk { NL_HANDLE *nlh; uint32_t seq; }; -int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, - uint32_t dst_addr, int oif, - uint32_t *nh_addr); +int opal_reachable_netlink_rt_lookup(uint32_t src_addr, + uint32_t dst_addr, int oif, + uint32_t *nh_addr); #endif /* LIBNL_UTILS_H */ diff --git a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c index 81abe44e204..c71eb3e2e21 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c +++ b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c @@ -1,6 +1,7 @@ /* - * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. - * + * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * Portions of this software copied from libfabric * (https://github.com/ofiwg/libfabric) * @@ -63,8 +64,8 @@ static struct nla_policy route_policy[RTA_MAX+1] = { [RTA_MULTIPATH] = { .type = NLA_NESTED }, }; -static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk, - struct nlmsghdr *nlm_hdr) +static int opal_reachable_netlink_is_nlreply_expected(struct opal_reachable_netlink_sk *unlsk, + struct nlmsghdr *nlm_hdr) { #if OPAL_ENABLE_DEBUG if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh) @@ -80,7 +81,7 @@ static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk, return 1; } -static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr) +static int opal_reachable_netlink_is_nlreply_err(struct nlmsghdr *nlm_hdr) { if (nlm_hdr->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr); @@ -96,9 +97,9 @@ static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr) return 0; } -static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, - struct nl_msg *msg, - int protocol, int flag) +static int opal_reachable_netlink_send_query(struct opal_reachable_netlink_sk *unlsk, + struct nl_msg *msg, + int protocol, int flag) { struct nlmsghdr *nlhdr; @@ -111,7 +112,7 @@ static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, return nl_send(unlsk->nlh, msg); } -static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh) +static int opal_reachable_netlink_set_rcvsk_timer(NL_HANDLE *nlh) { int err = 0; struct timeval timeout; @@ -129,15 +130,15 @@ static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh) return err; } -static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol) +static int opal_reachable_netlink_sk_alloc(struct opal_reachable_netlink_sk **p_sk, int protocol) { - struct usnic_nl_sk *unlsk; + struct opal_reachable_netlink_sk *unlsk; NL_HANDLE *nlh; int err; unlsk = calloc(1, sizeof(*unlsk)); if (!unlsk) { - opal_output(0, "Failed to allocate usnic_nl_sk struct\n"); + opal_output(0, "Failed to allocate opal_reachable_netlink_sk struct\n"); return ENOMEM; } @@ -157,7 +158,7 @@ static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol) } NL_DISABLE_SEQ_CHECK(nlh); - err = usnic_nl_set_rcvsk_timer(nlh); + err = opal_reachable_netlink_set_rcvsk_timer(nlh); if (err < 0) goto err_close_nlh; @@ -175,17 +176,17 @@ static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol) return err; } -static void usnic_nl_sk_free(struct usnic_nl_sk *unlsk) +static void opal_reachable_netlink_sk_free(struct opal_reachable_netlink_sk *unlsk) { nl_close(unlsk->nlh); NL_HANDLE_FREE(unlsk->nlh); free(unlsk); } -static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg) +static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg) { - struct usnic_rt_cb_arg *lookup_arg = (struct usnic_rt_cb_arg *)arg; - struct usnic_nl_sk *unlsk = lookup_arg->unlsk; + struct opal_reachable_netlink_rt_cb_arg *lookup_arg = (struct opal_reachable_netlink_rt_cb_arg *)arg; + struct opal_reachable_netlink_sk *unlsk = lookup_arg->unlsk; struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX + 1]; @@ -194,14 +195,14 @@ static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg) INC_CB_MSGCNT(lookup_arg); - if (!usnic_is_nlreply_expected(unlsk, nlm_hdr)) { + if (!opal_reachable_netlink_is_nlreply_expected(unlsk, nlm_hdr)) { #if OPAL_ENABLE_DEBUG nl_msg_dump(msg, stderr); #endif return NL_SKIP; } - if (usnic_is_nlreply_err(nlm_hdr)) { + if (opal_reachable_netlink_is_nlreply_err(nlm_hdr)) { #if OPAL_ENABLE_DEBUG nl_msg_dump(msg, stderr); #endif @@ -250,31 +251,35 @@ static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg) if (found && tb[RTA_GATEWAY]) lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]); - lookup_arg->found = found; return NL_STOP; } -int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, - uint32_t dst_addr, int oif, - uint32_t *nh_addr) +int opal_reachable_netlink_rt_lookup(uint32_t src_addr, + uint32_t dst_addr, + int outgoing_interface, + uint32_t *nh_addr) { - struct usnic_nl_sk *unlsk; - struct nl_msg *nlm; - struct rtmsg rmsg; - struct usnic_rt_cb_arg arg; - int err; + struct opal_reachable_netlink_sk *unlsk; /* netlink socket */ + struct nl_msg *nlm; /* netlink message */ + struct rtmsg rmsg; /* route message */ + struct opal_reachable_netlink_rt_cb_arg arg; /* callback argument */ + int err; + + /* allocate netlink socket */ unlsk = NULL; - err = usnic_nl_sk_alloc(&unlsk, NETLINK_ROUTE); + err = opal_reachable_netlink_sk_alloc(&unlsk, NETLINK_ROUTE); if (err) return err; + /* allocate route message */ memset(&rmsg, 0, sizeof(rmsg)); rmsg.rtm_family = AF_INET; rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT; rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT; + /* allocate netlink message of type RTM_GETROUTE */ nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0); if (!nlm) { opal_output(0, "Failed to alloc nl message, %s\n", @@ -282,11 +287,14 @@ int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, err = ENOMEM; goto out; } + + /* append route message and addresses to netlink message. */ nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO); nla_put_u32(nlm, RTA_DST, dst_addr); nla_put_u32(nlm, RTA_SRC, src_addr); - err = usnic_nl_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST); + /* query kernel */ + err = opal_reachable_netlink_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST); nlmsg_free(nlm); if (err < 0) { opal_output(0, "Failed to send RTM_GETROUTE query message, error %s\n", @@ -295,11 +303,12 @@ int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, goto out; } + /* Setup callback function */ memset(&arg, 0, sizeof(arg)); - arg.oif = oif; - arg.unlsk = unlsk; + arg.oif = outgoing_interface; + arg.unlsk = unlsk; err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM, - usnic_rt_raw_parse_cb, &arg); + opal_reachable_netlink_rt_raw_parse_cb, &arg); if (err != 0) { opal_output(0, "Failed to setup callback function, error %s\n", NL_GETERROR(err)); @@ -307,8 +316,10 @@ int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, goto out; } + /* recieve results */ NL_RECVMSGS(unlsk->nlh, arg, EHOSTUNREACH, err, out); + /* check whether a route was found */ if (arg.found) { *nh_addr = arg.nh_addr; err = 0; @@ -317,6 +328,6 @@ int opal_reachable_netlink_nl_rt_lookup(uint32_t src_addr, } out: - usnic_nl_sk_free(unlsk); + opal_reachable_netlink_sk_free(unlsk); return err; } From 5a923fe713aa7ce2f6f667bfc50391b44130a293 Mon Sep 17 00:00:00 2001 From: Gabe Saba Date: Wed, 13 Sep 2017 18:43:15 +0000 Subject: [PATCH 03/10] reachable: Initialize / Finalize reachable framework Initialize the reachable framework during opal_init() and tear it back down during opal_finalize(). The framework was never used, so the lack of initialization didn't matter, but this is a required step in using the framework. Signed-off-by: Brian Barrett --- opal/runtime/opal_finalize.c | 5 +++++ opal/runtime/opal_init.c | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/opal/runtime/opal_finalize.c b/opal/runtime/opal_finalize.c index a029d6b2932..05c06e0f9c3 100644 --- a/opal/runtime/opal_finalize.c +++ b/opal/runtime/opal_finalize.c @@ -15,6 +15,8 @@ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,6 +46,7 @@ #include "opal/mca/memchecker/base/base.h" #include "opal/mca/memcpy/base/base.h" #include "opal/mca/backtrace/base/base.h" +#include "opal/mca/reachable/base/base.h" #include "opal/mca/timer/base/base.h" #include "opal/mca/hwloc/base/base.h" #include "opal/mca/event/base/base.h" @@ -135,6 +138,8 @@ opal_finalize(void) (void) mca_base_framework_close(&opal_compress_base_framework); #endif + (void) mca_base_framework_close(&opal_reachable_base_framework); + (void) mca_base_framework_close(&opal_event_base_framework); /* close high resolution timers */ diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index 03ffa7118d5..67a7ef3ad60 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -18,6 +18,8 @@ * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -49,6 +51,7 @@ #include "opal/mca/patcher/base/base.h" #include "opal/mca/memcpy/base/base.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/reachable/base/base.h" #include "opal/mca/timer/base/base.h" #include "opal/mca/memchecker/base/base.h" #include "opal/mca/if/base/base.h" @@ -596,6 +599,16 @@ opal_init(int* pargc, char*** pargv) goto return_error; } + /* Load reachable framework */ + if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_reachable_base_framework, 0))){ + error = "opal_reachable_base_framework"; + goto return_error; + } + if (OPAL_SUCCESS != (ret = opal_reachable_base_select())) { + error = "opal_reachable_base_select"; + goto return_error; + } + #if OPAL_ENABLE_FT_CR == 1 /* * Initialize the compression framework From d128fa0d5db40f4b1abc4a66f509c077988d7a2f Mon Sep 17 00:00:00 2001 From: Gabe Saba Date: Wed, 13 Sep 2017 20:21:12 +0000 Subject: [PATCH 04/10] reachable: Enable weighted component / fix interface Based on work from usNIC, the best way to use the reachability information the reachable components return is to build a connectivity graph between the two peers and run a bipartite graph solver. Rather than returning the "best" pairing, the reachability framework now returns the entire mapping, allowing a (soon to be added) graph solver to build the "optimal" connectivity pairing. Practically, this means changing the return type of the reachable() function and rewriting the weighted_reachable() function to return the full mapping. The netlink_reachable() function still always returns NULL. At the same time, fix bit-rot in the weighted component and enable builds of the component by removing the opal_ignore. Also, add IPv6 support to the weighted component to support both use cases in the TCP BTL. Signed-off-by: Brian Barrett --- opal/mca/reachable/base/Makefile.am | 3 +- opal/mca/reachable/base/base.h | 4 + .../mca/reachable/base/reachable_base_alloc.c | 66 +++ .../netlink/reachable_netlink_module.c | 4 +- opal/mca/reachable/reachable.h | 50 ++- opal/mca/reachable/weighted/.opal_ignore | 0 .../reachable/weighted/reachable_weighted.c | 384 +++++++++--------- .../reachable/weighted/reachable_weighted.h | 3 + .../weighted/reachable_weighted_component.c | 4 +- 9 files changed, 305 insertions(+), 213 deletions(-) create mode 100644 opal/mca/reachable/base/reachable_base_alloc.c delete mode 100644 opal/mca/reachable/weighted/.opal_ignore diff --git a/opal/mca/reachable/base/Makefile.am b/opal/mca/reachable/base/Makefile.am index 9214aae6814..fb72725e926 100644 --- a/opal/mca/reachable/base/Makefile.am +++ b/opal/mca/reachable/base/Makefile.am @@ -14,4 +14,5 @@ headers += \ libmca_reachable_la_SOURCES += \ base/reachable_base_frame.c \ - base/reachable_base_select.c + base/reachable_base_select.c \ + base/reachable_base_alloc.c diff --git a/opal/mca/reachable/base/base.h b/opal/mca/reachable/base/base.h index ed737e7841d..6ab36d5b62f 100644 --- a/opal/mca/reachable/base/base.h +++ b/opal/mca/reachable/base/base.h @@ -29,6 +29,10 @@ OPAL_DECLSPEC extern mca_base_framework_t opal_reachable_base_framework; */ OPAL_DECLSPEC int opal_reachable_base_select(void); +OPAL_DECLSPEC opal_reachable_t * opal_reachable_allocate(unsigned int num_local, + unsigned int num_remote); + + END_C_DECLS #endif diff --git a/opal/mca/reachable/base/reachable_base_alloc.c b/opal/mca/reachable/base/reachable_base_alloc.c new file mode 100644 index 00000000000..faec53ab20b --- /dev/null +++ b/opal/mca/reachable/base/reachable_base_alloc.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "opal/class/opal_object.h" + +#include "opal/mca/reachable/reachable.h" +#include "opal/mca/reachable/base/base.h" + + +static void opal_reachable_construct(opal_reachable_t *reachable) +{ + reachable->weights = NULL; +} + + +static void opal_reachable_destruct(opal_reachable_t * reachable) +{ + if (NULL != reachable->memory) { + free(reachable->memory); + } +} + + +opal_reachable_t * opal_reachable_allocate(unsigned int num_local, + unsigned int num_remote) +{ + char *memory; + unsigned int i; + opal_reachable_t *reachable = OBJ_NEW(opal_reachable_t); + + reachable->num_local = num_local; + reachable->num_remote = num_remote; + + /* allocate all the pieces of the two dimensional array in one + malloc, rather than a bunch of little allocations */ + memory = malloc(sizeof(int*) * num_local + + num_local * (sizeof(int) * num_remote)); + if (memory == NULL) return NULL; + + reachable->memory = (void*)memory; + reachable->weights = (int**)reachable->memory; + memory += (sizeof(int*) * num_local); + + for (i = 0; i < num_local; i++) { + reachable->weights[i] = (int*)memory; + memory += (sizeof(int) * num_remote); + } + + return reachable; +} + +OBJ_CLASS_INSTANCE( + opal_reachable_t, + opal_object_t, + opal_reachable_construct, + opal_reachable_destruct +); diff --git a/opal/mca/reachable/netlink/reachable_netlink_module.c b/opal/mca/reachable/netlink/reachable_netlink_module.c index 60c8e075aee..9f01ce90a57 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_module.c +++ b/opal/mca/reachable/netlink/reachable_netlink_module.c @@ -34,8 +34,8 @@ static int netlink_fini(void) return OPAL_SUCCESS; } -static opal_if_t* netlink_reachable(opal_list_t *local_if, - opal_list_t *remote_if) +static opal_reachable_t* netlink_reachable(opal_list_t *local_if, + opal_list_t *remote_if) { /* JMS Fill me in */ return NULL; diff --git a/opal/mca/reachable/reachable.h b/opal/mca/reachable/reachable.h index 77630b7d0d5..8f3a4659cb9 100644 --- a/opal/mca/reachable/reachable.h +++ b/opal/mca/reachable/reachable.h @@ -3,6 +3,8 @@ * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -15,12 +17,37 @@ #include "opal_config.h" #include "opal/types.h" +#include "opal/class/opal_object.h" #include "opal/mca/mca.h" #include "opal/mca/if/if.h" + BEGIN_C_DECLS +/** + * Reachability matrix between endpoints of a given pair of hosts + * + * The output of the reachable() call is a opal_reachable_t, which + * gives an matrix of the connectivity between local and remote + * ethernet endpoints. Any given value in weights is the connectivity + * between the local endpoint index (first index) and the remote + * endpoint index (second index), and is a value between 0 and INT_MAX + * representing a relative connectivity. + */ +struct opal_reachable_t { + opal_object_t super; + /** number of local interfaces passed to reachable() */ + int num_local; + /** number of remote interfaces passed to reachable() */ + int num_remote; + /** matric of connectivity weights */ + int **weights; + /** \internal */ + void *memory; +}; +typedef struct opal_reachable_t opal_reachable_t; +OBJ_CLASS_DECLARATION(opal_reachable_t); /* Init */ typedef int (*opal_reachable_base_module_init_fn_t)(void); @@ -28,20 +55,19 @@ typedef int (*opal_reachable_base_module_init_fn_t)(void); /* Finalize */ typedef int (*opal_reachable_base_module_fini_fn_t)(void); -/* Given a list of local interfaces and a list of remote - * interfaces, return the interface that is the "best" - * for connecting to the remote process. +/* Build reachability matrix between local and remote ethernet + * interfaces * - * local_if: list of local opal_if_t interfaces - * remote_if: list of opal_if_t interfaces for the remote - * process + * Given a list of local interfaces and remote interfaces from a + * single peer, build a reachability matrix between the two peers. + * This function does not select the best pairing of local and remote + * interfaces, but only a (comparable) reachability between any pair + * of local/remote interfaces. * - * return value: pointer to opal_if_t on local_if that is - * the "best" option for connecting. NULL - * indicates that the remote process cannot - * be reached on any interface + * @returns a reachable object containing the reachability matrix on + * success, NULL on failure. */ -typedef opal_if_t* +typedef opal_reachable_t* (*opal_reachable_base_module_reachable_fn_t)(opal_list_t *local_if, opal_list_t *remote_if); @@ -65,7 +91,7 @@ typedef struct { /* * Macro for use in components that are of type reachable */ -#define OPAL_REACHABLE_BASE_VERSION_2_0_0 \ +#define OPAL_REACHABLE_BASE_VERSION_2_0_0 \ OPAL_MCA_BASE_VERSION_2_1_0("reachable", 2, 0, 0) /* Global structure for accessing reachability functions */ diff --git a/opal/mca/reachable/weighted/.opal_ignore b/opal/mca/reachable/weighted/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/opal/mca/reachable/weighted/reachable_weighted.c b/opal/mca/reachable/weighted/reachable_weighted.c index 6996a61981d..ecd68ac03ec 100644 --- a/opal/mca/reachable/weighted/reachable_weighted.c +++ b/opal/mca/reachable/weighted/reachable_weighted.c @@ -5,6 +5,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,30 +22,41 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_MATH_H +#include +#endif #include "opal/mca/if/if.h" #include "opal/mca/reachable/base/base.h" #include "reachable_weighted.h" +#include "opal/util/net.h" static int weighted_init(void); static int weighted_fini(void); -static opal_if_t* weighted_reachable(opal_list_t *local_if, - opal_list_t *remote_if); +static opal_reachable_t* weighted_reachable(opal_list_t *local_if, + opal_list_t *remote_if); + +static int get_weights(opal_if_t *local_if, opal_if_t *remote_if); +static int calculate_weight(int bandwidth_local, int bandwidth_remote, + int connection_quality); /* - * describes the quality of a possible connection between a local and - * a remote network interface + * Describes the quality of a possible connection between a local and + * a remote network interface. Highest connection quality is assigned + * to connections between interfaces on same network. This is because + * same network implies a single hop to destination. Public addresses + * are preferred over private addresses. This is all guessing, + * because we don't know actual network topology. */ enum connection_quality { - CQ_NO_CONNECTION, - CQ_PRIVATE_DIFFERENT_NETWORK, - CQ_PRIVATE_SAME_NETWORK, - CQ_PUBLIC_DIFFERENT_NETWORK, - CQ_PUBLIC_SAME_NETWORK + CQ_NO_CONNECTION = 0, + CQ_PRIVATE_DIFFERENT_NETWORK = 50, + CQ_PRIVATE_SAME_NETWORK = 80, + CQ_PUBLIC_DIFFERENT_NETWORK = 90, + CQ_PUBLIC_SAME_NETWORK = 100 }; - const opal_reachable_base_module_t opal_reachable_weighted_module = { weighted_init, weighted_fini, @@ -53,6 +66,7 @@ const opal_reachable_base_module_t opal_reachable_weighted_module = { // local variables static int init_cntr = 0; + static int weighted_init(void) { ++init_cntr; @@ -67,207 +81,183 @@ static int weighted_fini(void) return OPAL_SUCCESS; } -static opal_if_t* weighted_reachable(opal_list_t *local_if, - opal_list_t *remote_if) + +static opal_reachable_t* weighted_reachable(opal_list_t *local_if, + opal_list_t *remote_if) { - size_t perm_size, num_local_interfaces, num_peer_interfaces; - enum connection_quality **weights; - - /* - * assign weights to each possible pair of interfaces - */ - num_local_interfaces = opal_list_get_size(local_if); - num_peer_interfaces = opal_list_get_size(remote_if); - - perm_size = num_local_interfaces; - if (num_peer_interfaces > perm_size) { - perm_size = num_peer_interfaces; + opal_reachable_t *reachable_results = NULL; + int i, j; + opal_if_t *local_iter, *remote_iter; + + reachable_results = opal_reachable_allocate(opal_list_get_size(local_if), + opal_list_get_size(remote_if)); + if (NULL == reachable_results) { + return NULL; } - weights = (enum connection_quality**)malloc(perm_size * sizeof(enum connection_quality*)); + i = 0; + OPAL_LIST_FOREACH(local_iter, local_if, opal_if_t) { + j = 0; + OPAL_LIST_FOREACH(remote_iter, remote_if, opal_if_t) { + reachable_results->weights[i][j] = get_weights(local_iter, remote_iter); + j++; + } + i++; + } - best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size - * sizeof(mca_btl_tcp_addr_t **)); - for(i = 0; i < perm_size; ++i) { - weights[i] = (enum connection_quality*) malloc(perm_size * sizeof(enum connection_quality)); - memset(weights[i], 0, perm_size * sizeof(enum connection_quality)); + return reachable_results; +} - best_addr[i] = (mca_btl_tcp_addr_t **) malloc(perm_size * sizeof(mca_btl_tcp_addr_t *)); - memset(best_addr[i], 0, perm_size * sizeof(mca_btl_tcp_addr_t *)); - } - for(i=0; iipv4_address && - NULL != peer_interfaces[j]->ipv4_address) { - - /* check for loopback */ - if ((opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv4_address) - && !opal_net_islocalhost((struct sockaddr *)peer_interfaces[j]->ipv4_address)) - || (opal_net_islocalhost((struct sockaddr *)peer_interfaces[j]->ipv4_address) - && !opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv4_address)) - || (opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv4_address) - && !opal_ifislocal(proc_hostname))) { - - /* No connection is possible on these interfaces */ - - /* check for RFC1918 */ - } else if(opal_net_addr_isipv4public((struct sockaddr*) local_interfaces[i]->ipv4_address) - && opal_net_addr_isipv4public((struct sockaddr*) - peer_interfaces[j]->ipv4_address)) { - if(opal_net_samenetwork((struct sockaddr*) local_interfaces[i]->ipv4_address, - (struct sockaddr*) peer_interfaces[j]->ipv4_address, - local_interfaces[i]->ipv4_netmask)) { - weights[i][j] = CQ_PUBLIC_SAME_NETWORK; - } else { - weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK; - } - best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr; - continue; - } else { - if(opal_net_samenetwork((struct sockaddr*) local_interfaces[i]->ipv4_address, - (struct sockaddr*) peer_interfaces[j]->ipv4_address, - local_interfaces[i]->ipv4_netmask)) { - weights[i][j] = CQ_PRIVATE_SAME_NETWORK; - } else { - weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK; - } - best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr; - } +static int get_weights(opal_if_t *local_if, opal_if_t *remote_if) +{ + char str_local[128], str_remote[128], *conn_type; + struct sockaddr *local_sockaddr, *remote_sockaddr; + int weight; + + local_sockaddr = (struct sockaddr *)&local_if->if_addr; + remote_sockaddr = (struct sockaddr *)&remote_if->if_addr; + + /* opal_net_get_hostname returns a static buffer. Great for + single address printfs, need to copy in this case */ + strncpy(str_local, opal_net_get_hostname(local_sockaddr), sizeof(str_local)); + strncpy(str_remote, opal_net_get_hostname(remote_sockaddr), sizeof(str_remote)); + + /* initially, assume no connection is possible */ + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); + + if (AF_INET == local_sockaddr->sa_family && + AF_INET == remote_sockaddr->sa_family) { + + if (opal_net_addr_isipv4public(local_sockaddr) && + opal_net_addr_isipv4public(remote_sockaddr)) { + if (opal_net_samenetwork(local_sockaddr, + remote_sockaddr, + local_if->if_mask)) { + conn_type = "IPv4 PUBLIC SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PUBLIC_SAME_NETWORK); + } else { + conn_type = "IPv4 PUBLIC DIFFERENT NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PUBLIC_DIFFERENT_NETWORK); } - - /* check state of ipv6 address pair - ipv6 is always public, - * since link-local addresses are skipped in opal_ifinit() - */ - if(NULL != local_interfaces[i]->ipv6_address && - NULL != peer_interfaces[j]->ipv6_address) { - - /* check for loopback */ - if ((opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv6_address) - && !opal_net_islocalhost((struct sockaddr *)peer_interfaces[j]->ipv6_address)) - || (opal_net_islocalhost((struct sockaddr *)peer_interfaces[j]->ipv6_address) - && !opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv6_address)) - || (opal_net_islocalhost((struct sockaddr *)local_interfaces[i]->ipv6_address) - && !opal_ifislocal(proc_hostname))) { - - /* No connection is possible on these interfaces */ - - } else if(opal_net_samenetwork((struct sockaddr*) local_interfaces[i]->ipv6_address, - (struct sockaddr*) peer_interfaces[j]->ipv6_address, - local_interfaces[i]->ipv6_netmask)) { - weights[i][j] = CQ_PUBLIC_SAME_NETWORK; - } else { - weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK; - } - best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr; + } else if (!opal_net_addr_isipv4public(local_sockaddr) && + !opal_net_addr_isipv4public(remote_sockaddr)) { + if (opal_net_samenetwork(local_sockaddr, + remote_sockaddr, + local_if->if_mask)) { + conn_type = "IPv4 PRIVATE SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PRIVATE_SAME_NETWORK); + } else { + conn_type = "IPv4 PRIVATE DIFFERENT NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PRIVATE_DIFFERENT_NETWORK); } + } else { + /* one private, one public address. likely not a match. */ + conn_type = "IPv4 NO CONNECTION"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_NO_CONNECTION); + } - } /* for each peer interface */ - } /* for each local interface */ - - /* - * determine the size of the set to permute (max number of - * interfaces - */ - - best_assignment = (unsigned int *) malloc (perm_size * sizeof(int)); - - a = (int *) malloc(perm_size * sizeof(int)); - if (NULL == a) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Can only find the best set of connections when the number of - * interfaces is not too big. When it gets larger, we fall back - * to a simpler and faster (and not as optimal) algorithm. - * See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031 - * for more details about this issue. */ - if (perm_size <= MAX_PERMUTATION_INTERFACES) { - memset(a, 0, perm_size * sizeof(int)); - max_assignment_cardinality = -1; - max_assignment_weight = -1; - visit(0, -1, perm_size, a); - - rc = OPAL_ERR_UNREACH; - for(i = 0; i < perm_size; ++i) { - if(best_assignment[i] > num_peer_interfaces - || weights[i][best_assignment[i]] == CQ_NO_CONNECTION - || peer_interfaces[best_assignment[i]]->inuse - || NULL == peer_interfaces[best_assignment[i]]) { - continue; +#if OPAL_ENABLE_IPV6 + } else if (AF_INET6 == local_sockaddr->sa_family && + AF_INET6 == remote_sockaddr->sa_family) { + if (opal_net_addr_isipv6linklocal(local_sockaddr) && + opal_net_addr_isipv6linklocal(remote_sockaddr)) { + /* we can't actually tell if link local addresses are on + * the same network or not with the weighted component. + * Assume they are on the same network, so that they'll be + * most likely to be paired together, breaking the fewest + * number of connections. + * + * There used to be a comment in this code (and one in the + * BTL TCP code as well) that the opal_if code doesn't + * pass link-local addresses through. However, this is + * demonstratably not true on Linux, where link-local + * interfaces are created. Since it's easy to handle + * either case, do so. + */ + conn_type = "IPv6 LINK-LOCAL SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PRIVATE_SAME_NETWORK); + } else if (!opal_net_addr_isipv6linklocal(local_sockaddr) && + !opal_net_addr_isipv6linklocal(remote_sockaddr)) { + if (opal_net_samenetwork(local_sockaddr, + remote_sockaddr, + local_if->if_mask)) { + conn_type = "IPv6 PUBLIC SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PUBLIC_SAME_NETWORK); + } else { + conn_type = "IPv6 PUBLIC DIFFERENT NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_PUBLIC_DIFFERENT_NETWORK); } - peer_interfaces[best_assignment[i]]->inuse++; - btl_endpoint->endpoint_addr = best_addr[i][best_assignment[i]]; - btl_endpoint->endpoint_addr->addr_inuse++; - rc = OPAL_SUCCESS; - break; + } else { + /* one link-local, one public address. likely not a match. */ + conn_type = "IPv6 NO CONNECTION"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_NO_CONNECTION); } +#endif /* #if OPAL_ENABLE_IPV6 */ + } else { - enum mca_btl_tcp_connection_quality max; - int i_max = 0, j_max = 0; - /* Find the best connection that is not in use. Save away - * the indices of the best location. */ - max = CQ_NO_CONNECTION; - for(i=0; iinuse) { - if (weights[i][j] > max) { - max = weights[i][j]; - i_max = i; - j_max = j; - } - } - } - } - /* Now see if there is a some type of connection available. */ - rc = OPAL_ERR_UNREACH; - if (CQ_NO_CONNECTION != max) { - peer_interfaces[j_max]->inuse++; - btl_endpoint->endpoint_addr = best_addr[i_max][j_max]; - btl_endpoint->endpoint_addr->addr_inuse++; - rc = OPAL_SUCCESS; - } + /* we don't have an address family match, so assume no + connection */ + conn_type = "Address type mismatch"; + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); } - for(i = 0; i < perm_size; ++i) { - free(weights[i]); - free(best_addr[i]); - } + opal_output_verbose(20, opal_reachable_base_framework.framework_output, + "reachable:weighted: path from %s to %s: %s", + str_local, str_remote, conn_type); - for(i = 0; i < num_peer_interfaces; ++i) { - if(NULL != peer_interfaces[i]->ipv4_address) { - free(peer_interfaces[i]->ipv4_address); - } - if(NULL != peer_interfaces[i]->ipv6_address) { - free(peer_interfaces[i]->ipv6_address); - } - free(peer_interfaces[i]); - } - free(peer_interfaces); - peer_interfaces = NULL; - max_peer_interfaces = 0; + return weight; +} - for(i = 0; i < num_local_interfaces; ++i) { - if(NULL != local_interfaces[i]->ipv4_address) { - free(local_interfaces[i]->ipv4_address); - } - if(NULL != local_interfaces[i]->ipv6_address) { - free(local_interfaces[i]->ipv6_address); - } - free(local_interfaces[i]); - } - free(local_interfaces); - local_interfaces = NULL; - max_local_interfaces = 0; - - free(weights); - free(best_addr); - free(best_assignment); - free(a); - return false; + +/* + * Weights determined by bandwidth between + * interfaces (limited by lower bandwidth + * interface). A penalty is added to minimize + * the discrepancy in bandwidth. This helps + * prevent pairing of fast and slow interfaces + * + * Formula: connection_quality * (min(a,b) + 1/(1 + |a-b|)) + * + * Examples: a b f(a,b) + * 0 0 1 + * 0 1 0.5 + * 1 1 2 + * 1 2 1.5 + * 1 3 1.33 + * 1 10 1.1 + * 10 10 11 + * 10 14 10.2 + * 11 14 11.25 + * 11 15 11.2 + * + * NOTE: connection_quality of 1 is assumed for examples. + * In reality, since we're using integers, we need + * connection_quality to be large enough + * to capture decimals + */ +static int calculate_weight(int bandwidth_local, int bandwidth_remote, + int connection_quality) +{ + int weight = connection_quality * (MIN(bandwidth_local, bandwidth_remote) + + 1.0 / (1.0 + (double)abs(bandwidth_local - bandwidth_remote))); + return weight; } diff --git a/opal/mca/reachable/weighted/reachable_weighted.h b/opal/mca/reachable/weighted/reachable_weighted.h index 04113b10468..6a0220c2ca0 100644 --- a/opal/mca/reachable/weighted/reachable_weighted.h +++ b/opal/mca/reachable/weighted/reachable_weighted.h @@ -1,5 +1,7 @@ /* * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,6 +21,7 @@ #include #endif +#include "opal/mca/reachable/reachable.h" #include "opal/mca/mca.h" #include "opal/mca/event/event.h" #include "opal/util/proc.h" diff --git a/opal/mca/reachable/weighted/reachable_weighted_component.c b/opal/mca/reachable/weighted/reachable_weighted_component.c index fbbd27308e8..6e8098b7698 100644 --- a/opal/mca/reachable/weighted/reachable_weighted_component.c +++ b/opal/mca/reachable/weighted/reachable_weighted_component.c @@ -5,6 +5,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,7 +73,7 @@ opal_reachable_weighted_component_t mca_reachable_weighted_component = { .mca_register_component_params = component_register, }, /* Next the MCA v1.0.0 component meta data */ - .base_version = { + .base_data = { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, From ef0f0ae25c75128318e70fd143a2ee35b993e1d5 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 15 Sep 2017 04:32:51 +0000 Subject: [PATCH 05/10] reachable: remove libnl-1 support from netlink The netlink reachable component has never been released in a usable form, but had code copied from usNIC to support both libnl-1 and libnl-3. If nothing else, this code was a little buggy in handling the case where libnl-3 but not libnl-route-3 were installed. Jeff and I decided to drop libnl-1 support from the netlink reachable component, given that it's getting pretty old and the weighted component provides the same information that the TCP BTL and OOB are using today, so libnl-1 customers won't see a step backwards from where they are today. Signed-off-by: Brian Barrett --- opal/mca/reachable/netlink/Makefile.am | 1 - opal/mca/reachable/netlink/configure.m4 | 101 ++++------------------ opal/mca/reachable/netlink/libnl1_utils.h | 99 --------------------- opal/mca/reachable/netlink/libnl_utils.h | 6 -- 4 files changed, 18 insertions(+), 189 deletions(-) delete mode 100644 opal/mca/reachable/netlink/libnl1_utils.h diff --git a/opal/mca/reachable/netlink/Makefile.am b/opal/mca/reachable/netlink/Makefile.am index 02d7cb28003..77b96f729fe 100644 --- a/opal/mca/reachable/netlink/Makefile.am +++ b/opal/mca/reachable/netlink/Makefile.am @@ -14,7 +14,6 @@ sources = \ reachable_netlink.h \ reachable_netlink_component.c \ reachable_netlink_module.c \ - libnl1_utils.h \ libnl3_utils.h \ libnl_utils.h \ reachable_netlink_utils_common.c diff --git a/opal/mca/reachable/netlink/configure.m4 b/opal/mca/reachable/netlink/configure.m4 index 163095659e3..15a996a4e97 100644 --- a/opal/mca/reachable/netlink/configure.m4 +++ b/opal/mca/reachable/netlink/configure.m4 @@ -3,6 +3,8 @@ # Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2015-2016 Research Organization for Information Science # and Technology (RIST). All rights reserved. +# Copyright (c) 2017 Amazon.com, Inc. or its affiliates. +# All Rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -10,87 +12,6 @@ # $HEADER$ # -dnl -dnl Portions of this software copied from libfabric -dnl (https://github.com/ofiwg/libfabric) -dnl - -dnl BSD license -dnl -dnl Redistribution and use in source and binary forms, with or without -dnl modification, are permitted provided that the following conditions -dnl are met: -dnl -dnl * Redistributions of source code must retain the above copyright -dnl notice, this list of conditions and the following disclaimer. -dnl -dnl * Redistributions in binary form must reproduce the above -dnl copyright notice, this list of conditions and the following -dnl disclaimer in the documentation and/or other materials provided -dnl with the distribution. -dnl -dnl THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -dnl "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -dnl LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -dnl FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -dnl COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -dnl INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -dnl BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -dnl LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -dnl CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -dnl LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -dnl ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -dnl POSSIBILITY OF SUCH DAMAGE. - -dnl Check for libnl; prefer version 3 instead of version 1. Abort (i.e., -dnl AC_MSG_ERROR) if neither libnl v1 or v3 can be found. -dnl -dnl Outputs: -dnl -dnl - Set $1 to the CPPFLAGS necessary to compile with libnl -dnl - Set $2 to the LIBS necessary to link with libnl -dnl - If $3 is 1, AC_MSG_ERROR (i.e., abort) if neither libnl or -dnl libnl3 can be found -dnl - Set OPAL_HAVE_LIBNL3 to 1 if libnl v3 will be used; 0 if libnl v1 will be used -dnl - AC_SUBST $OPAL_HAVE_LIBNL3 -dnl - AC_DEFINE OPAL_HAVE_LIBNL3 -dnl -dnl -------------------------------------------------------- -AC_DEFUN([OPAL_REACHABLE_NETLINK_CHECK_LIBNL_Vx],[ - - # Default to a numeric value (this value gets AC_DEFINEd) - OPAL_HAVE_LIBNL3=0 - - ################################################### - # NOTE: We *must* check for libnl3 before libnl. - ################################################### - - AS_IF([test $opal_libnl_version -ne 1], - [OPAL_CHECK_LIBNL_V3([$opal_libnl_location], [opal_reachable_netlink])]) - AS_IF([test $opal_libnl_version -ne 3 && - test -z "$opal_reachable_netlink_LIBS"], - [OPAL_CHECK_LIBNL_V1([$opal_libnl_location], [opal_reachable_netlink])]) - - AS_IF([test "$opal_want_libnl" = "yes" && - test "$opal_reachable_netlink_LIBS" = ""], - [AC_MSG_WARN([--with-libnl specified, but not found]) - AC_MSG_ERROR([Cannot continue])]) - - # Final result - AC_SUBST([OPAL_HAVE_LIBNL3]) - AC_DEFINE_UNQUOTED([OPAL_HAVE_LIBNL3], [$OPAL_HAVE_LIBNL3], - [Whether we have libl v1 or libnl v3]) - - AC_SUBST([opal_reachable_netlink_CPPFLAGS]) - AC_SUBST([opal_reachable_netlink_LDFLAGS]) - AC_SUBST([opal_reachable_netlink_LIBS]) - - AS_IF([test "$opal_reachable_netlink_LIBS" = ""], - [opal_reachable_netlink_happy=0]) -]) - -dnl ============================================================== - # MCA_opal_reachable_netlink_CONFIG([action-if-can-compile], # [action-if-cant-compile]) # ------------------------------------------------ @@ -106,12 +27,26 @@ AC_DEFUN([MCA_opal_reachable_netlink_CONFIG],[ #include ]) - AS_IF([test $opal_reachable_netlink_happy -eq 1], - [OPAL_REACHABLE_NETLINK_CHECK_LIBNL_Vx]) + # this is terrible, but libnl-1 and libnl-3 are incompatible in + # weird ways, and once there are libraries in LIBS for one, the + # other is hard to get right. So if someone has already decided + # we have libnl version 1, get out. Otherwise, see if we have + # libnl-3, which is the only version supported by the netlink + # component. + AS_IF([test $opal_libnl_version -eq 1], + [opal_reachable_netlink_happy=0], + [OPAL_CHECK_LIBNL_V3([$opal_libnl_location], + [opal_reachable_netlink]) + AS_IF([test "$OPAL_HAVE_LIBNL3" != "1"], + [opal_reachable_netlink_happy=0])]) AS_IF([test $opal_reachable_netlink_happy -eq 1], [$1], [$2]) + AC_SUBST([opal_reachable_netlink_CPPFLAGS]) + AC_SUBST([opal_reachable_netlink_LDFLAGS]) + AC_SUBST([opal_reachable_netlink_LIBS]) + OPAL_VAR_SCOPE_POP() ]) diff --git a/opal/mca/reachable/netlink/libnl1_utils.h b/opal/mca/reachable/netlink/libnl1_utils.h deleted file mode 100644 index 8eedf0ac8b6..00000000000 --- a/opal/mca/reachable/netlink/libnl1_utils.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. - * All Rights reserved. - * - * Portions of this software copied from libfabric - * (https://github.com/ofiwg/libfabric) - * - * LICENSE_BEGIN - * - * BSD license: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * LICENSE_END - * - * - */ - -#ifndef LIBNL1_UTILS_H -#define LIBNL1_UTILS_H - -#include -#include -#include -#include -#include - -typedef struct nl_handle NL_HANDLE; - -#define NLMSG_SIZE(size) nlmsg_msg_size(size) -#define NL_GETERROR(err) nl_geterror() -#define NL_HANDLE_ALLOC nl_handle_alloc -#define NL_HANDLE_FREE nl_handle_destroy -#define NL_DISABLE_SEQ_CHECK nl_disable_sequence_check -#define INC_CB_MSGCNT(arg) \ - do { \ - arg->msg_cnt++; \ - } while (0) - -/* - * the return value of nl_recvmsgs_default does not tell - * whether it returns because of successful read or socket - * timeout. This is a limitation in libnl1. So we compare - * message count before and after the call to decide if there - * is no new message arriving. In this case, this function - * needs to terminate to prevent the caller from - * blocking forever. - * NL_CB_MSG_IN traps every received message, so - * there should be no premature exit - */ -#define NL_RECVMSGS(nlh, cb_arg, rc, err, out) \ - do { \ - int msg_cnt = cb_arg.msg_cnt; \ - err = nl_recvmsgs_default(nlh); \ - if (err < 0) { \ - opal_output(0, "Failed to receive netlink reply message, error %s\n", \ - NL_GETERROR(err)); \ - goto out; \ - } \ - if (msg_cnt == cb_arg.msg_cnt) {\ - err = rc; \ - goto out; \ - } \ - } while (0) - -struct opal_reachable_netlink_rt_cb_arg { - uint32_t nh_addr; - int oif; - int found; - int msg_cnt; - struct opal_reachable_netlink_sk *unlsk; -}; - -#endif /* LIBNL1_UTILS_H */ diff --git a/opal/mca/reachable/netlink/libnl_utils.h b/opal/mca/reachable/netlink/libnl_utils.h index cff70735628..d5177269d07 100644 --- a/opal/mca/reachable/netlink/libnl_utils.h +++ b/opal/mca/reachable/netlink/libnl_utils.h @@ -46,13 +46,7 @@ #ifndef LIBNL_UTILS_H #define LIBNL_UTILS_H -#if !defined (OPAL_HAVE_LIBNL3) -#error You must define OPAL_HAVE_LIBNL3 to 0 or 1 before including libnl_utils.h -#elif OPAL_HAVE_LIBNL3 #include "libnl3_utils.h" -#else -#include "libnl1_utils.h" -#endif struct opal_reachable_netlink_sk { NL_HANDLE *nlh; From 8fd57b69239eab9d1a8b90aec0eceea24e175fd7 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 15 Sep 2017 04:42:45 +0000 Subject: [PATCH 06/10] reachable: Simplify gateway check in netlink The netlink component's libnl wrapper code returned the next hop in the route table to allow the calling code to differentiate between same and different networks, which is a fine comparison for IPv4, but is pretty expensive for IPv6 (coming soon to a netlink component near you). Rather than provide extra information (the address of the next hop), just provide whether there is a gateway or not, which is all the netlink component actually needs. Signed-off-by: Brian Barrett --- opal/mca/reachable/netlink/libnl3_utils.h | 2 +- opal/mca/reachable/netlink/libnl_utils.h | 8 +++++++- .../netlink/reachable_netlink_utils_common.c | 11 ++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/opal/mca/reachable/netlink/libnl3_utils.h b/opal/mca/reachable/netlink/libnl3_utils.h index 957d4c2a8e4..3668685824f 100644 --- a/opal/mca/reachable/netlink/libnl3_utils.h +++ b/opal/mca/reachable/netlink/libnl3_utils.h @@ -72,9 +72,9 @@ typedef struct nl_sock NL_HANDLE; } while (0) struct opal_reachable_netlink_rt_cb_arg { - uint32_t nh_addr; int oif; int found; + int has_gateway; int replied; struct opal_reachable_netlink_sk *unlsk; }; diff --git a/opal/mca/reachable/netlink/libnl_utils.h b/opal/mca/reachable/netlink/libnl_utils.h index d5177269d07..a0a1c30dce0 100644 --- a/opal/mca/reachable/netlink/libnl_utils.h +++ b/opal/mca/reachable/netlink/libnl_utils.h @@ -53,8 +53,14 @@ struct opal_reachable_netlink_sk { uint32_t seq; }; +/* returns 0 if host is reachable, EHOSTUNREACH if the host + * is not reachable, non-zero in other errors. + * + * If the route to the destination is through a gateway, *has_gateway + * is set to 1. Otherwise, it is set to 0. + */ int opal_reachable_netlink_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif, - uint32_t *nh_addr); + int *has_gateway); #endif /* LIBNL_UTILS_H */ diff --git a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c index c71eb3e2e21..7e50167a2c4 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c +++ b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c @@ -249,8 +249,9 @@ static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg) lookup_arg->oif); } - if (found && tb[RTA_GATEWAY]) - lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]); + if (found && tb[RTA_GATEWAY]) { + lookup_arg->has_gateway = 1; + } lookup_arg->found = found; return NL_STOP; } @@ -258,9 +259,8 @@ static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg) int opal_reachable_netlink_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int outgoing_interface, - uint32_t *nh_addr) + int *has_gateway) { - struct opal_reachable_netlink_sk *unlsk; /* netlink socket */ struct nl_msg *nlm; /* netlink message */ struct rtmsg rmsg; /* route message */ @@ -321,9 +321,10 @@ int opal_reachable_netlink_rt_lookup(uint32_t src_addr, /* check whether a route was found */ if (arg.found) { - *nh_addr = arg.nh_addr; + *has_gateway = arg.has_gateway; err = 0; } else { + *has_gateway = 0; err = EHOSTUNREACH; } From 71aaa2b528e6f519310ce203cefa62a81e3180ef Mon Sep 17 00:00:00 2001 From: Gabe Saba Date: Wed, 13 Sep 2017 20:45:22 +0000 Subject: [PATCH 07/10] reachable: Add IPv6 support to libnl code Add IPv6 support to the netlink component's utility wrappers around libnl-3. Signed-off-by: Brian Barrett --- opal/mca/reachable/netlink/libnl_utils.h | 12 +++ .../netlink/reachable_netlink_utils_common.c | 90 ++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/opal/mca/reachable/netlink/libnl_utils.h b/opal/mca/reachable/netlink/libnl_utils.h index a0a1c30dce0..6a7c7cc5538 100644 --- a/opal/mca/reachable/netlink/libnl_utils.h +++ b/opal/mca/reachable/netlink/libnl_utils.h @@ -63,4 +63,16 @@ int opal_reachable_netlink_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif, int *has_gateway); +#if OPAL_ENABLE_IPV6 +/* returns 0 if host is reachable, EHOSTUNREACH if the host + * is not reachable, non-zero in other errors. + * + * If the route to the destination is through a gateway, *has_gateway + * is set to 1. Otherwise, it is set to 0. + */ +int opal_reachable_netlink_rt_lookup6(struct in6_addr *src_addr, + struct in6_addr *dst_addr, int oif, + int *has_gateway); +#endif + #endif /* LIBNL_UTILS_H */ diff --git a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c index 7e50167a2c4..c154c49978a 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c +++ b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c @@ -45,6 +45,9 @@ #include #include #include +#ifdef HAVE_NETINET_IN_H +#include +#endif #include "libnl_utils.h" @@ -221,7 +224,11 @@ static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg) } rtm = nlmsg_data(nlm_hdr); - if (rtm->rtm_family != AF_INET) { + if (rtm->rtm_family != AF_INET +#if OPAL_ENABLE_IPV6 + && rtm->rtm_family != AF_INET6 +#endif + ) { #if OPAL_ENABLE_DEBUG opal_output(0, "RTM message contains invalid AF family: %u\n", rtm->rtm_family); @@ -332,3 +339,84 @@ int opal_reachable_netlink_rt_lookup(uint32_t src_addr, opal_reachable_netlink_sk_free(unlsk); return err; } + + +#if OPAL_ENABLE_IPV6 +int opal_reachable_netlink_rt_lookup6(struct in6_addr *src_addr, + struct in6_addr *dst_addr, + int outgoing_interface, + int *has_gateway) +{ + + struct opal_reachable_netlink_sk *unlsk; /* netlink socket */ + struct nl_msg *nlm; /* netlink message */ + struct rtmsg rmsg; /* route message */ + struct opal_reachable_netlink_rt_cb_arg arg; /* callback argument */ + int err; + + /* allocate netlink socket */ + unlsk = NULL; + err = opal_reachable_netlink_sk_alloc(&unlsk, NETLINK_ROUTE); + if (err) + return err; + + /* allocate route message */ + memset(&rmsg, 0, sizeof(rmsg)); + rmsg.rtm_family = AF_INET6; + rmsg.rtm_dst_len = sizeof(*dst_addr) * CHAR_BIT; + rmsg.rtm_src_len = sizeof(*src_addr) * CHAR_BIT; + + /* allocate netlink message of type RTM_GETROUTE */ + nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0); + if (!nlm) { + opal_output(0, "Failed to alloc nl message, %s\n", + NL_GETERROR(err)); + err = ENOMEM; + goto out; + } + + /* append route message and addresses to netlink message. */ + nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO); + nla_put(nlm, RTA_DST, sizeof(dst_addr->s6_addr), &(dst_addr->s6_addr)); + nla_put(nlm, RTA_SRC, sizeof(src_addr->s6_addr), &(src_addr->s6_addr)); + + /* query kernel */ + err = opal_reachable_netlink_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST); + nlmsg_free(nlm); + if (err < 0) { + opal_output(0, "Failed to send RTM_GETROUTE query message, error %s\n", + NL_GETERROR(err)); + err = EINVAL; + goto out; + } + + /* Setup callback function */ + memset(&arg, 0, sizeof(arg)); + arg.oif = outgoing_interface; + arg.unlsk = unlsk; + err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM, + opal_reachable_netlink_rt_raw_parse_cb, &arg); + if (err != 0) { + opal_output(0, "Failed to setup callback function, error %s\n", + NL_GETERROR(err)); + err = EINVAL; + goto out; + } + + /* receive results */ + NL_RECVMSGS(unlsk->nlh, arg, EHOSTUNREACH, err, out); + + /* check whether a route was found */ + if (arg.found) { + *has_gateway = arg.has_gateway; + err = 0; + } else { + *has_gateway = 0; + err = EHOSTUNREACH; + } + + out: + opal_reachable_netlink_sk_free(unlsk); + return err; +} +#endif /* #if OPAL_ENABLE_IPV6 */ From fc00cf3c63813d5d869d07476a34f70e09f8ac8f Mon Sep 17 00:00:00 2001 From: Gabe Saba Date: Wed, 13 Sep 2017 21:12:31 +0000 Subject: [PATCH 08/10] reachable: Implement netlink component Wire up the libnl utilities Jeff and Ralph added previously to the netlink reachable component so that it actually does work. The algorithm is a bit simplistic, but should work for our use cases. If there's a route, assume the two interfaces can talk. If there's no gateway, assume the two interfaces are in the same subnet, and give preference to that connection. If there's a gateway, assume there's a route, but the interfaces are not in the same subnet. Signed-off-by: Brian Barrett --- .../netlink/reachable_netlink_module.c | 174 +++++++++++++++++- .../netlink/reachable_netlink_utils_common.c | 17 +- 2 files changed, 182 insertions(+), 9 deletions(-) diff --git a/opal/mca/reachable/netlink/reachable_netlink_module.c b/opal/mca/reachable/netlink/reachable_netlink_module.c index 9f01ce90a57..3bb82049a8c 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_module.c +++ b/opal/mca/reachable/netlink/reachable_netlink_module.c @@ -2,6 +2,8 @@ /* * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2015 Cisco Systems. All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -13,12 +15,27 @@ #include "opal/constants.h" #include "opal/types.h" +#ifdef HAVE_MATH_H +#include +#endif + +#include "opal/util/net.h" #include "opal/mca/reachable/base/base.h" #include "reachable_netlink.h" +#include "libnl_utils.h" + +enum connection_quality { + CQ_NO_CONNECTION = 0, + CQ_DIFFERENT_NETWORK = 50, + CQ_SAME_NETWORK = 100 +}; /* Local variables */ static int init_counter = 0; +static int get_weights(opal_if_t *local_if, opal_if_t *remote_if); +static int calculate_weight(int bandwidth_local, int bandwidth_remote, + int connection_quality); static int netlink_init(void) { @@ -34,15 +51,168 @@ static int netlink_fini(void) return OPAL_SUCCESS; } +/* + * Determines whether a connection is possible between + * pairs of local and remote interfaces. To determine + * reachability, the kernel's routing table is queried. + * Higher weightings are given to connections on the same + * network. + */ static opal_reachable_t* netlink_reachable(opal_list_t *local_if, opal_list_t *remote_if) { - /* JMS Fill me in */ - return NULL; + opal_reachable_t *reachable_results = NULL; + int i, j; + opal_if_t *local_iter, *remote_iter; + + reachable_results = opal_reachable_allocate(local_if->opal_list_length, + remote_if->opal_list_length); + if (NULL == reachable_results) { + return NULL; + } + + i = 0; + OPAL_LIST_FOREACH(local_iter, local_if, opal_if_t) { + j = 0; + OPAL_LIST_FOREACH(remote_iter, remote_if, opal_if_t) { + reachable_results->weights[i][j] = get_weights(local_iter, remote_iter); + j++; + } + i++; + } + + return reachable_results; +} + + +static int get_weights(opal_if_t *local_if, opal_if_t *remote_if) +{ + char str_local[128], str_remote[128], *conn_type; + int outgoing_interface, ret, weight, has_gateway; + + /* opal_net_get_hostname returns a static buffer. Great for + single address printfs, need to copy in this case */ + strncpy(str_local, + opal_net_get_hostname((struct sockaddr *)&local_if->if_addr), + sizeof(str_local)); + strncpy(str_remote, + opal_net_get_hostname((struct sockaddr *)&remote_if->if_addr), + sizeof(str_remote)); + + /* initially, assume no connection is possible */ + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); + + if (AF_INET == local_if->af_family && AF_INET == remote_if->af_family) { + uint32_t local_ip, remote_ip; + + local_ip = (uint32_t)((struct sockaddr_in *)&(local_if->if_addr))->sin_addr.s_addr; + remote_ip = (uint32_t)((struct sockaddr_in *)&(remote_if->if_addr))->sin_addr.s_addr; + outgoing_interface = local_if->if_kernel_index; + + ret = opal_reachable_netlink_rt_lookup(local_ip, + remote_ip, + outgoing_interface, + &has_gateway); + if (0 == ret) { + if (0 == has_gateway) { + conn_type = "IPv4 SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_SAME_NETWORK); + } else { + conn_type = "IPv4 DIFFERENT NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_DIFFERENT_NETWORK); + } + } else { + conn_type = "IPv4 NO CONNECTION"; + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); + } + +#if OPAL_ENABLE_IPV6 + } else if (AF_INET6 == local_if->af_family && AF_INET6 == remote_if->af_family) { + struct in6_addr *local_ip, *remote_ip; + + local_ip = &((struct sockaddr_in6 *)&(local_if->if_addr))->sin6_addr; + remote_ip = &((struct sockaddr_in6 *)&(remote_if->if_addr))->sin6_addr; + outgoing_interface = local_if->if_kernel_index; + + ret = opal_reachable_netlink_rt_lookup6(local_ip, + remote_ip, + outgoing_interface, + &has_gateway); + + if (0 == ret) { + if (0 == has_gateway) { + conn_type = "IPv6 SAME NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_SAME_NETWORK); + } else { + conn_type = "IPv6 DIFFERENT NETWORK"; + weight = calculate_weight(local_if->if_bandwidth, + remote_if->if_bandwidth, + CQ_DIFFERENT_NETWORK); + } + } else { + conn_type = "IPv6 NO CONNECTION"; + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); + } +#endif /* #if OPAL_ENABLE_IPV6 */ + + } else { + /* we don't have an address family match, so assume no + connection */ + conn_type = "Address type mismatch"; + weight = calculate_weight(0, 0, CQ_NO_CONNECTION); + } + + opal_output_verbose(20, opal_reachable_base_framework.framework_output, + "reachable:netlink: path from %s to %s: %s", + str_local, str_remote, conn_type); + + return weight; } + const opal_reachable_base_module_t opal_reachable_netlink_module = { netlink_init, netlink_fini, netlink_reachable }; + + +/* + * Weights determined by bandwidth between + * interfaces (limited by lower bandwidth + * interface). A penalty is added to minimize + * the discrepancy in bandwidth. This helps + * prevent pairing of fast and slow interfaces + * + * Formula: connection_quality * (min(a,b) + 1/(1 + |a-b|)) + * + * Examples: a b f(a,b) + * 0 0 1 + * 0 1 0.5 + * 1 1 2 + * 1 2 1.5 + * 1 3 1.33 + * 1 10 1.1 + * 10 10 11 + * 10 14 10.2 + * 11 14 11.25 + * 11 15 11.2 + * + * NOTE: connection_quality of 1 is assumed for examples. + * In reality, since we're using integers, we need + * connection_quality to be large enough + * to capture decimals + */ +static int calculate_weight(int bandwidth_local, int bandwidth_remote, + int connection_quality) +{ + int weight = connection_quality * (MIN(bandwidth_local, bandwidth_remote) + + 1.0/(1.0 + (double)abs(bandwidth_local - bandwidth_remote))); + return weight; +} diff --git a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c index c154c49978a..9422c22d180 100644 --- a/opal/mca/reachable/netlink/reachable_netlink_utils_common.c +++ b/opal/mca/reachable/netlink/reachable_netlink_utils_common.c @@ -89,11 +89,11 @@ static int opal_reachable_netlink_is_nlreply_err(struct nlmsghdr *nlm_hdr) if (nlm_hdr->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr); if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e))) - opal_output(0, - "Received a netlink error message"); + opal_output_verbose(20, 0, + "Received a netlink error message"); else - opal_output(0, - "Received a truncated netlink error message\n"); + opal_output_verbose(20, 0, + "Received a truncated netlink error message\n"); return 1; } @@ -251,9 +251,12 @@ static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg) if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif) found = 1; else - opal_output(0, "Retrieved route has a different outgoing interface %d (expected %d)\n", - nla_get_u32(tb[RTA_OIF]), - lookup_arg->oif); + /* usually, this means that there is a route to the remote + host, but that it's not through the given interface. For + our purposes, that means it's not reachable. */ + opal_output_verbose(20, 0, "Retrieved route has a different outgoing interface %d (expected %d)\n", + nla_get_u32(tb[RTA_OIF]), + lookup_arg->oif); } if (found && tb[RTA_GATEWAY]) { From aed461e57c6fff0f443908ec5f0610cc4e621408 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 13 Sep 2017 18:38:47 +0000 Subject: [PATCH 09/10] reachable: Change ownership to Amazon Amazon is going to use the reachable framework to fix some connection bugs in the TCP BTL, so claim support ownership of the weighted and netlink components. Signed-off-by: Brian Barrett --- opal/mca/reachable/base/owner.txt | 4 ++-- opal/mca/reachable/netlink/owner.txt | 4 ++-- opal/mca/reachable/weighted/owner.txt | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/opal/mca/reachable/base/owner.txt b/opal/mca/reachable/base/owner.txt index 5361011bda4..786349a14c0 100644 --- a/opal/mca/reachable/base/owner.txt +++ b/opal/mca/reachable/base/owner.txt @@ -3,5 +3,5 @@ # owner: institution that is responsible for this package # status: e.g. active, maintenance, unmaintained # -owner: INTEL -status: unmaintained +owner: AMAZON +status: active diff --git a/opal/mca/reachable/netlink/owner.txt b/opal/mca/reachable/netlink/owner.txt index 5361011bda4..786349a14c0 100644 --- a/opal/mca/reachable/netlink/owner.txt +++ b/opal/mca/reachable/netlink/owner.txt @@ -3,5 +3,5 @@ # owner: institution that is responsible for this package # status: e.g. active, maintenance, unmaintained # -owner: INTEL -status: unmaintained +owner: AMAZON +status: active diff --git a/opal/mca/reachable/weighted/owner.txt b/opal/mca/reachable/weighted/owner.txt index 5361011bda4..786349a14c0 100644 --- a/opal/mca/reachable/weighted/owner.txt +++ b/opal/mca/reachable/weighted/owner.txt @@ -3,5 +3,5 @@ # owner: institution that is responsible for this package # status: e.g. active, maintenance, unmaintained # -owner: INTEL -status: unmaintained +owner: AMAZON +status: active From 48f8236dbbff134d3e1c6b875ef6ff162929d4d7 Mon Sep 17 00:00:00 2001 From: Gabe Saba Date: Sat, 29 Jul 2017 00:00:40 +0000 Subject: [PATCH 10/10] reachable: add tests Add test suite for netlink and weighted reachable components. We don't have a great way of running components through unit tests today, so make them stand-alone tests that are run with mpirun and such. Signed-off-by: Brian Barrett --- .gitignore | 3 + opal/test/reachable/Makefile | 19 + opal/test/reachable/reachable_netlink.c | 196 +++++ opal/test/reachable/reachable_shared.h | 70 ++ opal/test/reachable/reachable_weighted.c | 1015 ++++++++++++++++++++++ opal/test/reachable/tests | 12 + 6 files changed, 1315 insertions(+) create mode 100644 opal/test/reachable/Makefile create mode 100644 opal/test/reachable/reachable_netlink.c create mode 100644 opal/test/reachable/reachable_shared.h create mode 100644 opal/test/reachable/reachable_weighted.c create mode 100755 opal/test/reachable/tests diff --git a/.gitignore b/.gitignore index ad498753498..2d91c9003b9 100644 --- a/.gitignore +++ b/.gitignore @@ -682,3 +682,6 @@ test/util/opal_path_nfs test/util/opal_path_nfs.out test/util/opal_bit_ops test/util/bipartite_graph + +opal/test/reachable/reachable_netlink +opal/test/reachable/reachable_weighted diff --git a/opal/test/reachable/Makefile b/opal/test/reachable/Makefile new file mode 100644 index 00000000000..028cb93e68d --- /dev/null +++ b/opal/test/reachable/Makefile @@ -0,0 +1,19 @@ +# Copyright (c) 2017 Amazon.com, Inc. or its affiliates. All Rights +# reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PROGS = reachable_weighted reachable_netlink + +all: $(PROGS) + +CC = ortecc +CFLAGS = -g + +clean: + rm -f $(PROGS) *~ diff --git a/opal/test/reachable/reachable_netlink.c b/opal/test/reachable/reachable_netlink.c new file mode 100644 index 00000000000..3fdedbf84ef --- /dev/null +++ b/opal/test/reachable/reachable_netlink.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. All Rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "opal_config.h" + +#include "reachable_shared.h" + +#include "opal/runtime/opal.h" +#include "opal/mca/reachable/reachable.h" +#include "opal/util/if.h" +#include "opal/class/opal_list.h" +#include "opal/util/if.h" + +/* + * Creates list of remote interfaces for testing reachability. + * Only minimum information is filled out. + */ +opal_list_t* build_if_list(void) +{ + /* Allocate memory for and create interface list */ + opal_list_t *if_list = OBJ_NEW(opal_list_t); + opal_if_t *intf; + + /* + * Add localhost to list + */ + intf = create_if(AF_INET, "127.0.0.1", 8, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * Add localhost with non-standard address + */ + intf = create_if(AF_INET, "127.31.41.59", 8, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * Add another localhost with non-standard address + */ + intf = create_if(AF_INET, "127.26.53.58", 8, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * Google's public DNS + */ + intf = create_if(AF_INET, "8.8.8.8", 16, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * Google's public DNS (2) + */ + intf = create_if(AF_INET, "8.8.4.4", 16, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * IPv6: Google's public DNS (IPv6) + */ + intf = create_if(AF_INET6, "2001:4860:4860::8888", 64, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * IPv6: Google's public DNS 2 (IPv6) + */ + intf = create_if(AF_INET6, "2001:4860:4860::8844", 128, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * IPv6: Google's public DNS 1 (IPv6) EXPLICIT ADDRESS + */ + intf = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8888", 64, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * IPv6: Google's public DNS 2 (IPv6) EXPLICIT ADDRESS + */ + intf = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8844", 64, 0); + opal_list_append(if_list, &(intf->super)); + + /* + * IPv6: something that should be on the same link local... + */ + intf = create_if(AF_INET6, "fe80::0001", 64, 0); + opal_list_append(if_list, &(intf->super)); + + return if_list; +} + + +int main(int argc, char **argv) +{ + opal_list_t *local_list, *remote_list; + opal_reachable_t *results; + uint32_t i, j; + int successful_connections = 0; + int local_ifs; + int remote_ifs; + opal_if_t *local_if; + + opal_init(&argc, &argv); + + /* List of interfaces generated by opal */ + local_list = &opal_if_list; + /* Create test interfaces */ + remote_list = build_if_list(); + + local_ifs = opal_list_get_size(local_list); + remote_ifs = opal_list_get_size(remote_list); + + /* Tests reachability by looking up entries in routing table. + * Tests routes to localhost and google's nameservers. + */ + results = opal_reachable.reachable(local_list, remote_list); + + printf("Local interfaces:\n"); + i = 0; + OPAL_LIST_FOREACH(local_if, local_list, opal_if_t) { + char addr[128]; + char *family; + + switch (local_if->af_family) { + case AF_INET: + family = "IPv4"; + inet_ntop(AF_INET, &(((struct sockaddr_in*) &local_if->if_addr))->sin_addr, + addr, sizeof(addr)); + break; + case AF_INET6: + family = "IPv6"; + inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &local_if->if_addr))->sin6_addr, + addr, sizeof(addr)); + break; + default: + family = "Unknown"; + strcpy(addr, "Unknown"); + break; + } + + printf(" %3d: %s\t%s\t%s/%d\n", i, local_if->if_name, + family, addr, local_if->if_mask); + i++; + } + + printf("\nRemote interfaces:\n"); + i = 0; + OPAL_LIST_FOREACH(local_if, remote_list, opal_if_t) { + char addr[128]; + char *family; + + switch (local_if->af_family) { + case AF_INET: + family = "IPv4"; + inet_ntop(AF_INET, &(((struct sockaddr_in*) &local_if->if_addr))->sin_addr, + addr, sizeof(addr)); + break; + case AF_INET6: + family = "IPv6"; + inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &local_if->if_addr))->sin6_addr, + addr, sizeof(addr)); + break; + default: + family = "Unknown"; + strcpy(addr, "Unknown"); + break; + } + + printf(" %3d: %s\t%s\t%s/%d\n", i, local_if->if_name, + family, addr, local_if->if_mask); + i++; + } + + printf("\nConnectivity Table:\n "); + for (j = 0 ; j < remote_ifs ; j++) { + printf("%3d ", j); + } + printf("\n"); + + for (i = 0; i < local_ifs ; i++) { + printf(" %3d: ", i); + for (j = 0 ; j < remote_ifs ; j++) { + printf("%3d ", results->weights[i][j]); + } + printf("\n"); + } + printf("\n"); + + OBJ_RELEASE(remote_list); + + opal_output(0, "Passed all tests!\n"); + return 0; +} diff --git a/opal/test/reachable/reachable_shared.h b/opal/test/reachable/reachable_shared.h new file mode 100644 index 00000000000..4b9941a03da --- /dev/null +++ b/opal/test/reachable/reachable_shared.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017-XXXX Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef TEST_REACHABLE_SHARED +#define TEST_REACHABLE_SHARED 1 + +#include + +#include "opal/runtime/opal.h" +#include "opal/mca/reachable/reachable.h" +#include "opal/util/if.h" + +BEGIN_C_DECLS + +/* Create and populate opal_if_t with information required by opal_reachable */ +opal_if_t* create_if(int af_family, char *address, int mask, int bandwidth) +{ + opal_if_t *interface = OBJ_NEW(opal_if_t); + strncpy(interface->if_name, "interface0", IF_NAMESIZE); + interface->af_family = af_family; + ((struct sockaddr *)&(interface->if_addr))->sa_family = af_family; + + if (AF_INET == af_family){ + assert(1 == inet_pton(af_family, address, &((struct sockaddr_in *)&(interface->if_addr))->sin_addr)); + } else if (AF_INET6 == af_family){ + assert(1 == inet_pton(af_family, address, &((struct sockaddr_in6 *)&(interface->if_addr))->sin6_addr)); + } + + interface->if_mask = mask; + interface->if_bandwidth = bandwidth; + + return interface; +} + + +/* Run a test between a pair of interfaces + * and clean up the memory afterwards. + * Return the weight between the pair of + * interfaces + */ +int run_single_test(opal_if_t *local_if, opal_if_t *remote_if) +{ + + opal_list_t *local_list = OBJ_NEW(opal_list_t); + opal_list_t *remote_list = OBJ_NEW(opal_list_t); + + opal_list_append(local_list, &(local_if->super)); + opal_list_append(remote_list, &(remote_if->super)); + + opal_reachable_t *results; + results = opal_reachable.reachable(local_list, remote_list); + OBJ_RELEASE(local_list); + OBJ_RELEASE(remote_list); + int result = results->weights[0][0]; + + /* release results */ + OBJ_RELEASE(results); + return result; +} + +END_C_DECLS + +#endif diff --git a/opal/test/reachable/reachable_weighted.c b/opal/test/reachable/reachable_weighted.c new file mode 100644 index 00000000000..5a6db2fc091 --- /dev/null +++ b/opal/test/reachable/reachable_weighted.c @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. All Rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "opal_config.h" + +#include "reachable_shared.h" + +/* sigh; needs to match with code in reachable_weighted, but those + headers aren't installed. */ +enum connection_quality { + CQ_NO_CONNECTION = 0, + CQ_PRIVATE_DIFFERENT_NETWORK = 50, + CQ_PRIVATE_SAME_NETWORK = 80, + CQ_PUBLIC_DIFFERENT_NETWORK = 90, + CQ_PUBLIC_SAME_NETWORK = 100 +}; + + +/* SUITE 1: + * Tests IPv4 connections by + * modifying ip addresses and + * subnet masks. Also tests + * IPv4->IPv6 and the other way + * around, to assure no connection + * is returned in that case. + */ +int ipv4_test() +{ + opal_if_t *int1; + opal_if_t *int2; + int expected_result; + int result; + int test_no = 0; + int failed_no = 0; + + /* TEST1 + * Localhost to localhost. Since localhost range is not a + * private network (RFC1918), expected result is public + * same network. + */ + test_no++; + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET, "127.0.0.1", 24, 0); + int2 = create_if(AF_INET, "127.0.0.2", 0, 0); + result = run_single_test(int1, int2); + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST2 + * Testing public same network with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET, "31.14.15.92", 24, 0); + int2 = create_if(AF_INET, "31.14.15.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST3 + * Testing public same network with subnet mask + * 255.255.0.0 + */ + test_no++; + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET, "65.35.89.79", 16, 0); + int2 = create_if(AF_INET, "65.35.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST4 + * Testing public same network with subnet mask + * 255.0.0.0 + */ + test_no++; + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET, "3.23.84.62", 8, 0); + int2 = create_if(AF_INET, "3.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST5 + * Testing public same network with subnet mask + * 0.0.0.0 + */ + test_no++; + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET, "64.33.83.27", 0, 0); + int2 = create_if(AF_INET, "27.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST6 + * Testing public different network with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_PUBLIC_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "95.2.88.41", 24, 0); + int2 = create_if(AF_INET, "95.2.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST7 + * Testing public different network with subnet mask + * 255.255.0.0 + */ + test_no++; + expected_result = CQ_PUBLIC_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "97.16.93.99", 16, 0); + int2 = create_if(AF_INET, "97.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST8 + * Testing public different network with subnet mask + * 255.0.0.0 + */ + test_no++; + expected_result = CQ_PUBLIC_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "37.51.5.82", 8, 0); + int2 = create_if(AF_INET, "27.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST9 + * Testing private same network with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_PRIVATE_SAME_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 24, 0); + int2 = create_if(AF_INET, "192.168.0.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST10 + * Testing private same network with subnet mask + * 255.255.0.0 + */ + test_no++; + expected_result = CQ_PRIVATE_SAME_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 16, 0); + int2 = create_if(AF_INET, "192.168.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed this test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST11 + * Testing private same network with subnet mask + * 255.0.0.0 + */ + test_no++; + expected_result = CQ_PRIVATE_SAME_NETWORK; + int1 = create_if(AF_INET, "172.16.0.1", 8, 0); + int2 = create_if(AF_INET, "172.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed this test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST12 + * Testing private same network with subnet mask + * 0.0.0.0 + */ + test_no++; + expected_result = CQ_PRIVATE_SAME_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 0, 0); + int2 = create_if(AF_INET, "10.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST13 + * Testing private different network with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_PRIVATE_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 24, 0); + int2 = create_if(AF_INET, "192.168.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST14 + * Testing private different network with subnet mask + * 255.255.0.0 + */ + test_no++; + expected_result = CQ_PRIVATE_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 16, 0); + int2 = create_if(AF_INET, "10.1.0.1", 16, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST15 + * Testing private different network with subnet mask + * 255.0.0.0 + */ + test_no++; + expected_result = CQ_PRIVATE_DIFFERENT_NETWORK; + int1 = create_if(AF_INET, "192.168.0.1", 8, 0); + int2 = create_if(AF_INET, "10.27.27.27", 0, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST16 + * Testing public to private with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_NO_CONNECTION; + int1 = create_if(AF_INET, "27.27.27.27", 24 , 0); + int2 = create_if(AF_INET, "192.168.0.1", 16, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST17 + * Testing private to public with subnet mask + * 255.255.255.0 + */ + test_no++; + expected_result = CQ_NO_CONNECTION; + int1 = create_if(AF_INET, "192.168.0.1", 24, 0); + int2 = create_if(AF_INET, "27.27.27.27", 8, 0); + result = run_single_test(int1, int2); + + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST18 + * IPv4->IPv6 + */ + expected_result = CQ_NO_CONNECTION; + int1 = create_if(AF_INET, "8.8.8.8", 24, 0); + int2 = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8888", 0, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST19 + * IPv6->IPv4 + */ + expected_result = CQ_NO_CONNECTION; + int1 = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8888", 64, 0); + int2 = create_if(AF_INET, "8.8.8.8", 0, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + opal_output(0, "Finished Reachable IPv4 Tests. %d/%d successful", test_no-failed_no, test_no); + + if (0 == failed_no) { + return 0; + } else { + return 1; + } +} + + +/* SUITE 2: + * Compares connections with different + * bandwidths to see ensure the + * relative ranking is as expected + */ +int ranking_test() +{ + opal_if_t *int1; + opal_if_t *int2; + int result1; + int result2; + int test_no = 0; + int failed_no = 0; + + /* TEST1 + * Compares pairs with bandwidths 0->0 and 1->0. + * The former connection should be better, as + * there is a smaller difference in bandwidth + * (This is an edge case, but this behavior makes + * sense. We want 0->0 to still work, incase + * bandwidth was never set. Thus, the behavior + * for a->b where a = 0 and a != b should + * act the same as any other case, where + * a greater difference in leads to a greater + * penalty in bandwidth) + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 0); + int2 = create_if(AF_INET, "31.14.15.27", 0, 0); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 0); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 > result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST2 + * Compares interface pairs with bandwidth 0->0 and 1->2. + * The latter should be better as it has greater bandwidth + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 0); + int2 = create_if(AF_INET, "31.14.15.27", 0, 0); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 2); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST3 + * Compares interface pairs with bandwidth 1->2 and 1->1. + * The later should be better as there is a smaller + * difference in bandwidth + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 0); + int2 = create_if(AF_INET, "31.14.15.27", 0, 0); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 1); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST4 + * Compares interface pairs with bandwidth 1->3 and 1->2. + * The later should be better as there is a smaller + * difference in bandwidth + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 3); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 2); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST5 + * Compares interface pairs with bandwidth 1->10 and 1->3. + * The later should be better as there is less discrepancy + * in bandwidth + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 10); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 1); + int2 = create_if(AF_INET, "31.14.15.27", 0, 3); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST6 + * Compares interface pairs with bandwidth 5->5 and 10->10. + * The later should be better as it has higher bandwidth + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 5); + int2 = create_if(AF_INET, "31.14.15.27", 0, 5); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 10); + int2 = create_if(AF_INET, "31.14.15.27", 0, 10); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST7 + * Compares interface pairs with bandwidth 10->11 and 10->10. + * The later should be better as there is no discrepancy in + * bandwidth. + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 10); + int2 = create_if(AF_INET, "31.14.15.27", 0, 11); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 10); + int2 = create_if(AF_INET, "31.14.15.27", 0, 10); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST8 + * Compares interface pairs with bandwidth 10->11 and 11->10. + * These connections should be equivilant, as they have the same + * bandwidth and same discrepancy. + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 10); + int2 = create_if(AF_INET, "31.14.15.27", 0, 11); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 11); + int2 = create_if(AF_INET, "31.14.15.27", 0, 10); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 == result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + /* TEST9 + * Compares interface pairs with bandwidth 10->14 and 11->15. + * The latter should be better as it has higher bandwidth. + */ + test_no++; + + int1 = create_if(AF_INET, "31.14.15.92", 24, 10); + int2 = create_if(AF_INET, "31.14.15.27", 0, 14); + result1 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + int1 = create_if(AF_INET, "31.14.15.92", 24, 11); + int2 = create_if(AF_INET, "31.14.15.27", 0, 15); + result2 = run_single_test(int1, int2); + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + if (!(result1 < result2)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + + opal_output(0, "Finished Reachable Weighted Ranking Tests. %d/%d successful", test_no-failed_no, test_no); + if (0 == failed_no) { + return 0; + } else { + return 1; + } +} + + +/* SUITE 3: + * Tests interfaces lists of various sizes + * to ensure no crashes occur and results + * are outputted in proper order + */ +int loop_test() +{ + int test_no = 0; + int failed_no = 0; + + opal_list_t *if_list1, *if_list2; + opal_if_t *intf; + opal_reachable_t *results; + + int i; + + /* TEST1: + * Ensure opal_reachable doesn't crash + * when called with empty lists + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(0 == results->num_local && 0 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST2: + * Ensure opal_reachable doesn't crash + * when called with empty local list + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list2, &(intf->super)); + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(0 == results->num_local && 1 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST3: + * Ensure opal_reachable doesn't crash + * when called with empty remote list + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(1 == results->num_local && 0 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST4: + * Ensure opal_reachable doesn't crash + * when the remote list has more elements + * than the local list + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + for (i = 0; i < 3; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + } + for (i = 0; i < 14; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list2, &(intf->super)); + } + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(3 == results->num_local && 14 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST5: + * Ensure opal_reachable doesn't crash + * when the local list has more elements + * than the remote list + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + for (i = 0; i < 14; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + } + for (i = 0; i < 3; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list2, &(intf->super)); + } + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(14 == results->num_local && 3 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST6: + * Ensure opal_reachable doesn't crash + * when the local list has the same number + * of elements as the remote list + */ + test_no++; + + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + for (i = 0; i < 27; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + } + for (i = 0; i < 27; i++) { + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list2, &(intf->super)); + } + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(27 == results->num_local && 27 == results->num_remote)) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST7: + * Tests proper ordering of results + * when same number of local interfaces and + * remote interfaces + */ + test_no++; + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + + intf = create_if(AF_INET, "31.14.19.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(CQ_PUBLIC_SAME_NETWORK == results->weights[0][0] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[0][1] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[1][0] && + CQ_PUBLIC_SAME_NETWORK == results->weights[1][1])) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST8: + * Tests proper ordering of results + * when greater number of remote interfaces + * than local interfaces + */ + test_no++; + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + + intf = create_if(AF_INET, "31.14.19.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + intf = create_if(AF_INET, "31.14.21.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(CQ_PUBLIC_SAME_NETWORK == results->weights[0][0] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[0][1] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[0][2] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[1][0] && + CQ_PUBLIC_SAME_NETWORK == results->weights[1][1]) && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[1][2]) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + /* TEST9: + * Tests proper ordering of results + * when greater number of local interfaces + * than remote interfaces + */ + test_no++; + if_list1 = OBJ_NEW(opal_list_t); + if_list2 = OBJ_NEW(opal_list_t); + + intf = create_if(AF_INET, "31.14.19.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.92", 24, 0); + opal_list_append(if_list1, &(intf->super)); + intf = create_if(AF_INET, "31.14.21.93", 24, 0); + opal_list_append(if_list1, &(intf->super)); + + intf = create_if(AF_INET, "31.14.19.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + intf = create_if(AF_INET, "31.14.20.93", 24, 0); + opal_list_append(if_list2, &(intf->super)); + + results = opal_reachable.reachable(if_list1, if_list2); + + OBJ_RELEASE(if_list1); + OBJ_RELEASE(if_list2); + + if (!(CQ_PUBLIC_SAME_NETWORK == results->weights[0][0] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[0][1] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[1][0] && + CQ_PUBLIC_SAME_NETWORK == results->weights[1][1] && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[2][0]) && + CQ_PUBLIC_DIFFERENT_NETWORK == results->weights[2][1]) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(results); + + opal_output(0, "Finished Reachable Weighted Loop Tests. %d/%d successful", test_no-failed_no, test_no); + + if (0 == failed_no) { + return 0; + } else { + return 1; + } + +} + + +/* SUITE 4: + * Test IPv6 + */ +int test_ipv6() +{ + int failed_no = 0; + +#if OPAL_ENABLE_IPV6 + opal_if_t *int1; + opal_if_t *int2; + int expected_result; + int result; + int test_no = 0; + + /* TEST1 + * Testing ipv6 same network with subnet mask + * \64 + */ + expected_result = CQ_PUBLIC_SAME_NETWORK; + int1 = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8888", 64, 0); + int2 = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8889", 8, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + /* TEST2 + * Testing ipv6 different network with subnet mask + * \64 + */ + expected_result = CQ_PUBLIC_DIFFERENT_NETWORK; + int1 = create_if(AF_INET6, "2001:4860:4860:0:0:0:0:8888", 64, 0); + int2 = create_if(AF_INET6, "2001:4860:4860:1:0:0:0:8888", 0, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + expected_result = CQ_PRIVATE_SAME_NETWORK; + int1 = create_if(AF_INET6, "fe80::8888", 64, 0); + int2 = create_if(AF_INET6, "fe80::8889", 64, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + expected_result = CQ_NO_CONNECTION; + int1 = create_if(AF_INET6, "2001::8888", 64, 0); + int2 = create_if(AF_INET6, "fe80::8889", 64, 0); + result = run_single_test(int1, int2); + test_no++; + if (result != expected_result) { + ++failed_no; + opal_output(0, "Failed test #%d", test_no); + } + OBJ_RELEASE(int1); + OBJ_RELEASE(int2); + + opal_output(0, "Finished Reachable Weighted IPv6 Tests. %d/%d successful", test_no-failed_no, test_no); + + if (0 == failed_no) { + return 0; + } else { + return 1; + } +#else + opal_output(0, "No IPv6 support; skipped tests"); + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int failed = 0; + int total = 0; + + opal_init(&argc, &argv); + opal_output(0, "\n\nBeginning Reachable Weighted tests\n\n"); + + total++; + if (ipv4_test()) { + failed++; + } + + total++; + if (ranking_test()) { + failed++; + } + + total++; + if (loop_test()) { + failed++; + } + + total++; + if (test_ipv6()) { + failed++; + } + + if (0 != failed) { + opal_output(0, "\n\nFailed %d/%d Reachable Weighted Test Suites :(\n\n", failed, total); + } else { + opal_output(0, "\n\nPassed %d/%d Reachable Weighted Test Suites :)\n\n", total, total); + } + + return failed; +} diff --git a/opal/test/reachable/tests b/opal/test/reachable/tests new file mode 100755 index 00000000000..344bdf24a16 --- /dev/null +++ b/opal/test/reachable/tests @@ -0,0 +1,12 @@ +#!/bin/bash +# Copyright (c) 2017 Amazon.com, Inc. or its affiliates. All Rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +mpirun -np 1 --mca reachable netlink reachable_netlink +mpirun -np 1 --mca reachable weighted reachable_weighted