diff --git a/NEWS b/NEWS index aed7d811b0e..1da5b87779c 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,7 @@ Post-v2.15.0 * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the CPU supports it. This enhances performance by using the native vpopcount instructions, instead of the emulated version of vpopcount. + * Add command line option to switch between MFEX function pointers. - ovs-ctl: * New option '--no-record-hostname' to disable hostname configuration in ovsdb on startup. diff --git a/lib/automake.mk b/lib/automake.mk index 3c9523c1a30..53b8abc0fcc 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpif-netdev-private-dpcls.h \ lib/dpif-netdev-private-dpif.c \ lib/dpif-netdev-private-dpif.h \ + lib/dpif-netdev-private-extract.c \ + lib/dpif-netdev-private-extract.h \ lib/dpif-netdev-private-flow.h \ lib/dpif-netdev-private-thread.h \ lib/dpif-netdev-private.h \ diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index 6f9aa8284a1..7772b7abf96 100644 --- a/lib/dpif-netdev-avx512.c +++ b/lib/dpif-netdev-avx512.c @@ -149,6 +149,15 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, * // do all processing (HWOL->MFEX->EMC->SMC) * } */ + + /* Do a batch minfilow extract into keys. */ + uint32_t mf_mask = 0; + miniflow_extract_func mfex_func; + atomic_read_relaxed(&pmd->miniflow_extract_opt, &mfex_func); + if (mfex_func) { + mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd); + } + uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1; uint32_t iter = lookup_pkts_bitmask; while (iter) { @@ -167,6 +176,13 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, pkt_metadata_init(&packet->md, in_port); struct dp_netdev_flow *f = NULL; + struct netdev_flow_key *key = &keys[i]; + + /* Check the minfiflow mask to see if the packet was correctly + * classifed by vector mfex else do a scalar miniflow extract + * for that packet. + */ + bool mfex_hit = !!(mf_mask & (1 << i)); /* Check for a partial hardware offload match. */ if (hwol_enabled) { @@ -177,7 +193,13 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, } if (f) { rules[i] = &f->cr; - pkt_meta[i].tcp_flags = parse_tcp_flags(packet); + /* If AVX512 MFEX already classified the packet, use it. */ + if (mfex_hit) { + pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf); + } else { + pkt_meta[i].tcp_flags = parse_tcp_flags(packet); + } + pkt_meta[i].bytes = dp_packet_size(packet); phwol_hits++; hwol_emc_smc_hitmask |= (1 << i); @@ -185,9 +207,10 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, } } - /* Do miniflow extract into keys. */ - struct netdev_flow_key *key = &keys[i]; - miniflow_extract(packet, &key->mf); + if (!mfex_hit) { + /* Do a scalar miniflow extract into keys. */ + miniflow_extract(packet, &key->mf); + } /* Cache TCP and byte values for all packets. */ pkt_meta[i].bytes = dp_packet_size(packet); diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c new file mode 100644 index 00000000000..584b110ca89 --- /dev/null +++ b/lib/dpif-netdev-private-extract.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "dp-packet.h" +#include "dpif-netdev-private-dpcls.h" +#include "dpif-netdev-private-extract.h" +#include "dpif-netdev-private-thread.h" +#include "flow.h" +#include "openvswitch/vlog.h" +#include "ovs-thread.h" +#include "util.h" + +VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract); + +/* Variable to hold the default MFEX implementation. */ +static ATOMIC(miniflow_extract_func) default_mfex_func; + +/* Implementations of available extract options and + * the implementations are always in order of preference. + */ +static struct dpif_miniflow_extract_impl mfex_impls[] = { + + [MFEX_IMPL_SCALAR] = { + .probe = NULL, + .extract_func = NULL, + .name = "scalar", }, +}; + +BUILD_ASSERT_DECL(MFEX_IMPL_MAX == ARRAY_SIZE(mfex_impls)); + +void +dpif_miniflow_extract_init(void) +{ + atomic_uintptr_t *mfex_func = (void *)&default_mfex_func; + int mfex_idx = MFEX_IMPL_SCALAR; + + /* Call probe on each impl, and cache the result. */ + for (int i = 0; i < MFEX_IMPL_MAX; i++) { + bool avail = true; + if (mfex_impls[i].probe) { + /* Return zero is success, non-zero means error. */ + avail = (mfex_impls[i].probe() == 0); + } + VLOG_INFO("Miniflow Extract implementation %s (available: %s)\n", + mfex_impls[i].name, avail ? "available" : "not available"); + mfex_impls[i].available = avail; + } + + /* For the first call, this will be choosen based on the + * compile time flag. + */ + VLOG_INFO("Default MFEX Extract implementation is %s.\n", + mfex_impls[mfex_idx].name); + atomic_store_relaxed(mfex_func, (uintptr_t) mfex_impls + [mfex_idx].extract_func); +} + +miniflow_extract_func +dp_mfex_impl_get_default(void) +{ + miniflow_extract_func return_func; + atomic_uintptr_t *mfex_func = (void *)&default_mfex_func; + + atomic_read_relaxed(mfex_func, (uintptr_t *) &return_func); + + return return_func; +} + +int +dp_mfex_impl_set_default_by_name(const char *name) +{ + miniflow_extract_func new_default; + atomic_uintptr_t *mfex_func = (void *)&default_mfex_func; + + int err = dp_mfex_impl_get_by_name(name, &new_default); + + if (!err) { + atomic_store_relaxed(mfex_func, (uintptr_t) new_default); + } + + return err; + +} + +void +dp_mfex_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list, + size_t pmd_list_size) +{ + /* Add all MFEX functions to reply string. */ + ds_put_cstr(reply, "Available MFEX implementations:\n"); + + for (int i = 0; i < MFEX_IMPL_MAX; i++) { + ds_put_format(reply, " %s (available: %s pmds: ", + mfex_impls[i].name, mfex_impls[i].available ? + "True" : "False"); + + for (size_t j = 0; j < pmd_list_size; j++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[j]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + if (pmd->miniflow_extract_opt == mfex_impls[i].extract_func) { + ds_put_format(reply, "%u,", pmd->core_id); + } + } + + ds_chomp(reply, ','); + + if (ds_last(reply) == ' ') { + ds_put_cstr(reply, "none"); + } + + ds_put_cstr(reply, ")\n"); + } + +} + +/* This function checks all available MFEX implementations, and selects and + * returns the function pointer to the one requested by "name". If nothing + * is found it returns error. + */ +int +dp_mfex_impl_get_by_name(const char *name, miniflow_extract_func *out_func) +{ + if (!name || !out_func) { + return -EINVAL; + } + + for (int i = 0; i < MFEX_IMPL_MAX; i++) { + if (strcmp(mfex_impls[i].name, name) == 0) { + /* Check available is set before exec. */ + if (!mfex_impls[i].available) { + *out_func = NULL; + return -ENODEV; + } + + *out_func = mfex_impls[i].extract_func; + return 0; + } + } + + return -ENOENT; +} diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h new file mode 100644 index 00000000000..614677dd04e --- /dev/null +++ b/lib/dpif-netdev-private-extract.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021 Intel. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MFEX_AVX512_EXTRACT +#define MFEX_AVX512_EXTRACT 1 + +#include + +/* Forward declarations. */ +struct dp_packet; +struct miniflow; +struct dp_netdev_pmd_thread; +struct dp_packet_batch; +struct netdev_flow_key; + +/* Function pointer prototype to be implemented in the optimized miniflow + * extract code. + * returns the hitmask of the processed packets on success. + * returns zero on failure. + */ +typedef uint32_t (*miniflow_extract_func)(struct dp_packet_batch *batch, + struct netdev_flow_key *keys, + uint32_t keys_size, + odp_port_t in_port, + struct dp_netdev_pmd_thread + *pmd_handle); + + +/* The function pointer miniflow_extract_func depends on batch size. */ +BUILD_ASSERT_DECL(NETDEV_MAX_BURST == 32); + +/* Probe function is used to detect if this CPU has the ISA required + * to run the optimized miniflow implementation. + * returns one on successful probe. + * returns negative errno on failure. + */ +typedef int (*miniflow_extract_probe)(void); + +/* Structure representing the attributes of an optimized implementation. */ +struct dpif_miniflow_extract_impl { + /* When it is true, this impl has passed the probe() checks. */ + bool available; + + /* Probe function is used to detect if this CPU has the ISA required + * to run the optimized miniflow implementation. It is optional and + * if it is not used, then it must be null. + */ + miniflow_extract_probe probe; + + /* Optional function to call to extract miniflows for a burst of packets. + * If it is not used must be set to NULL; + */ + miniflow_extract_func extract_func; + + /* Name of the optimized implementation. */ + char *name; +}; + + +/* Enum to hold implementation indexes. The list is traversed + * linearly as from the ISA perspective, the VBMI version + * should always come before the generic AVX512-F version. + */ +enum dpif_miniflow_extract_impl_idx { + MFEX_IMPL_SCALAR, + MFEX_IMPL_MAX +}; + +extern struct ovs_mutex dp_netdev_mutex; + +/* This function returns all available implementations to the caller. The + * quantity of implementations is returned by the int return value. + */ +void +dp_mfex_impl_get(struct ds *reply, struct dp_netdev_pmd_thread **pmd_list, + size_t pmd_list_size) OVS_REQUIRES(dp_netdev_mutex); + +/* This function checks all available MFEX implementations, and selects the + * returns the function pointer to the one requested by "name". + */ +int +dp_mfex_impl_get_by_name(const char *name, miniflow_extract_func *out_func); + +/* Returns the default MFEX which is first ./configure selected, but can be + * overridden at runtime. */ +miniflow_extract_func dp_mfex_impl_get_default(void); + +/* Overrides the default MFEX with the user set MFEX. */ +int dp_mfex_impl_set_default_by_name(const char *name); + + +/* Initializes the available miniflow extract implementations by probing for + * the CPU ISA requirements. As the runtime available CPU ISA does not change + * and the required ISA of the implementation also does not change, it is safe + * to cache the probe() results, and not call probe() at runtime. + */ +void +dpif_miniflow_extract_init(void); + +#endif /* MFEX_AVX512_EXTRACT */ diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h index ba79c4a0ab1..a4c092b6929 100644 --- a/lib/dpif-netdev-private-thread.h +++ b/lib/dpif-netdev-private-thread.h @@ -27,6 +27,11 @@ #include #include "cmap.h" + +#include "dpif-netdev-private-dfc.h" +#include "dpif-netdev-private-dpif.h" +#include "dpif-netdev-perf.h" +#include "dpif-netdev-private-extract.h" #include "openvswitch/thread.h" #ifdef __cplusplus @@ -110,6 +115,9 @@ struct dp_netdev_pmd_thread { /* Pointer for per-DPIF implementation scratch space. */ void *netdev_input_func_userdata; + /* Function pointer to call for miniflow_extract() functionality. */ + ATOMIC(miniflow_extract_func) miniflow_extract_opt; + struct seq *reload_seq; uint64_t last_reload_seq; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 610949f3672..75c3a3214ba 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -45,6 +45,7 @@ #include "dpif.h" #include "dpif-netdev-lookup.h" #include "dpif-netdev-perf.h" +#include "dpif-netdev-private-extract.h" #include "dpif-provider.h" #include "dummy.h" #include "fat-rwlock.h" @@ -117,7 +118,7 @@ COVERAGE_DEFINE(datapath_drop_rx_invalid_packet); COVERAGE_DEFINE(datapath_drop_hw_miss_recover); /* Protects against changes to 'dp_netdevs'. */ -static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; +struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER; /* Contains all 'struct dp_netdev's. */ static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex) @@ -1050,6 +1051,96 @@ dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED, ds_destroy(&reply); } +static void +dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct shash_node *node; + + ovs_mutex_lock(&dp_netdev_mutex); + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = node->data; + size_t n; + + /* Get PMD threads list, required to get the DPIF impl used by each PMD + * thread. */ + sorted_poll_thread_list(dp, &pmd_list, &n); + dp_mfex_impl_get(&reply, pmd_list, n); + } + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static void +dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[], void *aux OVS_UNUSED) +{ + /* This function requires just one parameter, the miniflow name. + */ + const char *mfex_name = argv[1]; + struct shash_node *node; + + int err = dp_mfex_impl_set_default_by_name(mfex_name); + + if (err) { + struct ds reply = DS_EMPTY_INITIALIZER; + char *error_desc = NULL; + if (err == -EINVAL) { + error_desc = "Unknown miniflow extract implementation:"; + } else if (err == -ENOENT) { + error_desc = "Miniflow extract implementation doesn't exist:"; + } else if (err == -ENODEV) { + error_desc = "Miniflow extract implementation not available:"; + } else { + error_desc = "Miniflow extract implementation Error:"; + } + ds_put_format(&reply, "%s %s.\n", error_desc, mfex_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply_error(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); + return; + } + + ovs_mutex_lock(&dp_netdev_mutex); + + SHASH_FOR_EACH (node, &dp_netdevs) { + struct dp_netdev_pmd_thread **pmd_list; + struct dp_netdev *dp = node->data; + size_t n; + + sorted_poll_thread_list(dp, &pmd_list, &n); + miniflow_extract_func default_func = dp_mfex_impl_get_default(); + + for (size_t i = 0; i < n; i++) { + struct dp_netdev_pmd_thread *pmd = pmd_list[i]; + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Initialize MFEX function pointer to the newly configured + * default. */ + atomic_uintptr_t *pmd_func = (void *) &pmd->miniflow_extract_opt; + atomic_store_relaxed(pmd_func, (uintptr_t) default_func); + }; + } + + ovs_mutex_unlock(&dp_netdev_mutex); + + /* Reply with success to command. */ + struct ds reply = DS_EMPTY_INITIALIZER; + ds_put_format(&reply, "Miniflow Extract implementation set to %s", + mfex_name); + const char *reply_str = ds_cstr(&reply); + unixctl_command_reply(conn, reply_str); + VLOG_INFO("%s", reply_str); + ds_destroy(&reply); +} + static void dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) @@ -1279,6 +1370,13 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/dpif-impl-get", "", 0, 0, dpif_netdev_impl_get, NULL); + unixctl_command_register("dpif-netdev/miniflow-parser-set", + "miniflow_implementation_name", + 1, 1, dpif_miniflow_extract_impl_set, + NULL); + unixctl_command_register("dpif-netdev/miniflow-parser-get", "", + 0, 0, dpif_miniflow_extract_impl_get, + NULL); return 0; } @@ -1480,6 +1578,8 @@ create_dp_netdev(const char *name, const struct dpif_class *class, dp->conntrack = conntrack_init(); + dpif_miniflow_extract_init(); + atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); @@ -6222,6 +6322,11 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, atomic_uintptr_t *pmd_func = (void *) &pmd->netdev_input_func; atomic_init(pmd_func, (uintptr_t) default_func); + /* Init default miniflow_extract function */ + miniflow_extract_func mfex_func = dp_mfex_impl_get_default(); + atomic_uintptr_t *pmd_func_mfex = (void *)&pmd->miniflow_extract_opt; + atomic_store_relaxed(pmd_func_mfex, (uintptr_t) mfex_func); + /* init the 'flow_cache' since there is no * actual thread created for NON_PMD_CORE_ID. */ if (core_id == NON_PMD_CORE_ID) {