diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index cd7254b7670..a78152b54de 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -105,31 +105,32 @@ Q: Are all features available with all datapaths? The following table lists the datapath supported features from an Open vSwitch user's perspective. - ===================== ============== ============== ========= ======= - Feature Linux upstream Linux OVS tree Userspace Hyper-V - ===================== ============== ============== ========= ======= - NAT 4.6 YES Yes NO - Connection tracking 4.3 YES PARTIAL PARTIAL - Tunnel - LISP NO YES NO NO - Tunnel - STT NO YES NO YES - Tunnel - GRE 3.11 YES YES YES - Tunnel - VXLAN 3.12 YES YES YES - Tunnel - Geneve 3.18 YES YES YES - Tunnel - GRE-IPv6 4.18 YES YES NO - Tunnel - VXLAN-IPv6 4.3 YES YES NO - Tunnel - Geneve-IPv6 4.4 YES YES NO - Tunnel - ERSPAN 4.18 YES YES NO - Tunnel - ERSPAN-IPv6 4.18 YES YES NO - QoS - Policing YES YES YES NO - QoS - Shaping YES YES NO NO - sFlow YES YES YES NO - IPFIX 3.10 YES YES NO - Set action YES YES YES PARTIAL - NIC Bonding YES YES YES YES - Multiple VTEPs YES YES YES YES - Meters 4.15 YES YES NO - Conntrack zone limit 4.18 YES NO NO - ===================== ============== ============== ========= ======= + ========================== ============== ============== ========= ======= + Feature Linux upstream Linux OVS tree Userspace Hyper-V + ========================== ============== ============== ========= ======= + Connection tracking 4.3 YES YES YES + Conntrack Fragment Reass. 4.3 YES YES YES + NAT 4.6 YES YES NO + Conntrack zone limit 4.18 YES NO NO + Tunnel - LISP NO YES NO NO + Tunnel - STT NO YES NO YES + Tunnel - GRE 3.11 YES YES YES + Tunnel - VXLAN 3.12 YES YES YES + Tunnel - Geneve 3.18 YES YES YES + Tunnel - GRE-IPv6 NO NO YES NO + Tunnel - VXLAN-IPv6 4.3 YES YES NO + Tunnel - Geneve-IPv6 4.4 YES YES NO + Tunnel - ERSPAN 4.18 YES YES NO + Tunnel - ERSPAN-IPv6 4.18 YES YES NO + QoS - Policing YES YES YES NO + QoS - Shaping YES YES NO NO + sFlow YES YES YES NO + IPFIX 3.10 YES YES NO + Set action YES YES YES PARTIAL + NIC Bonding YES YES YES YES + Multiple VTEPs YES YES YES YES + Meters 4.15 YES YES NO + ========================== ============== ============== ========= ======= Do note, however: diff --git a/NEWS b/NEWS index ccc0bfb0e47..2dabb970415 100644 --- a/NEWS +++ b/NEWS @@ -8,7 +8,15 @@ Post-v2.11.0 - Userspace datapath: * ICMPv6 ND enhancements: support for match and set ND options type and reserved fields. - + * Add v4/v6 fragmentation support for conntrack. + * New ovs-appctl "dpctl/ipf-set-enabled" and "dpctl/ipf-set-disabled" + commands for userspace datapath conntrack fragmentation support. + * New "ovs-appctl dpctl/ipf-set-min-frag" command for userspace + datapath conntrack fragmentation support. + * New "ovs-appctl dpctl/ipf-set-max-nfrags" command for userspace datapath + conntrack fragmentation support. + * New "ovs-appctl dpctl/ipf-get-status" command for userspace datapath + conntrack fragmentation support. v2.11.0 - xx xxx xxxx --------------------- diff --git a/include/sparse/netinet/ip6.h b/include/sparse/netinet/ip6.h index d2a54de169d..bfa637a4604 100644 --- a/include/sparse/netinet/ip6.h +++ b/include/sparse/netinet/ip6.h @@ -64,5 +64,6 @@ struct ip6_frag { }; #define IP6F_OFF_MASK ((OVS_FORCE ovs_be16) 0xfff8) +#define IP6F_MORE_FRAG ((OVS_FORCE ovs_be16) 0x0001) #endif /* netinet/ip6.h sparse */ diff --git a/lib/automake.mk b/lib/automake.mk index ba1041095c5..bae032bd835 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -1,4 +1,4 @@ -# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. +# Copyright (C) 2009-2018 Nicira, Inc. # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright @@ -108,6 +108,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/hmapx.h \ lib/id-pool.c \ lib/id-pool.h \ + lib/ipf.c \ + lib/ipf.h \ lib/jhash.c \ lib/jhash.h \ lib/json.c \ diff --git a/lib/conntrack.c b/lib/conntrack.c index a044a69874f..78c673c56f1 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2016, 2017 Nicira, Inc. + * Copyright (c) 2015-2019 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include "ct-dpif.h" #include "dp-packet.h" #include "flow.h" +#include "ipf.h" #include "netdev.h" #include "odp-netlink.h" #include "openvswitch/hmap.h" @@ -340,6 +341,7 @@ conntrack_init(struct conntrack *ct) atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); latch_init(&ct->clean_thread_exit); ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct); + ct->ipf = ipf_init(); } /* Destroys the connection tracker 'ct' and frees all the allocated memory. */ @@ -382,6 +384,7 @@ conntrack_destroy(struct conntrack *ct) hindex_destroy(&ct->alg_expectation_refs); ct_rwlock_unlock(&ct->resources_lock); ct_rwlock_destroy(&ct->resources_lock); + ipf_destroy(ct->ipf); } static unsigned hash_to_bucket(uint32_t hash) @@ -1299,7 +1302,8 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have - * the l3 and and l4 offset properly set. + * the l3 and and l4 offset properly set. Performs fragment reassembly with + * the help of ipf_preprocess_conntrack(). * * If 'commit' is true, the packets are allowed to create new entries in the * connection tables. 'setmark', if not NULL, should point to a two @@ -1314,11 +1318,15 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, const struct nat_action_info_t *nat_action_info, long long now) { + ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone, + ct->hash_basis); + struct dp_packet *packet; struct conn_lookup_ctx ctx; DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) { - if (!conn_key_extract(ct, packet, dl_type, &ctx, zone)) { + if (packet->md.ct_state == CS_INVALID + || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) { packet->md.ct_state = CS_INVALID; write_ct_md(packet, zone, NULL, NULL, NULL); continue; @@ -1327,6 +1335,8 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, setlabel, nat_action_info, tp_src, tp_dst, helper); } + ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type); + return 0; } @@ -2484,6 +2494,12 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, } } +struct ipf * +conntrack_ipf_ctx(struct conntrack *ct) +{ + return ct->ipf; +} + int conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump, const uint16_t *pzone, int *ptot_bkts) diff --git a/lib/conntrack.h b/lib/conntrack.h index e3a5dcc8023..038d22d713a 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2016, 2017 Nicira, Inc. + * Copyright (c) 2015, 2016, 2017, 2019 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -122,6 +122,7 @@ int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *, int conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns); int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns); int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns); +struct ipf *conntrack_ipf_ctx(struct conntrack *ct); /* 'struct ct_lock' is a wrapper for an adaptive mutex. It's useful to try * different types of locks (e.g. spinlocks) */ @@ -293,6 +294,9 @@ struct conntrack { */ struct ct_rwlock resources_lock; + /* Fragmentation handling context. */ + struct ipf *ipf; + }; #endif /* conntrack.h */ diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 67eccd0fa04..b2c9b4309a8 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Nicira, Inc. + * Copyright (c) 2015, 2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -194,6 +194,62 @@ ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *zone_limits) : EOPNOTSUPP); } +int +ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) +{ + return (dpif->dpif_class->ipf_set_enabled + ? dpif->dpif_class->ipf_set_enabled(dpif, v6, enable) + : EOPNOTSUPP); +} + +int +ct_dpif_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) +{ + return (dpif->dpif_class->ipf_set_min_frag + ? dpif->dpif_class->ipf_set_min_frag(dpif, v6, min_frag) + : EOPNOTSUPP); +} + +int +ct_dpif_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) +{ + return (dpif->dpif_class->ipf_set_max_nfrags + ? dpif->dpif_class->ipf_set_max_nfrags(dpif, max_frags) + : EOPNOTSUPP); +} + +int ct_dpif_ipf_get_status(struct dpif *dpif, + struct dpif_ipf_status *dpif_ipf_status) +{ + return (dpif->dpif_class->ipf_get_status + ? dpif->dpif_class->ipf_get_status(dpif, dpif_ipf_status) + : EOPNOTSUPP); +} + +int +ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **dump_ctx) +{ + return (dpif->dpif_class->ipf_dump_start + ? dpif->dpif_class->ipf_dump_start(dpif, dump_ctx) + : EOPNOTSUPP); +} + +int +ct_dpif_ipf_dump_next(struct dpif *dpif, void *dump_ctx, char **dump) +{ + return (dpif->dpif_class->ipf_dump_next + ? dpif->dpif_class->ipf_dump_next(dpif, dump_ctx, dump) + : EOPNOTSUPP); +} + +int +ct_dpif_ipf_dump_done(struct dpif *dpif, void *dump_ctx) +{ + return (dpif->dpif_class->ipf_dump_done + ? dpif->dpif_class->ipf_dump_done(dpif, dump_ctx) + : EOPNOTSUPP); +} + void ct_dpif_entry_uninit(struct ct_dpif_entry *entry) { diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index decc14ffc2a..0151cfea479 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Nicira, Inc. + * Copyright (c) 2015, 2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,6 +186,8 @@ enum { }; struct dpif; +struct dpif_ipf_status; +struct ipf_dump_ctx; struct ct_dpif_dump_state { struct dpif *dpif; @@ -212,6 +214,14 @@ int ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, const struct ovs_list *, struct ovs_list *); int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *); +int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable); +int ct_dpif_ipf_set_min_frag(struct dpif *, bool v6, uint32_t min_frag); +int ct_dpif_ipf_set_max_nfrags(struct dpif *, uint32_t max_frags); +int ct_dpif_ipf_get_status(struct dpif *dpif, + struct dpif_ipf_status *dpif_ipf_status); +int ct_dpif_ipf_dump_start(struct dpif *dpif, struct ipf_dump_ctx **); +int ct_dpif_ipf_dump_next(struct dpif *dpif, void *, char **); +int ct_dpif_ipf_dump_done(struct dpif *dpif, void *); void ct_dpif_entry_uninit(struct ct_dpif_entry *); void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *, bool verbose, bool print_stats); diff --git a/lib/dpctl.c b/lib/dpctl.c index 59071cdba83..f5a09b70f65 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2017 Nicira, Inc. + * Copyright (c) 2008-2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include "dirs.h" #include "dpctl.h" #include "dpif.h" +#include "dpif-provider.h" #include "openvswitch/dynamic-string.h" #include "flow.h" #include "openvswitch/match.h" @@ -1917,6 +1918,210 @@ dpctl_ct_get_limits(int argc, const char *argv[], return error; } +static int +ipf_set_enabled__(int argc, const char *argv[], struct dpctl_params *dpctl_p, + bool enabled) +{ + struct dpif *dpif; + int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); + if (!error) { + char v4_or_v6[3] = {0}; + if (ovs_scan(argv[argc - 1], "%2s", v4_or_v6) && + (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2))) { + error = ct_dpif_ipf_set_enabled( + dpif, !strncmp(v4_or_v6, "v6", 2), enabled); + if (!error) { + dpctl_print(dpctl_p, + "%s fragmentation reassembly successful", + enabled ? "enabling" : "disabling"); + } else { + dpctl_error(dpctl_p, error, + "%s fragmentation reassembly failed", + enabled ? "enabling" : "disabling"); + } + } else { + error = EINVAL; + dpctl_error(dpctl_p, error, + "parameter missing: 'v4' for IPv4 or 'v6' for IPv6"); + } + dpif_close(dpif); + } + return error; +} + +static int +dpctl_ipf_set_enabled(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + return ipf_set_enabled__(argc, argv, dpctl_p, true); +} + +static int +dpctl_ipf_set_disabled(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + return ipf_set_enabled__(argc, argv, dpctl_p, false); +} + +static int +dpctl_ipf_set_min_frag(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct dpif *dpif; + int error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); + if (!error) { + char v4_or_v6[3] = {0}; + if (ovs_scan(argv[argc - 2], "%2s", v4_or_v6) && + (!strncmp(v4_or_v6, "v4", 2) || !strncmp(v4_or_v6, "v6", 2))) { + uint32_t min_fragment; + if (ovs_scan(argv[argc - 1], "%"SCNu32, &min_fragment)) { + error = ct_dpif_ipf_set_min_frag( + dpif, !strncmp(v4_or_v6, "v6", 2), min_fragment); + if (!error) { + dpctl_print(dpctl_p, + "setting minimum fragment size successful"); + } else { + dpctl_error(dpctl_p, error, + "requested minimum fragment size too small;" + " see documentation"); + } + } else { + error = EINVAL; + dpctl_error(dpctl_p, error, + "parameter missing for minimum fragment size"); + } + } else { + error = EINVAL; + dpctl_error(dpctl_p, error, + "parameter missing: v4 for IPv4 or v6 for IPv6"); + } + dpif_close(dpif); + } + + return error; +} + +static int +dpctl_ipf_set_max_nfrags(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct dpif *dpif; + int error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif); + if (!error) { + uint32_t nfrags_max; + if (ovs_scan(argv[argc - 1], "%"SCNu32, &nfrags_max)) { + error = ct_dpif_ipf_set_max_nfrags(dpif, nfrags_max); + if (!error) { + dpctl_print(dpctl_p, + "setting maximum fragments successful"); + } else { + dpctl_error(dpctl_p, error, + "setting maximum fragments failed"); + } + } else { + error = EINVAL; + dpctl_error(dpctl_p, error, + "parameter missing for maximum fragments"); + } + dpif_close(dpif); + } + + return error; +} + +static void +dpctl_dump_ipf(struct dpif *dpif, struct dpctl_params *dpctl_p) +{ + struct ipf_dump_ctx *dump_ctx; + char *dump; + + int error = ct_dpif_ipf_dump_start(dpif, &dump_ctx); + if (error) { + dpctl_error(dpctl_p, error, "starting ipf list dump"); + /* Nothing to clean up, just return. */ + return; + } + + dpctl_print(dpctl_p, "\n Fragment Lists:\n\n"); + while (!(error = ct_dpif_ipf_dump_next(dpif, dump_ctx, &dump))) { + dpctl_print(dpctl_p, "%s\n", dump); + free(dump); + } + + if (error && error != EOF) { + dpctl_error(dpctl_p, error, "dumping ipf lists failed"); + } + + ct_dpif_ipf_dump_done(dpif, dump_ctx); +} + +static int +dpctl_ct_ipf_get_status(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct dpif *dpif; + int error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif); + + if (!error) { + struct dpif_ipf_status dpif_ipf_status; + error = ct_dpif_ipf_get_status(dpif, &dpif_ipf_status); + + if (!error) { + dpctl_print(dpctl_p, " Fragmentation Module Status\n"); + dpctl_print(dpctl_p, " ---------------------------\n"); + dpctl_print(dpctl_p, " v4 enabled: %u\n", + dpif_ipf_status.v4.enabled); + dpctl_print(dpctl_p, " v6 enabled: %u\n", + dpif_ipf_status.v6.enabled); + dpctl_print(dpctl_p, " max num frags (v4/v6): %u\n", + dpif_ipf_status.nfrag_max); + dpctl_print(dpctl_p, " num frag: %u\n", + dpif_ipf_status.nfrag); + dpctl_print(dpctl_p, " min v4 frag size: %u\n", + dpif_ipf_status.v4.min_frag_size); + dpctl_print(dpctl_p, " v4 frags accepted: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_accepted); + dpctl_print(dpctl_p, " v4 frags completed: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_completed_sent); + dpctl_print(dpctl_p, " v4 frags expired: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_expired_sent); + dpctl_print(dpctl_p, " v4 frags too small: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_too_small); + dpctl_print(dpctl_p, " v4 frags overlapped: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_overlap); + dpctl_print(dpctl_p, " v4 frags purged: %"PRIu64"\n", + dpif_ipf_status.v4.nfrag_purged); + + dpctl_print(dpctl_p, " min v6 frag size: %u\n", + dpif_ipf_status.v6.min_frag_size); + dpctl_print(dpctl_p, " v6 frags accepted: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_accepted); + dpctl_print(dpctl_p, " v6 frags completed: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_completed_sent); + dpctl_print(dpctl_p, " v6 frags expired: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_expired_sent); + dpctl_print(dpctl_p, " v6 frags too small: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_too_small); + dpctl_print(dpctl_p, " v6 frags overlapped: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_overlap); + dpctl_print(dpctl_p, " v6 frags purged: %"PRIu64"\n", + dpif_ipf_status.v6.nfrag_purged); + } else { + dpctl_error(dpctl_p, error, + "ipf status could not be retrieved"); + return error; + } + + if (dpctl_p->verbosity) { + dpctl_dump_ipf(dpif, dpctl_p); + } + + dpif_close(dpif); + } + + return error; +} + /* Undocumented commands for unit testing. */ static int @@ -2222,6 +2427,14 @@ static const struct dpctl_command all_commands[] = { DP_RO }, { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2, dpctl_ct_get_limits, DP_RO }, + { "ipf-set-enabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_enabled, DP_RW }, + { "ipf-set-disabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_disabled, DP_RW }, + { "ipf-set-min-frag", "[dp] v4|v6 minfragment", 2, 3, + dpctl_ipf_set_min_frag, DP_RW }, + { "ipf-set-max-nfrags", "[dp] maxfrags", 1, 2, + dpctl_ipf_set_max_nfrags, DP_RW }, + { "ipf-get-status", "[dp]", 0, 1, dpctl_ct_ipf_get_status, + DP_RO }, { "help", "", 0, INT_MAX, dpctl_help, DP_RO }, { "list-commands", "", 0, INT_MAX, dpctl_list_commands, DP_RO }, diff --git a/lib/dpctl.man b/lib/dpctl.man index fe0aec9ef34..f22029fcd40 100644 --- a/lib/dpctl.man +++ b/lib/dpctl.man @@ -220,6 +220,42 @@ nftables and the regular host stack). Therefore, the following commands do not apply specifically to one datapath. . .TP +\*(DX\fBipf\-set\-enabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR +.TQ +\*(DX\fBipf\-set\-disabled\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR +Enables or disables IP fragmentation handling for the userspace +connection tracker. Either \fBv4\fR or \fBv6\fR must be specified. +Both IPv4 and IPv6 fragment reassembly are enabled by default. Only +supported for the userspace datapath. +. +.TP +\*(DX\fBipf\-set\-min\-frag\fR [\fIdp\fR] \fBv4\fR|\fBv6\fR \fIminfrag\fR +Sets the minimum fragment size for non-final fragments to +\fIminfrag\fR. Either \fBv4\fR or \fBv6\fR must be specified. For +enhanced DOS security, higher minimum fragment sizes can usually be used. +The default IPv4 value is 1200 and the clamped minimum is 400. The default +IPv6 value is 1280, with a clamped minimum of 400, for testing +flexibility. The maximum fragment size is not clamped, however, setting +this value too high might result in valid fragments being dropped. Only +supported for userspace datapath. +. +.TP +\*(DX\fBipf\-set\-max\-nfrags\fR [\fIdp\fR] \fImaxfrags\fR +Sets the maximum number of fragments tracked by the userspace datapath +connection tracker to \fImaxfrags\fR. The default value is 1000 and the +clamped maximum is 5000. Note that packet buffers can be held by the +fragmentation module while fragments are incomplete, but will timeout +after 15 seconds. Memory pool sizing should be set accordingly when +fragmentation is enabled. Only supported for userspace datapath. +. +.TP +.DO "[\fB\-m\fR | \fB\-\-more\fR]" "\*(DX\fBipf\-get\-status\fR [\fIdp\fR]" +Gets the configuration settings and fragment counters associated with the +fragmentation handling of the userspace datapath connection tracker. +With \fB\-m\fR or \fB\-\-more\fR, also dumps the IP fragment lists. +Only supported for userspace datapath. +. +.TP .DO "[\fB\-m\fR | \fB\-\-more\fR] [\fB\-s\fR | \fB\-\-statistics\fR]" "\*(DX\fBdump\-conntrack\fR" "[\fIdp\fR] [\fBzone=\fIzone\fR]" Prints to the console all the connection entries in the tracker used by \fIdp\fR. If \fBzone=\fIzone\fR is specified, only shows the connections diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index f41f1d70df3..77ac1d2c19a 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc. + * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,6 +47,7 @@ #include "flow.h" #include "hmapx.h" #include "id-pool.h" +#include "ipf.h" #include "latch.h" #include "netdev.h" #include "netdev-provider.h" @@ -7356,6 +7357,61 @@ dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns) return conntrack_get_nconns(&dp->conntrack, nconns); } +static int +dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return ipf_set_enabled(conntrack_ipf_ctx(&dp->conntrack), v6, enable); +} + +static int +dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return ipf_set_min_frag(conntrack_ipf_ctx(&dp->conntrack), v6, min_frag); +} + +static int +dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return ipf_set_max_nfrags(conntrack_ipf_ctx(&dp->conntrack), max_frags); +} + +/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to + * diverge. */ +static int +dpif_netdev_ipf_get_status(struct dpif *dpif, + struct dpif_ipf_status *dpif_ipf_status) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + ipf_get_status(conntrack_ipf_ctx(&dp->conntrack), + (struct ipf_status *) dpif_ipf_status); + return 0; +} + +static int +dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED, + struct ipf_dump_ctx **ipf_dump_ctx) +{ + return ipf_dump_start(ipf_dump_ctx); +} + +static int +dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return ipf_dump_next(conntrack_ipf_ctx(&dp->conntrack), ipf_dump_ctx, + dump); +} + +static int +dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx) +{ + return ipf_dump_done(ipf_dump_ctx); + +} + const struct dpif_class dpif_netdev_class = { "netdev", dpif_netdev_init, @@ -7407,6 +7463,13 @@ const struct dpif_class dpif_netdev_class = { NULL, /* ct_set_limits */ NULL, /* ct_get_limits */ NULL, /* ct_del_limits */ + dpif_netdev_ipf_set_enabled, + dpif_netdev_ipf_set_min_frag, + dpif_netdev_ipf_set_max_nfrags, + dpif_netdev_ipf_get_status, + dpif_netdev_ipf_dump_start, + dpif_netdev_ipf_dump_next, + dpif_netdev_ipf_dump_done, dpif_netdev_meter_get_features, dpif_netdev_meter_set, dpif_netdev_meter_get, diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index e23a35da4f4..73641a5b5b5 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2017 Nicira, Inc. + * Copyright (c) 2008-2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -3429,6 +3429,13 @@ const struct dpif_class dpif_netlink_class = { dpif_netlink_ct_set_limits, dpif_netlink_ct_get_limits, dpif_netlink_ct_del_limits, + NULL, /* ipf_set_enabled */ + NULL, /* ipf_set_min_frag */ + NULL, /* ipf_set_max_nfrags */ + NULL, /* ipf_get_status */ + NULL, /* ipf_dump_start */ + NULL, /* ipf_dump_next */ + NULL, /* ipf_dump_done */ dpif_netlink_meter_get_features, dpif_netlink_meter_set, dpif_netlink_meter_get, diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 78e153c8624..b2a4dff9645 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. + * Copyright (c) 2009-2014, 2018 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,6 +42,9 @@ struct dpif { long long int current_ms; }; +struct dpif_ipf_status; +struct ipf_dump_ctx; + void dpif_init(struct dpif *, const struct dpif_class *, const char *name, uint8_t netflow_engine_type, uint8_t netflow_engine_id); void dpif_uninit(struct dpif *dpif, bool close); @@ -78,6 +81,27 @@ struct ct_dpif_dump_state; struct ct_dpif_entry; struct ct_dpif_tuple; +/* 'dpif_ipf_proto_status' and 'dpif_ipf_status' are presently in + * sync with 'ipf_proto_status' and 'ipf_status', but more + * generally represent a superset of present and future support. */ +struct dpif_ipf_proto_status { + uint64_t nfrag_accepted; + uint64_t nfrag_completed_sent; + uint64_t nfrag_expired_sent; + uint64_t nfrag_too_small; + uint64_t nfrag_overlap; + uint64_t nfrag_purged; + unsigned int min_frag_size; + bool enabled; +}; + +struct dpif_ipf_status { + struct dpif_ipf_proto_status v4; + struct dpif_ipf_proto_status v6; + unsigned int nfrag; + unsigned int nfrag_max; +}; + /* Datapath interface class structure, to be defined by each implementation of * a datapath interface. * @@ -468,6 +492,33 @@ struct dpif_class { * list of 'struct ct_dpif_zone_limit' entries. */ int (*ct_del_limits)(struct dpif *, const struct ovs_list *zone_limits); + /* IP Fragmentation. */ + + /* Disables or enables conntrack fragment reassembly. The default + * setting is enabled. */ + int (*ipf_set_enabled)(struct dpif *, bool v6, bool enabled); + + /* Set minimum fragment allowed. */ + int (*ipf_set_min_frag)(struct dpif *, bool v6, uint32_t min_frag); + + /* Set maximum number of fragments tracked. */ + int (*ipf_set_max_nfrags)(struct dpif *, uint32_t max_nfrags); + + /* Get fragmentation configuration status and counters. */ + int (*ipf_get_status)(struct dpif *, + struct dpif_ipf_status *dpif_ipf_status); + + /* The following 3 apis find and print ipf lists by creating a string + * representation of the state of an ipf list, to which 'dump' is pointed + * to. 'ipf_dump_start()' allocates memory for 'ipf_dump_ctx'. + * 'ipf_dump_next()' finds the next ipf list and copies it's + * characteristics to a string, which is freed by the caller. + * 'ipf_dump_done()' frees the 'ipf_dump_ctx' that was allocated in + * 'ipf_dump_start'. */ + int (*ipf_dump_start)(struct dpif *, struct ipf_dump_ctx **ipf_dump_ctx); + int (*ipf_dump_next)(struct dpif *, void *ipf_dump_ctx, char **dump); + int (*ipf_dump_done)(struct dpif *, void *ipf_dump_ctx); + /* Meters */ /* Queries 'dpif' for supported meter features. diff --git a/lib/ipf.c b/lib/ipf.c new file mode 100644 index 00000000000..df5196f2772 --- /dev/null +++ b/lib/ipf.c @@ -0,0 +1,1525 @@ +/* + * Copyright (c) 2019 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "coverage.h" +#include "csum.h" +#include "ipf.h" +#include "latch.h" +#include "openvswitch/hmap.h" +#include "openvswitch/poll-loop.h" +#include "openvswitch/vlog.h" +#include "ovs-atomic.h" +#include "packets.h" +#include "util.h" + +VLOG_DEFINE_THIS_MODULE(ipf); +COVERAGE_DEFINE(ipf_stuck_frag_list_purged); + +enum { + IPV4_PACKET_MAX_HDR_SIZE = 60, + IPV4_PACKET_MAX_SIZE = 65535, + IPV6_PACKET_MAX_DATA = 65535, +}; + +enum ipf_list_state { + IPF_LIST_STATE_UNUSED, + IPF_LIST_STATE_REASS_FAIL, + IPF_LIST_STATE_OTHER_SEEN, + IPF_LIST_STATE_FIRST_SEEN, + IPF_LIST_STATE_LAST_SEEN, + IPF_LIST_STATE_FIRST_LAST_SEEN, + IPF_LIST_STATE_COMPLETED, + IPF_LIST_STATE_NUM, +}; + +static char *ipf_state_name[IPF_LIST_STATE_NUM] = + {"unused", "reassemble fail", "other frag", "first frag", "last frag", + "first/last frag", "complete"}; + +enum ipf_list_type { + IPF_FRAG_COMPLETED_LIST, + IPF_FRAG_EXPIRY_LIST, +}; + +enum { + IPF_INVALID_IDX = -1, + IPF_V4_FRAG_SIZE_LBOUND = 400, + IPF_V4_FRAG_SIZE_MIN_DEF = 1200, + IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */ + IPF_V6_FRAG_SIZE_MIN_DEF = 1280, + IPF_MAX_FRAGS_DEFAULT = 1000, + IPF_NFRAG_UBOUND = 5000, +}; + +enum ipf_counter_type { + IPF_NFRAGS_ACCEPTED, + IPF_NFRAGS_COMPL_SENT, + IPF_NFRAGS_EXPD_SENT, + IPF_NFRAGS_TOO_SMALL, + IPF_NFRAGS_OVERLAP, + IPF_NFRAGS_PURGED, + IPF_NFRAGS_NUM_CNTS, +}; + +union ipf_addr { + ovs_be32 ipv4; + struct in6_addr ipv6; +}; + +/* Represents a single fragment; part of a list of fragments. */ +struct ipf_frag { + struct dp_packet *pkt; + uint16_t start_data_byte; + uint16_t end_data_byte; + bool dnsteal; /* 'do not steal': if true, ipf should not free packet. */ +}; + +/* The key for a collection of fragments potentially making up an unfragmented + * packet. */ +struct ipf_list_key { + /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first + * two members. */ + union ipf_addr src_addr; + union ipf_addr dst_addr; + uint32_t recirc_id; + ovs_be32 ip_id; /* V6 is 32 bits. */ + ovs_be16 dl_type; + uint16_t zone; + uint8_t nw_proto; +}; + +/* A collection of fragments potentially making up an unfragmented packet. */ +struct ipf_list { + struct hmap_node node; /* In struct ipf's 'frag_lists'. */ + struct ovs_list list_node; /* In struct ipf's 'frag_exp_list' or + * 'frag_complete_list'. */ + struct ipf_frag *frag_list; /* List of fragments for this list. */ + struct ipf_list_key key; /* The key for the fragemnt list. */ + struct dp_packet *reass_execute_ctx; /* Reassembled packet. */ + long long expiration; /* In milliseconds. */ + int last_sent_idx; /* Last sent fragment idx. */ + int last_inuse_idx; /* Last inuse fragment idx. */ + int size; /* Fragment list size. */ + uint8_t state; /* Frag list state; see ipf_list_state. */ +}; + +/* Represents a reassambled packet which typically is passed through + * conntrack. */ +struct reassembled_pkt { + struct ovs_list rp_list_node; /* In struct ipf's + * 'reassembled_pkt_list'. */ + struct dp_packet *pkt; + struct ipf_list *list; +}; + +struct ipf { + /* The clean thread is used to clean up fragments in the 'ipf' + * module if packet batches are not longer be sent through its user. */ + pthread_t ipf_clean_thread; + struct latch ipf_clean_thread_exit; + + int max_v4_frag_list_size; + + struct ovs_mutex ipf_lock; /* Protects all of the following. */ + /* These contain 'struct ipf_list's. */ + struct hmap frag_lists OVS_GUARDED; + struct ovs_list frag_exp_list OVS_GUARDED; + struct ovs_list frag_complete_list OVS_GUARDED; + /* Contains 'struct reassembled_pkt's. */ + struct ovs_list reassembled_pkt_list OVS_GUARDED; + + /* Used to allow disabling fragmentation reassembly. */ + atomic_bool ifp_v4_enabled; + atomic_bool ifp_v6_enabled; + + /* Will be clamped above 400 bytes; the value chosen should handle + * alg control packets of interest that use string encoding of mutable + * IP fields; meaning, the control packets should not be fragmented. */ + atomic_uint min_v4_frag_size; + atomic_uint min_v6_frag_size; + + /* Configurable maximum allowable fragments in process. */ + atomic_uint nfrag_max; + + /* Number of fragments in process. */ + atomic_count nfrag; + + atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS]; + atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS]; +}; + +static void +ipf_print_reass_packet(const char *es, const void *pkt) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + if (!VLOG_DROP_WARN(&rl)) { + struct ds ds = DS_EMPTY_INITIALIZER; + ds_put_hex_dump(&ds, pkt, 128, 0, false); + VLOG_WARN("%s\n%s", es, ds_cstr(&ds)); + ds_destroy(&ds); + } +} + +static void +ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr) +{ + atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] : &ipf->n4frag_cnt[cntr]); +} + +static bool +ipf_get_v4_enabled(struct ipf *ipf) +{ + bool ifp_v4_enabled_; + atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_); + return ifp_v4_enabled_; +} + +static bool +ipf_get_v6_enabled(struct ipf *ipf) +{ + bool ifp_v6_enabled_; + atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_); + return ifp_v6_enabled_; +} + +static bool +ipf_get_enabled(struct ipf *ipf) +{ + return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf); +} + +static uint32_t +ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr) +{ + BUILD_ASSERT_DECL(sizeof *addr % 4 == 0); + return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr); +} + +/* Adds a list of fragments to the list tracking expiry of yet to be + * completed reassembled packets, hence subject to expirty. */ +static void +ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list *ipf_list, + long long now) + /* OVS_REQUIRES(ipf->ipf_lock) */ +{ + enum { + IPF_FRAG_LIST_TIMEOUT = 15000, + }; + + ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT; + ovs_list_push_back(frag_exp_list, &ipf_list->list_node); +} + +/* Adds a list of fragments to the list of completed packets, which will be + * subsequently transmitted. */ +static void +ipf_completed_list_add(struct ovs_list *frag_complete_list, + struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + ovs_list_push_back(frag_complete_list, &ipf_list->list_node); +} + +/* Adds a reassmebled packet to the list of reassembled packets, awaiting some + * processing, such as being sent through conntrack. */ +static void +ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list, + struct reassembled_pkt *rp) + /* OVS_REQUIRES(ipf_lock) */ +{ + ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node); +} + +/* Removed a frag list from tracking datastructures and frees list heap + * memory. */ +static void +ipf_list_clean(struct hmap *frag_lists, + struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + ovs_list_remove(&ipf_list->list_node); + hmap_remove(frag_lists, &ipf_list->node); + free(ipf_list->frag_list); + free(ipf_list); +} + +/* Removed a frag list sitting on the expiry list from tracking + * datastructures and frees list heap memory. */ +static void +ipf_expiry_list_clean(struct hmap *frag_lists, + struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + ipf_list_clean(frag_lists, ipf_list); +} + +/* Removed a frag list sitting on the completed list from tracking + * datastructures and frees list heap memory. */ +static void +ipf_completed_list_clean(struct hmap *frag_lists, + struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + ipf_list_clean(frag_lists, ipf_list); +} + +static void +ipf_expiry_list_remove(struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + ovs_list_remove(&ipf_list->list_node); +} + +static void +ipf_reassembled_list_remove(struct reassembled_pkt *rp) + /* OVS_REQUIRES(ipf_lock) */ +{ + ovs_list_remove(&rp->rp_list_node); +} + +/* Symmetric */ +static uint32_t +ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis) +{ + uint32_t hsrc, hdst, hash; + hsrc = hdst = basis; + hsrc = ipf_addr_hash_add(hsrc, &key->src_addr); + hdst = ipf_addr_hash_add(hdst, &key->dst_addr); + hash = hsrc ^ hdst; + + /* Hash the rest of the key. */ + return hash_words((uint32_t *) (&key->dst_addr + 1), + (uint32_t *) (key + 1) - + (uint32_t *) (&key->dst_addr + 1), + hash); +} + +static bool +ipf_is_first_v4_frag(const struct dp_packet *pkt) +{ + const struct ip_header *l3 = dp_packet_l3(pkt); + if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) && + l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) { + return true; + } + return false; +} + +static bool +ipf_is_last_v4_frag(const struct dp_packet *pkt) +{ + const struct ip_header *l3 = dp_packet_l3(pkt); + if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) && + !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) { + return true; + } + return false; +} + +static bool +ipf_is_v6_frag(ovs_be16 ip6f_offlg) +{ + if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) { + return true; + } + return false; +} + +static bool +ipf_is_first_v6_frag(ovs_be16 ip6f_offlg) +{ + if (!(ip6f_offlg & IP6F_OFF_MASK) && + ip6f_offlg & IP6F_MORE_FRAG) { + return true; + } + return false; +} + +static bool +ipf_is_last_v6_frag(ovs_be16 ip6f_offlg) +{ + if ((ip6f_offlg & IP6F_OFF_MASK) && + !(ip6f_offlg & IP6F_MORE_FRAG)) { + return true; + } + return false; +} + +/* Checks for a completed packet collection of fragments. */ +static bool +ipf_list_complete(const struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) { + if (ipf_list->frag_list[i - 1].end_data_byte + 1 + != ipf_list->frag_list[i].start_data_byte) { + return false; + } + } + return true; +} + +/* Runs O(n) for a sorted or almost sorted list. */ +static void +ipf_sort(struct ipf_frag *frag_list, size_t last_idx) + /* OVS_REQUIRES(ipf_lock) */ +{ + for (int li = 1; li <= last_idx; li++) { + struct ipf_frag ipf_frag = frag_list[li]; + int ci = li - 1; + while (ci >= 0 && + frag_list[ci].start_data_byte > ipf_frag.start_data_byte) { + frag_list[ci + 1] = frag_list[ci]; + ci--; + } + frag_list[ci + 1] = ipf_frag; + } +} + +/* Called on a sorted complete list of v4 fragments to reassemble them into + * a single packet that can be processed, such as passing through conntrack. + */ +static struct dp_packet * +ipf_reassemble_v4_frags(struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + struct ipf_frag *frag_list = ipf_list->frag_list; + struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt); + struct ip_header *l3 = dp_packet_l3(pkt); + int len = ntohs(l3->ip_tot_len); + + int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte - + frag_list[1].start_data_byte + 1; + + if (len + rest_len > IPV4_PACKET_MAX_SIZE) { + ipf_print_reass_packet( + "Unsupported big reassembled v4 packet; v4 hdr:", l3); + dp_packet_delete(pkt); + return NULL; + } + + dp_packet_prealloc_tailroom(pkt, len + rest_len); + + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) { + size_t add_len = frag_list[i].end_data_byte - + frag_list[i].start_data_byte + 1; + len += add_len; + const char *l4 = dp_packet_l4(frag_list[i].pkt); + dp_packet_put(pkt, l4, add_len); + } + l3 = dp_packet_l3(pkt); + ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS); + l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off, + new_ip_frag_off); + l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len)); + l3->ip_tot_len = htons(len); + l3->ip_frag_off = new_ip_frag_off; + dp_packet_set_l2_pad_size(pkt, 0); + + return pkt; +} + +/* Called on a sorted complete list of v6 fragments to reassemble them into + * a single packet that can be processed, such as passing through conntrack. + */ +static struct dp_packet * +ipf_reassemble_v6_frags(struct ipf_list *ipf_list) + /* OVS_REQUIRES(ipf_lock) */ +{ + struct ipf_frag *frag_list = ipf_list->frag_list; + struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt); + struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt); + int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag); + + int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte - + frag_list[1].start_data_byte + 1; + + if (pl + rest_len > IPV4_PACKET_MAX_SIZE) { + ipf_print_reass_packet( + "Unsupported big reassembled v6 packet; v6 hdr:", l3); + dp_packet_delete(pkt); + return NULL; + } + + dp_packet_prealloc_tailroom(pkt, pl + rest_len); + + for (int i = 1; i <= ipf_list->last_inuse_idx; i++) { + size_t add_len = frag_list[i].end_data_byte - + frag_list[i].start_data_byte + 1; + pl += add_len; + const char *l4 = dp_packet_l4(frag_list[i].pkt); + dp_packet_put(pkt, l4, add_len); + } + + l3 = dp_packet_l3(pkt); + + uint8_t nw_proto = l3->ip6_nxt; + uint8_t nw_frag = 0; + const void *data = l3 + 1; + size_t datasize = pl; + + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr) + || !nw_frag || !frag_hdr) { + + ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3); + dp_packet_delete(pkt); + return NULL; + } + + struct ovs_16aligned_ip6_frag *fh = + CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr); + fh->ip6f_offlg = 0; + l3->ip6_plen = htons(pl); + l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto; + dp_packet_set_l2_pad_size(pkt, 0); + return pkt; +} + +/* Called when a frag list state transitions to another state. This is + * triggered by new fragment for the list being received.*/ +static void +ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list, + bool ff, bool lf, bool v6) + OVS_REQUIRES(ipf->ipf_lock) +{ + enum ipf_list_state curr_state = ipf_list->state; + enum ipf_list_state next_state; + switch (curr_state) { + case IPF_LIST_STATE_UNUSED: + case IPF_LIST_STATE_OTHER_SEEN: + if (ff) { + next_state = IPF_LIST_STATE_FIRST_SEEN; + } else if (lf) { + next_state = IPF_LIST_STATE_LAST_SEEN; + } else { + next_state = IPF_LIST_STATE_OTHER_SEEN; + } + break; + case IPF_LIST_STATE_FIRST_SEEN: + if (ff) { + next_state = IPF_LIST_STATE_FIRST_SEEN; + } else if (lf) { + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN; + } else { + next_state = IPF_LIST_STATE_FIRST_SEEN; + } + break; + case IPF_LIST_STATE_LAST_SEEN: + if (ff) { + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN; + } else if (lf) { + next_state = IPF_LIST_STATE_LAST_SEEN; + } else { + next_state = IPF_LIST_STATE_LAST_SEEN; + } + break; + case IPF_LIST_STATE_FIRST_LAST_SEEN: + next_state = IPF_LIST_STATE_FIRST_LAST_SEEN; + break; + case IPF_LIST_STATE_COMPLETED: + case IPF_LIST_STATE_REASS_FAIL: + case IPF_LIST_STATE_NUM: + default: + OVS_NOT_REACHED(); + } + + if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) { + ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx); + if (ipf_list_complete(ipf_list)) { + struct dp_packet *reass_pkt = v6 + ? ipf_reassemble_v6_frags(ipf_list) + : ipf_reassemble_v4_frags(ipf_list); + if (reass_pkt) { + struct reassembled_pkt *rp = xzalloc(sizeof *rp); + rp->pkt = reass_pkt; + rp->list = ipf_list; + ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp); + ipf_expiry_list_remove(ipf_list); + next_state = IPF_LIST_STATE_COMPLETED; + } else { + next_state = IPF_LIST_STATE_REASS_FAIL; + } + } + } + ipf_list->state = next_state; +} + +/* Some sanity checks are redundant, but prudent, in case code paths for + * fragments change in future. The processing cost for fragments is not + * important. */ +static bool +ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt) +{ + if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) { + goto invalid_pkt; + } + + const struct eth_header *l2 = dp_packet_eth(pkt); + const struct ip_header *l3 = dp_packet_l3(pkt); + + if (OVS_UNLIKELY(!l2 || !l3)) { + goto invalid_pkt; + } + + size_t l3_size = dp_packet_l3_size(pkt); + if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) { + goto invalid_pkt; + } + + if (!IP_IS_FRAGMENT(l3->ip_frag_off)) { + return false; + } + + uint16_t ip_tot_len = ntohs(l3->ip_tot_len); + if (OVS_UNLIKELY(ip_tot_len != l3_size)) { + goto invalid_pkt; + } + + size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4; + if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) { + goto invalid_pkt; + } + if (OVS_UNLIKELY(l3_size < ip_hdr_len)) { + goto invalid_pkt; + } + + if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt) + && csum(l3, ip_hdr_len) != 0)) { + goto invalid_pkt; + } + + uint32_t min_v4_frag_size_; + atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_); + bool lf = ipf_is_last_v4_frag(pkt); + if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v4_frag_size_)) { + ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL); + goto invalid_pkt; + } + return true; + +invalid_pkt: + pkt->md.ct_state = CS_INVALID; + return false; +} + +static bool +ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone, + struct ipf_list_key *key, uint16_t *start_data_byte, + uint16_t *end_data_byte, bool *ff, bool *lf) +{ + const struct ip_header *l3 = dp_packet_l3(pkt); + uint16_t ip_tot_len = ntohs(l3->ip_tot_len); + size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4; + + *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8; + *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1; + *ff = ipf_is_first_v4_frag(pkt); + *lf = ipf_is_last_v4_frag(pkt); + memset(key, 0, sizeof *key); + key->ip_id = be16_to_be32(l3->ip_id); + key->dl_type = dl_type; + key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src); + key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst); + key->nw_proto = l3->ip_proto; + key->zone = zone; + key->recirc_id = pkt->md.recirc_id; + return true; +} + +/* Some sanity checks are redundant, but prudent, in case code paths for + * fragments change in future. The processing cost for fragments is not + * important. */ +static bool +ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt) +{ + const struct eth_header *l2 = dp_packet_eth(pkt); + const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt); + const char *l4 = dp_packet_l4(pkt); + + if (OVS_UNLIKELY(!l2 || !l3 || !l4)) { + goto invalid_pkt; + } + + size_t l3_size = dp_packet_l3_size(pkt); + size_t l3_hdr_size = sizeof *l3; + + if (OVS_UNLIKELY(l3_size < l3_hdr_size)) { + goto invalid_pkt; + } + + uint8_t nw_frag = 0; + uint8_t nw_proto = l3->ip6_nxt; + const void *data = l3 + 1; + size_t datasize = l3_size - l3_hdr_size; + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, + &frag_hdr) || !nw_frag || !frag_hdr) { + return false; + } + + int pl = ntohs(l3->ip6_plen); + if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) { + goto invalid_pkt; + } + + ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg; + if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) { + return false; + } + + uint32_t min_v6_frag_size_; + atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_); + bool lf = ipf_is_last_v6_frag(ip6f_offlg); + + if (OVS_UNLIKELY(!lf && dp_packet_size(pkt) < min_v6_frag_size_)) { + ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL); + goto invalid_pkt; + } + + return true; + +invalid_pkt: + pkt->md.ct_state = CS_INVALID; + return false; + +} + +static void +ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone, + struct ipf_list_key *key, uint16_t *start_data_byte, + uint16_t *end_data_byte, bool *ff, bool *lf) +{ + const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt); + const char *l4 = dp_packet_l4(pkt); + const char *tail = dp_packet_tail(pkt); + uint8_t pad = dp_packet_l2_pad_size(pkt); + size_t l3_size = tail - (char *)l3 - pad; + size_t l4_size = tail - (char *)l4 - pad; + size_t l3_hdr_size = sizeof *l3; + uint8_t nw_frag = 0; + uint8_t nw_proto = l3->ip6_nxt; + const void *data = l3 + 1; + size_t datasize = l3_size - l3_hdr_size; + const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + + parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr); + ovs_assert(nw_frag && frag_hdr); + ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg; + *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) + + sizeof (struct ovs_16aligned_ip6_frag); + *end_data_byte = *start_data_byte + l4_size - 1; + *ff = ipf_is_first_v6_frag(ip6f_offlg); + *lf = ipf_is_last_v6_frag(ip6f_offlg); + memset(key, 0, sizeof *key); + key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident); + key->dl_type = dl_type; + memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6); + /* We are not supporting parsing of the routing header to use as the + * dst address part of the key. */ + memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6); + key->nw_proto = 0; /* Not used for key for V6. */ + key->zone = zone; + key->recirc_id = pkt->md.recirc_id; +} + +static bool +ipf_list_key_eq(const struct ipf_list_key *key1, + const struct ipf_list_key *key2) + /* OVS_REQUIRES(ipf_lock) */ +{ + if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) && + !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) && + key1->dl_type == key2->dl_type && + key1->ip_id == key2->ip_id && + key1->zone == key2->zone && + key1->nw_proto == key2->nw_proto && + key1->recirc_id == key2->recirc_id) { + return true; + } + return false; +} + +static struct ipf_list * +ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key, + uint32_t hash) + /* OVS_REQUIRES(ipf->ipf_lock) */ +{ + struct ipf_list *ipf_list; + HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) { + if (ipf_list_key_eq(&ipf_list->key, key)) { + return ipf_list; + } + } + return NULL; +} + +static bool +ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx, + size_t start_data_byte, size_t end_data_byte) + /* OVS_REQUIRES(ipf_lock) */ +{ + for (int i = 0; i <= last_inuse_idx; i++) { + if ((start_data_byte >= frag_list[i].start_data_byte && + start_data_byte <= frag_list[i].end_data_byte) || + (end_data_byte >= frag_list[i].start_data_byte && + end_data_byte <= frag_list[i].end_data_byte)) { + return true; + } + } + return false; +} + +/* Adds a fragment to a list of fragments, if the fragment is not a + * duplicate. If the fragment is a duplicate, that fragment is marked + * invalid to avoid the work that conntrack would do to mark the fragment + * as invalid, which it will in all cases. */ +static bool +ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list, + struct dp_packet *pkt, uint16_t start_data_byte, + uint16_t end_data_byte, bool ff, bool lf, bool v6, + bool dnsteal) + OVS_REQUIRES(ipf->ipf_lock) +{ + bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list, + ipf_list->last_inuse_idx, start_data_byte, end_data_byte); + int last_inuse_idx = ipf_list->last_inuse_idx; + + if (!duped_frag) { + if (last_inuse_idx < ipf_list->size - 1) { + /* In the case of dpdk, it would be unfortunate if we had + * to create a clone fragment outside the dpdk mp due to the + * mempool size being too limited. We will otherwise need to + * recommend not setting the mempool number of buffers too low + * and also clamp the number of fragments. */ + struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx + 1]; + frag->pkt = pkt; + frag->start_data_byte = start_data_byte; + frag->end_data_byte = end_data_byte; + frag->dnsteal = dnsteal; + ipf_list->last_inuse_idx++; + atomic_count_inc(&ipf->nfrag); + ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED); + ipf_list_state_transition(ipf, ipf_list, ff, lf, v6); + } else { + OVS_NOT_REACHED(); + } + } else { + ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP); + pkt->md.ct_state = CS_INVALID; + return false; + } + return true; +} + +static void +ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key, + int max_frag_list_size) +{ + ipf_list->key = *key; + ipf_list->last_inuse_idx = IPF_INVALID_IDX; + ipf_list->last_sent_idx = IPF_INVALID_IDX; + ipf_list->reass_execute_ctx = NULL; + ipf_list->state = IPF_LIST_STATE_UNUSED; + ipf_list->size = max_frag_list_size; + ipf_list->frag_list + = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list); +} + +/* Generates a fragment list key from a well formed fragment and either starts + * a new fragment list or increases the size of the existing fragment list, + * while checking if the maximum supported fragements are supported or the + * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment + * to a list of fragemnts. */ +static bool +ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type, + uint16_t zone, long long now, uint32_t hash_basis, + bool dnsteal) + OVS_REQUIRES(ipf->ipf_lock) +{ + struct ipf_list_key key; + /* Initialize 4 variables for some versions of GCC. */ + uint16_t start_data_byte = 0; + uint16_t end_data_byte = 0; + bool ff = false; + bool lf = false; + bool v6 = dl_type == htons(ETH_TYPE_IPV6); + + if (v6 && ipf_get_v6_enabled(ipf)) { + ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte, + &end_data_byte, &ff, &lf); + } else if (!v6 && ipf_get_v4_enabled(ipf)) { + ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte, + &end_data_byte, &ff, &lf); + } else { + return false; + } + + unsigned int nfrag_max; + atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max); + if (atomic_count_get(&ipf->nfrag) >= nfrag_max) { + return false; + } + + uint32_t hash = ipf_list_key_hash(&key, hash_basis); + struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash); + enum { + IPF_FRAG_LIST_MIN_INCREMENT = 4, + IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535, + }; + + int max_frag_list_size; + if (v6) { + /* Because the calculation with extension headers is variable, + * we don't calculate a hard maximum fragment list size upfront. The + * fragment list size is practically limited by the code, however. */ + max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE; + } else { + max_frag_list_size = ipf->max_v4_frag_list_size; + } + + if (!ipf_list) { + ipf_list = xmalloc(sizeof *ipf_list); + ipf_list_init(ipf_list, &key, + MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT)); + hmap_insert(&ipf->frag_lists, &ipf_list->node, hash); + ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now); + } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL) { + /* Bail out as early as possible. */ + return false; + } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) { + int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT, + max_frag_list_size - ipf_list->size); + /* Enforce limit. */ + if (increment > 0) { + ipf_list->frag_list = + xrealloc(ipf_list->frag_list, (ipf_list->size + increment) * + sizeof *ipf_list->frag_list); + ipf_list->size += increment; + } else { + return false; + } + } + + return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte, + end_data_byte, ff, lf, v6, dnsteal); +} + +/* Filters out fragments from a batch of fragments and adjust the batch. */ +static void +ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb, + ovs_be16 dl_type, uint16_t zone, long long now, + uint32_t hash_basis) +{ + const size_t pb_cnt = dp_packet_batch_size(pb); + int pb_idx; /* Index in a packet batch. */ + struct dp_packet *pkt; + + DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { + if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) && + ipf_is_valid_v4_frag(ipf, pkt)) + || + (dl_type == htons(ETH_TYPE_IPV6) && + ipf_is_valid_v6_frag(ipf, pkt)))) { + + ovs_mutex_lock(&ipf->ipf_lock); + if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis, + pb->do_not_steal)) { + dp_packet_batch_refill(pb, pkt, pb_idx); + } + ovs_mutex_unlock(&ipf->ipf_lock); + } else { + dp_packet_batch_refill(pb, pkt, pb_idx); + } + } +} + +/* In case of DPDK, a memory source check is done, as DPDK memory pool + * management has trouble dealing with multiple source types. The + * check_source paramater is used to indicate when this check is needed. */ +static bool +ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt, + bool check_source OVS_UNUSED) +{ +#ifdef DPDK_NETDEV + if ((dp_packet_batch_is_full(pb)) || + /* DPDK cannot handle multiple sources in a batch. */ + (check_source && !dp_packet_batch_is_empty(pb) + && pb->packets[0]->source != pkt->source)) { +#else + if (dp_packet_batch_is_full(pb)) { +#endif + return false; + } + + dp_packet_batch_add(pb, pkt); + return true; +} + +/* This would be used in rare cases where a list cannot be sent. One rare + * reason known right now is a mempool source check, which exists due to DPDK + * support, where packets are no longer being received on any port with a + * source matching the fragment. Another reason is a race where all + * conntrack rules are unconfigured when some fragments are yet to be + * flushed. + * + * Returns true if the list was purged. */ +static bool +ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list, + long long now) + OVS_REQUIRES(ipf->ipf_lock) +{ + enum { + IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000 + }; + + if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) { + return false; + } + + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) { + struct dp_packet * pkt + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt; + dp_packet_delete(pkt); + atomic_count_dec(&ipf->nfrag); + COVERAGE_INC(ipf_stuck_frag_list_purged); + ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6), + IPF_NFRAGS_PURGED); + ipf_list->last_sent_idx++; + } + + return true; +} + +/* Does the packet batch management and common accounting work associated + * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */ +static bool +ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list, + struct dp_packet_batch *pb, + enum ipf_list_type list_type, bool v6, long long now) + OVS_REQUIRES(ipf->ipf_lock) +{ + if (ipf_purge_list_check(ipf, ipf_list, now)) { + return true; + } + + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) { + struct dp_packet *pkt + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt; + if (ipf_dp_packet_batch_add(pb, pkt, true)) { + ipf_list->last_sent_idx++; + atomic_count_dec(&ipf->nfrag); + + if (list_type == IPF_FRAG_COMPLETED_LIST) { + ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT); + } else { + ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT); + pkt->md.ct_state = CS_INVALID; + } + + if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) { + return true; + } + } else { + return false; + } + } + OVS_NOT_REACHED(); +} + +/* Adds fragments associated with a completed fragment list to a packet batch + * to be processed by the calling application, typically conntrack. Also + * cleans up the list context when it is empty.*/ +static void +ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, bool v6) +{ + if (ovs_list_is_empty(&ipf->frag_complete_list)) { + return; + } + + ovs_mutex_lock(&ipf->ipf_lock); + struct ipf_list *ipf_list, *next; + + LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) { + if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST, + v6, now)) { + ipf_completed_list_clean(&ipf->frag_lists, ipf_list); + } else { + break; + } + } + + ovs_mutex_unlock(&ipf->ipf_lock); +} + +/* Conservatively adds fragments associated with a expired fragment list to + * a packet batch to be processed by the calling application, typically + * conntrack. Also cleans up the list context when it is empty.*/ +static void +ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, bool v6) +{ + enum { + /* Very conservative, due to DOS probability. */ + IPF_FRAG_LIST_MAX_EXPIRED = 1, + }; + + + if (ovs_list_is_empty(&ipf->frag_exp_list)) { + return; + } + + ovs_mutex_lock(&ipf->ipf_lock); + struct ipf_list *ipf_list, *next; + size_t lists_removed = 0; + + LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) { + if (now <= ipf_list->expiration || + lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) { + break; + } + + if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_EXPIRY_LIST, + v6, now)) { + ipf_expiry_list_clean(&ipf->frag_lists, ipf_list); + lists_removed++; + } else { + break; + } + } + + ovs_mutex_unlock(&ipf->ipf_lock); +} + +/* Adds a reassmebled packet to a packet batch to be processed by the caller. + */ +static void +ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb) +{ + if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) { + return; + } + + ovs_mutex_lock(&ipf->ipf_lock); + struct reassembled_pkt *rp, *next; + + LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) { + if (!rp->list->reass_execute_ctx && + ipf_dp_packet_batch_add(pb, rp->pkt, false)) { + rp->list->reass_execute_ctx = rp->pkt; + } + } + + ovs_mutex_unlock(&ipf->ipf_lock); +} + +/* Checks for reassembled packets post processing by conntrack and edits the + * fragments if needed based on what conntrack decided. */ +static void +ipf_post_execute_reass_pkts(struct ipf *ipf, + struct dp_packet_batch *pb, bool v6) +{ + if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) { + return; + } + + ovs_mutex_lock(&ipf->ipf_lock); + struct reassembled_pkt *rp, *next; + + LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) { + const size_t pb_cnt = dp_packet_batch_size(pb); + int pb_idx; + struct dp_packet *pkt; + /* Inner batch loop is constant time since batch size is <= + * NETDEV_MAX_BURST. */ + DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { + if (pkt == rp->list->reass_execute_ctx) { + for (int i = 0; i <= rp->list->last_inuse_idx; i++) { + rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label; + rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark; + rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state; + rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone; + rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 = + pkt->md.ct_orig_tuple_ipv6; + if (pkt->md.ct_orig_tuple_ipv6) { + rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 = + pkt->md.ct_orig_tuple.ipv6; + } else { + rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 = + pkt->md.ct_orig_tuple.ipv4; + } + } + + const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; + const char *tail_frag = dp_packet_tail(frag_0->pkt); + uint8_t pad_frag = dp_packet_l2_pad_size(frag_0->pkt); + void *l4_frag = dp_packet_l4(frag_0->pkt); + void *l4_reass = dp_packet_l4(pkt); + memcpy(l4_frag, l4_reass, + tail_frag - (char *) l4_frag - pad_frag); + + if (v6) { + struct ovs_16aligned_ip6_hdr *l3_frag + = dp_packet_l3(frag_0->pkt); + struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt); + l3_frag->ip6_src = l3_reass->ip6_src; + l3_frag->ip6_dst = l3_reass->ip6_dst; + } else { + struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt); + struct ip_header *l3_reass = dp_packet_l3(pkt); + ovs_be32 reass_ip = get_16aligned_be32(&l3_reass->ip_src); + ovs_be32 frag_ip = get_16aligned_be32(&l3_frag->ip_src); + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, + frag_ip, reass_ip); + l3_frag->ip_src = l3_reass->ip_src; + + reass_ip = get_16aligned_be32(&l3_reass->ip_dst); + frag_ip = get_16aligned_be32(&l3_frag->ip_dst); + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, + frag_ip, reass_ip); + l3_frag->ip_dst = l3_reass->ip_dst; + } + + ipf_completed_list_add(&ipf->frag_complete_list, rp->list); + ipf_reassembled_list_remove(rp); + dp_packet_delete(rp->pkt); + free(rp); + } else { + dp_packet_batch_refill(pb, pkt, pb_idx); + } + } + } + + ovs_mutex_unlock(&ipf->ipf_lock); +} + +/* Extracts any fragments from the batch and reassembles them when a + * complete packet is received. Completed packets are attempted to + * be added to the batch to be sent through conntrack. */ +void +ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, ovs_be16 dl_type, uint16_t zone, + uint32_t hash_basis) +{ + if (ipf_get_enabled(ipf)) { + ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis); + } + + if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) { + ipf_execute_reass_pkts(ipf, pb); + } +} + +/* Updates fragments based on the processing of the reassembled packet sent + * through conntrack and adds these fragments to any batches seen. Expired + * fragments are marked as invalid and also added to the batches seen + * with low priority. Reassembled packets are freed. */ +void +ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, ovs_be16 dl_type) +{ + if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) { + bool v6 = dl_type == htons(ETH_TYPE_IPV6); + ipf_post_execute_reass_pkts(ipf, pb, v6); + ipf_send_completed_frags(ipf, pb, now, v6); + ipf_send_expired_frags(ipf, pb, now, v6); + } +} + +static void * +ipf_clean_thread_main(void *f) +{ + struct ipf *ipf = f; + + enum { + IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000, + }; + + while (!latch_is_set(&ipf->ipf_clean_thread_exit)) { + + long long now = time_msec(); + + if (!ovs_list_is_empty(&ipf->frag_exp_list) || + !ovs_list_is_empty(&ipf->frag_complete_list)) { + + ovs_mutex_lock(&ipf->ipf_lock); + + struct ipf_list *ipf_list, *next; + LIST_FOR_EACH_SAFE (ipf_list, next, list_node, + &ipf->frag_exp_list) { + if (ipf_purge_list_check(ipf, ipf_list, now)) { + ipf_expiry_list_clean(&ipf->frag_lists, ipf_list); + } + } + + LIST_FOR_EACH_SAFE (ipf_list, next, list_node, + &ipf->frag_complete_list) { + if (ipf_purge_list_check(ipf, ipf_list, now)) { + ipf_completed_list_clean(&ipf->frag_lists, ipf_list); + } + } + + ovs_mutex_unlock(&ipf->ipf_lock); + } + + poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT); + latch_wait(&ipf->ipf_clean_thread_exit); + poll_block(); + } + + return NULL; +} + +struct ipf * +ipf_init(void) +{ + struct ipf *ipf = xzalloc(sizeof *ipf); + + ovs_mutex_init_adaptive(&ipf->ipf_lock); + ovs_mutex_lock(&ipf->ipf_lock); + hmap_init(&ipf->frag_lists); + ovs_list_init(&ipf->frag_exp_list); + ovs_list_init(&ipf->frag_complete_list); + ovs_list_init(&ipf->reassembled_pkt_list); + atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF); + atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF); + ipf->max_v4_frag_list_size = DIV_ROUND_UP( + IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE, + ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE); + ovs_mutex_unlock(&ipf->ipf_lock); + atomic_count_init(&ipf->nfrag, 0); + for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) { + atomic_init(&ipf->n4frag_cnt[i], 0); + atomic_init(&ipf->n6frag_cnt[i], 0); + } + atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT); + atomic_init(&ipf->ifp_v4_enabled, true); + atomic_init(&ipf->ifp_v6_enabled, true); + latch_init(&ipf->ipf_clean_thread_exit); + ipf->ipf_clean_thread = ovs_thread_create("ipf_clean", + ipf_clean_thread_main, ipf); + + return ipf; +} + +void +ipf_destroy(struct ipf *ipf) +{ + ovs_mutex_lock(&ipf->ipf_lock); + latch_set(&ipf->ipf_clean_thread_exit); + pthread_join(ipf->ipf_clean_thread, NULL); + latch_destroy(&ipf->ipf_clean_thread_exit); + + struct ipf_list *ipf_list; + HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) { + while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) { + struct dp_packet *pkt + = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt; + if (!ipf_list->frag_list[ipf_list->last_sent_idx + 1].dnsteal) { + dp_packet_delete(pkt); + } + atomic_count_dec(&ipf->nfrag); + ipf_list->last_sent_idx++; + } + free(ipf_list->frag_list); + free(ipf_list); + } + + if (atomic_count_get(&ipf->nfrag)) { + VLOG_WARN("ipf destroy with non-zero fragment count. "); + } + + struct reassembled_pkt *rp; + LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) { + dp_packet_delete(rp->pkt); + free(rp); + } + + hmap_destroy(&ipf->frag_lists); + ovs_list_poison(&ipf->frag_exp_list); + ovs_list_poison(&ipf->frag_complete_list); + ovs_list_poison(&ipf->reassembled_pkt_list); + ovs_mutex_unlock(&ipf->ipf_lock); + ovs_mutex_destroy(&ipf->ipf_lock); + free(ipf); +} + +int +ipf_set_enabled(struct ipf *ipf, bool v6, bool enable) +{ + atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled, + enable); + return 0; +} + +int +ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value) +{ + /* If the user specifies an unreasonably large number, fragmentation + * will not work well but it will not blow up. */ + if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND : IPF_V4_FRAG_SIZE_LBOUND)) { + return 1; + } + + ovs_mutex_lock(&ipf->ipf_lock); + if (v6) { + atomic_store_relaxed(&ipf->min_v6_frag_size, value); + } else { + atomic_store_relaxed(&ipf->min_v4_frag_size, value); + ipf->max_v4_frag_list_size = DIV_ROUND_UP( + IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE, + ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE); + } + ovs_mutex_unlock(&ipf->ipf_lock); + return 0; +} + +int +ipf_set_max_nfrags(struct ipf *ipf, uint32_t value) +{ + if (value > IPF_NFRAG_UBOUND) { + return 1; + } + atomic_store_relaxed(&ipf->nfrag_max, value); + return 0; +} + +int +ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status) +{ + ipf_status->nfrag = atomic_count_get(&ipf->nfrag); + atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max); + + atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled); + atomic_read_relaxed(&ipf->min_v4_frag_size, + &ipf_status->v4.min_frag_size); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED], + &ipf_status->v4.nfrag_accepted); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT], + &ipf_status->v4.nfrag_completed_sent); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT], + &ipf_status->v4.nfrag_expired_sent); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL], + &ipf_status->v4.nfrag_too_small); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP], + &ipf_status->v4.nfrag_overlap); + atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED], + &ipf_status->v4.nfrag_purged); + + atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled); + atomic_read_relaxed(&ipf->min_v6_frag_size, + &ipf_status->v6.min_frag_size); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED], + &ipf_status->v6.nfrag_accepted); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT], + &ipf_status->v6.nfrag_completed_sent); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT], + &ipf_status->v6.nfrag_expired_sent); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL], + &ipf_status->v6.nfrag_too_small); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP], + &ipf_status->v6.nfrag_overlap); + atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED], + &ipf_status->v6.nfrag_purged); + return 0; +} + +struct ipf_dump_ctx { + struct hmap_position bucket_pos; +}; + +/* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The + * caller must call ipf_dump_done() when dumping is finished. */ +int +ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx) +{ + *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx); + return 0; +} + +/* Creates a string representation of the state of an 'ipf_list' and puts + * it in 'ds'. */ +static void +ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds) +{ + ds_put_cstr(ds, "("); + if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) { + ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",", + IP_ARGS(ipf_list->key.src_addr.ipv4), + IP_ARGS(ipf_list->key.dst_addr.ipv4)); + } else { + ds_put_cstr(ds, "src="); + ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds); + ds_put_cstr(ds, ",dst="); + ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds); + ds_put_cstr(ds, ","); + } + + ds_put_format(ds, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u", + ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id), + ntohs(ipf_list->key.dl_type), ipf_list->key.zone, + ipf_list->key.nw_proto); + + ds_put_format(ds, ",num_fragments=%u,state=%s", + ipf_list->last_inuse_idx + 1, + ipf_state_name[ipf_list->state]); + + ds_put_cstr(ds, ")"); +} + +/* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses + * ipf_dump_create() to create a string representation of the state of an + * ipf list, to which 'dump' is pointed to. Returns EOF when there are no + * more ipf lists. */ +int +ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char **dump) +{ + ovs_mutex_lock(&ipf->ipf_lock); + + struct hmap_node *node = hmap_at_position(&ipf->frag_lists, + &ipf_dump_ctx->bucket_pos); + if (!node) { + ovs_mutex_unlock(&ipf->ipf_lock); + return EOF; + } else { + struct ipf_list *ipf_list_; + INIT_CONTAINER(ipf_list_, node, node); + struct ipf_list ipf_list = *ipf_list_; + ovs_mutex_unlock(&ipf->ipf_lock); + struct ds ds = DS_EMPTY_INITIALIZER; + ipf_dump_create(&ipf_list, &ds); + *dump = ds_steal_cstr(&ds); + return 0; + } +} + +/* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */ +int +ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx) +{ + free(ipf_dump_ctx); + return 0; +} diff --git a/lib/ipf.h b/lib/ipf.h new file mode 100644 index 00000000000..6ac91b27083 --- /dev/null +++ b/lib/ipf.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef IPF_H +#define IPF_H 1 + +#include "dp-packet.h" +#include "openvswitch/types.h" + +struct ipf; + +struct ipf_proto_status { + uint64_t nfrag_accepted; + uint64_t nfrag_completed_sent; + uint64_t nfrag_expired_sent; + uint64_t nfrag_too_small; + uint64_t nfrag_overlap; + uint64_t nfrag_purged; + unsigned int min_frag_size; + bool enabled; +}; + +struct ipf_status { + struct ipf_proto_status v4; + struct ipf_proto_status v6; + unsigned int nfrag; + unsigned int nfrag_max; +}; + +struct ipf *ipf_init(void); +void ipf_destroy(struct ipf *ipf); +void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, ovs_be16 dl_type, uint16_t zone, + uint32_t hash_basis); + +void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, + long long now, ovs_be16 dl_type); + +int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable); +int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value); +int ipf_set_max_nfrags(struct ipf *ipf, uint32_t value); +int ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status); + +struct ipf_dump_ctx; +int ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx); +int ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, + char **dump); +int ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx); + +#endif /* ipf.h */ diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 3296d64e653..1057e34c571 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -77,12 +77,6 @@ m4_define([CHECK_CONNTRACK], # m4_define([CHECK_CONNTRACK_ALG]) -# CHECK_CONNTRACK_FRAG() -# -# Perform requirements checks for running conntrack fragmentations tests. -# The kernel always supports fragmentation, so no check is needed. -m4_define([CHECK_CONNTRACK_FRAG]) - # CHECK_CONNTRACK_LOCAL_STACK() # # Perform requirements checks for running conntrack tests with local stack. @@ -140,6 +134,46 @@ m4_define([CHECK_CT_DPIF_GET_NCONNS], AT_SKIP_IF([:]) ]) +# DPCTL_SET_MIN_FRAG_SIZE() +# +# The kernel does not support this command. +m4_define([DPCTL_SET_MIN_FRAG_SIZE], +[ + +]) + +# DPCTL_MODIFY_FRAGMENTATION() +# +# The kernel does not support this command. +m4_define([DPCTL_MODIFY_FRAGMENTATION], +[ + +]) + +# DPCTL_CHECK_FRAGMENTATION_PASS() +# +# The kernel does not support this command. +m4_define([DPCTL_CHECK_FRAGMENTATION_PASS], +[ + +]) + +# DPCTL_CHECK_V6_FRAGMENTATION_PASS() +# +# The kernel does not support this command. +m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS], +[ + +]) + +# DPCTL_CHECK_FRAGMENTATION_FAIL() +# +# The kernel does not support this command. +m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL], +[ + +]) + # OVS_CHECK_KERNEL([minversion], [minsublevel], [maxversion], [maxsublevel]) # # Check if kernel version falls between minversion.minsublevel and diff --git a/tests/system-traffic.at b/tests/system-traffic.at index de40734e4a6..6da5ac826a1 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2356,7 +2356,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2375,6 +2374,9 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +dnl Modify userspace conntrack fragmentation handling. +DPCTL_MODIFY_FRAGMENTATION() + dnl Ipv4 fragmentation connectivity check. NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms @@ -2385,12 +2387,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check userspace conntrack fragmentation counters. +DPCTL_CHECK_FRAGMENTATION_PASS() + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation expiry]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2411,17 +2415,22 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +dnl Modify userspace conntrack fragmentation handling. +DPCTL_MODIFY_FRAGMENTATION() + dnl Ipv4 fragmentation connectivity check. NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 1 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl 7 packets transmitted, 0 received, 100% packet loss, time 0ms ]) +dnl Check userspace conntrack fragmentation counters. +DPCTL_CHECK_FRAGMENTATION_FAIL() + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation + vlan]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2442,6 +2451,9 @@ priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +dnl Modify userspace conntrack fragmentation handling. +DPCTL_MODIFY_FRAGMENTATION() + dnl Ipv4 fragmentation connectivity check. NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms @@ -2452,12 +2464,14 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check userspace conntrack fragmentation counters. +DPCTL_CHECK_FRAGMENTATION_PASS() + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation + cvlan]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() @@ -2511,6 +2525,8 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation incomplete reassembled packet]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() + ADD_NAMESPACES(at_ns0, at_ns1) @@ -2532,8 +2548,8 @@ AT_CLEANUP dnl Uses same first fragment as above 'incomplete reassembled packet' test. AT_SETUP([conntrack - IPv4 fragmentation with fragments specified]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2556,8 +2572,8 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation out of order]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2580,9 +2596,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1 octet]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_OVERLAP() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2604,9 +2620,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation overlapping fragments by 1 octet out of order]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_OVERLAP() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2628,7 +2644,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2668,7 +2683,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation expiry]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2709,7 +2723,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation + vlan]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2752,7 +2765,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation + cvlan]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() @@ -2807,6 +2819,7 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation incomplete reassembled packet]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2827,8 +2840,8 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation with fragments specified]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2851,8 +2864,8 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation out of order]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2875,9 +2888,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2901,9 +2914,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers + out of order]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2927,9 +2940,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2953,9 +2966,9 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 fragmentation, multiple extension headers 2 + out of order]) CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN() OVS_TRAFFIC_VSWITCHD_START() +DPCTL_SET_MIN_FRAG_SIZE() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2980,7 +2993,6 @@ AT_CLEANUP AT_SETUP([conntrack - Fragmentation over vxlan]) OVS_CHECK_VXLAN() CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_LOCAL_STACK() OVS_TRAFFIC_VSWITCHD_START() @@ -3033,7 +3045,6 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 Fragmentation over vxlan]) OVS_CHECK_VXLAN() CHECK_CONNTRACK() -CHECK_CONNTRACK_FRAG() CHECK_CONNTRACK_LOCAL_STACK() OVS_TRAFFIC_VSWITCHD_START() diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index 27bde8bee8a..4ea55ea4af9 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -73,15 +73,6 @@ m4_define([CHECK_CONNTRACK], # m4_define([CHECK_CONNTRACK_ALG]) -# CHECK_CONNTRACK_FRAG() -# -# Perform requirements checks for running conntrack fragmentations tests. -# The userspace doesn't support fragmentation yet, so skip the tests. -m4_define([CHECK_CONNTRACK_FRAG], -[ - AT_SKIP_IF([:]) -]) - # CHECK_CONNTRACK_LOCAL_STACK() # # Perform requirements checks for running conntrack tests with local stack. @@ -95,19 +86,13 @@ m4_define([CHECK_CONNTRACK_LOCAL_STACK], # CHECK_CONNTRACK_FRAG_OVERLAP() # -# The userspace datapath does not support fragments yet. -m4_define([CHECK_CONNTRACK_FRAG_OVERLAP], -[ - AT_SKIP_IF([:]) -]) +# The userspace datapath supports fragment overlap check. +m4_define([CHECK_CONNTRACK_FRAG_OVERLAP]) -# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN() +# CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN # -# The userspace datapath does not support fragments yet. -m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN], -[ - AT_SKIP_IF([:]) -]) +# The userspace datapath supports fragments with multiple extension headers. +m4_define([CHECK_CONNTRACK_FRAG_IPV6_MULT_EXTEN]) # CHECK_CONNTRACK_NAT() # @@ -137,6 +122,167 @@ m4_define([CHECK_CT_DPIF_SET_GET_MAXCONNS]) # userspace datapath does support this feature. m4_define([CHECK_CT_DPIF_GET_NCONNS]) +# DPCTL_SET_MIN_FRAG_SIZE() +# +# The userspace datapath supports this command. +m4_define([DPCTL_SET_MIN_FRAG_SIZE], +[ +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 400], [], [dnl +setting minimum fragment size successful +]) +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v6 400], [], [dnl +setting minimum fragment size successful +]) +]) + +# DPCTL_MODIFY_FRAGMENTATION() +# +# The userspace datapath supports this command. +m4_define([DPCTL_MODIFY_FRAGMENTATION], +[ +AT_CHECK([ovs-appctl dpctl/ipf-set-min-frag v4 1000], [], [dnl +setting minimum fragment size successful +]) +AT_CHECK([ovs-appctl dpctl/ipf-set-max-nfrags 500], [], [dnl +setting maximum fragments successful +]) +AT_CHECK([ovs-appctl dpctl/ipf-get-status], [], [dnl + Fragmentation Module Status + --------------------------- + v4 enabled: 1 + v6 enabled: 1 + max num frags (v4/v6): 500 + num frag: 0 + min v4 frag size: 1000 + v4 frags accepted: 0 + v4 frags completed: 0 + v4 frags expired: 0 + v4 frags too small: 0 + v4 frags overlapped: 0 + v4 frags purged: 0 + min v6 frag size: 1280 + v6 frags accepted: 0 + v6 frags completed: 0 + v6 frags expired: 0 + v6 frags too small: 0 + v6 frags overlapped: 0 + v6 frags purged: 0 +]) +]) + +# DPCTL_CHECK_FRAGMENTATION_PASS() +# +# Used to check fragmentation counters for some fragmentation tests using +# the userspace datapath. +m4_define([DPCTL_CHECK_FRAGMENTATION_PASS], +[ +AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl + Fragmentation Module Status + --------------------------- + v4 enabled: 1 + v6 enabled: 1 + max num frags (v4/v6): 500 + num frag: 0 + min v4 frag size: 1000 + v4 frags accepted: 30 + v4 frags completed: 30 + v4 frags expired: 0 + v4 frags too small: 0 + v4 frags overlapped: 0 + v4 frags purged: 0 + min v6 frag size: 1280 + v6 frags accepted: 0 + v6 frags completed: 0 + v6 frags expired: 0 + v6 frags too small: 0 + v6 frags overlapped: 0 + v6 frags purged: 0 + + Fragment Lists: + +]) +]) + +# DPCTL_CHECK_V6_FRAGMENTATION_PASS() +# +# Used to check fragmentation counters for some fragmentation tests using +# the userspace datapath. +m4_define([DPCTL_CHECK_V6_FRAGMENTATION_PASS], +[ +AT_CHECK([ovs-appctl dpctl/ipf-get-status --more], [], [dnl + Fragmentation Module Status + --------------------------- + v4 enabled: 1 + v6 enabled: 1 + max num frags (v4/v6): 1000 + num frag: 0 + min v4 frag size: 1200 + v4 frags accepted: 0 + v4 frags completed: 0 + v4 frags expired: 0 + v4 frags too small: 0 + v4 frags overlapped: 0 + v4 frags purged: 0 + min v6 frag size: 1280 + v6 frags accepted: 30 + v6 frags completed: 30 + v6 frags expired: 0 + v6 frags too small: 0 + v6 frags overlapped: 0 + v6 frags purged: 0 + + Fragment Lists: + +]) +]) + +# FORMAT_FRAG_LIST([]) +# +# Strip content from the piped input which can differ from test to test; recirc_id +# and ip_id fields in an ipf_list vary from test to test and hence are cleared. +m4_define([FORMAT_FRAG_LIST], + [[sed -e 's/ip_id=[0-9]*/ip_id=/g' -e 's/recirc_id=[0-9]*/recirc_id=/g']]) + +# DPCTL_CHECK_FRAGMENTATION_FAIL() +# +# Used to check fragmentation counters for some fragmentation tests using +# the userspace datapath, when failure to transmit fragments is expected. +m4_define([DPCTL_CHECK_FRAGMENTATION_FAIL], +[ +AT_CHECK([ovs-appctl dpctl/ipf-get-status -m | FORMAT_FRAG_LIST()], [], [dnl + Fragmentation Module Status + --------------------------- + v4 enabled: 1 + v6 enabled: 1 + max num frags (v4/v6): 500 + num frag: 7 + min v4 frag size: 1000 + v4 frags accepted: 7 + v4 frags completed: 0 + v4 frags expired: 0 + v4 frags too small: 0 + v4 frags overlapped: 0 + v4 frags purged: 0 + min v6 frag size: 1280 + v6 frags accepted: 0 + v6 frags completed: 0 + v6 frags expired: 0 + v6 frags too small: 0 + v6 frags overlapped: 0 + v6 frags purged: 0 + + Fragment Lists: + +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +(src=10.1.1.1,dst=10.1.1.2,recirc_id=,ip_id=,dl_type=0x800,zone=9,nw_proto=1,num_fragments=1,state=first frag) +]) +]) + # OVS_CHECK_KERNEL([minversion], [maxversion], [minsublevel], [maxsublevel]) # # The userspace skips all tests that check kernel version.