dpif-netdev: XPS (Transmit Packet Steering) implementation.

If CPU number in pmd-cpu-mask is not divisible by the number of queues and in a few more complex situations there may be unfair distribution of TX queue-ids between PMD threads. For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask such distribution is possible: <------------------------------------------------------------------------> pmd thread numa_id 0 core_id 13: port: vhost-user1 queue-id: 1 port: dpdk0 queue-id: 3 pmd thread numa_id 0 core_id 14: port: vhost-user1 queue-id: 2 pmd thread numa_id 0 core_id 16: port: dpdk0 queue-id: 0 pmd thread numa_id 0 core_id 17: port: dpdk0 queue-id: 1 pmd thread numa_id 0 core_id 12: port: vhost-user1 queue-id: 0 port: dpdk0 queue-id: 2 pmd thread numa_id 0 core_id 15: port: vhost-user1 queue-id: 3 <------------------------------------------------------------------------> As we can see above dpdk0 port polled by threads on cores: 12, 13, 16 and 17. By design of dpif-netdev, there is only one TX queue-id assigned to each pmd thread. This queue-id's are sequential similar to core-id's. And thread will send packets to queue with exact this queue-id regardless of port. In previous example: pmd thread on core 12 will send packets to tx queue 0 pmd thread on core 13 will send packets to tx queue 1 ... pmd thread on core 17 will send packets to tx queue 5 So, for dpdk0 port after truncating in netdev-dpdk: core 12 --> TX queue-id 0 % 4 == 0 core 13 --> TX queue-id 1 % 4 == 1 core 16 --> TX queue-id 4 % 4 == 0 core 17 --> TX queue-id 5 % 4 == 1 As a result only 2 of 4 queues used. To fix this issue some kind of XPS implemented in following way: * TX queue-ids are allocated dynamically. * When PMD thread first time tries to send packets to new port it allocates less used TX queue for this port. * PMD threads periodically performes revalidation of allocated TX queue-ids. If queue wasn't used in last XPS_TIMEOUT_MS milliseconds it will be freed while revalidation. * XPS is not working if we have enough TX queues. Reported-by: Zhihong Wang <zhihong.wang@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
openvswitch · Jul 27, 2016 · 324c837 · 324c837
1 parent 7adc92a
commit 324c837
Show file tree

Hide file tree

Showing 8 changed files with 200 additions and 74 deletions.
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
@@ -680,7 +680,8 @@ netdev_bsd_rxq_drain(struct netdev_rxq *rxq_)
  */
 static int
 netdev_bsd_send(struct netdev *netdev_, int qid OVS_UNUSED,
-                struct dp_packet_batch *batch, bool may_steal)
+                struct dp_packet_batch *batch, bool may_steal,
+                bool concurrent_txq OVS_UNUSED)
 {
     struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
     const char *name = netdev_get_name(netdev_);

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
@@ -298,7 +298,7 @@ struct dpdk_tx_queue {
     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                     * from concurrent access.  It is used only
                                     * if the queue is shared among different
-                                    * pmd threads (see 'txq_needs_locking'). */
+                                    * pmd threads (see 'concurrent_txq'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
 };
@@ -349,13 +349,6 @@ struct netdev_dpdk {
     struct rte_eth_link link;
     int link_reset_cnt;
 
-    /* Caller of netdev_send() might want to use more txqs than the device has.
-     * For physical NICs, if the 'requested_n_txq' less or equal to 'up.n_txq',
-     * 'txq_needs_locking' is false, otherwise it is true and we will take a
-     * spinlock on transmission.  For vhost devices, 'requested_n_txq' is
-     * always true.  */
-    bool txq_needs_locking;
-
     /* virtio-net structure for vhost device */
     OVSRCU_TYPE(struct virtio_net *) virtio_dev;
 
@@ -778,10 +771,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
             goto unlock;
         }
         netdev_dpdk_alloc_txq(dev, netdev->n_txq);
-        dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
     } else {
         netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
-        dev->txq_needs_locking = true;
         /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
         dev->flags = NETDEV_UP | NETDEV_PROMISC;
     }
@@ -1468,7 +1459,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
 static int
 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                        struct dp_packet_batch *batch,
-                       bool may_steal)
+                       bool may_steal, bool concurrent_txq OVS_UNUSED)
 {
 
     if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
@@ -1484,9 +1475,10 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
 
 static inline void
 netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
-                   struct dp_packet_batch *batch, bool may_steal)
+                   struct dp_packet_batch *batch, bool may_steal,
+                   bool concurrent_txq)
 {
-    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+    if (OVS_UNLIKELY(concurrent_txq)) {
         qid = qid % dev->up.n_txq;
         rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
     }
@@ -1551,18 +1543,19 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         }
     }
 
-    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
+    if (OVS_UNLIKELY(concurrent_txq)) {
         rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
     }
 }
 
 static int
 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
-                     struct dp_packet_batch *batch, bool may_steal)
+                     struct dp_packet_batch *batch, bool may_steal,
+                     bool concurrent_txq)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    netdev_dpdk_send__(dev, qid, batch, may_steal);
+    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
     return 0;
 }
 
@@ -2533,7 +2526,8 @@ dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id)
 
 static int
 netdev_dpdk_ring_send(struct netdev *netdev, int qid,
-                      struct dp_packet_batch *batch, bool may_steal)
+                      struct dp_packet_batch *batch, bool may_steal,
+                      bool concurrent_txq)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     unsigned i;
@@ -2546,7 +2540,7 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid,
         dp_packet_rss_invalidate(batch->packets[i]);
     }
 
-    netdev_dpdk_send__(dev, qid, batch, may_steal);
+    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
     return 0;
 }
 
@@ -2823,8 +2817,6 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
     err = dpdk_eth_dev_init(dev);
     netdev_dpdk_alloc_txq(dev, netdev->n_txq);
 
-    dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
-
 out:
 
     ovs_mutex_unlock(&dev->mutex);

diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
@@ -1034,7 +1034,8 @@ netdev_dummy_rxq_drain(struct netdev_rxq *rxq_)
 
 static int
 netdev_dummy_send(struct netdev *netdev, int qid OVS_UNUSED,
-                  struct dp_packet_batch *batch, bool may_steal)
+                  struct dp_packet_batch *batch, bool may_steal,
+                  bool concurrent_txq OVS_UNUSED)
 {
     struct netdev_dummy *dev = netdev_dummy_cast(netdev);
     int error = 0;

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
@@ -1161,7 +1161,8 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
  * expected to do additional queuing of packets. */
 static int
 netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
-                  struct dp_packet_batch *batch, bool may_steal)
+                  struct dp_packet_batch *batch, bool may_steal,
+                  bool concurrent_txq OVS_UNUSED)
 {
     int i;
     int error = 0;

diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
@@ -303,10 +303,6 @@ struct netdev_class {
      * otherwise a positive errno value.
      *
      * 'n_txq' specifies the exact number of transmission queues to create.
-     * The caller will call netdev_send() concurrently from 'n_txq' different
-     * threads (with different qid).  The netdev provider is responsible for
-     * making sure that these concurrent calls do not create a race condition
-     * by using multiple hw queues or locking.
      *
      * The caller will call netdev_reconfigure() (if necessary) before using
      * netdev_send() on any of the newly configured queues, giving the provider
@@ -328,6 +324,11 @@ struct netdev_class {
      * packets.  If 'may_steal' is true, the caller transfers ownership of all
      * the packets to the network device, regardless of success.
      *
+     * If 'concurrent_txq' is true, the caller may perform concurrent calls
+     * to netdev_send() with the same 'qid'. The netdev provider is responsible
+     * for making sure that these concurrent calls do not create a race
+     * condition by using locking or other synchronization if required.
+     *
      * The network device is expected to maintain one or more packet
      * transmission queues, so that the caller does not ordinarily have to
      * do additional queuing of packets.  'qid' specifies the queue to use
@@ -341,7 +342,7 @@ struct netdev_class {
      * datapath".  It will also prevent the OVS implementation of bonding from
      * working properly over 'netdev'.) */
     int (*send)(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
-                bool may_steal);
+                bool may_steal, bool concurrent_txq);
 
     /* Registers with the poll loop to wake up from the next call to
      * poll_block() when the packet transmission queue for 'netdev' has

diff --git a/lib/netdev.c b/lib/netdev.c
@@ -655,9 +655,6 @@ netdev_rxq_drain(struct netdev_rxq *rx)
  * otherwise a positive errno value.
  *
  * 'n_txq' specifies the exact number of transmission queues to create.
- * If this function returns successfully, the caller can make 'n_txq'
- * concurrent calls to netdev_send() (each one with a different 'qid' in the
- * range [0..'n_txq'-1]).
  *
  * The change might not effective immediately.  The caller must check if a
  * reconfiguration is required with netdev_is_reconf_required() and eventually
@@ -694,6 +691,11 @@ netdev_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
  * If 'may_steal' is true, the caller transfers ownership of all the packets
  * to the network device, regardless of success.
  *
+ * If 'concurrent_txq' is true, the caller may perform concurrent calls
+ * to netdev_send() with the same 'qid'. The netdev provider is responsible
+ * for making sure that these concurrent calls do not create a race condition
+ * by using locking or other synchronization if required.
+ *
  * The network device is expected to maintain one or more packet
  * transmission queues, so that the caller does not ordinarily have to
  * do additional queuing of packets.  'qid' specifies the queue to use
@@ -704,14 +706,15 @@ netdev_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
  * cases this function will always return EOPNOTSUPP. */
 int
 netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
-            bool may_steal)
+            bool may_steal, bool concurrent_txq)
 {
     if (!netdev->netdev_class->send) {
         dp_packet_delete_batch(batch, may_steal);
         return EOPNOTSUPP;
     }
 
-    int error = netdev->netdev_class->send(netdev, qid, batch, may_steal);
+    int error = netdev->netdev_class->send(netdev, qid, batch, may_steal,
+                                           concurrent_txq);
     if (!error) {
         COVERAGE_INC(netdev_sent);
         if (!may_steal) {

diff --git a/lib/netdev.h b/lib/netdev.h
@@ -149,7 +149,7 @@ int netdev_rxq_drain(struct netdev_rxq *);
 
 /* Packet transmission. */
 int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
-                bool may_steal);
+                bool may_steal, bool concurrent_txq);
 void netdev_send_wait(struct netdev *, int qid);
 
 /* native tunnel APIs */