Skip to content

Commit

Permalink
dpif-netdev: XPS (Transmit Packet Steering) implementation.
Browse files Browse the repository at this point in the history
If CPU number in pmd-cpu-mask is not divisible by the number of queues and
in a few more complex situations there may be unfair distribution of TX
queue-ids between PMD threads.

For example, if we have 2 ports with 4 queues and 6 CPUs in pmd-cpu-mask
such distribution is possible:
<------------------------------------------------------------------------>
pmd thread numa_id 0 core_id 13:
        port: vhost-user1       queue-id: 1
        port: dpdk0     queue-id: 3
pmd thread numa_id 0 core_id 14:
        port: vhost-user1       queue-id: 2
pmd thread numa_id 0 core_id 16:
        port: dpdk0     queue-id: 0
pmd thread numa_id 0 core_id 17:
        port: dpdk0     queue-id: 1
pmd thread numa_id 0 core_id 12:
        port: vhost-user1       queue-id: 0
        port: dpdk0     queue-id: 2
pmd thread numa_id 0 core_id 15:
        port: vhost-user1       queue-id: 3
<------------------------------------------------------------------------>

As we can see above dpdk0 port polled by threads on cores:
	12, 13, 16 and 17.

By design of dpif-netdev, there is only one TX queue-id assigned to each
pmd thread. This queue-id's are sequential similar to core-id's. And
thread will send packets to queue with exact this queue-id regardless
of port.

In previous example:

	pmd thread on core 12 will send packets to tx queue 0
	pmd thread on core 13 will send packets to tx queue 1
	...
	pmd thread on core 17 will send packets to tx queue 5

So, for dpdk0 port after truncating in netdev-dpdk:

	core 12 --> TX queue-id 0 % 4 == 0
	core 13 --> TX queue-id 1 % 4 == 1
	core 16 --> TX queue-id 4 % 4 == 0
	core 17 --> TX queue-id 5 % 4 == 1

As a result only 2 of 4 queues used.

To fix this issue some kind of XPS implemented in following way:

	* TX queue-ids are allocated dynamically.
	* When PMD thread first time tries to send packets to new port
	  it allocates less used TX queue for this port.
	* PMD threads periodically performes revalidation of
	  allocated TX queue-ids. If queue wasn't used in last
	  XPS_TIMEOUT_MS milliseconds it will be freed while revalidation.
        * XPS is not working if we have enough TX queues.

Reported-by: Zhihong Wang <zhihong.wang@intel.com>
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com>
  • Loading branch information
igsilya authored and ddiproietto committed Jul 27, 2016
1 parent 7adc92a commit 324c837
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 74 deletions.
207 changes: 167 additions & 40 deletions lib/dpif-netdev.c

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion lib/netdev-bsd.c
Expand Up @@ -680,7 +680,8 @@ netdev_bsd_rxq_drain(struct netdev_rxq *rxq_)
*/
static int
netdev_bsd_send(struct netdev *netdev_, int qid OVS_UNUSED,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq OVS_UNUSED)
{
struct netdev_bsd *dev = netdev_bsd_cast(netdev_);
const char *name = netdev_get_name(netdev_);
Expand Down
32 changes: 12 additions & 20 deletions lib/netdev-dpdk.c
Expand Up @@ -298,7 +298,7 @@ struct dpdk_tx_queue {
rte_spinlock_t tx_lock; /* Protects the members and the NIC queue
* from concurrent access. It is used only
* if the queue is shared among different
* pmd threads (see 'txq_needs_locking'). */
* pmd threads (see 'concurrent_txq'). */
int map; /* Mapping of configured vhost-user queues
* to enabled by guest. */
};
Expand Down Expand Up @@ -349,13 +349,6 @@ struct netdev_dpdk {
struct rte_eth_link link;
int link_reset_cnt;

/* Caller of netdev_send() might want to use more txqs than the device has.
* For physical NICs, if the 'requested_n_txq' less or equal to 'up.n_txq',
* 'txq_needs_locking' is false, otherwise it is true and we will take a
* spinlock on transmission. For vhost devices, 'requested_n_txq' is
* always true. */
bool txq_needs_locking;

/* virtio-net structure for vhost device */
OVSRCU_TYPE(struct virtio_net *) virtio_dev;

Expand Down Expand Up @@ -778,10 +771,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
goto unlock;
}
netdev_dpdk_alloc_txq(dev, netdev->n_txq);
dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;
} else {
netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
dev->txq_needs_locking = true;
/* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
dev->flags = NETDEV_UP | NETDEV_PROMISC;
}
Expand Down Expand Up @@ -1468,7 +1459,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
static int
netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch,
bool may_steal)
bool may_steal, bool concurrent_txq OVS_UNUSED)
{

if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
Expand All @@ -1484,9 +1475,10 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,

static inline void
netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq)
{
if (OVS_UNLIKELY(dev->txq_needs_locking)) {
if (OVS_UNLIKELY(concurrent_txq)) {
qid = qid % dev->up.n_txq;
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
}
Expand Down Expand Up @@ -1551,18 +1543,19 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
}
}

if (OVS_UNLIKELY(dev->txq_needs_locking)) {
if (OVS_UNLIKELY(concurrent_txq)) {
rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
}
}

static int
netdev_dpdk_eth_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

netdev_dpdk_send__(dev, qid, batch, may_steal);
netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
return 0;
}

Expand Down Expand Up @@ -2533,7 +2526,8 @@ dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id)

static int
netdev_dpdk_ring_send(struct netdev *netdev, int qid,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
unsigned i;
Expand All @@ -2546,7 +2540,7 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid,
dp_packet_rss_invalidate(batch->packets[i]);
}

netdev_dpdk_send__(dev, qid, batch, may_steal);
netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
return 0;
}

Expand Down Expand Up @@ -2823,8 +2817,6 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
err = dpdk_eth_dev_init(dev);
netdev_dpdk_alloc_txq(dev, netdev->n_txq);

dev->txq_needs_locking = netdev->n_txq < dev->requested_n_txq;

out:

ovs_mutex_unlock(&dev->mutex);
Expand Down
3 changes: 2 additions & 1 deletion lib/netdev-dummy.c
Expand Up @@ -1034,7 +1034,8 @@ netdev_dummy_rxq_drain(struct netdev_rxq *rxq_)

static int
netdev_dummy_send(struct netdev *netdev, int qid OVS_UNUSED,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq OVS_UNUSED)
{
struct netdev_dummy *dev = netdev_dummy_cast(netdev);
int error = 0;
Expand Down
3 changes: 2 additions & 1 deletion lib/netdev-linux.c
Expand Up @@ -1161,7 +1161,8 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
* expected to do additional queuing of packets. */
static int
netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
struct dp_packet_batch *batch, bool may_steal)
struct dp_packet_batch *batch, bool may_steal,
bool concurrent_txq OVS_UNUSED)
{
int i;
int error = 0;
Expand Down
11 changes: 6 additions & 5 deletions lib/netdev-provider.h
Expand Up @@ -303,10 +303,6 @@ struct netdev_class {
* otherwise a positive errno value.
*
* 'n_txq' specifies the exact number of transmission queues to create.
* The caller will call netdev_send() concurrently from 'n_txq' different
* threads (with different qid). The netdev provider is responsible for
* making sure that these concurrent calls do not create a race condition
* by using multiple hw queues or locking.
*
* The caller will call netdev_reconfigure() (if necessary) before using
* netdev_send() on any of the newly configured queues, giving the provider
Expand All @@ -328,6 +324,11 @@ struct netdev_class {
* packets. If 'may_steal' is true, the caller transfers ownership of all
* the packets to the network device, regardless of success.
*
* If 'concurrent_txq' is true, the caller may perform concurrent calls
* to netdev_send() with the same 'qid'. The netdev provider is responsible
* for making sure that these concurrent calls do not create a race
* condition by using locking or other synchronization if required.
*
* The network device is expected to maintain one or more packet
* transmission queues, so that the caller does not ordinarily have to
* do additional queuing of packets. 'qid' specifies the queue to use
Expand All @@ -341,7 +342,7 @@ struct netdev_class {
* datapath". It will also prevent the OVS implementation of bonding from
* working properly over 'netdev'.) */
int (*send)(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool may_steal);
bool may_steal, bool concurrent_txq);

/* Registers with the poll loop to wake up from the next call to
* poll_block() when the packet transmission queue for 'netdev' has
Expand Down
13 changes: 8 additions & 5 deletions lib/netdev.c
Expand Up @@ -655,9 +655,6 @@ netdev_rxq_drain(struct netdev_rxq *rx)
* otherwise a positive errno value.
*
* 'n_txq' specifies the exact number of transmission queues to create.
* If this function returns successfully, the caller can make 'n_txq'
* concurrent calls to netdev_send() (each one with a different 'qid' in the
* range [0..'n_txq'-1]).
*
* The change might not effective immediately. The caller must check if a
* reconfiguration is required with netdev_is_reconf_required() and eventually
Expand Down Expand Up @@ -694,6 +691,11 @@ netdev_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
* If 'may_steal' is true, the caller transfers ownership of all the packets
* to the network device, regardless of success.
*
* If 'concurrent_txq' is true, the caller may perform concurrent calls
* to netdev_send() with the same 'qid'. The netdev provider is responsible
* for making sure that these concurrent calls do not create a race condition
* by using locking or other synchronization if required.
*
* The network device is expected to maintain one or more packet
* transmission queues, so that the caller does not ordinarily have to
* do additional queuing of packets. 'qid' specifies the queue to use
Expand All @@ -704,14 +706,15 @@ netdev_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
* cases this function will always return EOPNOTSUPP. */
int
netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool may_steal)
bool may_steal, bool concurrent_txq)
{
if (!netdev->netdev_class->send) {
dp_packet_delete_batch(batch, may_steal);
return EOPNOTSUPP;
}

int error = netdev->netdev_class->send(netdev, qid, batch, may_steal);
int error = netdev->netdev_class->send(netdev, qid, batch, may_steal,
concurrent_txq);
if (!error) {
COVERAGE_INC(netdev_sent);
if (!may_steal) {
Expand Down
2 changes: 1 addition & 1 deletion lib/netdev.h
Expand Up @@ -149,7 +149,7 @@ int netdev_rxq_drain(struct netdev_rxq *);

/* Packet transmission. */
int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
bool may_steal);
bool may_steal, bool concurrent_txq);
void netdev_send_wait(struct netdev *, int qid);

/* native tunnel APIs */
Expand Down

0 comments on commit 324c837

Please sign in to comment.