Skip to content

Commit

Permalink
net/mlx5: handle Tx completion with error
Browse files Browse the repository at this point in the history
When WQEs are posted to the HW to send packets, the PMD may get a
completion report with error from the HW, aka error CQE which is
associated to a bad WQE.

The error reason may be bad address, wrong lkey, bad sizes, etc.
that can wrongly be configured by the PMD or by the user.

Checking all the optional mistakes to prevent error CQEs doesn't make
sense due to performance impacts and huge complexity.

The error CQEs change the SQ state to error state what causes all the
next posted WQEs to be completed with CQE flush error forever.

Currently, the PMD doesn't handle Tx error CQEs and even may crashed
when one of them appears.

Extend the Tx data-path to detect these error CQEs, to report them by
the statistics error counters, to recover the SQ by moving the state
to ready again and adjusting the management variables appropriately.

Sometimes the error CQE root cause is very hard to debug and even may
be related to some corner cases which are not reproducible easily, hence
a dump file with debug information will be created for the first number
of error CQEs, this number can be configured by the PMD probe
parameters.

Cc: stable@dpdk.org

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
  • Loading branch information
Matan Azrad authored and Ferruh Yigit committed Jun 13, 2019
1 parent 88c0733 commit 957e45f
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 51 deletions.
11 changes: 11 additions & 0 deletions drivers/net/mlx5/mlx5_prm.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,17 @@
/* Maximum number of DS in WQE. */
#define MLX5_DSEG_MAX 63

/* The completion mode offset in the WQE control segment line 2. */
#define MLX5_COMP_MODE_OFFSET 2

/* Completion mode. */
enum mlx5_completion_mode {
MLX5_COMP_ONLY_ERR = 0x0,
MLX5_COMP_ONLY_FIRST_ERR = 0x1,
MLX5_COMP_ALWAYS = 0x2,
MLX5_COMP_CQE_AND_EQE = 0x3,
};

/* Subset of struct mlx5_wqe_eth_seg. */
struct mlx5_wqe_eth_seg_small {
uint32_t rsvd0;
Expand Down
166 changes: 156 additions & 10 deletions drivers/net/mlx5/mlx5_rxtx.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,141 @@ mlx5_dump_debug_information(const char *fname, const char *hex_title,
fclose(fd);
}

/**
* Move QP from error state to running state.
*
* @param txq
* Pointer to TX queue structure.
* @param qp
* The qp pointer for recovery.
*
* @return
* 0 on success, else errno value.
*/
static int
tx_recover_qp(struct mlx5_txq_data *txq, struct ibv_qp *qp)
{
int ret;
struct ibv_qp_attr mod = {
.qp_state = IBV_QPS_RESET,
.port_num = 1,
};
ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
if (ret) {
DRV_LOG(ERR, "Cannot change the Tx QP state to RESET %d\n",
ret);
return ret;
}
mod.qp_state = IBV_QPS_INIT;
ret = mlx5_glue->modify_qp(qp, &mod,
(IBV_QP_STATE | IBV_QP_PORT));
if (ret) {
DRV_LOG(ERR, "Cannot change Tx QP state to INIT %d\n", ret);
return ret;
}
mod.qp_state = IBV_QPS_RTR;
ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
if (ret) {
DRV_LOG(ERR, "Cannot change Tx QP state to RTR %d\n", ret);
return ret;
}
mod.qp_state = IBV_QPS_RTS;
ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
if (ret) {
DRV_LOG(ERR, "Cannot change Tx QP state to RTS %d\n", ret);
return ret;
}
txq->wqe_ci = 0;
txq->wqe_pi = 0;
txq->elts_comp = 0;
return 0;
}

/* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
static int
check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
{
static const uint8_t magic[] = "seen";
int ret = 1;
unsigned int i;

for (i = 0; i < sizeof(magic); ++i)
if (!ret || err_cqe->rsvd1[i] != magic[i]) {
ret = 0;
err_cqe->rsvd1[i] = magic[i];
}
return ret;
}

/**
* Handle error CQE.
*
* @param txq
* Pointer to TX queue structure.
* @param error_cqe
* Pointer to the error CQE.
*
* @return
* The last Tx buffer element to free.
*/
uint16_t
mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
volatile struct mlx5_err_cqe *err_cqe)
{
if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
struct mlx5_txq_ctrl *txq_ctrl =
container_of(txq, struct mlx5_txq_ctrl, txq);
uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
int seen = check_err_cqe_seen(err_cqe);

if (!seen && txq_ctrl->dump_file_n <
txq_ctrl->priv->config.max_dump_files_num) {
MKSTR(err_str, "Unexpected CQE error syndrome "
"0x%02x CQN = %u SQN = %u wqe_counter = %u "
"wq_ci = %u cq_ci = %u", err_cqe->syndrome,
txq_ctrl->cqn, txq->qp_num_8s >> 8,
rte_be_to_cpu_16(err_cqe->wqe_counter),
txq->wqe_ci, txq->cq_ci);
MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
PORT_ID(txq_ctrl->priv), txq->idx,
txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
mlx5_dump_debug_information(name, NULL, err_str, 0);
mlx5_dump_debug_information(name, "MLX5 Error CQ:",
(const void *)((uintptr_t)
&(*txq->cqes)[0]),
sizeof(*err_cqe) *
(1 << txq->cqe_n));
mlx5_dump_debug_information(name, "MLX5 Error SQ:",
(const void *)((uintptr_t)
tx_mlx5_wqe(txq, 0)),
MLX5_WQE_SIZE *
(1 << txq->wqe_n));
txq_ctrl->dump_file_n++;
}
if (!seen)
/*
* Count errors in WQEs units.
* Later it can be improved to count error packets,
* for example, by SQ parsing to find how much packets
* should be counted for each WQE.
*/
txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
new_wqe_pi) & wqe_m;
if ((rte_eal_process_type() == RTE_PROC_PRIMARY) &&
tx_recover_qp(txq, txq_ctrl->ibv->qp) == 0) {
txq->cq_ci++;
/* Release all the remaining buffers. */
return txq->elts_head;
}
/* Recovering failed - try again later on the same WQE. */
} else {
txq->cq_ci++;
}
/* Do not release buffers. */
return txq->elts_tail;
}

/**
* DPDK callback for TX.
*
Expand Down Expand Up @@ -709,7 +844,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
wqe->ctrl = (rte_v128u32_t){
rte_cpu_to_be_32(txq->wqe_ci << 8),
rte_cpu_to_be_32(txq->qp_num_8s | 1),
0,
rte_cpu_to_be_32
(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET),
0,
};
ds = 1;
Expand Down Expand Up @@ -882,7 +1019,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_cpu_to_be_32((txq->wqe_ci << 8) |
MLX5_OPCODE_TSO),
rte_cpu_to_be_32(txq->qp_num_8s | ds),
0,
rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET),
0,
};
wqe->eseg = (rte_v128u32_t){
Expand All @@ -897,7 +1035,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_cpu_to_be_32((txq->wqe_ci << 8) |
MLX5_OPCODE_SEND),
rte_cpu_to_be_32(txq->qp_num_8s | ds),
0,
rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET),
0,
};
wqe->eseg = (rte_v128u32_t){
Expand Down Expand Up @@ -926,7 +1065,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* A CQE slot must always be available. */
assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
/* Request completion on last WQE. */
last_wqe->ctrl2 = rte_cpu_to_be_32(8);
last_wqe->ctrl2 = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
/* Save elts_head in unused "immediate" field of WQE. */
last_wqe->ctrl3 = txq->elts_head;
txq->elts_comp = 0;
Expand Down Expand Up @@ -973,7 +1113,8 @@ mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_TSO);
mpw->wqe->ctrl[2] = 0;
mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET);
mpw->wqe->ctrl[3] = 0;
mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
(((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
Expand Down Expand Up @@ -1145,7 +1286,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* A CQE slot must always be available. */
assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
/* Request completion on last WQE. */
wqe->ctrl[2] = rte_cpu_to_be_32(8);
wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
/* Save elts_head in unused "immediate" field of WQE. */
wqe->ctrl[3] = elts_head;
txq->elts_comp = 0;
Expand Down Expand Up @@ -1189,7 +1331,8 @@ mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_TSO);
mpw->wqe->ctrl[2] = 0;
mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET);
mpw->wqe->ctrl[3] = 0;
mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
mpw->wqe->eseg.inline_hdr_sz = 0;
Expand Down Expand Up @@ -1447,7 +1590,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
/* A CQE slot must always be available. */
assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
/* Request completion on last WQE. */
wqe->ctrl[2] = rte_cpu_to_be_32(8);
wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
/* Save elts_head in unused "immediate" field of WQE. */
wqe->ctrl[3] = elts_head;
txq->elts_comp = 0;
Expand Down Expand Up @@ -1491,7 +1635,8 @@ mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_ENHANCED_MPSW);
mpw->wqe->ctrl[2] = 0;
mpw->wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ONLY_FIRST_ERR <<
MLX5_COMP_MODE_OFFSET);
mpw->wqe->ctrl[3] = 0;
memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
if (unlikely(padding)) {
Expand Down Expand Up @@ -1738,7 +1883,8 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
/* A CQE slot must always be available. */
assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
/* Request completion on last WQE. */
wqe->ctrl[2] = rte_cpu_to_be_32(8);
wqe->ctrl[2] = rte_cpu_to_be_32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
/* Save elts_head in unused "immediate" field of WQE. */
wqe->ctrl[3] = elts_head;
txq->elts_comp = 0;
Expand Down
81 changes: 50 additions & 31 deletions drivers/net/mlx5/mlx5_rxtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ struct mlx5_txq_ctrl {
struct mlx5_priv *priv; /* Back pointer to private data. */
off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
void *bf_reg; /* BlueFlame register from Verbs. */
uint32_t cqn; /* CQ number. */
uint16_t dump_file_n; /* Number of dump files. */
};

#define MLX5_TX_BFREG(txq) \
Expand Down Expand Up @@ -334,6 +336,8 @@ uint16_t mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
uint16_t mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
__rte_noinline uint16_t mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
volatile struct mlx5_err_cqe *err_cqe);
uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n);
void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq);
__rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq,
Expand Down Expand Up @@ -488,6 +492,51 @@ tx_mlx5_wqe(struct mlx5_txq_data *txq, uint16_t ci)
return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
}

/**
* Handle the next CQE.
*
* @param txq
* Pointer to TX queue structure.
*
* @return
* The last Tx buffer element to free.
*/
static __rte_always_inline uint16_t
mlx5_tx_cqe_handle(struct mlx5_txq_data *txq)
{
const unsigned int cqe_n = 1 << txq->cqe_n;
const unsigned int cqe_cnt = cqe_n - 1;
uint16_t last_elts;
union {
volatile struct mlx5_cqe *cqe;
volatile struct mlx5_err_cqe *err_cqe;
} u = {
.cqe = &(*txq->cqes)[txq->cq_ci & cqe_cnt],
};
int ret = check_cqe(u.cqe, cqe_n, txq->cq_ci);

if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
if (unlikely(ret == MLX5_CQE_STATUS_ERR))
last_elts = mlx5_tx_error_cqe_handle(txq, u.err_cqe);
else
/* Do not release buffers. */
return txq->elts_tail;
} else {
uint16_t new_wqe_pi = rte_be_to_cpu_16(u.cqe->wqe_counter);
volatile struct mlx5_wqe_ctrl *ctrl =
(volatile struct mlx5_wqe_ctrl *)
tx_mlx5_wqe(txq, new_wqe_pi);

/* Release completion burst buffers. */
last_elts = ctrl->ctrl3;
txq->wqe_pi = new_wqe_pi;
txq->cq_ci++;
}
rte_compiler_barrier();
*txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
return last_elts;
}

/**
* Manage TX completions.
*
Expand All @@ -501,39 +550,13 @@ mlx5_tx_complete(struct mlx5_txq_data *txq)
{
const uint16_t elts_n = 1 << txq->elts_n;
const uint16_t elts_m = elts_n - 1;
const unsigned int cqe_n = 1 << txq->cqe_n;
const unsigned int cqe_cnt = cqe_n - 1;
uint16_t elts_free = txq->elts_tail;
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
volatile struct mlx5_cqe *cqe = NULL;
volatile struct mlx5_wqe_ctrl *ctrl;
struct rte_mbuf *m, *free[elts_n];
struct rte_mempool *pool = NULL;
unsigned int blk_n = 0;

cqe = &(*txq->cqes)[cq_ci & cqe_cnt];
if (unlikely(check_cqe(cqe, cqe_n, cq_ci)))
return;
#ifndef NDEBUG
if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
(MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
if (!check_cqe_seen(cqe)) {
DRV_LOG(ERR, "unexpected error CQE, Tx stopped");
rte_hexdump(stderr, "MLX5 TXQ:",
(const void *)((uintptr_t)txq->wqes),
((1 << txq->wqe_n) *
MLX5_WQE_SIZE));
}
return;
}
#endif /* NDEBUG */
++cq_ci;
rte_cio_rmb();
txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
ctrl = (volatile struct mlx5_wqe_ctrl *)
tx_mlx5_wqe(txq, txq->wqe_pi);
elts_tail = ctrl->ctrl3;
elts_tail = mlx5_tx_cqe_handle(txq);
assert((elts_tail & elts_m) < (1 << txq->wqe_n));
/* Free buffers. */
while (elts_free != elts_tail) {
Expand Down Expand Up @@ -564,11 +587,7 @@ mlx5_tx_complete(struct mlx5_txq_data *txq)
++elts_free;
}
#endif
txq->cq_ci = cq_ci;
txq->elts_tail = elts_tail;
/* Update the consumer index. */
rte_compiler_barrier();
*txq->cq_db = rte_cpu_to_be_32(cq_ci);
}

/**
Expand Down
Loading

0 comments on commit 957e45f

Please sign in to comment.