Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
blk-mq: make restart mechanism more reliable
BLK_STS_RESOURCE can be returned from driver when any resource
is running out of. And the resource may not be related with tags,
such as kmalloc(GFP_ATOMIC), when queue is idle under this kind of
BLK_STS_RESOURCE, restart can't work any more, then IO hang may
be caused.

This patch deals with this situation by running the queue after 10ms,
and serves as the last straw to avoid IO hang.

The introduced cost is small:

- the combined condition of 'need_restart and BLK_STS_RESOURCE' can
seldom be triggered

- for TAG_SHARED, hctx->nr_active is used to check if queue is busy

- for non-TAG_SHARED, sbitmap_any_bit_set() is used to check if queue
is busy in case of io scheduler. When BLK_STS_RESOURCE is returned and
SCHED_RESTART is set, most of times it means there are too many inflight
requests, and sbitmap_any_bit_set() can return very quickly. For none
scheduler, we have to walk on the tag bitmap to see if there is any
in-flight request, which should be quick too.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
  • Loading branch information
Ming Lei committed Jan 18, 2018
1 parent dfd672c commit f866ce0
Showing 1 changed file with 92 additions and 2 deletions.
94 changes: 92 additions & 2 deletions block/blk-mq.c
Expand Up @@ -1162,13 +1162,92 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
return true;
}

struct hctx_busy_data {
struct blk_mq_hw_ctx *hctx;
bool reserved;
bool busy;
};

static bool check_busy_hctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
struct hctx_busy_data *busy_data = data;
struct blk_mq_hw_ctx *hctx = busy_data->hctx;
struct request *rq;

if (busy_data->reserved)
bitnr += hctx->tags->nr_reserved_tags;

rq = hctx->tags->static_rqs[bitnr];
if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
busy_data->busy = true;
return false;
}
return true;
}

/* Check if there is any in-flight request */
static bool __blk_mq_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
struct hctx_busy_data data = {
.hctx = hctx,
.busy = false,
.reserved = true,
};

sbitmap_for_each_set(&hctx->tags->breserved_tags.sb,
check_busy_hctx, &data);
if (data.busy)
return true;

data.reserved = false;
sbitmap_for_each_set(&hctx->tags->bitmap_tags.sb,
check_busy_hctx, &data);
if (data.busy)
return true;

return false;
}

/* Simply check if there are driver tag allocated for io scheduler case */
static bool __blk_mq_sched_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
if (sbitmap_any_bit_set(&hctx->tags->breserved_tags.sb))
return true;

if (sbitmap_any_bit_set(&hctx->tags->bitmap_tags.sb))
return true;
return false;
}

static bool blk_mq_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
/*
* This helper is called after rq is added to hctx->dispatch_list,
* the barrier makes sure that adding rq to hctx->dispatch_list and
* checking queue busy won't be reordered.
*
* The pair of this barrier is the one in sbq_wake_up() which is
* called from blk_mq_put_tag() in blk_mq_free_request().
*/
smp_mb();

if (hctx->flags & BLK_MQ_F_TAG_SHARED)
return atomic_read(&hctx->nr_active) > 0;

if (hctx->queue->elevator)
return __blk_mq_sched_hctx_is_busy(hctx);
else
return __blk_mq_hctx_is_busy(hctx);
}

bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq, *nxt;
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;

if (list_empty(list))
return false;
Expand All @@ -1181,7 +1260,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
blk_status_t ret;

rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
Expand Down Expand Up @@ -1287,8 +1365,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart)
/*
* Drivers may return BLK_STS_RESOURCE which isn't related with
* tags, such as run out of memory, then we can't rely on
* S_SCHED_RESTART to rerun queue when the queue is idle(no any
* inflight request). Rerun the queue after 10ms under this
* situation as the last straw, and it is safe to do this way
* since the request has been added to dispatch list before
* checking S_SCHED_RESTART and queue busy.
*/
else if (needs_restart && (ret == BLK_STS_RESOURCE)) {
hctx->restart++;
if (!blk_mq_hctx_is_busy(hctx))
blk_mq_delay_run_hw_queue(hctx, 10/*ms*/);
}
}

return (queued + errors) != 0;
Expand Down

0 comments on commit f866ce0

Please sign in to comment.