Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
blk-mq: make restart mechanism more reliable
BLK_STS_RESOURCE can be returned from driver when any resource
is running out of. And the resource may not be related with tags,
such as kmalloc(GFP_ATOMIC), when queue is idle under this kind of
BLK_STS_RESOURCE, restart can't work any more, then IO hang may
be caused.

This patch deals with this situation by running the queue after 10ms,
and serves as the last straw to avoid IO hang.

The introduced cost is small and can be ignored because:

- the combined condition of 'need_restart and BLK_STS_RESOURCE' can
seldom be triggered

- for TAG_SHARED, hctx->nr_active is used to check if queue is busy

- for non-TAG_SHARED, sbitmap_any_bit_set() is used to check if queue
is busy. When BLK_STS_RESOURCE is returned and SCHED_RESTART is
set, most of times it means there are too many inflight requests, and
sbitmap_any_bit_set() can return very quickly.

Test has been run on dm-mpath over scsi-debug, which is setup as:

- modprobe scsi_debug virtual_gb=128 submit_queues=1 max_luns=2 ndelay=1000 max_queue=4

- fio on dm-mpath, randread, bs:4k, libaio, dio, 32jobs, 64 iodepth

- run/restart observed from debugfs(thanks Mike for enabling blk-mq
debugfs on dm-rq)
	root@ming:/sys/kernel/debug/block/dm-4/hctx0# cat run
	19235342
	root@ming:/sys/kernel/debug/block/dm-4/hctx0# cat restart
	45

So from the above debugfs info, this patch's effect on perfomance can
be ignored, because this condition is too difficult to trigger(0.0002339%)

Signed-off-by: Ming Lei <ming.lei@redhat.com>
  • Loading branch information
Ming Lei committed Jan 18, 2018
1 parent dfd672c commit 68a6690
Showing 1 changed file with 92 additions and 2 deletions.
94 changes: 92 additions & 2 deletions block/blk-mq.c
Expand Up @@ -1162,13 +1162,92 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
return true;
}

struct hctx_busy_data {
struct blk_mq_hw_ctx *hctx;
bool reserved;
bool busy;
};

static bool check_busy_hctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
struct hctx_busy_data *busy_data = data;
struct blk_mq_hw_ctx *hctx = busy_data->hctx;
struct request *rq;

if (busy_data->reserved)
bitnr += hctx->tags->nr_reserved_tags;

rq = hctx->tags->static_rqs[bitnr];
if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
busy_data->busy = true;
return false;
}
return true;
}

/* Check if there is any in-flight request */
static bool __blk_mq_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
struct hctx_busy_data data = {
.hctx = hctx,
.busy = false,
.reserved = true,
};

sbitmap_for_each_set(&hctx->tags->breserved_tags.sb,
check_busy_hctx, &data);
if (data.busy)
return true;

data.reserved = false;
sbitmap_for_each_set(&hctx->tags->bitmap_tags.sb,
check_busy_hctx, &data);
if (data.busy)
return true;

return false;
}

/* Simply check if there are driver tag allocated for io scheduler case */
static bool __blk_mq_sched_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
if (sbitmap_any_bit_set(&hctx->tags->breserved_tags.sb))
return true;

if (sbitmap_any_bit_set(&hctx->tags->bitmap_tags.sb))
return true;
return false;
}

static bool blk_mq_hctx_is_busy(struct blk_mq_hw_ctx *hctx)
{
/*
* This helper is called after rq is added to hctx->dispatch_list,
* the barrier makes sure that adding rq to hctx->dispatch_list and
* checking queue busy won't be reordered.
*
* The pair of this barrier is the one in sbq_wake_up() which is
* called from blk_mq_put_tag() in blk_mq_free_request().
*/
smp_mb();

if (hctx->flags & BLK_MQ_F_TAG_SHARED)
return atomic_read(&hctx->nr_active) > 0;

if (hctx->queue->elevator)
return __blk_mq_sched_hctx_is_busy(hctx);
else
return __blk_mq_hctx_is_busy(hctx);
}

bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq, *nxt;
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;

if (list_empty(list))
return false;
Expand All @@ -1181,7 +1260,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
blk_status_t ret;

rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
Expand Down Expand Up @@ -1287,8 +1365,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart)
/*
* Drivers may return BLK_STS_RESOURCE which isn't related with
* tags, such as run out of memory, then we can't rely on
* S_SCHED_RESTART to rerun queue when the queue is idle(no any
* inflight request). Rerun the queue after 10ms under this
* situation as the last straw, and it is safe to do this way
* since the request has been added to dispatch list before
* checking S_SCHED_RESTART and queue busy.
*/
else if (needs_restart && (ret == BLK_STS_RESOURCE)) {
hctx->restart++;
if (!blk_mq_hctx_is_busy(hctx))
blk_mq_delay_run_hw_queue(hctx, 10/*ms*/);
}
}

return (queued + errors) != 0;
Expand Down

0 comments on commit 68a6690

Please sign in to comment.