From d705ae6b133f9f6a8beee617b1224b6a5c99c5da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Feb 2012 09:45:49 +0100 Subject: [PATCH 01/11] block: replace icq->changed with icq->flags icq->changed was used for ICQ_*_CHANGED bits. Rename it to flags and access it under ioc->lock instead of using atomic bitops. ioc_get_changed() is added so that the changed part can be fetched and cleared as before. icq->flags will be used to carry other flags. Signed-off-by: Tejun Heo Tested-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-ioc.c | 30 ++++++++++++++++++++++++++---- block/cfq-iosched.c | 12 ++++++------ include/linux/iocontext.h | 9 ++++++--- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 8b782a63c29705..811879c752e475 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -363,13 +363,13 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) return icq; } -void ioc_set_changed(struct io_context *ioc, int which) +void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags) { struct io_cq *icq; struct hlist_node *n; hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) - set_bit(which, &icq->changed); + icq->flags |= flags; } /** @@ -387,7 +387,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio) spin_lock_irqsave(&ioc->lock, flags); ioc->ioprio = ioprio; - ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED); + ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED); spin_unlock_irqrestore(&ioc->lock, flags); } @@ -404,11 +404,33 @@ void ioc_cgroup_changed(struct io_context *ioc) unsigned long flags; spin_lock_irqsave(&ioc->lock, flags); - ioc_set_changed(ioc, ICQ_CGROUP_CHANGED); + ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED); spin_unlock_irqrestore(&ioc->lock, flags); } EXPORT_SYMBOL(ioc_cgroup_changed); +/** + * icq_get_changed - fetch and clear icq changed mask + * @icq: icq of interest + * + * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases + * @icq->ioc->lock. + */ +unsigned icq_get_changed(struct io_cq *icq) +{ + unsigned int changed = 0; + unsigned long flags; + + if (unlikely(icq->flags & ICQ_CHANGED_MASK)) { + spin_lock_irqsave(&icq->ioc->lock, flags); + changed = icq->flags & ICQ_CHANGED_MASK; + icq->flags &= ~ICQ_CHANGED_MASK; + spin_unlock_irqrestore(&icq->ioc->lock, flags); + } + return changed; +} +EXPORT_SYMBOL(icq_get_changed); + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d0ba505336685f..457295253566e9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3470,20 +3470,20 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; + unsigned int changed; might_sleep_if(gfp_mask & __GFP_WAIT); spin_lock_irq(q->queue_lock); /* handle changed notifications */ - if (unlikely(cic->icq.changed)) { - if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed)) - changed_ioprio(cic); + changed = icq_get_changed(&cic->icq); + if (unlikely(changed & ICQ_IOPRIO_CHANGED)) + changed_ioprio(cic); #ifdef CONFIG_CFQ_GROUP_IOSCHED - if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed)) - changed_cgroup(cic); + if (unlikely(changed & ICQ_CGROUP_CHANGED)) + changed_cgroup(cic); #endif - } new_queue: cfqq = cic_to_cfqq(cic, is_sync); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 119773eebe3144..17839c7b9614ff 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -6,8 +6,10 @@ #include enum { - ICQ_IOPRIO_CHANGED, - ICQ_CGROUP_CHANGED, + ICQ_IOPRIO_CHANGED = 1 << 0, + ICQ_CGROUP_CHANGED = 1 << 1, + + ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, }; /* @@ -88,7 +90,7 @@ struct io_cq { struct rcu_head __rcu_head; }; - unsigned long changed; + unsigned int flags; }; /* @@ -139,6 +141,7 @@ struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); +unsigned int icq_get_changed(struct io_cq *icq); #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } From 2274b029f640cd652ab59c363e5beebf5f50e609 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Feb 2012 09:45:52 +0100 Subject: [PATCH 02/11] block: simplify ioc_release_fn() Reverse double lock dancing in ioc_release_fn() can be simplified by just using trylock on the queue_lock and back out from ioc lock on trylock failure. Simplify it. Signed-off-by: Tejun Heo Tested-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-ioc.c | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 811879c752e475..f53c80ecaf076c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -79,7 +79,6 @@ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); - struct request_queue *last_q = NULL; unsigned long flags; /* @@ -93,44 +92,19 @@ static void ioc_release_fn(struct work_struct *work) while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - /* - * Need to switch to @this_q. Once we release - * @ioc->lock, it can go away along with @cic. - * Hold on to it. - */ - __blk_get_queue(this_q); - - /* - * blk_put_queue() might sleep thanks to kobject - * idiocy. Always release both locks, put and - * restart. - */ - if (last_q) { - spin_unlock(last_q->queue_lock); - spin_unlock_irqrestore(&ioc->lock, flags); - blk_put_queue(last_q); - } else { - spin_unlock_irqrestore(&ioc->lock, flags); - } - - last_q = this_q; - spin_lock_irqsave(this_q->queue_lock, flags); - spin_lock_nested(&ioc->lock, 1); - continue; + struct request_queue *q = icq->q; + + if (spin_trylock(q->queue_lock)) { + ioc_exit_icq(icq); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irqrestore(&ioc->lock, flags); + cpu_relax(); + spin_lock_irqsave_nested(&ioc->lock, flags, 1); } - ioc_exit_icq(icq); } - if (last_q) { - spin_unlock(last_q->queue_lock); - spin_unlock_irqrestore(&ioc->lock, flags); - blk_put_queue(last_q); - } else { - spin_unlock_irqrestore(&ioc->lock, flags); - } + spin_unlock_irqrestore(&ioc->lock, flags); kmem_cache_free(iocontext_cachep, ioc); } From 621032ad6eaabf2fe771c4fa0d8f58e1fcfcdba6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Feb 2012 09:45:53 +0100 Subject: [PATCH 03/11] block: exit_io_context() should call elevator_exit_icq_fn() While updating locking, b2efa05265 "block, cfq: unlink cfq_io_context's immediately" moved elevator_exit_icq_fn() invocation from exit_io_context() to the final ioc put. While this doesn't cause catastrophic failure, it effectively removes task exit notification to elevator and cause noticeable IO performance degradation with CFQ. On task exit, CFQ used to immediately expire the slice if it was being used by the exiting task as no more IO would be issued by the task; however, after b2efa05265, the notification is lost and disk could sit idle needlessly, leading to noticeable IO performance degradation for certain workloads. This patch renames ioc_exit_icq() to ioc_destroy_icq(), separates elevator_exit_icq_fn() invocation into ioc_exit_icq() and invokes it from exit_io_context(). ICQ_EXITED flag is added to avoid invoking the callback more than once for the same icq. Walking icq_list from ioc side and invoking elevator callback requires reverse double locking. This may be better implemented using RCU; unfortunately, using RCU isn't trivial. e.g. RCU protection would need to cover request_queue and queue_lock switch on cleanup makes grabbing queue_lock from RCU unsafe. Reverse double locking should do, at least for now. Signed-off-by: Tejun Heo Reported-and-bisected-by: Shaohua Li LKML-Reference: Tested-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-ioc.c | 55 +++++++++++++++++++++++++++++++++------ include/linux/iocontext.h | 1 + 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f53c80ecaf076c..92bf55540d87b2 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -36,10 +36,22 @@ static void icq_free_icq_rcu(struct rcu_head *head) kmem_cache_free(icq->__rcu_icq_cache, icq); } -/* - * Exit and free an icq. Called with both ioc and q locked. - */ +/* Exit an icq. Called with both ioc and q locked. */ static void ioc_exit_icq(struct io_cq *icq) +{ + struct elevator_type *et = icq->q->elevator->type; + + if (icq->flags & ICQ_EXITED) + return; + + if (et->ops.elevator_exit_icq_fn) + et->ops.elevator_exit_icq_fn(icq); + + icq->flags |= ICQ_EXITED; +} + +/* Release an icq. Called with both ioc and q locked. */ +static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; struct request_queue *q = icq->q; @@ -60,8 +72,7 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) - et->ops.elevator_exit_icq_fn(icq); + ioc_exit_icq(icq); /* * @icq->q might have gone away by the time RCU callback runs @@ -95,7 +106,7 @@ static void ioc_release_fn(struct work_struct *work) struct request_queue *q = icq->q; if (spin_trylock(q->queue_lock)) { - ioc_exit_icq(icq); + ioc_destroy_icq(icq); spin_unlock(q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); @@ -142,13 +153,41 @@ EXPORT_SYMBOL(put_io_context); void exit_io_context(struct task_struct *task) { struct io_context *ioc; + struct io_cq *icq; + struct hlist_node *n; + unsigned long flags; task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); - atomic_dec(&ioc->nr_tasks); + if (!atomic_dec_and_test(&ioc->nr_tasks)) { + put_io_context(ioc); + return; + } + + /* + * Need ioc lock to walk icq_list and q lock to exit icq. Perform + * reverse double locking. Read comment in ioc_release_fn() for + * explanation on the nested locking annotation. + */ +retry: + spin_lock_irqsave_nested(&ioc->lock, flags, 1); + hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) { + if (icq->flags & ICQ_EXITED) + continue; + if (spin_trylock(icq->q->queue_lock)) { + ioc_exit_icq(icq); + spin_unlock(icq->q->queue_lock); + } else { + spin_unlock_irqrestore(&ioc->lock, flags); + cpu_relax(); + goto retry; + } + } + spin_unlock_irqrestore(&ioc->lock, flags); + put_io_context(ioc); } @@ -168,7 +207,7 @@ void ioc_clear_queue(struct request_queue *q) struct io_context *ioc = icq->ioc; spin_lock(&ioc->lock); - ioc_exit_icq(icq); + ioc_destroy_icq(icq); spin_unlock(&ioc->lock); } } diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 17839c7b9614ff..1a30180630343d 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -8,6 +8,7 @@ enum { ICQ_IOPRIO_CHANGED = 1 << 0, ICQ_CGROUP_CHANGED = 1 << 1, + ICQ_EXITED = 1 << 2, ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, }; From fe316bf2d5847bc5dd975668671a7b1067603bc7 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Fri, 2 Mar 2012 10:38:33 +0100 Subject: [PATCH 04/11] block: Fix NULL pointer dereference in sd_revalidate_disk Since 2.6.39 (1196f8b), when a driver returns -ENOMEDIUM for open(), __blkdev_get() calls rescan_partitions() to remove in-kernel partition structures and raise KOBJ_CHANGE uevent. However it ends up calling driver's revalidate_disk without open and could cause oops. In the case of SCSI: process A process B ---------------------------------------------- sys_open __blkdev_get sd_open returns -ENOMEDIUM scsi_remove_device rescan_partitions sd_revalidate_disk Oopses are reported here: http://marc.info/?l=linux-scsi&m=132388619710052 This patch separates the partition invalidation from rescan_partitions() and use it for -ENOMEDIUM case. Reported-by: Huajun Li Signed-off-by: Jun'ichi Nomura Acked-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/partition-generic.c | 48 ++++++++++++++++++++++++++++++++------- fs/block_dev.c | 16 +++++++++---- include/linux/genhd.h | 1 + 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index d06ec1c829c291..6df5d6928a440c 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -389,17 +389,11 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +static int drop_partitions(struct gendisk *disk, struct block_device *bdev) { - struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - kfree(state); - state = NULL; - } + int res; if (bdev->bd_part_count) return -EBUSY; @@ -412,6 +406,24 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) delete_partition(disk, part->partno); disk_part_iter_exit(&piter); + return 0; +} + +int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state = NULL; + struct hd_struct *part; + int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } + + res = drop_partitions(disk, bdev); + if (res) + return res; + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); check_disk_size_change(disk, bdev); @@ -515,6 +527,26 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } +int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) +{ + int res; + + if (!bdev->bd_invalidated) + return 0; + + res = drop_partitions(disk, bdev); + if (res) + return res; + + set_capacity(disk, 0); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + return 0; +} + unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) { struct address_space *mapping = bdev->bd_inode->i_mapping; diff --git a/fs/block_dev.c b/fs/block_dev.c index 0e575d1304b461..5e9f198f77129b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1183,8 +1183,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } if (ret) goto out_clear; } else { @@ -1214,8 +1218,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(bdev->bd_disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } if (ret) goto out_unlock_bdev; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fe23ee768589c7..e61d3192448e90 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -596,6 +596,7 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf); extern int disk_expand_part_tbl(struct gendisk *disk, int target); extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); +extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev); extern struct hd_struct * __must_check add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, From 12ebffd146768556ab7c415d0ff9ab78e3d16b7a Mon Sep 17 00:00:00 2001 From: Muthukumar R Date: Fri, 2 Mar 2012 10:40:58 +0100 Subject: [PATCH 05/11] block: Fix setting bio flags in drivers (sd_dif/floppy) Fix setting bio flags in drivers (sd_dif/floppy). Signed-off-by: Muthukumar R Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 +- drivers/scsi/sd_dif.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 9baf11e8636205..744f078f4dd831 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3832,7 +3832,7 @@ static int __floppy_read_block_0(struct block_device *bdev) bio.bi_size = size; bio.bi_bdev = bdev; bio.bi_sector = 0; - bio.bi_flags = BIO_QUIET; + bio.bi_flags = (1 << BIO_QUIET); init_completion(&complete); bio.bi_private = &complete; bio.bi_end_io = floppy_rb0_complete; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 0cb39ff21171b3..f8fb2d691c0a69 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -408,7 +408,7 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s kunmap_atomic(sdt, KM_USER0); } - bio->bi_flags |= BIO_MAPPED_INTEGRITY; + bio->bi_flags |= (1 << BIO_MAPPED_INTEGRITY); } return 0; From 9f53d2fe815b4011ff930a7b6db98385d45faa68 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 2 Mar 2012 10:43:28 +0100 Subject: [PATCH 06/11] block: fix __blkdev_get and add_disk race condition The following situation might occur: __blkdev_get: add_disk: register_disk() get_gendisk() disk_block_events() disk->ev == NULL disk_add_events() __disk_unblock_events() disk->ev != NULL --ev->block Then we unblock events, when they are suppose to be blocked. This can trigger events related block/genhd.c warnings, but also can crash in sd_check_events() or other places. I'm able to reproduce crashes with the following scripts (with connected usb dongle as sdb disk). DEV=/dev/sdb ENABLE=/sys/bus/usb/devices/1-2/bConfigurationValue function stop_me() { for i in `jobs -p` ; do kill $i 2> /dev/null ; done exit } trap stop_me SIGHUP SIGINT SIGTERM for ((i = 0; i < 10; i++)) ; do while true; do fdisk -l $DEV 2>&1 > /dev/null ; done & done while true ; do echo 1 > $ENABLE sleep 1 echo 0 > $ENABLE done I use the script to verify patch fixing oops in sd_revalidate_disk http://marc.info/?l=linux-scsi&m=132935572512352&w=2 Without Jun'ichi Nomura patch titled "Fix NULL pointer dereference in sd_revalidate_disk" or this one, script easily crash kernel within a few seconds. With both patches applied I do not observe crash. Unfortunately after some time (dozen of minutes), script will hung in: [ 1563.906432] [] schedule_timeout_uninterruptible+0x15/0x20 [ 1563.906437] [] msleep+0x15/0x20 [ 1563.906443] [] blk_drain_queue+0x32/0xd0 [ 1563.906447] [] blk_cleanup_queue+0xd0/0x170 [ 1563.906454] [] scsi_free_queue+0x3f/0x60 [ 1563.906459] [] __scsi_remove_device+0x6e/0xb0 [ 1563.906463] [] scsi_forget_host+0x4f/0x60 [ 1563.906468] [] scsi_remove_host+0x5a/0xf0 [ 1563.906482] [] quiesce_and_remove_host+0x5b/0xa0 [usb_storage] [ 1563.906490] [] usb_stor_disconnect+0x13/0x20 [usb_storage] Anyway I think this patch is some step forward. As drawback, I do not teardown on sysfs file create error, because I do not know how to nullify disk->ev (since it can be used). However add_disk error handling practically does not exist too, and things will work without this sysfs file, except events will not be exported to user space. Signed-off-by: Stanislaw Gruszka Acked-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 23b4f7063322c3..b26c4085590da7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -35,6 +35,7 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_alloc_events(struct gendisk *disk); static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); @@ -601,6 +602,8 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + disk_alloc_events(disk); + /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, disk_devt(disk)); @@ -1733,9 +1736,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, &disk_events_dfl_poll_msecs, 0644); /* - * disk_{add|del|release}_events - initialize and destroy disk_events. + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -static void disk_add_events(struct gendisk *disk) +static void disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; @@ -1748,16 +1751,6 @@ static void disk_add_events(struct gendisk *disk) return; } - if (sysfs_create_files(&disk_to_dev(disk)->kobj, - disk_events_attrs) < 0) { - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - kfree(ev); - return; - } - - disk->ev = ev; - INIT_LIST_HEAD(&ev->node); ev->disk = disk; spin_lock_init(&ev->lock); @@ -1766,8 +1759,21 @@ static void disk_add_events(struct gendisk *disk) ev->poll_msecs = -1; INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + disk->ev = ev; +} + +static void disk_add_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + mutex_lock(&disk_events_mutex); - list_add_tail(&ev->node, &disk_events); + list_add_tail(&disk->ev->node, &disk_events); mutex_unlock(&disk_events_mutex); /* From bca505f1097c725708ddc055cf8055e922b0904b Mon Sep 17 00:00:00 2001 From: Danny Kukawka Date: Fri, 2 Mar 2012 10:48:32 +0100 Subject: [PATCH 07/11] drivers/block/DAC960: fix DAC960_V2_IOCTL_Opcode_T -Wenum-compare warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed compiler warning: comparison between ‘DAC960_V2_IOCTL_Opcode_T’ and ‘enum ’ Renamed enum, added a new enum for SCSI_10.CommandOpcode in DAC960_V2_ProcessCompletedCommand(). Signed-off-by: Danny Kukawka Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index e086fbbbe853ed..487ce64c5493fc 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -4627,7 +4627,8 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) DAC960_Controller_T *Controller = Command->Controller; DAC960_CommandType_T CommandType = Command->CommandType; DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; - DAC960_V2_IOCTL_Opcode_T CommandOpcode = CommandMailbox->Common.IOCTL_Opcode; + DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode; + DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode; DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus; if (CommandType == DAC960_ReadCommand || @@ -4699,7 +4700,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) { if (Controller->ShutdownMonitoringTimer) return; - if (CommandOpcode == DAC960_V2_GetControllerInfo) + if (IOCTLOpcode == DAC960_V2_GetControllerInfo) { DAC960_V2_ControllerInfo_T *NewControllerInfo = Controller->V2.NewControllerInformation; @@ -4719,14 +4720,14 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) memcpy(ControllerInfo, NewControllerInfo, sizeof(DAC960_V2_ControllerInfo_T)); } - else if (CommandOpcode == DAC960_V2_GetEvent) + else if (IOCTLOpcode == DAC960_V2_GetEvent) { if (CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_ReportEvent(Controller, Controller->V2.Event); } Controller->V2.NextEventSequenceNumber++; } - else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid && + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid && CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo = @@ -4915,7 +4916,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) NewPhysicalDeviceInfo->LogicalUnit++; Controller->V2.PhysicalDeviceIndex++; } - else if (CommandOpcode == DAC960_V2_GetPhysicalDeviceInfoValid) + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid) { unsigned int DeviceIndex; for (DeviceIndex = Controller->V2.PhysicalDeviceIndex; @@ -4938,7 +4939,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) } Controller->V2.NeedPhysicalDeviceInformation = false; } - else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid && + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid && CommandStatus == DAC960_V2_NormalCompletion) { DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo = @@ -5065,7 +5066,7 @@ static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) [LogicalDeviceNumber] = true; NewLogicalDeviceInfo->LogicalDeviceNumber++; } - else if (CommandOpcode == DAC960_V2_GetLogicalDeviceInfoValid) + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid) { int LogicalDriveNumber; for (LogicalDriveNumber = 0; From cecd353a02fb1405c8a72a324b26b5acf97e7411 Mon Sep 17 00:00:00 2001 From: Danny Kukawka Date: Fri, 2 Mar 2012 10:48:35 +0100 Subject: [PATCH 08/11] drivers/block/DAC960: fix -Wuninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set CommandMailbox with memset before use it. Fix for: drivers/block/DAC960.c: In function ‘DAC960_V1_EnableMemoryMailboxInterface’: arch/x86/include/asm/io.h:61:1: warning: ‘CommandMailbox.Bytes[12]’ may be used uninitialized in this function [-Wuninitialized] drivers/block/DAC960.c:1175:30: note: ‘CommandMailbox.Bytes[12]’ was declared here Signed-off-by: Danny Kukawka Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 487ce64c5493fc..8db9089127c5da 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -1177,7 +1177,8 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T int TimeoutCounter; int i; - + memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T)); + if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) return DAC960_Failure(Controller, "DMA mask out of range"); Controller->BounceBufferLimit = DMA_BIT_MASK(32); From 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 2 Mar 2012 10:51:00 +0100 Subject: [PATCH 09/11] Block: use a freezable workqueue for disk-event polling This patch (as1519) fixes a bug in the block layer's disk-events polling. The polling is done by a work routine queued on the system_nrt_wq workqueue. Since that workqueue isn't freezable, the polling continues even in the middle of a system sleep transition. Obviously, polling a suspended drive for media changes and such isn't a good thing to do; in the case of USB mass-storage devices it can lead to real problems requiring device resets and even re-enumeration. The patch fixes things by creating a new system-wide, non-reentrant, freezable workqueue and using it for disk-events polling. Signed-off-by: Alan Stern CC: Acked-by: Tejun Heo Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++----- include/linux/workqueue.h | 4 ++++ kernel/workqueue.c | 7 ++++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index b26c4085590da7..df9816ede75bc3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1478,9 +1478,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) intv = disk_events_poll_jiffies(disk); set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); else if (intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } @@ -1524,7 +1524,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) ev->clearing |= mask; if (!ev->block) { cancel_delayed_work(&ev->dwork); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); } spin_unlock_irq(&ev->lock); } @@ -1561,7 +1561,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) /* uncondtionally schedule event check and wait for it to finish */ disk_block_events(disk); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); @@ -1598,7 +1598,7 @@ static void disk_events_workfn(struct work_struct *work) intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index eb8b9f15f2e03b..af155450cabb8a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -289,12 +289,16 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. + * + * system_nrt_freezable_wq is equivalent to system_nrt_wq except that + * it's freezable. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_nrt_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; +extern struct workqueue_struct *system_nrt_freezable_wq; extern struct workqueue_struct * __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03db..f2c5638bb5ab1a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); EXPORT_SYMBOL_GPL(system_freezable_wq); +EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include @@ -3833,8 +3835,11 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", + WQ_NON_REENTRANT | WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_nrt_freezable_wq); return 0; } early_initcall(init_workqueues); From ea5f4db8ece896c2ab9eafa0924148a2596c52e4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 3 Mar 2012 12:09:17 +0100 Subject: [PATCH 10/11] block, sx8: fix pointer math issue getting fw version "mem" is type u8. We need parenthesis here or it screws up the pointer math probably leading to an oops. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Acked-by: Jeff Garzik Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index e7472f567c9de7..3fb6ab4c8b4e9e 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1120,7 +1120,7 @@ static inline void carm_handle_resp(struct carm_host *host, break; case MISC_GET_FW_VER: { struct carm_fw_ver *ver = (struct carm_fw_ver *) - mem + sizeof(struct carm_msg_get_fw_ver); + (mem + sizeof(struct carm_msg_get_fw_ver)); if (!error) { host->fw_ver = le32_to_cpu(ver->version); host->flags |= (ver->features & FL_FW_VER_MASK); From ff8c1474cc2f5e11414c71ec4d739c18e6e669c0 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 14 Mar 2012 15:34:48 +0100 Subject: [PATCH 11/11] block: fix ioc leak in put_io_context When put_io_context is called, if ioc->icq_list is empty and refcount is 1, kernel will not free the ioc. This is caught by following kmemleak: unreferenced object 0xffff880036349fe0 (size 216): comm "sh", pid 2137, jiffies 4294931140 (age 290579.412s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 01 00 01 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... backtrace: [] kmemleak_alloc+0x26/0x50 [] kmem_cache_alloc_node+0x1cc/0x2a0 [] create_io_context_slowpath+0x27/0x130 [] get_task_io_context+0xbb/0xf0 [] copy_process+0x188e/0x18b0 [] do_fork+0x11b/0x420 [] sys_clone+0x28/0x30 [] stub_clone+0x13/0x20 [] 0xffffffffffffffff ioc should be freed if ioc->icq_list is empty. Signed-off-by: Xiaotian Feng Acked-by: Vivek Goyal Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-ioc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 92bf55540d87b2..fb95dd2f889a60 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -130,6 +130,7 @@ static void ioc_release_fn(struct work_struct *work) void put_io_context(struct io_context *ioc) { unsigned long flags; + bool free_ioc = false; if (ioc == NULL) return; @@ -144,8 +145,13 @@ void put_io_context(struct io_context *ioc) spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) schedule_work(&ioc->release_work); + else + free_ioc = true; spin_unlock_irqrestore(&ioc->lock, flags); } + + if (free_ioc) + kmem_cache_free(iocontext_cachep, ioc); } EXPORT_SYMBOL(put_io_context);