Skip to content
Permalink
Browse files
btrfs: implement RWF_ENCODED writes
The implementation resembles direct I/O: we have to flush any ordered
extents, invalidate the page cache, and do the io tree/delalloc/extent
map/ordered extent dance. From there, we can reuse the compression code
with a minor modification to distinguish the write from writeback. This
also creates inline extents when possible.

Now that read and write are implemented, this also sets the
FMODE_ENCODED_IO flag in btrfs_file_open().

Signed-off-by: Omar Sandoval <osandov@fb.com>
  • Loading branch information
osandov committed Nov 13, 2020
1 parent a348fa8 commit 68812638bf2d878a7c63f705fa098efe9c900806
Show file tree
Hide file tree
Showing 7 changed files with 304 additions and 12 deletions.
@@ -336,7 +336,8 @@ static void end_compressed_bio_write(struct bio *bio)
bio->bi_status == BLK_STS_OK);
cb->compressed_pages[0]->mapping = NULL;

end_compressed_writeback(inode, cb);
if (cb->writeback)
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */

/*
@@ -372,7 +373,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned long nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
struct cgroup_subsys_state *blkcg_css,
bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
@@ -396,6 +398,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;

@@ -49,6 +49,9 @@ struct compressed_bio {
/* the compression algorithm for this bio */
int compress_type;

/* Whether this is a write for writeback. */
bool writeback;

/* number of compressed pages in the array */
unsigned long nr_pages;

@@ -96,7 +99,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned long nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css);
struct cgroup_subsys_state *blkcg_css,
bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);

@@ -3134,6 +3134,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct encoded_iov *encoded);

extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
@@ -1994,6 +1994,32 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
return written ? written : err;
}

ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct encoded_iov encoded;
ssize_t ret;

ret = copy_encoded_iov_from_iter(&encoded, from);
if (ret)
return ret;

btrfs_inode_lock(inode, 0);
ret = generic_encoded_write_checks(iocb, &encoded);
if (ret || encoded.len == 0)
goto out;

ret = btrfs_write_check(iocb, from, encoded.len);
if (ret < 0)
goto out;

ret = btrfs_do_encoded_write(iocb, from, &encoded);
out:
btrfs_inode_unlock(inode, 0);
return ret;
}

static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -2012,14 +2038,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;

if (!(iocb->ki_flags & IOCB_DIRECT) &&
(iocb->ki_flags & IOCB_NOWAIT))
if ((iocb->ki_flags & IOCB_NOWAIT) &&
(!(iocb->ki_flags & IOCB_DIRECT) ||
(iocb->ki_flags & IOCB_ENCODED)))
return -EOPNOTSUPP;

if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);

if (iocb->ki_flags & IOCB_DIRECT)
if (iocb->ki_flags & IOCB_ENCODED)
num_written = btrfs_encoded_write(iocb, from);
else if (iocb->ki_flags & IOCB_DIRECT)
num_written = btrfs_direct_write(iocb, from);
else
num_written = btrfs_buffered_write(iocb, from);
@@ -3586,7 +3615,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)

static int btrfs_file_open(struct inode *inode, struct file *filp)
{
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_ENCODED_IO;
return generic_file_open(inode, filp);
}

@@ -930,7 +930,7 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
ins.offset, async_extent->pages,
async_extent->nr_pages,
async_chunk->write_flags,
async_chunk->blkcg_css)) {
async_chunk->blkcg_css, true)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -2732,7 +2732,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)

if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;

freespace_inode = btrfs_is_free_space_inode(inode);
@@ -10421,6 +10422,251 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
return ret;
}

ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct encoded_iov *encoded)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
int compression;
size_t orig_count;
u64 start, end;
u64 num_bytes, ram_bytes, disk_num_bytes;
unsigned long nr_pages, i;
struct page **pages;
struct btrfs_key ins;
bool extent_reserved = false;
struct extent_map *em;
ssize_t ret;

switch (encoded->compression) {
case ENCODED_IOV_COMPRESSION_ZLIB:
compression = BTRFS_COMPRESS_ZLIB;
break;
case ENCODED_IOV_COMPRESSION_LZO:
compression = BTRFS_COMPRESS_LZO;
break;
case ENCODED_IOV_COMPRESSION_ZSTD:
compression = BTRFS_COMPRESS_ZSTD;
break;
default:
return -EINVAL;
}
if (encoded->encryption != ENCODED_IOV_ENCRYPTION_NONE)
return -EINVAL;

orig_count = iov_iter_count(from);

/* The extent size must be sane. */
if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
return -EINVAL;

/*
* The compressed data must be smaller than the decompressed data.
*
* It's of course possible for data to compress to larger or the same
* size, but the buffered I/O path falls back to no compression for such
* data, and we don't want to break any assumptions by creating these
* extents.
*
* Note that this is less strict than the current check we have that the
* compressed data must be at least one sector smaller than the
* decompressed data. We only want to enforce the weaker requirement
* from old kernels that it is at least one byte smaller.
*/
if (orig_count >= encoded->unencoded_len)
return -EINVAL;

/* The extent must start on a sector boundary. */
start = iocb->ki_pos;
if (!IS_ALIGNED(start, fs_info->sectorsize))
return -EINVAL;

/*
* The extent must end on a sector boundary. However, we allow a write
* which ends at or extends i_size to have an unaligned length; we round
* up the extent size and set i_size to the unaligned end.
*/
if (start + encoded->len < inode->i_size &&
!IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
return -EINVAL;

/* Finally, the offset in the unencoded data must be sector-aligned. */
if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
return -EINVAL;

num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
end = start + num_bytes - 1;

/*
* If the extent cannot be inline, the compressed data on disk must be
* sector-aligned. For convenience, we extend it with zeroes if it
* isn't.
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
if (!pages)
return -ENOMEM;
for (i = 0; i < nr_pages; i++) {
size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
char *kaddr;

pages[i] = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM);
if (!pages[i]) {
ret = -ENOMEM;
goto out_pages;
}
kaddr = kmap(pages[i]);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
kunmap(pages[i]);
ret = -EFAULT;
goto out_pages;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
kunmap(pages[i]);
}

for (;;) {
struct btrfs_ordered_extent *ordered;

ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
goto out_pages;
ret = invalidate_inode_pages2_range(inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ret)
goto out_pages;
lock_extent_bits(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(io_tree, start, end, &cached_state);
cond_resched();
}

/*
* We don't use the higher-level delalloc space functions because our
* num_bytes and disk_num_bytes are different.
*/
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), disk_num_bytes);
if (ret)
goto out_unlock;
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, start,
num_bytes);
if (ret)
goto out_free_data_space;
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), num_bytes,
disk_num_bytes);
if (ret)
goto out_qgroup_free_data;

/* Try an inline extent first. */
if (start == 0 && encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0) {
ret = cow_file_range_inline(BTRFS_I(inode), encoded->len,
orig_count, compression, pages,
true);
if (ret <= 0) {
if (ret == 0)
ret = orig_count;
goto out_delalloc_release;
}
}

ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
disk_num_bytes, 0, 0, &ins, 1, 1);
if (ret)
goto out_delalloc_release;
extent_reserved = true;

em = create_io_em(BTRFS_I(inode), start, num_bytes,
start - encoded->unencoded_offset, ins.objectid,
ins.offset, ins.offset, ram_bytes, compression,
BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserved;
}
free_extent_map(em);

ret = btrfs_add_ordered_extent(BTRFS_I(inode), start, num_bytes,
ram_bytes, ins.objectid, ins.offset,
encoded->unencoded_offset,
(1 << BTRFS_ORDERED_ENCODED) |
(1 << BTRFS_ORDERED_COMPRESSED),
compression);
if (ret) {
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
goto out_free_reserved;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);

if (start + encoded->len > inode->i_size)
i_size_write(inode, start + encoded->len);

unlock_extent_cached(io_tree, start, end, &cached_state);

btrfs_delalloc_release_extents(BTRFS_I(inode), num_bytes);

if (btrfs_submit_compressed_write(BTRFS_I(inode), start, num_bytes,
ins.objectid, ins.offset, pages,
nr_pages, 0, NULL, false)) {
struct page *page = pages[0];

page->mapping = inode->i_mapping;
btrfs_writepage_endio_finish_ordered(page, start, end, 0);
page->mapping = NULL;
ret = -EIO;
goto out_pages;
}
ret = orig_count;
goto out;

out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_delalloc_release:
btrfs_delalloc_release_extents(BTRFS_I(inode), num_bytes);
btrfs_delalloc_release_metadata(BTRFS_I(inode), disk_num_bytes,
ret < 0);
out_qgroup_free_data:
if (ret < 0) {
btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, start,
num_bytes);
}
out_free_data_space:
/*
* If btrfs_reserve_extent() succeeded, then we already decremented
* bytes_may_use.
*/
if (!extent_reserved)
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
unlock_extent_cached(io_tree, start, end, &cached_state);
out_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i])
__free_page(pages[i]);
}
kvfree(pages);
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
return ret;
}

#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a

0 comments on commit 6881263

Please sign in to comment.