Skip to content

Commit

Permalink
Remove dummy znode from zvol_state
Browse files Browse the repository at this point in the history
struct zvol_state contains a dummy znode, which is around 1KB on x64, only for
zfs_range_lock. But in reality, other than z_range_lock and z_range_avl,
zfs_range_lock only need znode on regular file, which means we add 1KB on a
structure and gain nothing.

In this patch, we remove the dummy znode for zvol_state. In order to do that,
we also need to refactor zfs_range_lock a bit. We move z_range_lock and
z_range_avl pair out of znode_t to form zfs_rlock_t. This new struct replaces
znode_t as the main handle inside the range lock functions. Since regular
files still need znode for RL_WRITER and RL_APPEND, we make znode an optional
argument in zfs_range_lock_impl.

To reduce possible merge conflict, we retain the prototype of zfs_range_lock.
And zvol now should call zvol_range_lock instead.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
  • Loading branch information
Chunwei Chen committed Apr 12, 2016
1 parent 2b54cb1 commit c8009a1
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 66 deletions.
6 changes: 2 additions & 4 deletions include/sys/trace_acl.h
Expand Up @@ -63,7 +63,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__field(uint32_t, z_sync_cnt)
__field(mode_t, z_mode)
__field(boolean_t, z_is_sa)
__field(boolean_t, z_is_zvol)
__field(boolean_t, z_is_mapped)
__field(boolean_t, z_is_ctldir)
__field(boolean_t, z_is_stale)
Expand Down Expand Up @@ -101,7 +100,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_sync_cnt = zn->z_sync_cnt;
__entry->z_mode = zn->z_mode;
__entry->z_is_sa = zn->z_is_sa;
__entry->z_is_zvol = zn->z_is_zvol;
__entry->z_is_mapped = zn->z_is_mapped;
__entry->z_is_ctldir = zn->z_is_ctldir;
__entry->z_is_stale = zn->z_is_stale;
Expand All @@ -125,7 +123,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
"zn_prefetch %u moved %u blksz %u seq %u "
"mapcnt %llu gen %llu size %llu "
"links %llu pflags %llu uid %llu gid %llu "
"sync_cnt %u mode 0x%x is_sa %d is_zvol %d "
"sync_cnt %u mode 0x%x is_sa %d "
"is_mapped %d is_ctldir %d is_stale %d inode { "
"ino %lu nlink %u version %llu size %lli blkbits %u "
"bytes %u mode 0x%x generation %x } } ace { type %u "
Expand All @@ -136,7 +134,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_size,
__entry->z_links, __entry->z_pflags, __entry->z_uid,
__entry->z_gid, __entry->z_sync_cnt, __entry->z_mode,
__entry->z_is_sa, __entry->z_is_zvol, __entry->z_is_mapped,
__entry->z_is_sa, __entry->z_is_mapped,
__entry->z_is_ctldir, __entry->z_is_stale, __entry->i_ino,
__entry->i_nlink, __entry->i_version, __entry->i_size,
__entry->i_blkbits, __entry->i_bytes, __entry->i_mode,
Expand Down
35 changes: 32 additions & 3 deletions include/sys/zfs_rlock.h
Expand Up @@ -32,16 +32,23 @@ extern "C" {

#ifdef _KERNEL

#include <sys/zfs_znode.h>
#include <sys/list.h>
#include <sys/avl.h>
#include <sys/condvar.h>

typedef enum {
RL_READER,
RL_WRITER,
RL_APPEND
} rl_type_t;

typedef struct zfs_rlock {
kmutex_t zr_mutex; /* protects changes to zr_avl */
avl_tree_t zr_avl; /* avl tree of range locks */
} zfs_rlock_t;

typedef struct rl {
znode_t *r_zp; /* znode this lock applies to */
zfs_rlock_t *r_zrl;
avl_node_t r_node; /* avl node link */
uint64_t r_off; /* file range offset */
uint64_t r_len; /* file range length */
Expand All @@ -55,16 +62,26 @@ typedef struct rl {
list_node_t rl_node; /* used for deferred release */
} rl_t;

struct znode;
/*
* Lock a range (offset, length) as either shared (RL_READER)
* or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that
* is converted to RL_WRITER that specified to lock from the start of the
* end of file. Returns the range lock structure.
*
* Filesystem should call zfs_range_lock.
* Zvol should call zvol_range_lock.
*/
rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
rl_t *zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
rl_type_t type, struct znode *zp);
#define zfs_range_lock(zp, off, len, type) \
zfs_range_lock_impl(&(zp)->z_range_lock, off, len, type, zp)
#define zvol_range_lock(zrl, off, len, type) \
zfs_range_lock_impl(zrl, off, len, type, NULL)

/* Unlock range and destroy range lock structure. */
void zfs_range_unlock(rl_t *rl);
#define zvol_range_unlock(rl) zfs_range_unlock(rl)

/*
* Reduce range locked as RW_WRITER from whole file to specified range.
Expand All @@ -78,6 +95,18 @@ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
*/
int zfs_range_compare(const void *arg1, const void *arg2);

static inline void zfs_rlock_init(zfs_rlock_t *zrl)
{
mutex_init(&zrl->zr_mutex, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zrl->zr_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
}

static inline void zfs_rlock_destroy(zfs_rlock_t *zrl)
{
avl_destroy(&zrl->zr_avl);
mutex_destroy(&zrl->zr_mutex);
}
#endif /* _KERNEL */

#ifdef __cplusplus
Expand Down
5 changes: 2 additions & 3 deletions include/sys/zfs_znode.h
Expand Up @@ -37,6 +37,7 @@
#include <sys/rrwlock.h>
#include <sys/zfs_sa.h>
#include <sys/zfs_stat.h>
#include <sys/zfs_rlock.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
Expand Down Expand Up @@ -187,8 +188,7 @@ typedef struct znode {
krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
kmutex_t z_range_lock; /* protects changes to z_range_avl */
avl_tree_t z_range_avl; /* avl tree of file range locks */
zfs_rlock_t z_range_lock; /* file range lock */
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
Expand All @@ -212,7 +212,6 @@ typedef struct znode {
list_node_t z_link_node; /* all znodes in fs link */
sa_handle_t *z_sa_hdl; /* handle to sa data */
boolean_t z_is_sa; /* are we native sa? */
boolean_t z_is_zvol; /* are we used by the zvol */
boolean_t z_is_mapped; /* are we mmap'ed */
boolean_t z_is_ctldir; /* are we .zfs entry */
boolean_t z_is_stale; /* are we stale due to rollback? */
Expand Down
1 change: 0 additions & 1 deletion module/zfs/zfs_ctldir.c
Expand Up @@ -484,7 +484,6 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
zp->z_gid = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
zp->z_is_zvol = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_TRUE;
zp->z_is_sa = B_FALSE;
Expand Down
68 changes: 36 additions & 32 deletions module/zfs/zfs_rlock.c
Expand Up @@ -96,14 +96,15 @@
*/

#include <sys/zfs_rlock.h>
#include <sys/zfs_znode.h>

/*
* Check if a write lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_writer(znode_t *zp, rl_t *new)
zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new, znode_t *zp)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *rl;
avl_index_t where;
uint64_t end_size;
Expand All @@ -112,17 +113,16 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)

for (;;) {
/*
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
* a "sa" data or zfs_sb_t - so skip that processing.
* Range locking is also used by zvol. However, for zvol, we
* don't need to append or grow blocksize, so skip that
* processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
* we could make the range locking code generically available
* to other non-zfs consumers.
*/
if (!zp->z_is_zvol) { /* caller is ZPL */
if (zp) { /* caller is ZPL */
/*
* If in append mode pick up the current end of file.
* This is done under z_range_lock to avoid races.
Expand Down Expand Up @@ -175,7 +175,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
rl->r_write_wanted = B_TRUE;
}
cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);

/* reset to original */
new->r_off = off;
Expand Down Expand Up @@ -353,9 +353,9 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* Check if a reader lock can be grabbed, or wait and recheck until available.
*/
static void
zfs_range_lock_reader(znode_t *zp, rl_t *new)
zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *prev, *next;
avl_index_t where;
uint64_t off = new->r_off;
Expand All @@ -378,7 +378,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
prev->r_read_wanted = B_TRUE;
}
cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
goto retry;
}
if (off + len < prev->r_off + prev->r_len)
Expand All @@ -401,7 +401,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
next->r_read_wanted = B_TRUE;
}
cv_wait(&next->r_rd_cv, &zp->z_range_lock);
cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
goto retry;
}
if (off + len <= next->r_off + next->r_len)
Expand All @@ -421,16 +421,20 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
* or exclusive (RL_WRITER). Returns the range lock structure
* for later unlocking or reduce range (if entire file
* previously locked as RL_WRITER).
*
* Filesystem should call zfs_range_lock.
* Zvol should call zvol_range_lock.
*/
rl_t *
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
rl_type_t type, znode_t *zp)
{
rl_t *new;

ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);

new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
new->r_zp = zp;
new->r_zrl = zrl;
new->r_off = off;
if (len + off < off) /* overflow */
len = UINT64_MAX - off;
Expand All @@ -441,18 +445,18 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
new->r_write_wanted = B_FALSE;
new->r_read_wanted = B_FALSE;

mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
if (type == RL_READER) {
/*
* First check for the usual case of no locks
*/
if (avl_numnodes(&zp->z_range_avl) == 0)
avl_add(&zp->z_range_avl, new);
if (avl_numnodes(&zrl->zr_avl) == 0)
avl_add(&zrl->zr_avl, new);
else
zfs_range_lock_reader(zp, new);
} else
zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
mutex_exit(&zp->z_range_lock);
zfs_range_lock_reader(zrl, new);
} else /* RL_WRITER or RL_APPEND */
zfs_range_lock_writer(zrl, new, zp);
mutex_exit(&zrl->zr_mutex);
return (new);
}

Expand All @@ -474,9 +478,9 @@ zfs_range_free(void *arg)
* Unlock a reader lock
*/
static void
zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
{
avl_tree_t *tree = &zp->z_range_avl;
avl_tree_t *tree = &zrl->zr_avl;
rl_t *rl, *next = NULL;
uint64_t len;

Expand Down Expand Up @@ -543,7 +547,7 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
void
zfs_range_unlock(rl_t *rl)
{
znode_t *zp = rl->r_zp;
zfs_rlock_t *zrl = rl->r_zrl;
list_t free_list;
rl_t *free_rl;

Expand All @@ -552,10 +556,10 @@ zfs_range_unlock(rl_t *rl)
ASSERT(!rl->r_proxy);
list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));

mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
if (rl->r_type == RL_WRITER) {
/* writer locks can't be shared or split */
avl_remove(&zp->z_range_avl, rl);
avl_remove(&zrl->zr_avl, rl);
if (rl->r_write_wanted)
cv_broadcast(&rl->r_wr_cv);

Expand All @@ -568,9 +572,9 @@ zfs_range_unlock(rl_t *rl)
* lock may be shared, let zfs_range_unlock_reader()
* release the zp->z_range_lock lock and free the rl_t
*/
zfs_range_unlock_reader(zp, rl, &free_list);
zfs_range_unlock_reader(zrl, rl, &free_list);
}
mutex_exit(&zp->z_range_lock);
mutex_exit(&zrl->zr_mutex);

while ((free_rl = list_head(&free_list)) != NULL) {
list_remove(&free_list, free_rl);
Expand All @@ -588,17 +592,17 @@ zfs_range_unlock(rl_t *rl)
void
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
{
znode_t *zp = rl->r_zp;
zfs_rlock_t *zrl = rl->r_zrl;

/* Ensure there are no other locks */
ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
ASSERT(rl->r_off == 0);
ASSERT(rl->r_type == RL_WRITER);
ASSERT(!rl->r_proxy);
ASSERT3U(rl->r_len, ==, UINT64_MAX);
ASSERT3U(rl->r_cnt, ==, 1);

mutex_enter(&zp->z_range_lock);
mutex_enter(&zrl->zr_mutex);
rl->r_off = off;
rl->r_len = len;

Expand All @@ -607,7 +611,7 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
if (rl->r_read_wanted)
cv_broadcast(&rl->r_rd_cv);

mutex_exit(&zp->z_range_lock);
mutex_exit(&zrl->zr_mutex);
}

/*
Expand Down
8 changes: 2 additions & 6 deletions module/zfs/zfs_znode.c
Expand Up @@ -113,9 +113,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);

mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
zfs_rlock_init(&zp->z_range_lock);

zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
Expand All @@ -137,8 +135,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock);
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
zfs_rlock_destroy(&zp->z_range_lock);

ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
Expand Down Expand Up @@ -615,7 +612,6 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_is_zvol = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_FALSE;
zp->z_is_stale = B_FALSE;
Expand Down

0 comments on commit c8009a1

Please sign in to comment.