Skip to content

Commit 82a3718

Browse files
committed
Implement SA based xattrs
The current ZFS implementation stores xattrs on disk using a hidden directory. In this directory a file name represents the xattr name and the file contexts are the xattr binary data. This approach is very flexible and allows for arbitrarily large xattrs. However, it also suffers from a significant performance penalty. Accessing a single xattr can requires up to three disk seeks. 1) Lookup the dnode object. 2) Lookup the dnodes's xattr directory object. 3) Lookup the xattr object in the directory. To avoid this performance penalty Linux filesystems such as ext3 and xfs try to store the xattr as part of the inode on disk. When the xattr is to large to store in the inode then a single external block is allocated for them. In practice most xattrs are small and this approach works well. The addition of System Attributes (SA) to zfs provides us a clean way to make this optimization. When the dataset property 'xattr=sa' is set then xattrs will be preferentially stored as System Attributes. This allows tiny xattrs (~100 bytes) to be stored with the dnode and up to 64k of xattrs to be stored in the spill block. If additional xattr space is required, which is unlikely under Linux, they will be stored using the traditional directory approach. This optimization results in roughly a 3x performance improvement when accessing xattrs which brings zfs roughly to parity with ext4 and xfs (see table below). When multiple xattrs are stored per-file the performance improvements are even greater because all of the xattrs stored in the spill block will be cached. However, by default SA based xattrs are disabled in the Linux port to maximize compatibility with other implementations. If you do enable SA based xattrs then they will not be visible on platforms which do not support this feature. ---------------------------------------------------------------------- Time in seconds to get/set one xattr of N bytes on 100,000 files ------+--------------------------------+------------------------------ | setxattr | getxattr bytes | ext4 xfs zfs-dir zfs-sa | ext4 xfs zfs-dir zfs-sa ------+--------------------------------+------------------------------ 1 | 2.33 31.88 21.50 4.57 | 2.35 2.64 6.29 2.43 32 | 2.79 30.68 21.98 4.60 | 2.44 2.59 6.78 2.48 256 | 3.25 31.99 21.36 5.92 | 2.32 2.71 6.22 3.14 1024 | 3.30 32.61 22.83 8.45 | 2.40 2.79 6.24 3.27 4096 | 3.57 317.46 22.52 10.73 | 2.78 28.62 6.90 3.94 16384 | n/a 2342.39 34.30 19.20 | n/a 45.44 145.90 7.55 65536 | n/a 2941.39 128.15 131.32* | n/a 141.92 256.85 262.12* Legend: * ext4 - Stock RHEL6.1 ext4 mounted with '-o user_xattr'. * xfs - Stock RHEL6.1 xfs mounted with default options. * zfs-dir - Directory based xattrs only. * zfs-sa - Prefer SAs but spill in to directories as needed, a trailing * indicates overflow in to directories occured. NOTE: Ext4 supports 4096 bytes of xattr name/value pairs per file. NOTE: XFS and ZFS have no limit on xattr name/value pairs per file. NOTE: Linux limits individual name/value pairs to 65536 bytes. NOTE: All setattr/getattr's were done after dropping the cache. NOTE: All tests were run against a single hard drive. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #443
1 parent e89236f commit 82a3718

File tree

12 files changed

+425
-76
lines changed

12 files changed

+425
-76
lines changed

include/sys/fs/zfs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,11 @@ typedef enum {
309309
ZFS_SYNC_DISABLED = 2
310310
} zfs_sync_type_t;
311311

312+
typedef enum {
313+
ZFS_XATTR_OFF = 0,
314+
ZFS_XATTR_DIR = 1,
315+
ZFS_XATTR_SA = 2
316+
} zfs_xattr_type_t;
312317

313318
/*
314319
* On-disk version number.

include/sys/sa.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
149149
boolean_t sa_enabled(objset_t *);
150150
void sa_cache_init(void);
151151
void sa_cache_fini(void);
152+
void *sa_spill_alloc(int);
153+
void sa_spill_free(void *);
152154
int sa_set_sa_object(objset_t *, uint64_t);
153155
int sa_hdrsize(void *);
154156
void sa_handle_lock(sa_handle_t *);

include/sys/zfs_sa.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ typedef enum zpl_attr {
7373
ZPL_SYMLINK,
7474
ZPL_SCANSTAMP,
7575
ZPL_DACL_ACES,
76+
ZPL_DXATTR,
7677
ZPL_END
7778
} zpl_attr_t;
7879

@@ -126,12 +127,20 @@ typedef struct znode_phys {
126127
} znode_phys_t;
127128

128129
#ifdef _KERNEL
130+
131+
#define DXATTR_MAX_ENTRY_SIZE (32768)
132+
#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1)
133+
129134
int zfs_sa_readlink(struct znode *, uio_t *);
130135
void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
131136
void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
132137
void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
138+
int zfs_sa_get_xattr(struct znode *);
139+
int zfs_sa_set_xattr(struct znode *);
133140
void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
134141
void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
142+
void zfs_sa_init(void);
143+
void zfs_sa_fini(void);
135144
#endif
136145

137146
#ifdef __cplusplus

include/sys/zfs_vfsops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ typedef struct zfs_sb {
7676
boolean_t z_use_fuids; /* version allows fuids */
7777
boolean_t z_replay; /* set during ZIL replay */
7878
boolean_t z_use_sa; /* version allow system attributes */
79+
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
7980
uint64_t z_version; /* ZPL version */
8081
uint64_t z_shares_dir; /* hidden shares dir */
8182
kmutex_t z_lock;

include/sys/zfs_znode.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ extern "C" {
105105
#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
106106
#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
107107
#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
108+
#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR]
108109
#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
109110

110111
/*
@@ -206,6 +207,8 @@ typedef struct znode {
206207
uint32_t z_sync_cnt; /* synchronous open count */
207208
kmutex_t z_acl_lock; /* acl data lock */
208209
zfs_acl_t *z_acl_cached; /* cached acl */
210+
krwlock_t z_xattr_lock; /* xattr data lock */
211+
nvlist_t *z_xattr_cached;/* cached xattrs */
209212
list_node_t z_link_node; /* all znodes in fs link */
210213
sa_handle_t *z_sa_hdl; /* handle to sa data */
211214
boolean_t z_is_sa; /* are we native sa? */

module/nvpair/nvpair_alloc_spl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
static void *
3131
nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size)
3232
{
33-
return (kmem_alloc(size, KM_SLEEP));
33+
return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
3434
}
3535

3636
static void *

module/zcommon/zfs_prop.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,14 @@ zfs_prop_init(void)
186186
{ NULL }
187187
};
188188

189+
static zprop_index_t xattr_table[] = {
190+
{ "off", ZFS_XATTR_OFF },
191+
{ "on", ZFS_XATTR_DIR },
192+
{ "sa", ZFS_XATTR_SA },
193+
{ "dir", ZFS_XATTR_DIR },
194+
{ NULL }
195+
};
196+
189197
/* inherit index properties */
190198
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
191199
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
@@ -226,6 +234,9 @@ zfs_prop_init(void)
226234
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
227235
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
228236
"latency | throughput", "LOGBIAS", logbias_table);
237+
zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
238+
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
239+
"on | off | dir | sa", "XATTR", xattr_table);
229240

230241
/* inherit index (boolean) properties */
231242
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
@@ -244,12 +255,8 @@ zfs_prop_init(void)
244255
boolean_table);
245256
zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
246257
ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
247-
zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
248-
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
249-
boolean_table);
250258
zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
251-
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
252-
boolean_table);
259+
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table);
253260
zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
254261
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
255262
boolean_table);

module/zfs/sa.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
201201

202202
static int sa_legacy_attr_count = 16;
203203
static kmem_cache_t *sa_cache = NULL;
204+
static kmem_cache_t *spill_cache = NULL;
204205

205206
/*ARGSUSED*/
206207
static int
@@ -232,13 +233,30 @@ sa_cache_init(void)
232233
sa_cache = kmem_cache_create("sa_cache",
233234
sizeof (sa_handle_t), 0, sa_cache_constructor,
234235
sa_cache_destructor, NULL, NULL, NULL, 0);
236+
spill_cache = kmem_cache_create("spill_cache",
237+
SPA_MAXBLOCKSIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
235238
}
236239

237240
void
238241
sa_cache_fini(void)
239242
{
240243
if (sa_cache)
241244
kmem_cache_destroy(sa_cache);
245+
246+
if (spill_cache)
247+
kmem_cache_destroy(spill_cache);
248+
}
249+
250+
void *
251+
sa_spill_alloc(int flags)
252+
{
253+
return kmem_cache_alloc(spill_cache, flags);
254+
}
255+
256+
void
257+
sa_spill_free(void *obj)
258+
{
259+
kmem_cache_free(spill_cache, obj);
242260
}
243261

244262
static int
@@ -1618,7 +1636,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
16181636
sa_bulk_attr_t *attr_desc;
16191637
void *old_data[2];
16201638
int bonus_attr_count = 0;
1621-
int bonus_data_size = 0, spill_data_size = 0;
1639+
int bonus_data_size = 0;
16221640
int spill_attr_count = 0;
16231641
int error;
16241642
uint16_t length;
@@ -1648,8 +1666,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
16481666
/* Bring spill buffer online if it isn't currently */
16491667

16501668
if ((error = sa_get_spill(hdl)) == 0) {
1651-
spill_data_size = hdl->sa_spill->db_size;
1652-
old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
1669+
ASSERT3U(hdl->sa_spill->db_size, <=, SPA_MAXBLOCKSIZE);
1670+
old_data[1] = sa_spill_alloc(KM_SLEEP);
16531671
bcopy(hdl->sa_spill->db_data, old_data[1],
16541672
hdl->sa_spill->db_size);
16551673
spill_attr_count =
@@ -1729,7 +1747,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
17291747
if (old_data[0])
17301748
kmem_free(old_data[0], bonus_data_size);
17311749
if (old_data[1])
1732-
kmem_free(old_data[1], spill_data_size);
1750+
sa_spill_free(old_data[1]);
17331751
kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
17341752

17351753
return (error);
@@ -1998,6 +2016,8 @@ EXPORT_SYMBOL(sa_replace_all_by_template_locked);
19982016
EXPORT_SYMBOL(sa_enabled);
19992017
EXPORT_SYMBOL(sa_cache_init);
20002018
EXPORT_SYMBOL(sa_cache_fini);
2019+
EXPORT_SYMBOL(sa_spill_alloc);
2020+
EXPORT_SYMBOL(sa_spill_free);
20012021
EXPORT_SYMBOL(sa_set_sa_object);
20022022
EXPORT_SYMBOL(sa_hdrsize);
20032023
EXPORT_SYMBOL(sa_handle_lock);

module/zfs/zfs_sa.c

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
6363
{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
6464
{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
6565
{"ZPL_DACL_ACES", 0, SA_ACL, 0},
66+
{"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0},
6667
{NULL, 0, 0, 0}
6768
};
6869

@@ -183,6 +184,83 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
183184
}
184185
}
185186

187+
int
188+
zfs_sa_get_xattr(znode_t *zp)
189+
{
190+
zfs_sb_t *zsb = ZTOZSB(zp);
191+
char *obj;
192+
int size;
193+
int error;
194+
195+
ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
196+
ASSERT(!zp->z_xattr_cached);
197+
ASSERT(zp->z_is_sa);
198+
199+
error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size);
200+
if (error) {
201+
if (error == ENOENT)
202+
return nvlist_alloc(&zp->z_xattr_cached,
203+
NV_UNIQUE_NAME, KM_SLEEP);
204+
else
205+
return (error);
206+
}
207+
208+
obj = sa_spill_alloc(KM_SLEEP);
209+
210+
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size);
211+
if (error == 0)
212+
error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);
213+
214+
sa_spill_free(obj);
215+
216+
return (error);
217+
}
218+
219+
int
220+
zfs_sa_set_xattr(znode_t *zp)
221+
{
222+
zfs_sb_t *zsb = ZTOZSB(zp);
223+
dmu_tx_t *tx;
224+
char *obj;
225+
size_t size;
226+
int error;
227+
228+
ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
229+
ASSERT(zp->z_xattr_cached);
230+
ASSERT(zp->z_is_sa);
231+
232+
error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
233+
if (error)
234+
goto out;
235+
236+
obj = sa_spill_alloc(KM_SLEEP);
237+
238+
error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
239+
NV_ENCODE_XDR, KM_SLEEP);
240+
if (error)
241+
goto out_free;
242+
243+
tx = dmu_tx_create(zsb->z_os);
244+
dmu_tx_hold_sa_create(tx, size);
245+
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
246+
247+
error = dmu_tx_assign(tx, TXG_WAIT);
248+
if (error) {
249+
dmu_tx_abort(tx);
250+
} else {
251+
error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb),
252+
obj, size, tx);
253+
if (error)
254+
dmu_tx_abort(tx);
255+
else
256+
dmu_tx_commit(tx);
257+
}
258+
out_free:
259+
sa_spill_free(obj);
260+
out:
261+
return (error);
262+
}
263+
186264
/*
187265
* I'm not convinced we should do any of this upgrade.
188266
* since the SA code can read both old/new znode formats
@@ -338,6 +416,8 @@ EXPORT_SYMBOL(zfs_sa_readlink);
338416
EXPORT_SYMBOL(zfs_sa_symlink);
339417
EXPORT_SYMBOL(zfs_sa_get_scanstamp);
340418
EXPORT_SYMBOL(zfs_sa_set_scanstamp);
419+
EXPORT_SYMBOL(zfs_sa_get_xattr);
420+
EXPORT_SYMBOL(zfs_sa_set_xattr);
341421
EXPORT_SYMBOL(zfs_sa_upgrade);
342422
EXPORT_SYMBOL(zfs_sa_upgrade_txholds);
343423

module/zfs/zfs_vfsops.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,16 @@ xattr_changed_cb(void *arg, uint64_t newval)
140140
{
141141
zfs_sb_t *zsb = arg;
142142

143-
if (newval == TRUE)
144-
zsb->z_flags |= ZSB_XATTR;
145-
else
143+
if (newval == ZFS_XATTR_OFF) {
146144
zsb->z_flags &= ~ZSB_XATTR;
145+
} else {
146+
zsb->z_flags |= ZSB_XATTR;
147+
148+
if (newval == ZFS_XATTR_SA)
149+
zsb->z_xattr_sa = B_TRUE;
150+
else
151+
zsb->z_xattr_sa = B_FALSE;
152+
}
147153
}
148154

149155
static void
@@ -641,6 +647,10 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
641647
&sa_obj);
642648
if (error)
643649
goto out;
650+
651+
error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &zval);
652+
if ((error == 0) && (zval == ZFS_XATTR_SA))
653+
zsb->z_xattr_sa = B_TRUE;
644654
} else {
645655
/*
646656
* Pre SA versions file systems should never touch

module/zfs/zfs_znode.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,15 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
106106
rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
107107
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
108108
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
109+
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
109110

110111
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
111112
avl_create(&zp->z_range_avl, zfs_range_compare,
112113
sizeof (rl_t), offsetof(rl_t, r_node));
113114

114115
zp->z_dirlocks = NULL;
115116
zp->z_acl_cached = NULL;
117+
zp->z_xattr_cached = NULL;
116118
zp->z_moved = 0;
117119
return (0);
118120
}
@@ -128,11 +130,13 @@ zfs_znode_cache_destructor(void *buf, void *arg)
128130
rw_destroy(&zp->z_parent_lock);
129131
rw_destroy(&zp->z_name_lock);
130132
mutex_destroy(&zp->z_acl_lock);
133+
rw_destroy(&zp->z_xattr_lock);
131134
avl_destroy(&zp->z_range_avl);
132135
mutex_destroy(&zp->z_range_lock);
133136

134137
ASSERT(zp->z_dirlocks == NULL);
135138
ASSERT(zp->z_acl_cached == NULL);
139+
ASSERT(zp->z_xattr_cached == NULL);
136140
}
137141

138142
void
@@ -272,6 +276,11 @@ zfs_inode_destroy(struct inode *ip)
272276
zp->z_acl_cached = NULL;
273277
}
274278

279+
if (zp->z_xattr_cached) {
280+
nvlist_free(zp->z_xattr_cached);
281+
zp->z_xattr_cached = NULL;
282+
}
283+
275284
kmem_cache_free(znode_cache, zp);
276285
}
277286

0 commit comments

Comments
 (0)