Skip to content

Commit f2eebe0

Browse files
committed
Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency
When using lseek(2) to report data/holes memory mapped regions of the file were ignored. This could result in incorrect results. To handle this zfs_holey_common() was updated to asynchronously writeback any dirty mmap(2) regions prior to reporting holes. Additionally, while not strictly required the dn_struct_rwlock is now held over the dirty check to prevent the dnode structure from changing. This ensures that a clean dnode can't be dirtied before the data/hole is located. The range lock is now also taken to ensure the call cannot race with zfs_write(). Furthermore, the code was refactored to provide a dnode_is_dirty() helper function which checks the dnode for any dirty records to determine its dirtiness. A test case was added which does some basic verification that data/holes are reported correctly. As part of this change the zfs_dmu_offset_next_sync default value was changed to 1. This will ensure holes are reported on all files (even dirty ones) at some performance cost. TODO: - Implement zn_flush_cached_data() for FreeBSD Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #11900
1 parent 321c1b6 commit f2eebe0

File tree

14 files changed

+274
-32
lines changed

14 files changed

+274
-32
lines changed

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ AC_CONFIG_FILES([
221221
tests/zfs-tests/cmd/mktree/Makefile
222222
tests/zfs-tests/cmd/mmap_exec/Makefile
223223
tests/zfs-tests/cmd/mmap_libaio/Makefile
224+
tests/zfs-tests/cmd/mmap_seek/Makefile
224225
tests/zfs-tests/cmd/mmapwrite/Makefile
225226
tests/zfs-tests/cmd/nvlist_to_lua/Makefile
226227
tests/zfs-tests/cmd/randfree_file/Makefile

include/os/linux/zfs/sys/zfs_znode_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ extern "C" {
7171
#define Z_ISDIR(type) S_ISDIR(type)
7272

7373
#define zn_has_cached_data(zp) ((zp)->z_is_mapped)
74+
#define zn_flush_cached_data(zp, sync) (write_inode_now(ZTOI(zp), sync))
7475
#define zn_rlimit_fsize(zp, uio) (0)
7576

7677
/*

include/sys/dnode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ boolean_t dnode_add_ref(dnode_t *dn, void *ref);
425425
void dnode_rele(dnode_t *dn, void *ref);
426426
void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
427427
int dnode_try_claim(objset_t *os, uint64_t object, int slots);
428+
boolean_t dnode_is_dirty(dnode_t *dn);
428429
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
429430
void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag);
430431
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);

man/man4/zfs.4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1586,7 +1586,7 @@ Allow no-operation writes.
15861586
The occurrence of nopwrites will further depend on other pool properties
15871587
.Pq i.a. the checksumming and compression algorithms .
15881588
.
1589-
.It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 0 Ns | ns 1 Pq int
1589+
.It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int
15901590
Enable forcing TXG sync to find holes.
15911591
When enabled forces ZFS to act like prior versions when
15921592
.Sy SEEK_HOLE No or Sy SEEK_DATA

module/zfs/dmu.c

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ unsigned long zfs_per_txg_dirty_frees_percent = 5;
7575
/*
7676
* Enable/disable forcing txg sync when dirty in dmu_offset_next.
7777
*/
78-
int zfs_dmu_offset_next_sync = 0;
78+
int zfs_dmu_offset_next_sync = 1;
7979

8080
/*
8181
* Limit the amount we can prefetch with one call to this amount. This
@@ -2093,42 +2093,37 @@ int
20932093
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
20942094
{
20952095
dnode_t *dn;
2096-
int i, err;
2097-
boolean_t clean = B_TRUE;
2096+
int err;
20982097

2098+
restart:
20992099
err = dnode_hold(os, object, FTAG, &dn);
21002100
if (err)
21012101
return (err);
21022102

2103-
/*
2104-
* Check if dnode is dirty
2105-
*/
2106-
for (i = 0; i < TXG_SIZE; i++) {
2107-
if (multilist_link_active(&dn->dn_dirty_link[i])) {
2108-
clean = B_FALSE;
2109-
break;
2110-
}
2111-
}
2103+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
21122104

2113-
/*
2114-
* If compatibility option is on, sync any current changes before
2115-
* we go trundling through the block pointers.
2116-
*/
2117-
if (!clean && zfs_dmu_offset_next_sync) {
2118-
clean = B_TRUE;
2119-
dnode_rele(dn, FTAG);
2120-
txg_wait_synced(dmu_objset_pool(os), 0);
2121-
err = dnode_hold(os, object, FTAG, &dn);
2122-
if (err)
2123-
return (err);
2124-
}
2105+
if (dnode_is_dirty(dn)) {
2106+
/*
2107+
* If the zfs_dmu_offset_next_sync module option is enabled
2108+
* then strict hole reporting has been requested. Dirty
2109+
* dnodes must be synced to disk to accurately report all
2110+
* holes. When disabled (the default) dirty dnodes are
2111+
* reported to not have any holes which is always safe.
2112+
*/
2113+
if (zfs_dmu_offset_next_sync) {
2114+
rw_exit(&dn->dn_struct_rwlock);
2115+
dnode_rele(dn, FTAG);
2116+
txg_wait_synced(dmu_objset_pool(os), 0);
2117+
goto restart;
2118+
}
21252119

2126-
if (clean)
2127-
err = dnode_next_offset(dn,
2128-
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2129-
else
21302120
err = SET_ERROR(EBUSY);
2121+
} else {
2122+
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
2123+
(hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2124+
}
21312125

2126+
rw_exit(&dn->dn_struct_rwlock);
21322127
dnode_rele(dn, FTAG);
21332128

21342129
return (err);

module/zfs/dnode.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,6 +1648,26 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
16481648
slots, NULL, NULL));
16491649
}
16501650

1651+
/*
1652+
* Checks if the dnode contains any uncommitted dirty records.
1653+
*/
1654+
boolean_t
1655+
dnode_is_dirty(dnode_t *dn)
1656+
{
1657+
mutex_enter(&dn->dn_mtx);
1658+
1659+
for (int i = 0; i < TXG_SIZE; i++) {
1660+
if (list_head(&dn->dn_dirty_records[i]) != NULL) {
1661+
mutex_exit(&dn->dn_mtx);
1662+
return (B_TRUE);
1663+
}
1664+
}
1665+
1666+
mutex_exit(&dn->dn_mtx);
1667+
1668+
return (B_FALSE);
1669+
}
1670+
16511671
void
16521672
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
16531673
{

module/zfs/zfs_vnops.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
8585
static int
8686
zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
8787
{
88+
zfs_locked_range_t *lr;
8889
uint64_t noff = (uint64_t)*off; /* new offset */
8990
uint64_t file_sz;
9091
int error;
@@ -100,12 +101,18 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
100101
else
101102
hole = B_FALSE;
102103

104+
/* Flush any mmap()'d data to disk */
105+
if (zn_has_cached_data(zp))
106+
zn_flush_cached_data(zp, B_FALSE);
107+
108+
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
103109
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
110+
zfs_rangelock_exit(lr);
104111

105112
if (error == ESRCH)
106113
return (SET_ERROR(ENXIO));
107114

108-
/* file was dirty, so fall back to using generic logic */
115+
/* File was dirty, so fall back to using generic logic */
109116
if (error == EBUSY) {
110117
if (hole)
111118
*off = file_sz;

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
675675
tags = ['functional', 'migration']
676676

677677
[tests/functional/mmap]
678-
tests = ['mmap_write_001_pos', 'mmap_read_001_pos']
678+
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
679679
tags = ['functional', 'mmap']
680680

681681
[tests/functional/mount]

tests/zfs-tests/cmd/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ SUBDIRS = \
1919
mktree \
2020
mmap_exec \
2121
mmap_libaio \
22+
mmap_seek \
2223
mmapwrite \
2324
nvlist_to_lua \
2425
randwritecomp \
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
include $(top_srcdir)/config/Rules.am
2+
3+
pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
4+
5+
pkgexec_PROGRAMS = mmap_seek
6+
mmap_seek_SOURCES = mmap_seek.c

0 commit comments

Comments
 (0)