Skip to content

Commit

Permalink
Fix fork done handler wrongly update fsync metrics and enhance AOF_ F…
Browse files Browse the repository at this point in the history
…SYNC_ALWAYS (redis#11973)

This PR fix several unrelated bugs that were discovered by the same set of tests
(WAITAOF tests in redis#11713), could make the `WAITAOF` test hang. 

The change in `backgroundRewriteDoneHandler` is about MP-AOF.
That leftover / old code assumes that we started a new AOF file just now
(when we have a new base into which we're gonna incrementally write), but
the fact is that with MP-AOF, the fork done handler doesn't really affect the
incremental file being maintained by the parent process, there's no reason to
re-issue `SELECT`, and no reason to update any of the fsync variables in that flow.
This should have been deleted with MP-AOF (introduced in redis#9788, 7.0).
The damage is that the update to `aof_fsync_offset` will cause us to miss an fsync
in `flushAppendOnlyFile`, that happens if we stop write commands in `AOF_FSYNC_EVERYSEC`
while an AOFRW is in progress. This caused a new `WAITAOF` test to sometime hang forever.

Also because of MP-AOF, we needed to change `aof_fsync_offset` to `aof_last_incr_fsync_offset`
and match it to `aof_last_incr_size` in `flushAppendOnlyFile`. This is because in the past we compared
`aof_fsync_offset` and `aof_current_size`, but with MP-AOF it could be the total AOF file will be
smaller after AOFRW, and the (already existing) incr file still has data that needs to be fsynced.

The change in `flushAppendOnlyFile`, about the `AOF_FSYNC_ALWAYS`, it is follow redis#6053
(the details is in redis#5985), we also check `AOF_FSYNC_ALWAYS` to handle a case where
appendfsync is changed from everysec to always while there is data that's written but not yet fsynced.
  • Loading branch information
enjoy-binbin authored Mar 29, 2023
1 parent 557ca05 commit cb17178
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 13 deletions.
31 changes: 19 additions & 12 deletions src/aof.c
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,7 @@ void aofOpenIfNeededOnServerStart(void) {
}

server.aof_last_incr_size = getAppendOnlyFileSize(aof_name, NULL);
server.aof_last_incr_fsync_offset = server.aof_last_incr_size;

if (incr_aof_len) {
serverLog(LL_NOTICE, "Opening AOF incr file %s on server start", aof_name);
Expand Down Expand Up @@ -832,13 +833,14 @@ int openNewIncrAofForAppend(void) {
* is already synced at this point so fsync doesn't matter. */
if (server.aof_fd != -1) {
aof_background_fsync_and_close(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
}
server.aof_fd = newfd;

/* Reset the aof_last_incr_size. */
server.aof_last_incr_size = 0;
/* Reset the aof_last_incr_fsync_offset. */
server.aof_last_incr_fsync_offset = 0;
/* Update `server.aof_manifest`. */
if (temp_am) aofManifestFreeAndUpdate(temp_am);
return C_OK;
Expand Down Expand Up @@ -952,7 +954,6 @@ void stopAppendOnly(void) {
if (redis_fsync(server.aof_fd) == -1) {
serverLog(LL_WARNING,"Fail to fsync the AOF file: %s",strerror(errno));
} else {
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
}
close(server.aof_fd);
Expand All @@ -962,6 +963,7 @@ void stopAppendOnly(void) {
server.aof_state = AOF_OFF;
server.aof_rewrite_scheduled = 0;
server.aof_last_incr_size = 0;
server.aof_last_incr_fsync_offset = 0;
server.fsynced_reploff = -1;
atomicSet(server.fsynced_reploff_pending, 0);
killAppendOnlyChild();
Expand Down Expand Up @@ -1083,10 +1085,19 @@ void flushAppendOnlyFile(int force) {
* stop write commands before fsync called in one second,
* the data in page cache cannot be flushed in time. */
if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.aof_fsync_offset != server.aof_current_size &&
server.aof_last_incr_fsync_offset != server.aof_last_incr_size &&
server.unixtime > server.aof_last_fsync &&
!(sync_in_progress = aofFsyncInProgress())) {
goto try_fsync;

/* Check if we need to do fsync even the aof buffer is empty,
* the reason is described in the previous AOF_FSYNC_EVERYSEC block,
* and AOF_FSYNC_ALWAYS is also checked here to handle a case where
* aof_fsync is changed from everysec to always. */
} else if (server.aof_fsync == AOF_FSYNC_ALWAYS &&
server.aof_last_incr_fsync_offset != server.aof_last_incr_size)
{
goto try_fsync;
} else {
return;
}
Expand Down Expand Up @@ -1253,14 +1264,14 @@ void flushAppendOnlyFile(int force) {
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-fsync-always",latency);
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_incr_fsync_offset = server.aof_last_incr_size;
server.aof_last_fsync = server.unixtime;
atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
} else if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync) {
if (!sync_in_progress) {
aof_background_fsync(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_incr_fsync_offset = server.aof_last_incr_size;
}
server.aof_last_fsync = server.unixtime;
}
Expand Down Expand Up @@ -1766,7 +1777,6 @@ int loadAppendOnlyFiles(aofManifest *am) {
* executed early, but that shouldn't be a problem since everything will be
* fine after the first AOFRW. */
server.aof_rewrite_base_size = base_size;
server.aof_fsync_offset = server.aof_current_size;

cleanup:
stopLoading(ret == AOF_OK || ret == AOF_TRUNCATED);
Expand Down Expand Up @@ -2666,13 +2676,10 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
/* We can safely let `server.aof_manifest` point to 'temp_am' and free the previous one. */
aofManifestFreeAndUpdate(temp_am);

if (server.aof_fd != -1) {
if (server.aof_state != AOF_OFF) {
/* AOF enabled. */
server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
server.aof_current_size = getAppendOnlyFileSize(new_base_filename, NULL) + server.aof_last_incr_size;
server.aof_rewrite_base_size = server.aof_current_size;
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
}

/* We don't care about the return value of `aofDelHistoryFiles`, because the history
Expand Down
1 change: 1 addition & 0 deletions src/server.c
Original file line number Diff line number Diff line change
Expand Up @@ -2018,6 +2018,7 @@ void initServerConfig(void) {
server.aof_selected_db = -1; /* Make sure the first time will not match */
server.aof_flush_postponed_start = 0;
server.aof_last_incr_size = 0;
server.aof_last_incr_fsync_offset = 0;
server.active_defrag_running = 0;
server.notify_keyspace_events = 0;
server.blocked_clients = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -1742,7 +1742,8 @@ struct redisServer {
off_t aof_rewrite_base_size; /* AOF size on latest startup or rewrite. */
off_t aof_current_size; /* AOF current size (Including BASE + INCRs). */
off_t aof_last_incr_size; /* The size of the latest incr AOF. */
off_t aof_fsync_offset; /* AOF offset which is already synced to disk. */
off_t aof_last_incr_fsync_offset; /* AOF offset which is already requested to be synced to disk.
* Compare with the aof_last_incr_size. */
int aof_flush_sleep; /* Micros to sleep before flush. (used by tests) */
int aof_rewrite_scheduled; /* Rewrite once BGSAVE terminates. */
sds aof_buf; /* AOF buffer, written before entering the event loop */
Expand Down
42 changes: 42 additions & 0 deletions tests/unit/wait.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,49 @@ tags {"wait aof network external:skip"} {
$replica config set appendfsync everysec

test {WAITAOF replica copy everysec} {
$replica config set appendfsync everysec
waitForBgrewriteaof $replica ;# Make sure there is no AOFRW

$master incr foo
assert_equal [$master waitaof 0 1 0] {1 1}
}

test {WAITAOF replica copy everysec with AOFRW} {
$replica config set appendfsync everysec

# When we trigger an AOFRW, a fsync is triggered when closing the old INCR file,
# so with the everysec, we will skip that second of fsync, and in the next second
# after that, we will eventually do the fsync.
$replica bgrewriteaof
waitForBgrewriteaof $replica

$master incr foo
assert_equal [$master waitaof 0 1 0] {1 1}
}

test {WAITAOF replica copy everysec with slow AOFRW} {
$replica config set appendfsync everysec
$replica config set rdb-key-save-delay 1000000 ;# 1 sec

$replica bgrewriteaof

$master incr foo
assert_equal [$master waitaof 0 1 0] {1 1}

$replica config set rdb-key-save-delay 0
waitForBgrewriteaof $replica
}

test {WAITAOF replica copy everysec->always with AOFRW} {
$replica config set appendfsync everysec

# Try to fit all of them in the same round second, although there's no way to guarantee
# that, it can be done on fast machine. In any case, the test shouldn't fail either.
$replica bgrewriteaof
$master incr foo
waitForBgrewriteaof $replica
$replica config set appendfsync always

assert_equal [$master waitaof 0 1 0] {1 1}
}

Expand Down

0 comments on commit cb17178

Please sign in to comment.