Skip to content

Commit b407590

Browse files
committed
Fix redis#7306 less aggressively.
Citing from the issue: btw I suggest we change this fix to something else: * We revert the fix. * We add a call that disconnects chained replicas in the place where we trim the replica (that is a master i this case) offset. This way we can avoid disconnections when there is no trimming of the backlog. Note that we now want to disconnect replicas asynchronously in disconnectSlaves(), because it's in general safer now that we can call it from freeClient(). Otherwise for instance the command: CLIENT KILL TYPE master May crash: clientCommand() starts running the linked of of clients, looking for clients to kill. However it finds the master, kills it calling freeClient(), but this in turn calls replicationCacheMaster() that may also call disconnectSlaves() now. So the linked list iterator of the clientCommand() will no longer be valid.
1 parent 285817b commit b407590

File tree

2 files changed

+29
-17
lines changed

2 files changed

+29
-17
lines changed

src/networking.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,9 +1039,12 @@ static void freeClientArgv(client *c) {
10391039
* when we resync with our own master and want to force all our slaves to
10401040
* resync with us as well. */
10411041
void disconnectSlaves(void) {
1042-
while (listLength(server.slaves)) {
1042+
listIter li;
1043+
listNode *ln;
1044+
listRewind(server.slaves,&li);
1045+
while((ln = listNext(&li))) {
10431046
listNode *ln = listFirst(server.slaves);
1044-
freeClient((client*)ln->value);
1047+
freeClientAsync((client*)ln->value);
10451048
}
10461049
}
10471050

src/replication.c

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#include <sys/socket.h>
4040
#include <sys/stat.h>
4141

42-
long long adjustMeaningfulReplOffset();
42+
long long adjustMeaningfulReplOffset(int *adjusted);
4343
void replicationDiscardCachedMaster(void);
4444
void replicationResurrectCachedMaster(connection *conn);
4545
void replicationSendAck(void);
@@ -2027,19 +2027,12 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) {
20272027
* new one. */
20282028
memcpy(server.replid,new,sizeof(server.replid));
20292029
memcpy(server.cached_master->replid,new,sizeof(server.replid));
2030+
2031+
/* Disconnect all the sub-slaves: they need to be notified. */
2032+
disconnectSlaves();
20302033
}
20312034
}
20322035

2033-
/* Disconnect all the sub-replicas: they need to be notified always because
2034-
* in case the master has last replicated some non-meaningful commands
2035-
* (e.g. PINGs), the primary replica will request the PSYNC offset for the
2036-
* last known meaningful command. This means the master will again replicate
2037-
* the non-meaningful commands. If the sub-replicas still remains connected,
2038-
* they will receive those commands a second time and increment the master
2039-
* replication offset to be higher than the master's offset forever.
2040-
*/
2041-
disconnectSlaves();
2042-
20432036
/* Setup the replication to continue. */
20442037
sdsfree(reply);
20452038
replicationResurrectCachedMaster(conn);
@@ -2708,7 +2701,8 @@ void replicationCacheMaster(client *c) {
27082701

27092702
/* Adjust reploff and read_reploff to the last meaningful offset we
27102703
* executed. This is the offset the replica will use for future PSYNC. */
2711-
server.master->reploff = adjustMeaningfulReplOffset();
2704+
int offset_adjusted;
2705+
server.master->reploff = adjustMeaningfulReplOffset(&offset_adjusted);
27122706
server.master->read_reploff = server.master->reploff;
27132707
if (c->flags & CLIENT_MULTI) discardTransaction(c);
27142708
listEmpty(c->reply);
@@ -2731,6 +2725,13 @@ void replicationCacheMaster(client *c) {
27312725
* so make sure to adjust the replication state. This function will
27322726
* also set server.master to NULL. */
27332727
replicationHandleMasterDisconnection();
2728+
2729+
/* If we trimmed this replica backlog, we need to disconnect our chained
2730+
* replicas (if any), otherwise they may have the PINGs we removed
2731+
* from the stream and their offset would no longer match: upon
2732+
* disconnection they will also trim the final PINGs and will be able
2733+
* to incrementally sync without issues. */
2734+
if (offset_adjusted) disconnectSlaves();
27342735
}
27352736

27362737
/* If the "meaningful" offset, that is the offset without the final PINGs
@@ -2740,8 +2741,13 @@ void replicationCacheMaster(client *c) {
27402741
* offset because of the PINGs and will not be able to incrementally
27412742
* PSYNC with the new master.
27422743
* This function trims the replication backlog when needed, and returns
2743-
* the offset to be used for future partial sync. */
2744-
long long adjustMeaningfulReplOffset() {
2744+
* the offset to be used for future partial sync.
2745+
*
2746+
* If the integer 'adjusted' was passed by reference, it is set to 1
2747+
* if the function call actually modified the offset and the replication
2748+
* backlog, otherwise it is set to 0. It can be NULL if the caller is
2749+
* not interested in getting this info. */
2750+
long long adjustMeaningfulReplOffset(int *adjusted) {
27452751
if (server.master_repl_offset > server.master_repl_meaningful_offset) {
27462752
long long delta = server.master_repl_offset -
27472753
server.master_repl_meaningful_offset;
@@ -2761,6 +2767,9 @@ long long adjustMeaningfulReplOffset() {
27612767
(server.repl_backlog_idx + (server.repl_backlog_size - delta)) %
27622768
server.repl_backlog_size;
27632769
}
2770+
if (adjusted) *adjusted = 1;
2771+
} else {
2772+
if (adjusted) *adjusted = 0;
27642773
}
27652774
return server.master_repl_offset;
27662775
}
@@ -2784,7 +2793,7 @@ void replicationCacheMasterUsingMyself(void) {
27842793
* by replicationCreateMasterClient(). We'll later set the created
27852794
* master as server.cached_master, so the replica will use such
27862795
* offset for PSYNC. */
2787-
server.master_initial_offset = adjustMeaningfulReplOffset();
2796+
server.master_initial_offset = adjustMeaningfulReplOffset(NULL);
27882797

27892798
/* The master client we create can be set to any DBID, because
27902799
* the new master will start its replication stream with SELECT. */

0 commit comments

Comments
 (0)