From 7997874f4d2be9da1e26c77804569b0057c1e0a2 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 13 Mar 2023 22:12:29 +0800
Subject: [PATCH] Fix tail->repl_offset update in feedReplicationBuffer
 (#11905)

In #11666, we added a while loop and will split a big reply
node to multiple nodes. The update of tail->repl_offset may
be wrong. Like before #11666, we would have created at most
one new reply node, and now we will create multiple nodes if
it is a big reply node.

Now we are creating more than one node, and the tail->repl_offset
of all the nodes except the last one are incorrect. Because we
update master_repl_offset at the beginning, and then use it to
update the tail->repl_offset. This would have lead to an assertion
during PSYNC, a test was added to validate that case.

Besides that, the calculation of size was adjusted to fix
tests that failed due to a combination of a very low backlog size,
and some thresholds of that get violated because of the relatively
high overhead of replBufBlock. So now if the backlog size / 16 is too
small, we'll take PROTO_REPLY_CHUNK_BYTES instead.

Co-authored-by: Oran Agra <oran@redislabs.com>
---
 src/replication.c                        | 11 +++++++----
 src/server.h                             |  2 +-
 tests/integration/replication-buffer.tcl | 12 +++++++++++-
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/replication.c b/src/replication.c
index 854688f2c814..5aef2369876e 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -332,8 +332,6 @@ void feedReplicationBuffer(char *s, size_t len) {
     static long long repl_block_id = 0;
 
     if (server.repl_backlog == NULL) return;
-    server.master_repl_offset += len;
-    server.repl_backlog->histlen += len;
 
     while(len > 0) {
         size_t start_pos = 0; /* The position of referenced block to start sending. */
@@ -354,6 +352,8 @@ void feedReplicationBuffer(char *s, size_t len) {
             tail->used += copy;
             s += copy;
             len -= copy;
+            server.master_repl_offset += copy;
+            server.repl_backlog->histlen += copy;
         }
         if (len) {
             /* Create a new node, make sure it is allocated to at
@@ -362,14 +362,15 @@ void feedReplicationBuffer(char *s, size_t len) {
             /* Avoid creating nodes smaller than PROTO_REPLY_CHUNK_BYTES, so that we can append more data into them,
              * and also avoid creating nodes bigger than repl_backlog_size / 16, so that we won't have huge nodes that can't
              * trim when we only still need to hold a small portion from them. */
-            size_t size = min(max(len, (size_t)PROTO_REPLY_CHUNK_BYTES), (size_t)server.repl_backlog_size / 16);
+            size_t limit = max((size_t)server.repl_backlog_size / 16, (size_t)PROTO_REPLY_CHUNK_BYTES);
+            size_t size = min(max(len, (size_t)PROTO_REPLY_CHUNK_BYTES), limit);
             tail = zmalloc_usable(size + sizeof(replBufBlock), &usable_size);
             /* Take over the allocation's internal fragmentation */
             tail->size = usable_size - sizeof(replBufBlock);
             size_t copy = (tail->size >= len) ? len : tail->size;
             tail->used = copy;
             tail->refcount = 0;
-            tail->repl_offset = server.master_repl_offset - tail->used + 1;
+            tail->repl_offset = server.master_repl_offset + 1;
             tail->id = repl_block_id++;
             memcpy(tail->buf, s, copy);
             listAddNodeTail(server.repl_buffer_blocks, tail);
@@ -382,6 +383,8 @@ void feedReplicationBuffer(char *s, size_t len) {
             }
             s += copy;
             len -= copy;
+            server.master_repl_offset += copy;
+            server.repl_backlog->histlen += copy;
         }
 
         /* For output buffer of replicas. */
diff --git a/src/server.h b/src/server.h
index 04371e9590f0..a3d9aa0883a5 100644
--- a/src/server.h
+++ b/src/server.h
@@ -921,7 +921,7 @@ typedef struct clientReplyBlock {
  *      |                                            /       \
  *      |                                           /         \
  *      |                                          /           \
- *  Repl Backlog                               Replia_A      Replia_B
+ *  Repl Backlog                               Replica_A    Replica_B
  * 
  * Each replica or replication backlog increments only the refcount of the
  * 'ref_repl_buf_node' which it points to. So when replica walks to the next
diff --git a/tests/integration/replication-buffer.tcl b/tests/integration/replication-buffer.tcl
index fe85632ae14d..2e402480d032 100644
--- a/tests/integration/replication-buffer.tcl
+++ b/tests/integration/replication-buffer.tcl
@@ -267,7 +267,7 @@ test {Replica client-output-buffer size is limited to backlog_limit/16 when no r
             wait_for_ofs_sync $master $replica
 
             # Write another key to force the test to wait for another event loop iteration
-            # to give the serverCron a chance to disconnect replicas with COB size exeeeding the limits
+            # to give the serverCron a chance to disconnect replicas with COB size exceeding the limits
             $master set key1 "1"
             wait_for_ofs_sync $master $replica
 
@@ -280,6 +280,16 @@ test {Replica client-output-buffer size is limited to backlog_limit/16 when no r
             }
 
             assert {[status $master sync_partial_ok] == 0}
+
+            # Before this fix (#11905), the test would trigger an assertion in 'o->used >= c->ref_block_pos'
+            test {The update of replBufBlock's repl_offset is ok - Regression test for #11666} {
+                set rd [redis_deferring_client]
+                set replid [status $master master_replid]
+                set offset [status $master repl_backlog_first_byte_offset]
+                $rd psync $replid $offset
+                assert_equal {PONG} [$master ping] ;# Make sure the master doesn't crash.
+                $rd close
+            }
         }
     }
 }