Skip to content

Commit 55bd69c

Browse files
committed
Fix membership change bugs in the Raft core
* When the last appended entry of a follower is a membership change, it must update its status to ACTIVE once it installs a snapshot because snapshots always contain a committed member list * Multiple membership changes can be committed before a slow follower appends & commits them. When a slow follower appends these changes, it needs to commit them one by one. * Handle invocation exceptions properly on the Raft service layer
1 parent 9a0d299 commit 55bd69c

13 files changed

Lines changed: 280 additions & 94 deletions

File tree

hazelcast/src/main/java/com/hazelcast/cp/internal/MetadataRaftGroupManager.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -765,12 +765,11 @@ public boolean addActiveMember(long commitIndex, CPMemberInfo member) {
765765

766766
List<CPGroupMembershipChangeContext> changes = getGroupMembershipChangesForNewMember(member);
767767
if (changes.size() > 0) {
768+
membershipChangeContext = MembershipChangeContext.memberAdded(singletonList(commitIndex), member, changes);
768769
if (logger.isFineEnabled()) {
769-
logger.fine("CP group rebalancing is triggered for " + changes);
770+
logger.fine("CP group rebalancing is triggered for " + member + ", changes: " + membershipChangeContext);
770771
}
771772

772-
membershipChangeContext = MembershipChangeContext.memberAdded(singletonList(commitIndex), member, changes);
773-
774773
return false;
775774
}
776775

hazelcast/src/main/java/com/hazelcast/cp/internal/RaftGroupMembershipManager.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,9 @@ public void run() {
245245
return;
246246
}
247247

248-
logger.fine("Handling " + membershipChangeContext);
248+
if (logger.isFineEnabled()) {
249+
logger.fine("Handling " + membershipChangeContext);
250+
}
249251

250252
List<CPGroupMembershipChangeContext> changes = membershipChangeContext.getChanges();
251253
Map<CPGroupId, Tuple2<Long, Long>> changedGroups = new ConcurrentHashMap<CPGroupId, Tuple2<Long, Long>>();
@@ -340,9 +342,8 @@ private ICompletableFuture<Long> newCompletedFuture(long idx) {
340342
}
341343

342344
private long getMemberAddCommitIndex(CPGroupMembershipChangeContext ctx, Throwable t) {
343-
if (t.getCause() instanceof MismatchingGroupMembersCommitIndexException) {
344-
MismatchingGroupMembersCommitIndexException m = (MismatchingGroupMembersCommitIndexException) t.getCause();
345-
345+
if (t instanceof MismatchingGroupMembersCommitIndexException) {
346+
MismatchingGroupMembersCommitIndexException m = (MismatchingGroupMembersCommitIndexException) t;
346347
String msg = "MEMBER ADD commit of " + ctx.getMemberToAdd() + " to " + ctx.getGroupId()
347348
+ " with members commit index: " + ctx.getMembersCommitIndex() + " failed. Actual group members: "
348349
+ m.getMembers() + " with commit index: " + m.getCommitIndex();
@@ -377,9 +378,8 @@ private long getMemberAddCommitIndex(CPGroupMembershipChangeContext ctx, Throwab
377378
private long getMemberRemoveCommitIndex(CPGroupMembershipChangeContext ctx, Throwable t) {
378379
CPMemberInfo removedMember = ctx.getMemberToRemove();
379380

380-
if (t.getCause() instanceof MismatchingGroupMembersCommitIndexException) {
381-
MismatchingGroupMembersCommitIndexException m = (MismatchingGroupMembersCommitIndexException) t.getCause();
382-
381+
if (t instanceof MismatchingGroupMembersCommitIndexException) {
382+
MismatchingGroupMembersCommitIndexException m = (MismatchingGroupMembersCommitIndexException) t;
383383
String msg = "MEMBER REMOVE commit of " + removedMember + " to " + ctx.getGroupId()
384384
+ " failed. Actual group members: " + m.getMembers() + " with commit index: " + m.getCommitIndex();
385385

hazelcast/src/main/java/com/hazelcast/cp/internal/RaftInvocationManager.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import java.util.Collections;
4848
import java.util.Comparator;
4949
import java.util.List;
50+
import java.util.concurrent.ExecutionException;
5051
import java.util.concurrent.Executor;
5152

5253
import static com.hazelcast.cp.internal.raft.QueryPolicy.LEADER_LOCAL;
@@ -128,7 +129,7 @@ public void onResponse(List<CPMemberInfo> members) {
128129

129130
@Override
130131
public void onFailure(Throwable t) {
131-
resultFuture.setResult(t);
132+
resultFuture.setResult(new ExecutionException(t));
132133
}
133134
});
134135
}
@@ -146,7 +147,7 @@ public void onResponse(RaftGroupId groupId) {
146147

147148
@Override
148149
public void onFailure(Throwable t) {
149-
if (t.getCause() instanceof CannotCreateRaftGroupException) {
150+
if (t instanceof CannotCreateRaftGroupException) {
150151
logger.fine("Could not create CP group: " + groupName + " with members: " + members,
151152
t.getCause());
152153
invokeGetMembersToCreateRaftGroup(groupName, groupSize, resultFuture);

hazelcast/src/main/java/com/hazelcast/cp/internal/RaftService.java

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@
7777
import java.util.ArrayList;
7878
import java.util.Collection;
7979
import java.util.Collections;
80-
import java.util.List;
8180
import java.util.Map.Entry;
8281
import java.util.Properties;
8382
import java.util.Set;
@@ -97,7 +96,6 @@
9796
import static com.hazelcast.internal.config.ConfigValidator.checkCPSubsystemConfig;
9897
import static com.hazelcast.spi.ExecutionService.ASYNC_EXECUTOR;
9998
import static com.hazelcast.spi.ExecutionService.SYSTEM_EXECUTOR;
100-
import static com.hazelcast.util.ExceptionUtil.peel;
10199
import static com.hazelcast.util.Preconditions.checkFalse;
102100
import static com.hazelcast.util.Preconditions.checkTrue;
103101
import static java.util.Collections.newSetFromMap;
@@ -722,13 +720,6 @@ private ICompletableFuture<Void> invokeTriggerRemoveMember(CPMemberInfo member)
722720
return invocationManager.invoke(getMetadataGroupId(), new TriggerRemoveCPMemberOp(member));
723721
}
724722

725-
private boolean isRemoved(CPMemberInfo member) {
726-
RaftOp op = new GetActiveCPMembersOp();
727-
InternalCompletableFuture<List<CPMemberInfo>> f = invocationManager.query(getMetadataGroupId(), op, LEADER_LOCAL);
728-
List<CPMemberInfo> members = f.join();
729-
return !members.contains(member);
730-
}
731-
732723
public static String withoutDefaultGroupName(String name) {
733724
name = name.trim();
734725
int i = name.indexOf("@");
@@ -809,9 +800,8 @@ public void onResponse(CPGroupInfo group) {
809800

810801
@Override
811802
public void onFailure(Throwable t) {
812-
RuntimeException cause = peel(t);
813-
if (cause instanceof CPGroupDestroyedException) {
814-
CPGroupId destroyedGroupId = ((CPGroupDestroyedException) cause).getGroupId();
803+
if (t instanceof CPGroupDestroyedException) {
804+
CPGroupId destroyedGroupId = ((CPGroupDestroyedException) t).getGroupId();
815805
destroyedGroupIds.add(destroyedGroupId);
816806
}
817807

hazelcast/src/main/java/com/hazelcast/cp/internal/raft/impl/RaftNodeImpl.java

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
import com.hazelcast.util.RandomPicker;
6262
import com.hazelcast.util.collection.Long2ObjectHashMap;
6363

64-
import java.util.Arrays;
6564
import java.util.Collection;
6665
import java.util.Iterator;
6766
import java.util.List;
@@ -70,7 +69,7 @@
7069
import java.util.concurrent.TimeUnit;
7170

7271
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.ACTIVE;
73-
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.CHANGING_MEMBERSHIP;
72+
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.UPDATING_GROUP_MEMBER_LIST;
7473
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.STEPPED_DOWN;
7574
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.TERMINATED;
7675
import static com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.TERMINATING;
@@ -342,7 +341,7 @@ public boolean canReplicateNewEntry(Object operation) {
342341

343342
if (status == TERMINATING) {
344343
return false;
345-
} else if (status == CHANGING_MEMBERSHIP) {
344+
} else if (status == UPDATING_GROUP_MEMBER_LIST) {
346345
return !(operation instanceof RaftGroupCmd);
347346
}
348347

@@ -493,11 +492,6 @@ public void sendAppendRequest(Endpoint follower) {
493492

494493
assert prevEntry != null : "Follower: " + follower + ", next index: " + nextIndex;
495494

496-
if (prevEntry.index() < state.commitIndex()) {
497-
// send at most one ApplyRaftGroupMembersOp in single batch
498-
entries = trimEntriesIfContainsMultipleMembershipChanges(entries);
499-
}
500-
501495
AppendRequest appendRequest = new AppendRequest(getLocalMember(), state.term(), prevEntry.term(), prevEntry.index(),
502496
state.commitIndex(), entries);
503497

@@ -513,32 +507,6 @@ public void sendAppendRequest(Endpoint follower) {
513507
send(appendRequest, follower);
514508
}
515509

516-
/**
517-
* If log entries contains multiple membership change entries, then splits entries to send only a single
518-
* membership change in single append-entries request.
519-
*/
520-
private LogEntry[] trimEntriesIfContainsMultipleMembershipChanges(LogEntry[] entries) {
521-
int trim = entries.length;
522-
boolean found = false;
523-
for (int i = 0; i < entries.length; i++) {
524-
LogEntry entry = entries[i];
525-
if (entry.operation() instanceof ApplyRaftGroupMembersCmd) {
526-
if (found) {
527-
trim = i;
528-
break;
529-
} else {
530-
found = true;
531-
}
532-
}
533-
}
534-
535-
if (trim < entries.length) {
536-
logger.fine("Trimming append entries up to index of the second ApplyRaftGroupMembersOp: " + trim);
537-
return Arrays.copyOf(entries, trim);
538-
}
539-
return entries;
540-
}
541-
542510
/**
543511
* Applies committed log entries between {@code lastApplied} and {@code commitIndex}, if there's any available.
544512
* If new entries are applied, {@link RaftState}'s {@code lastApplied} field is updated.
@@ -592,10 +560,17 @@ private void applyLogEntry(LogEntry entry) {
592560
Object operation = entry.operation();
593561
if (operation instanceof RaftGroupCmd) {
594562
if (operation instanceof DestroyRaftGroupCmd) {
595-
assert status == TERMINATING;
596563
setStatus(TERMINATED);
597564
} else if (operation instanceof ApplyRaftGroupMembersCmd) {
598-
assert status == CHANGING_MEMBERSHIP : "STATUS: " + status;
565+
if (state.lastGroupMembers().index() < entry.index()) {
566+
setStatus(UPDATING_GROUP_MEMBER_LIST);
567+
ApplyRaftGroupMembersCmd op = (ApplyRaftGroupMembersCmd) operation;
568+
updateGroupMembers(entry.index(), op.getMembers());
569+
}
570+
571+
assert status == UPDATING_GROUP_MEMBER_LIST : "STATUS: " + status;
572+
assert state.lastGroupMembers().index() == entry.index();
573+
599574
state.commitGroupMembers();
600575
ApplyRaftGroupMembersCmd cmd = (ApplyRaftGroupMembersCmd) operation;
601576
if (cmd.getMember().equals(localMember) && cmd.getChangeType() == MembershipChangeType.REMOVE) {
@@ -692,7 +667,9 @@ public void invalidateFuturesFrom(long entryIndex) {
692667
}
693668
}
694669

695-
logger.warning("Invalidated " + count + " futures from log index: " + entryIndex);
670+
if (count > 0) {
671+
logger.warning("Invalidated " + count + " futures from log index: " + entryIndex);
672+
}
696673
}
697674

698675
/**
@@ -712,7 +689,9 @@ private void invalidateFuturesUntil(long entryIndex) {
712689
}
713690
}
714691

715-
logger.warning("Invalidated " + count + " futures until log index: " + entryIndex);
692+
if (count > 0) {
693+
logger.warning("Invalidated " + count + " futures until log index: " + entryIndex);
694+
}
716695
}
717696

718697
/**
@@ -777,8 +756,12 @@ public boolean installSnapshot(SnapshotEntry snapshot) {
777756

778757
raftIntegration.restoreSnapshot(snapshot.operation(), snapshot.index());
779758

780-
// If I am installing a snapshot, it means I am still present in the last member list so I don't need to update status.
759+
// If I am installing a snapshot, it means I am still present in the last member list,
760+
// but it is possible that the last entry I appended before the snapshot could be a membership change.
761+
// Because of this, I need to update my status.
781762
// Nevertheless, I may not be present in the restored member list, which is ok.
763+
764+
setStatus(ACTIVE);
782765
state.restoreGroupMembers(snapshot.groupMembersLogIndex(), snapshot.groupMembers());
783766
printMemberState();
784767

hazelcast/src/main/java/com/hazelcast/cp/internal/raft/impl/RaftNodeStatus.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ public enum RaftNodeStatus {
2929

3030
/**
3131
* During membership changes, node statuses become
32-
* {@code CHANGING_MEMBERSHIP} and they apply requested change once
32+
* {@code UPDATING_GROUP_MEMBER_LIST} and they apply requested change once
3333
* the entry is appended to the log. Once log is committed, if the related
3434
* node is the being removed from group, status becomes
3535
* {@link #STEPPED_DOWN}, otherwise {@link #ACTIVE}.
3636
*/
37-
CHANGING_MEMBERSHIP,
37+
UPDATING_GROUP_MEMBER_LIST,
3838

3939
/**
4040
* When a node is removed from the cluster after a membership change is

hazelcast/src/main/java/com/hazelcast/cp/internal/raft/impl/handler/AppendRequestHandlerTask.java

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package com.hazelcast.cp.internal.raft.impl.handler;
1818

1919
import com.hazelcast.cp.internal.raft.command.DestroyRaftGroupCmd;
20+
import com.hazelcast.cp.internal.raft.command.RaftGroupCmd;
2021
import com.hazelcast.cp.internal.raft.impl.RaftNodeImpl;
2122
import com.hazelcast.cp.internal.raft.impl.RaftNodeStatus;
2223
import com.hazelcast.cp.internal.raft.impl.command.ApplyRaftGroupMembersCmd;
@@ -28,12 +29,12 @@
2829
import com.hazelcast.cp.internal.raft.impl.state.RaftState;
2930
import com.hazelcast.cp.internal.raft.impl.task.RaftNodeStatusAwareTask;
3031

32+
import java.util.ArrayList;
3133
import java.util.Arrays;
3234
import java.util.List;
3335

3436
import static com.hazelcast.cp.internal.raft.impl.RaftRole.FOLLOWER;
3537
import static java.lang.Math.min;
36-
import static java.util.Arrays.asList;
3738

3839
/**
3940
* Handles {@link AppendRequest} sent by the leader. Responds with
@@ -126,12 +127,12 @@ protected void innerRun() {
126127
}
127128
}
128129

130+
LogEntry[] newEntries = null;
129131
// Process any new entries
130132
if (req.entryCount() > 0) {
131133
// Delete any conflicting entries, skip any duplicates
132134
long lastLogIndex = raftLog.lastLogOrSnapshotIndex();
133135

134-
LogEntry[] newEntries = null;
135136
for (int i = 0; i < req.entryCount(); i++) {
136137
LogEntry reqEntry = req.entries()[i];
137138

@@ -156,7 +157,7 @@ protected void innerRun() {
156157
}
157158

158159
raftNode.invalidateFuturesFrom(reqEntry.index());
159-
handleRaftGroupCmd(truncated, true);
160+
revertRaftGroupCmd(truncated);
160161

161162
newEntries = Arrays.copyOfRange(req.entries(), i, req.entryCount());
162163
break;
@@ -170,7 +171,6 @@ protected void innerRun() {
170171
}
171172

172173
raftLog.appendEntries(newEntries);
173-
handleRaftGroupCmd(asList(newEntries), false);
174174
}
175175
}
176176

@@ -185,34 +185,68 @@ protected void innerRun() {
185185
logger.fine("Setting commit index: " + newCommitIndex);
186186
}
187187
state.commitIndex(newCommitIndex);
188-
raftNode.applyLogEntries();
189188
}
190189

191190
raftNode.updateLastAppendEntriesTimestamp();
192191

193-
// If I just appended any new entry or the leader is trying to adjust my match index, I must send a response.
194-
// Otherwise, I just learnt the last commit index and I don't need to send a response.
195-
if (req.entryCount() > 0 || oldCommitIndex == state.commitIndex()) {
196-
AppendSuccessResponse resp = new AppendSuccessResponse(raftNode.getLocalMember(), state.term(), lastLogIndex);
197-
raftNode.send(resp, req.leader());
192+
try {
193+
// If I just appended any new entry or the leader is trying to adjust my match index, I must send a response.
194+
// Otherwise, I just learnt the last commit index and I don't need to send a response.
195+
if (req.entryCount() > 0 || oldCommitIndex == state.commitIndex()) {
196+
AppendSuccessResponse resp = new AppendSuccessResponse(raftNode.getLocalMember(), state.term(), lastLogIndex);
197+
raftNode.send(resp, req.leader());
198+
}
199+
} finally {
200+
if (state.commitIndex() > oldCommitIndex) {
201+
raftNode.applyLogEntries();
202+
}
203+
if (newEntries != null) {
204+
preApplyRaftGroupCmd(newEntries, state.commitIndex());
205+
}
206+
}
207+
}
208+
209+
210+
private void preApplyRaftGroupCmd(LogEntry[] entries, long commitIndex) {
211+
// There can be at most one appended & not-committed command in the log
212+
for (LogEntry entry : entries) {
213+
Object operation = entry.operation();
214+
if (entry.index() <= commitIndex || !(operation instanceof RaftGroupCmd)) {
215+
continue;
216+
}
217+
218+
if (operation instanceof DestroyRaftGroupCmd) {
219+
raftNode.setStatus(RaftNodeStatus.TERMINATING);
220+
} else if (operation instanceof ApplyRaftGroupMembersCmd) {
221+
raftNode.setStatus(RaftNodeStatus.UPDATING_GROUP_MEMBER_LIST);
222+
ApplyRaftGroupMembersCmd op = (ApplyRaftGroupMembersCmd) operation;
223+
raftNode.updateGroupMembers(entry.index(), op.getMembers());
224+
} else {
225+
assert false : "Invalid command: " + operation + " in " + raftNode.getGroupId();
226+
}
227+
228+
return;
198229
}
199230
}
200231

201-
private void handleRaftGroupCmd(List<LogEntry> entries, boolean revert) {
232+
private void revertRaftGroupCmd(List<LogEntry> entries) {
233+
// I am reverting appended-but-uncommitted entries and there can be at most 1 uncommitted Raft command...
234+
List<LogEntry> commandEntries = new ArrayList<LogEntry>();
235+
for (LogEntry entry : entries) {
236+
if (entry.operation() instanceof RaftGroupCmd) {
237+
commandEntries.add(entry);
238+
}
239+
}
240+
241+
assert commandEntries.size() <= 1 : "Reverted command entries: " + commandEntries;
242+
202243
for (LogEntry entry : entries) {
203244
if (entry.operation() instanceof DestroyRaftGroupCmd) {
204-
RaftNodeStatus status = revert ? RaftNodeStatus.ACTIVE : RaftNodeStatus.TERMINATING;
205-
raftNode.setStatus(status);
245+
raftNode.setStatus(RaftNodeStatus.ACTIVE);
206246
return;
207247
} else if (entry.operation() instanceof ApplyRaftGroupMembersCmd) {
208-
RaftNodeStatus status = revert ? RaftNodeStatus.ACTIVE : RaftNodeStatus.CHANGING_MEMBERSHIP;
209-
raftNode.setStatus(status);
210-
if (revert) {
211-
raftNode.resetGroupMembers();
212-
} else {
213-
ApplyRaftGroupMembersCmd op = (ApplyRaftGroupMembersCmd) entry.operation();
214-
raftNode.updateGroupMembers(entry.index(), op.getMembers());
215-
}
248+
raftNode.setStatus(RaftNodeStatus.ACTIVE);
249+
raftNode.resetGroupMembers();
216250
return;
217251
}
218252
}

0 commit comments

Comments
 (0)