Skip to content

Commit

Permalink
ZOOKEEPER-3701: Split brain on log disk full
Browse files Browse the repository at this point in the history
Issue described here:
https://issues.apache.org/jira/browse/ZOOKEEPER-3701

Proposing a fix with catching `IOException` within the truncate method to properly return with `false` if truncate fails.

Author: Andor Molnar <andor@apache.org>
Author: Andor Molnar <andor@cloudera.com>

Reviewers: Ivan Kelly <ivank@apache.org>, Enrico Olivelli <eolivelli@apache.org>, Norbert Kalmar <nkalmar@apache.org>

Closes apache#1233 from anmolnar/ZOOKEEPER-3701
  • Loading branch information
anmolnar authored and nkalmar committed Jan 28, 2020
1 parent 57be7ae commit a4bc985
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ public synchronized void serialize(
Util.getZxidFromName(snapShot.getName(), SNAPSHOT_FILE_PREFIX),
snapShot.lastModified() / 1000);
}
} else {
throw new IOException("FileSnap has already been closed");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
Expand Down Expand Up @@ -93,7 +94,7 @@
* 0 padded to EOF (filled during preallocation stage)
* </pre></blockquote>
*/
public class FileTxnLog implements TxnLog {
public class FileTxnLog implements TxnLog, Closeable {

private static final Logger LOG;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -499,23 +499,28 @@ public void save(
* @return true if able to truncate the log, false if not
* @throws IOException
*/
public boolean truncateLog(long zxid) throws IOException {
// close the existing txnLog and snapLog
close();

// truncate it
FileTxnLog truncLog = new FileTxnLog(dataDir);
boolean truncated = truncLog.truncate(zxid);
truncLog.close();

// re-open the txnLog and snapLog
// I'd rather just close/reopen this object itself, however that
// would have a big impact outside ZKDatabase as there are other
// objects holding a reference to this object.
txnLog = new FileTxnLog(dataDir);
snapLog = new FileSnap(snapDir);

return truncated;
public boolean truncateLog(long zxid) {
try {
// close the existing txnLog and snapLog
close();

// truncate it
try (FileTxnLog truncLog = new FileTxnLog(dataDir)) {
boolean truncated = truncLog.truncate(zxid);

// re-open the txnLog and snapLog
// I'd rather just close/reopen this object itself, however that
// would have a big impact outside ZKDatabase as there are other
// objects holding a reference to this object.
txnLog = new FileTxnLog(dataDir);
snapLog = new FileSnap(snapDir);

return truncated;
}
} catch (IOException e) {
LOG.error("Unable to truncate Txn log", e);
return false;
}
}

/**
Expand Down Expand Up @@ -594,8 +599,14 @@ public void rollLog() throws IOException {
* @throws IOException
*/
public void close() throws IOException {
txnLog.close();
snapLog.close();
if (txnLog != null) {
txnLog.close();
txnLog = null;
}
if (snapLog != null) {
snapLog.close();
snapLog = null;
}
}

@SuppressWarnings("serial")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ private void reloadSnapshotAndCheckDigest() throws Exception {
startServer();
QuorumPeerMainTest.waitForOne(zk, States.CONNECTED);

server = serverFactory.getZooKeeperServer();

// Snapshot digests always match
assertEquals(0L, ServerMetrics.getMetrics().DIGEST_MISMATCHES_COUNT.get());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@

import static java.util.Arrays.asList;
import static java.util.Collections.emptySet;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.mock;
Expand All @@ -36,17 +39,20 @@
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Consumer;
import org.apache.jute.BinaryInputArchive;
import org.apache.jute.BinaryOutputArchive;
import org.apache.zookeeper.ZKTestCase;
import org.apache.zookeeper.ZooDefs;
import org.apache.zookeeper.common.X509Exception;
import org.apache.zookeeper.data.ACL;
import org.apache.zookeeper.server.ExitCode;
import org.apache.zookeeper.server.ZKDatabase;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.test.TestUtils;
import org.apache.zookeeper.txn.CreateTxn;
import org.apache.zookeeper.txn.TxnHeader;
import org.apache.zookeeper.util.ServiceUtils;
import org.junit.Test;

public class LearnerTest extends ZKTestCase {
Expand Down Expand Up @@ -300,4 +306,51 @@ public void syncTest() throws Exception {
}
}

@Test
public void truncFailTest() throws Exception {
final boolean[] exitProcCalled = {false};

ServiceUtils.setSystemExitProcedure(new Consumer<Integer>() {
@Override
public void accept(Integer exitCode) {
exitProcCalled[0] = true;
assertThat("System.exit() was called with invalid exit code", exitCode, equalTo(ExitCode.QUORUM_PACKET_ERROR.getValue()));
}
});

File tmpFile = File.createTempFile("test", ".dir", testData);
tmpFile.delete();
try {
FileTxnSnapLog txnSnapLog = new FileTxnSnapLog(tmpFile, tmpFile);
SimpleLearner sl = new SimpleLearner(txnSnapLog);
long startZxid = sl.zk.getLastProcessedZxid();

// Set up bogus streams
ByteArrayOutputStream baos = new ByteArrayOutputStream();
BinaryOutputArchive oa = BinaryOutputArchive.getArchive(baos);
sl.leaderOs = BinaryOutputArchive.getArchive(new ByteArrayOutputStream());

// make streams and socket do something innocuous
sl.bufferedOutput = new BufferedOutputStream(System.out);
sl.sock = new Socket();

// fake messages from the server
QuorumPacket qp = new QuorumPacket(Leader.TRUNC, 0, null, null);
oa.writeRecord(qp, null);

// setup the messages to be streamed to follower
sl.leaderIs = BinaryInputArchive.getArchive(new ByteArrayInputStream(baos.toByteArray()));

try {
sl.syncWithLeader(3);
} catch (EOFException e) {
}

sl.zk.shutdown();

assertThat("System.exit() should have been called", exitProcCalled[0], is(true));
} finally {
TestUtils.deleteFileRecursively(tmpFile);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

package org.apache.zookeeper.test;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
Expand Down Expand Up @@ -125,10 +127,7 @@ public void testTruncationNullLog() throws Exception {
assertTrue("Failed to delete log file: " + logs[i].getName(), logs[i].delete());
}
try {
zkdb.truncateLog(1);
assertTrue("Should not get here", false);
} catch (IOException e) {
assertTrue("Should have received an IOException", true);
assertThat("truncateLog() should return false if truncation fails instead of throwing exception", zkdb.truncateLog(1), is(false));
} catch (NullPointerException npe) {
fail("This should not throw NPE!");
}
Expand Down

0 comments on commit a4bc985

Please sign in to comment.