enterprise/core-edge/src/main/java/org/neo4j/coreedge/raft/replication/shipping/RaftLogShipper.java

/*
 * Copyright (c) 2002-2016 "Neo Technology,"
 * Network Engine for Objects in Lund AB [http://neotechnology.com]
 *
 * This file is part of Neo4j.
 *
 * Neo4j is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
package org.neo4j.coreedge.raft.replication.shipping;

import java.io.IOException;
import java.time.Clock;

import org.neo4j.coreedge.catchup.storecopy.LocalDatabase;
import org.neo4j.coreedge.network.Message;
import org.neo4j.coreedge.raft.DelayedRenewableTimeoutService;
import org.neo4j.coreedge.raft.LeaderContext;
import org.neo4j.coreedge.raft.RaftMessages;
import org.neo4j.coreedge.raft.RenewableTimeoutService;
import org.neo4j.coreedge.raft.log.RaftLogEntry;
import org.neo4j.coreedge.raft.log.ReadableRaftLog;
import org.neo4j.coreedge.raft.log.segmented.InFlightMap;
import org.neo4j.coreedge.raft.net.Outbound;
import org.neo4j.coreedge.raft.state.InFlightLogEntrySupplier;
import org.neo4j.logging.Log;
import org.neo4j.logging.LogProvider;

import static java.lang.Long.max;
import static java.lang.Long.min;
import static java.lang.String.format;

import static org.neo4j.coreedge.raft.RenewableTimeoutService.RenewableTimeout;
import static org.neo4j.coreedge.raft.replication.shipping.RaftLogShipper.Mode.PIPELINE;
import static org.neo4j.coreedge.raft.replication.shipping.RaftLogShipper.Timeouts.RESEND;

/// Optimizations
// TODO: Have several outstanding batches in catchup mode, to bridge the latency gap.
// TODO: Bisect search for mismatch.
// TODO: Maximum bound on size of batch in bytes, not just entry count.

// Production ready
// TODO: Replace sender service with something more appropriate. No need for queue and multiplex capability, in fact
// is it bad to have?
//  TODO Should we drop messages to unconnected channels instead? Use UDP? Because we are not allowed to go below a
// certain cluster size (safety)
//  TODO then leader will keep trying to replicate to gone members, thus queuing things up is hurtful.

// TODO: Replace the timeout service with something better. More efficient for the constantly rescheduling use case
// and also useful for deterministic unit tests.

// Core functionality
// TODO: Consider making even CommitUpdate a raft-message of its own.

/**
 * This class handles the shipping of raft logs from this node when it is the leader to the followers.
 * Each instance handles a single follower and acts on events and associated state updates originating
 * within the main raft state machine.
 * <p>
 * It is crucial that all actions happen within the context of the leaders state at some point in time.
 *
 * @param <MEMBER> The member type.
 */
public class RaftLogShipper<MEMBER>
{
    enum Mode
    {
        /**
         * In the mismatch mode we are unsure about the follower state, thus
         * we tread with caution, going backwards trying to find the point where
         * our logs match.
         */
        MISMATCH,
        /**
         * In the catchup mode we are trying to catch up the follower as quickly
         * as possible. The follower receives batches of entries in series until
         * it is fully caught up.
         */
        CATCHUP,
        /**
         * In the pipeline mode the follower is treated as caught up and we
         * optimistically ship any latest entries without waiting for responses,
         * expecting successful responses.
         */
        PIPELINE
    }

    private final Outbound<MEMBER, RaftMessages.RaftMessage<MEMBER>> outbound;
    private final LogProvider logProvider;
    private final Log log;
    private final ReadableRaftLog raftLog;
    private final Clock clock;

    private final MEMBER follower;
    private final MEMBER leader;

    private DelayedRenewableTimeoutService timeoutService;

    public enum Timeouts implements RenewableTimeoutService.TimeoutName
    {
        RESEND
    }

    private final long retryTimeMillis;
    private final int catchupBatchSize;
    private final int maxAllowedShippingLag;
    private RenewableTimeout timeout;

    private long timeoutAbsoluteMillis;
    private long lastSentIndex;

    private long matchIndex = -1;

    InFlightMap<Long, RaftLogEntry> inFlightMap;

    private LeaderContext lastLeaderContext;

    private Mode mode = Mode.MISMATCH;

    RaftLogShipper( Outbound<MEMBER, RaftMessages.RaftMessage<MEMBER>> outbound, LogProvider logProvider,
                    ReadableRaftLog raftLog, Clock clock,
                    MEMBER leader, MEMBER follower, long leaderTerm, long leaderCommit, long retryTimeMillis,
                    int catchupBatchSize, int maxAllowedShippingLag, InFlightMap<Long, RaftLogEntry> inFlightMap )
    {
        this.outbound = outbound;
        this.catchupBatchSize = catchupBatchSize;
        this.maxAllowedShippingLag = maxAllowedShippingLag;
        this.logProvider = logProvider;
        this.log = logProvider.getLog( getClass() );
        this.raftLog = raftLog;
        this.clock = clock;
        this.follower = follower;
        this.leader = leader;
        this.retryTimeMillis = retryTimeMillis;
        this.lastLeaderContext = new LeaderContext( leaderTerm, leaderCommit );
        this.inFlightMap = inFlightMap;
    }

    public Object identity()
    {
        return follower;
    }

    public synchronized void start()
    {
        log.info( "Starting log shipper: %s", statusAsString() );
        timeoutService = new DelayedRenewableTimeoutService( clock, logProvider );
        timeoutService.init();
        timeoutService.start();
        sendSingle( raftLog.appendIndex(), lastLeaderContext );
   }

    public synchronized void stop()
    {
        log.info( "Stopping log shipper %s", statusAsString() );

        try
        {
            timeoutService.stop();
            timeoutService.shutdown();
        }
        catch ( Throwable e )
        {
            log.error( "Failed to start log shipper " + statusAsString(), e );
        }
        abortTimeout();
    }

    public synchronized void onMismatch( long lastRemoteAppendIndex, LeaderContext leaderContext )
    {
        switch ( mode )
        {
            case MISMATCH:
                long logIndex = max( min( lastSentIndex - 1, lastRemoteAppendIndex ), 0 );
                sendSingle( logIndex, leaderContext );
                break;
            case PIPELINE:
            case CATCHUP:
                log.info( "%s: mismatch in mode %s from follower %s, moving to MISMATCH mode",
                        statusAsString(), mode, follower );
                mode = Mode.MISMATCH;
                sendSingle( lastSentIndex, leaderContext );
                break;
        }

        lastLeaderContext = leaderContext;
    }

    public synchronized void onMatch( long newMatchIndex, LeaderContext leaderContext )
    {
        boolean progress = newMatchIndex > matchIndex;
        matchIndex = max( newMatchIndex, matchIndex );

        switch ( mode )
        {
            case MISMATCH:
                if ( sendNextBatchAfterMatch( leaderContext ) )
                {
                    log.info( "%s: caught up after mismatch, moving to PIPELINE mode", statusAsString() );
                    mode = PIPELINE;
                }
                else
                {
                    log.info( "%s: starting catch up after mismatch, moving to CATCHUP mode", statusAsString() );
                    mode = Mode.CATCHUP;
                }
                break;
            case CATCHUP:
                if ( matchIndex >= lastSentIndex )
                {
                    if ( sendNextBatchAfterMatch( leaderContext ) )
                    {
                        log.info( "%s: caught up, moving to PIPELINE mode", statusAsString() );
                        mode = PIPELINE;
                    }
                }
                break;
            case PIPELINE:
                if ( matchIndex == lastSentIndex )
                {
                    abortTimeout();
                }
                else if ( progress )
                {
                    scheduleTimeout( retryTimeMillis );
                }
                break;
        }

        lastLeaderContext = leaderContext;
    }

    public synchronized void onNewEntries( long prevLogIndex, long prevLogTerm, RaftLogEntry[] newLogEntries,
                                           LeaderContext leaderContext )
    {
        switch ( mode )
        {
            case PIPELINE:
                while ( lastSentIndex <= prevLogIndex )
                {
                    if ( prevLogIndex - matchIndex <= maxAllowedShippingLag )
                    {
                        sendNewEntries( prevLogIndex, prevLogTerm, newLogEntries, leaderContext ); // all sending
                        // functions update lastSentIndex
                    }
                    else
                    {
                    /* The timer is still set at this point. Either we will send the next batch
                     * as soon as the follower has caught up with the last pipelined entry,
                     * or when we timeout and resend. */
                        log.info( "%s: follower has fallen behind (target prevLogIndex was %d, maxAllowedShippingLag " +
                                "is %d), moving to CATCHUP mode", statusAsString(), prevLogIndex,
                                maxAllowedShippingLag );
                        mode = Mode.CATCHUP;
                        break;
                    }
                }
                break;
        }

        lastLeaderContext = leaderContext;
    }

    public synchronized void onCommitUpdate( LeaderContext leaderContext )
    {
        switch ( mode )
        {
            case PIPELINE:
                sendCommitUpdate( leaderContext );
                break;
        }

        lastLeaderContext = leaderContext;
    }

    private synchronized void onScheduledTimeoutExpiry()
    {
        if ( timedOut() )
        {
            onTimeout();
            return;
        }

        if ( timeoutAbsoluteMillis <= 0 )
        {
            return;
        }

        long timeLeft = timeoutAbsoluteMillis - clock.millis();
        if ( timeLeft > 0 )
        {
            scheduleTimeout( timeLeft );
        }
        else
        {
            onTimeout();
        }
    }

    private void onTimeout()
    {
        if ( mode == PIPELINE )
        {
            /* The follower seems unresponsive and we do not want to spam it with new entries */
            log.info( "%s: timed out, moving to CATCHUP mode", statusAsString() );
            mode = Mode.CATCHUP;
        }

        if ( lastLeaderContext != null )
        {
            sendSingle( lastSentIndex, lastLeaderContext );
        }
    }

    private boolean timedOut()
    {
        return timeoutAbsoluteMillis != 0 && (clock.millis() - timeoutAbsoluteMillis) >= 0;
    }

    private void scheduleTimeout( long deltaMillis )
    {
        // TODO: This cancel/create dance is a bit inefficient... consider something better.

        timeoutAbsoluteMillis = clock.millis() + deltaMillis;

        if ( timeout != null )
        {
            timeout.cancel();
        }
        timeout = timeoutService.create( RESEND, deltaMillis, 0, timeout -> onScheduledTimeoutExpiry() );
    }

    private void abortTimeout()
    {
        if ( timeout != null )
        {
            timeout.cancel();
        }
        timeoutAbsoluteMillis = 0;
    }

    /**
     * Returns true if this sent the last batch.
     */
    private boolean sendNextBatchAfterMatch( LeaderContext leaderContext )
    {
        long lastIndex = raftLog.appendIndex();

        if ( lastIndex > matchIndex )
        {
            long endIndex = min( lastIndex, matchIndex + catchupBatchSize );

            scheduleTimeout( retryTimeMillis );
            sendRange( matchIndex + 1, endIndex, leaderContext );
            return endIndex == lastIndex;
        }
        else
        {
            return true;
        }
    }

    private void sendCommitUpdate( LeaderContext leaderContext )
    {
        /*
         * This is a commit update. That means that we just received enough success responses to an append
         * request to allow us to send a commit. By Raft invariants, this means that the term for the committed
         * entry is the current term.
         */
        RaftMessages.Heartbeat<MEMBER> appendRequest =
                new RaftMessages.Heartbeat<>( leader, leaderContext.term, leaderContext.commitIndex,
                        leaderContext.term );

        outbound.send( follower, appendRequest );
    }

    private void sendSingle( long logIndex, LeaderContext leaderContext )
    {
        logIndex = max( raftLog.prevIndex() + 1, logIndex );

        scheduleTimeout( retryTimeMillis );

        sendRange( logIndex, logIndex, leaderContext );
    }

    private void sendNewEntries( long prevLogIndex, long prevLogTerm, RaftLogEntry[] newEntries,
                                 LeaderContext leaderContext )
    {
        scheduleTimeout( retryTimeMillis );

        lastSentIndex = prevLogIndex + 1;

        RaftMessages.AppendEntries.Request<MEMBER> appendRequest = new RaftMessages.AppendEntries.Request<>(
                leader, leaderContext.term, prevLogIndex, prevLogTerm, newEntries, leaderContext.commitIndex
        );

        outbound.send( follower, appendRequest );
    }

    private void sendRange( long startIndex, long endIndex, LeaderContext leaderContext )
    {
        if ( startIndex > endIndex )
        {
            return;
        }

        lastSentIndex = endIndex;

        try
        {
            int batchSize = (int) (endIndex - startIndex + 1);
            RaftLogEntry[] entries = new RaftLogEntry[batchSize];

            long prevLogIndex = startIndex - 1;
            long prevLogTerm = raftLog.readEntryTerm( prevLogIndex );

            if ( prevLogTerm > leaderContext.term )
            {
                log.warn( "%s aborting send. Not leader anymore? %s, prevLogTerm=%d",
                        statusAsString(), leaderContext, prevLogTerm );
                return;
            }

            if ( (prevLogIndex == -1 && prevLogTerm != -1) || (prevLogTerm == -1 && prevLogIndex != -1) )
            {
                log.warn( "%s aborting append entry request since someone has pruned away the entries we needed." +
                                "Sending a LogCompactionInfo instead. Leader context=%s, prevLogTerm=%d",
                        statusAsString(), leaderContext, prevLogTerm );
                outbound.send( follower, new RaftMessages.LogCompactionInfo<>( leader, leaderContext.term,
                        prevLogIndex ) );
                return;
            }

            RaftMessages.AppendEntries.Request<MEMBER> appendRequest =
                    new RaftMessages.AppendEntries.Request<>( leader, leaderContext.term, prevLogIndex, prevLogTerm,
                            entries, leaderContext.commitIndex );

            try ( InFlightLogEntrySupplier logEntrySupplier = new InFlightLogEntrySupplier( raftLog, inFlightMap ) )
            {
                for ( int offset = 0; offset < batchSize; offset++ )
                {
                    entries[offset] = logEntrySupplier.get( startIndex + offset );
                    if ( entries[offset].term() > leaderContext.term )
                    {
                        log.warn( "%s aborting send. Not leader anymore? %s, entryTerm=%d",
                                statusAsString(), leaderContext, entries[offset].term() );
                        return;
                    }
                }
            }

            outbound.send( follower, appendRequest );
        }
        catch ( IOException e )
        {
            log.warn( statusAsString() + " exception during batch send", e );
        }
    }

    private String statusAsString()
    {
        return format( "%s[matchIndex: %d, lastSentIndex: %d, localAppendIndex: %d, mode: %s]", follower, matchIndex,
                lastSentIndex, raftLog.appendIndex(), mode );
    }
}