mit-pdos · ms705 · Nov 13, 2017 · Oct 18, 2017 · Oct 18, 2017 · Oct 18, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-soup-log-*.json
+*-log-*.json
 *.png
 *.log
 plotting/*.png

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ binaries = ["default"]
 generate_mysql_tests = ["default"]
 
 [dependencies]
-arccstr = "1.0.2"
+arccstr = "1.0.3"
 arrayvec = "0.4.0"
 bincode = "0.9.0"
 buf_redux = "0.6.1"
@@ -78,6 +78,7 @@ default-features = false
 backtrace = { version = "0.3.2", features = ["serialize-serde"] }
 toml = "0.4.1"
 diff = "0.1.10"
+glob = "0.2.11"
 
 [profile.release]
 debug=true

diff --git a/benchmarks/vote/vote-server.rs b/benchmarks/vote/vote-server.rs
@@ -67,8 +67,12 @@ fn main() {
 
     println!("Attempting to start soup on {}", addr);
 
-    let persistence_params =
-        distributary::PersistenceParameters::new(durability, 512, time::Duration::from_millis(1));
+    let persistence_params = distributary::PersistenceParameters::new(
+        durability,
+        512,
+        time::Duration::from_millis(1),
+        Some(String::from("vote")),
+    );
 
     let sock_addr: SocketAddr = addr.parse()
         .expect("ADDR must be a valid HOST:PORT combination");

diff --git a/benchmarks/vote/vote.rs b/benchmarks/vote/vote.rs
@@ -249,7 +249,12 @@ fn main() {
         DurabilityMode::MemoryOnly
     };
 
-    let persistence_params = PersistenceParameters::new(mode, queue_length, flush_timeout);
+    let persistence_params = PersistenceParameters::new(
+        mode,
+        queue_length,
+        flush_timeout,
+        Some(String::from("vote")),
+    );
 
     // setup db
     let mut s = graph::Setup::default();

diff --git a/dataflow/src/checktable/mod.rs b/dataflow/src/checktable/mod.rs
@@ -285,6 +285,29 @@ impl CheckTable {
         }
     }
 
+    // Reserve a timestamp for the given base node, and update each column to said timestamp.
+    // This should be called for each batch of recovery updates.
+    pub fn recover(&mut self, base: NodeIndex) -> (i64, Option<Box<HashMap<domain::Index, i64>>>) {
+        // Take timestamp
+        let ts = self.next_timestamp;
+        self.next_timestamp += 1;
+
+        // Compute the previous timestamp that each domain will see before getting this one
+        let prev_times = self.compute_previous_timestamps(Some(base));
+
+        // Update checktables
+        self.last_base = Some(base);
+        self.toplevel.insert(base, ts);
+
+        let t = &mut self.granular.entry(base).or_default();
+        for (_column, g) in t.iter_mut() {
+            assert!(g.0.is_empty(), "checktable should be empty before recovery");
+            g.1 = ts;
+        }
+
+        (ts, prev_times)
+    }
+
     pub fn apply_unconditional(
         &mut self,
         base: NodeIndex,

diff --git a/dataflow/src/checktable/service.rs b/dataflow/src/checktable/service.rs
@@ -29,6 +29,7 @@ pub struct TimestampReply {
 
 service! {
     rpc apply_batch(request: TimestampRequest) -> Option<TimestampReply>;
+    rpc recover(base: NodeIndex) -> (i64, Option<Box<HashMap<domain::Index, i64>>>);
     rpc claim_replay_timestamp(tag: Tag) -> (i64, Option<Box<HashMap<domain::Index, i64>>>);
     rpc track(token_generator: TokenGenerator);
     rpc perform_migration(deps: HashMap<domain::Index, (IngressFromBase, EgressForBase)>)
@@ -106,4 +107,9 @@ impl FutureService for CheckTableServer {
     fn validate_token(&self, token: Token) -> Self::ValidateTokenFut {
         Ok(self.checktable.lock().unwrap().validate_token(&token))
     }
+
+    type RecoverFut = Result<(i64, Option<Box<HashMap<domain::Index, i64>>>), Never>;
+    fn recover(&self, base: NodeIndex) -> Self::RecoverFut {
+        Ok(self.checktable.lock().unwrap().recover(base))
+    }
 }
diff --git a/dataflow/src/domain/mod.rs b/dataflow/src/domain/mod.rs
@@ -5,6 +5,8 @@ use std::thread;
 use std::time;
 use std::collections::hash_map::Entry;
 use std::rc::Rc;
+use std::io::{BufRead, BufReader, ErrorKind};
+use std::fs::File;
 
 use std::net::SocketAddr;
 
@@ -18,6 +20,8 @@ use transactions;
 use persistence;
 use debug;
 use checktable;
+use serde_json;
+use itertools::Itertools;
 use slog::Logger;
 use timekeeper::{RealTime, SimpleTracker, ThreadTime, Timer, TimerSet};
 use tarpc::sync::client::{self, ClientExt};
@@ -30,6 +34,7 @@ pub struct Config {
 }
 
 const BATCH_SIZE: usize = 256;
+const RECOVERY_BATCH_SIZE: usize = 512;
 
 const NANOS_PER_SEC: u64 = 1_000_000_000;
 macro_rules! dur_to_ns {
@@ -803,6 +808,9 @@ impl Domain {
             Packet::ReplayPiece { .. } => {
                 self.handle_replay(m);
             }
+            Packet::StartRecovery { .. } => {
+                self.handle_recovery();
+            }
             consumed => {
                 match consumed {
                     // workaround #16223
@@ -1566,6 +1574,81 @@ impl Domain {
         }
     }
 
+    fn handle_recovery(&mut self) {
+        let checktable = self.transaction_state.get_checktable();
+        let node_info: Vec<_> = self.nodes
+            .iter()
+            .map(|(index, node)| {
+                let n = node.borrow();
+                (index, n.global_addr(), n.is_transactional())
+            })
+            .collect();
+
+        for (local_addr, global_addr, is_transactional) in node_info {
+            let path = self.persistence_parameters.log_path(
+                &local_addr,
+                self.index,
+                self.shard.unwrap_or(0),
+            );
+
+            let file = match File::open(&path) {
+                Ok(f) => f,
+                Err(ref e) if e.kind() == ErrorKind::NotFound => {
+                    warn!(
+                        self.log,
+                        "No log file found for node {}, starting out empty",
+                        local_addr
+                    );
+
+                    continue;
+                }
+                Err(e) => panic!("Could not open log file {:?}: {}", path, e),
+            };
+
+            BufReader::new(file)
+                .lines()
+                .filter_map(|line| {
+                    let line = line
+                        .expect(&format!("Failed to read line from log file: {:?}", path));
+                    let entries: Result<Vec<Records>, _> = serde_json::from_str(&line);
+                    entries.ok()
+                })
+                // Parsing each individual line gives us an iterator over Vec<Records>.
+                // We're interested in chunking each record, so let's flat_map twice:
+                // Iter<Vec<Records>> -> Iter<Records> -> Iter<Record>
+                .flat_map(|r| r)
+                .flat_map(|r| r)
+                // Merge individual records into batches of RECOVERY_BATCH_SIZE:
+                .chunks(RECOVERY_BATCH_SIZE)
+                .into_iter()
+                // Then create Packet objects from the data:
+                .map(|chunk| {
+                    let data: Records = chunk.collect();
+                    let link = Link::new(local_addr, local_addr);
+                    if is_transactional {
+                        let (ts, prevs) = checktable.recover(global_addr).unwrap();
+                        Packet::Transaction {
+                            link,
+                            data,
+                            tracer: None,
+                            state: TransactionState::Committed(ts, global_addr, prevs),
+                        }
+                    } else {
+                        Packet::Message {
+                            link,
+                            data,
+                            tracer: None,
+                        }
+                    }
+                })
+                .for_each(|packet| self.handle(box packet));
+        }
+
+        self.control_reply_tx
+            .send(ControlReplyPacket::ack())
+            .unwrap();
+    }
+
     fn handle_replay(&mut self, m: Box<Packet>) {
         let tag = m.tag().unwrap();
         let mut finished = None;

diff --git a/dataflow/src/payload.rs b/dataflow/src/payload.rs
@@ -223,6 +223,9 @@ pub enum Packet {
     /// A packet used solely to drive the event loop forward.
     Spin,
 
+    /// Signal that a base node's domain should start replaying logs.
+    StartRecovery,
+
     // Transaction time messages
     //
     /// Instruct domain to flush pending transactions and notify upon completion. `prev_ts` is the