diff --git a/Cargo.lock b/Cargo.lock index 01269091dd..56dbf7ba89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -468,9 +468,9 @@ checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "digest" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" dependencies = [ "block-buffer", "crypto-common", @@ -754,6 +754,7 @@ dependencies = [ "bytes", "chrono", "config", + "digest", "git2", "hex", "insta", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 5e3365b142..026ac561fc 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -24,6 +24,7 @@ bytes = "1.2.1" byteorder = "1.4.3" chrono = { version = "0.4.22", default-features = false, features = ["std", "clock"] } config = { version = "0.13.2", default-features = false, features = ["toml"] } +digest = "0.10.5" git2 = "0.15.0" hex = "0.4.3" itertools = "0.10.5" diff --git a/lib/src/backend.rs b/lib/src/backend.rs index 08cd46fbed..b3a323adad 100644 --- a/lib/src/backend.rs +++ b/lib/src/backend.rs @@ -22,8 +22,10 @@ use thiserror::Error; use crate::repo_path::{RepoPath, RepoPathComponent}; -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct CommitId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct CommitId(Vec); +} impl Debug for CommitId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -225,14 +227,18 @@ pub enum Phase { Draft, } -#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] -pub struct MillisSinceEpoch(pub i64); +content_hash! { + #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] + pub struct MillisSinceEpoch(pub i64); +} -#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] -pub struct Timestamp { - pub timestamp: MillisSinceEpoch, - // time zone offset in minutes - pub tz_offset: i32, +content_hash! { + #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] + pub struct Timestamp { + pub timestamp: MillisSinceEpoch, + // time zone offset in minutes + pub tz_offset: i32, + } } impl Timestamp { diff --git a/lib/src/content_hash.rs b/lib/src/content_hash.rs new file mode 100644 index 0000000000..9d228f0c14 --- /dev/null +++ b/lib/src/content_hash.rs @@ -0,0 +1,165 @@ +use itertools::Itertools as _; + +pub fn hash(x: &impl ContentHash) -> digest::Output { + let mut hasher = T::new(); + x.hash(&mut hasher); + hasher.finalize() +} + +/// Portable, stable hashing suitable for identifying values +/// +/// Variable-length sequences should hash a 64-bit little-endian representation of their length, +/// then their elements in order. Unordered containers should order their elements according to +/// their `Ord` implementation. Enums should hash a 32-bit little-endian encoding of the ordinal +/// number of the enum variant, then the variant's fields in lexical order. +pub trait ContentHash { + fn hash(&self, state: &mut impl digest::Update); +} + +impl ContentHash for u8 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&[*self]); + } +} + +impl ContentHash for i32 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +impl ContentHash for i64 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +// TODO: Specialize for [u8] once specialization exists +impl ContentHash for [T] { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for x in self { + x.hash(state); + } + } +} + +impl ContentHash for Vec { + fn hash(&self, state: &mut impl digest::Update) { + self.as_slice().hash(state) + } +} + +impl ContentHash for String { + fn hash(&self, state: &mut impl digest::Update) { + self.as_bytes().hash(state); + } +} + +impl ContentHash for Option { + fn hash(&self, state: &mut impl digest::Update) { + match *self { + None => state.update(&[0]), + Some(ref x) => { + state.update(&[1]); + x.hash(state) + } + } + } +} + +impl ContentHash for std::collections::HashMap +where + K: ContentHash + Ord, + V: ContentHash + Ord, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for (k, v) in self.iter().sorted() { + k.hash(state); + v.hash(state); + } + } +} + +impl ContentHash for std::collections::HashSet +where + K: ContentHash + Ord, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for k in self.iter().sorted() { + k.hash(state); + } + } +} + +impl ContentHash for std::collections::BTreeMap +where + K: ContentHash, + V: ContentHash, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for (k, v) in self.iter() { + k.hash(state); + v.hash(state); + } + } +} + +macro_rules! content_hash { + ($(#[$meta:meta])* $vis:vis struct $name:ident { + $($(#[$field_meta:meta])* $field_vis:vis $field:ident : $ty:ty),* $(,)? + }) => { + $(#[$meta])* + $vis struct $name { + $($(#[$field_meta])* $field_vis $field : $ty),* + } + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + $(<$ty as crate::content_hash::ContentHash>::hash(&self.$field, state);)* + } + } + }; + ($(#[$meta:meta])* $vis:vis struct $name:ident($field_vis:vis $ty:ty);) => { + $(#[$meta])* + $vis struct $name($field_vis $ty); + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + <$ty as crate::content_hash::ContentHash>::hash(&self.0, state); + } + } + }; +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use blake2::{Blake2b512, Digest}; + + use super::*; + + #[test] + fn hash_map_uniqueness() { + let a = [("ab".to_string(), "cd".to_string())] + .into_iter() + .collect::>(); + let b = [("a".to_string(), "bcd".to_string())] + .into_iter() + .collect::>(); + + let mut hasher = Blake2b512::default(); + a.hash(&mut hasher); + let hash_a = hasher.finalize(); + + let mut hasher = Blake2b512::default(); + b.hash(&mut hasher); + let hash_b = hasher.finalize(); + + assert_ne!(hash_a, hash_b); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index a705396763..612285aa5e 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -14,6 +14,9 @@ #![deny(unused_must_use)] +#[macro_use] +mod content_hash; + pub mod backend; pub mod commit; pub mod commit_builder; diff --git a/lib/src/op_store.rs b/lib/src/op_store.rs index 3d74c7f555..1e7387fe6c 100644 --- a/lib/src/op_store.rs +++ b/lib/src/op_store.rs @@ -18,9 +18,12 @@ use std::fmt::{Debug, Error, Formatter}; use thiserror::Error; use crate::backend::{CommitId, Timestamp}; +use crate::content_hash::ContentHash; -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct WorkspaceId(String); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct WorkspaceId(String); +} impl Debug for WorkspaceId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -44,8 +47,10 @@ impl WorkspaceId { } } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct ViewId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct ViewId(Vec); +} impl Debug for ViewId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -75,8 +80,10 @@ impl ViewId { } } -#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] -pub struct OperationId(Vec); +content_hash! { + #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] + pub struct OperationId(Vec); +} impl Debug for OperationId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { @@ -115,6 +122,26 @@ pub enum RefTarget { }, } +impl ContentHash for RefTarget { + fn hash(&self, state: &mut impl digest::Update) { + use RefTarget::*; + match *self { + Normal(ref id) => { + state.update(&0u32.to_le_bytes()); + id.hash(state); + } + Conflict { + ref removes, + ref adds, + } => { + state.update(&1u32.to_le_bytes()); + removes.hash(state); + adds.hash(state); + } + } + } +} + impl RefTarget { pub fn is_conflict(&self) -> bool { matches!(self, RefTarget::Conflict { .. }) @@ -146,67 +173,75 @@ impl RefTarget { } } -#[derive(Default, PartialEq, Eq, Clone, Debug)] -pub struct BranchTarget { - /// The commit the branch points to locally. `None` if the branch has been - /// deleted locally. - pub local_target: Option, - // TODO: Do we need to support tombstones for remote branches? For example, if the branch - // has been deleted locally and you pull from a remote, maybe it should make a difference - // whether the branch is known to have existed on the remote. We may not want to resurrect - // the branch if the branch's state on the remote was just not known. - pub remote_targets: BTreeMap, -} - -/// Represents the way the repo looks at a given time, just like how a Tree -/// object represents how the file system looks at a given time. -#[derive(PartialEq, Eq, Clone, Debug, Default)] -pub struct View { - /// All head commits - pub head_ids: HashSet, - /// Heads of the set of public commits. - pub public_head_ids: HashSet, - pub branches: BTreeMap, - pub tags: BTreeMap, - pub git_refs: BTreeMap, - /// The commit the Git HEAD points to. - // TODO: Support multiple Git worktrees? - // TODO: Do we want to store the current branch name too? - pub git_head: Option, - // The commit that *should be* checked out in the workspace. Note that the working copy - // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be - // precise: the commit to which we most recently completed an update to). - pub wc_commit_ids: HashMap, -} - -/// Represents an operation (transaction) on the repo view, just like how a -/// Commit object represents an operation on the tree. -/// -/// Operations and views are not meant to be exchanged between repos or users; -/// they represent local state and history. -/// -/// The operation history will almost always be linear. It will only have -/// forks when parallel operations occurred. The parent is determined when -/// the transaction starts. When the transaction commits, a lock will be -/// taken and it will be checked that the current head of the operation -/// graph is unchanged. If the current head has changed, there has been -/// concurrent operation. -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct Operation { - pub view_id: ViewId, - pub parents: Vec, - pub metadata: OperationMetadata, +content_hash! { + #[derive(Default, PartialEq, Eq, Clone, Debug)] + pub struct BranchTarget { + /// The commit the branch points to locally. `None` if the branch has been + /// deleted locally. + pub local_target: Option, + // TODO: Do we need to support tombstones for remote branches? For example, if the branch + // has been deleted locally and you pull from a remote, maybe it should make a difference + // whether the branch is known to have existed on the remote. We may not want to resurrect + // the branch if the branch's state on the remote was just not known. + pub remote_targets: BTreeMap, + } } -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct OperationMetadata { - pub start_time: Timestamp, - pub end_time: Timestamp, - // Whatever is useful to the user, such as exact command line call - pub description: String, - pub hostname: String, - pub username: String, - pub tags: HashMap, +content_hash! { + /// Represents the way the repo looks at a given time, just like how a Tree + /// object represents how the file system looks at a given time. + #[derive(PartialEq, Eq, Clone, Debug, Default)] + pub struct View { + /// All head commits + pub head_ids: HashSet, + /// Heads of the set of public commits. + pub public_head_ids: HashSet, + pub branches: BTreeMap, + pub tags: BTreeMap, + pub git_refs: BTreeMap, + /// The commit the Git HEAD points to. + // TODO: Support multiple Git worktrees? + // TODO: Do we want to store the current branch name too? + pub git_head: Option, + // The commit that *should be* checked out in the workspace. Note that the working copy + // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be + // precise: the commit to which we most recently completed an update to). + pub wc_commit_ids: HashMap, + } +} + +content_hash! { + /// Represents an operation (transaction) on the repo view, just like how a + /// Commit object represents an operation on the tree. + /// + /// Operations and views are not meant to be exchanged between repos or users; + /// they represent local state and history. + /// + /// The operation history will almost always be linear. It will only have + /// forks when parallel operations occurred. The parent is determined when + /// the transaction starts. When the transaction commits, a lock will be + /// taken and it will be checked that the current head of the operation + /// graph is unchanged. If the current head has changed, there has been + /// concurrent operation. + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct Operation { + pub view_id: ViewId, + pub parents: Vec, + pub metadata: OperationMetadata, + } +} + +content_hash! { + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct OperationMetadata { + pub start_time: Timestamp, + pub end_time: Timestamp, + // Whatever is useful to the user, such as exact command line call + pub description: String, + pub hostname: String, + pub username: String, + pub tags: HashMap, + } } impl OperationMetadata { diff --git a/lib/src/simple_op_store.rs b/lib/src/simple_op_store.rs index 372b19cff5..b4fe558512 100644 --- a/lib/src/simple_op_store.rs +++ b/lib/src/simple_op_store.rs @@ -19,12 +19,13 @@ use std::fs::File; use std::io::{ErrorKind, Write}; use std::path::PathBuf; -use blake2::{Blake2b512, Digest}; +use blake2::Blake2b512; use itertools::Itertools; use protobuf::{Message, MessageField}; use tempfile::{NamedTempFile, PersistError}; use crate::backend::{CommitId, MillisSinceEpoch, Timestamp}; +use crate::content_hash::hash; use crate::file_util::persist_content_addressed_temp_file; use crate::op_store::{ BranchTarget, OpStore, OpStoreError, OpStoreResult, Operation, OperationId, OperationMetadata, @@ -100,7 +101,7 @@ impl OpStore for SimpleOpStore { temp_file.as_file().write_all(&proto_bytes)?; - let id = ViewId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let id = ViewId::new(hash::(view).to_vec()); persist_content_addressed_temp_file(temp_file, self.view_path(&id))?; Ok(id) @@ -123,7 +124,7 @@ impl OpStore for SimpleOpStore { temp_file.as_file().write_all(&proto_bytes)?; - let id = OperationId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let id = OperationId::new(hash::(operation).to_vec()); persist_content_addressed_temp_file(temp_file, self.operation_path(&id))?; Ok(id)