diff --git a/Cargo.lock b/Cargo.lock index 824c4b329e..56dbf7ba89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -468,9 +468,9 @@ checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "digest" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" dependencies = [ "block-buffer", "crypto-common", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 5e3365b142..026ac561fc 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -24,6 +24,7 @@ bytes = "1.2.1" byteorder = "1.4.3" chrono = { version = "0.4.22", default-features = false, features = ["std", "clock"] } config = { version = "0.13.2", default-features = false, features = ["toml"] } +digest = "0.10.5" git2 = "0.15.0" hex = "0.4.3" itertools = "0.10.5" diff --git a/lib/src/backend.rs b/lib/src/backend.rs index 08cd46fbed..ba281d48d9 100644 --- a/lib/src/backend.rs +++ b/lib/src/backend.rs @@ -20,11 +20,18 @@ use std::vec::Vec; use thiserror::Error; +use crate::content_hash::ContentHash; use crate::repo_path::{RepoPath, RepoPathComponent}; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] pub struct CommitId(Vec); +impl ContentHash for CommitId { + fn hash(&self, state: &mut impl digest::Update) { + self.0.hash(state); + } +} + impl Debug for CommitId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { f.debug_tuple("CommitId").field(&self.hex()).finish() @@ -228,11 +235,19 @@ pub enum Phase { #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] pub struct MillisSinceEpoch(pub i64); -#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] -pub struct Timestamp { - pub timestamp: MillisSinceEpoch, - // time zone offset in minutes - pub tz_offset: i32, +impl ContentHash for MillisSinceEpoch { + fn hash(&self, state: &mut impl digest::Update) { + self.0.hash(state); + } +} + +content_hash! { + #[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)] + pub struct Timestamp { + pub timestamp: MillisSinceEpoch, + // time zone offset in minutes + pub tz_offset: i32, + } } impl Timestamp { diff --git a/lib/src/content_hash.rs b/lib/src/content_hash.rs new file mode 100644 index 0000000000..2ccecdbbb6 --- /dev/null +++ b/lib/src/content_hash.rs @@ -0,0 +1,133 @@ +use itertools::Itertools as _; + +/// Portable, stable hashing suitable for identifying values +pub trait ContentHash { + fn hash(&self, state: &mut impl digest::Update); +} + +impl ContentHash for u8 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&[*self]); + } +} + +impl ContentHash for i32 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +impl ContentHash for i64 { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&self.to_le_bytes()); + } +} + +// TODO: Specialize for [u8] once specialization exists +impl ContentHash for [T] { + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for x in self { + x.hash(state); + } + } +} + +impl ContentHash for Vec { + fn hash(&self, state: &mut impl digest::Update) { + self.as_slice().hash(state) + } +} + +impl ContentHash for String { + fn hash(&self, state: &mut impl digest::Update) { + self.as_bytes().hash(state); + } +} + +impl ContentHash for Option { + fn hash(&self, state: &mut impl digest::Update) { + match *self { + None => state.update(&[0]), + Some(ref x) => { + state.update(&[1]); + x.hash(state) + } + } + } +} + +impl ContentHash for std::collections::HashMap +where + K: ContentHash + Ord, + V: ContentHash + Ord, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for (k, v) in self.iter().sorted() { + k.hash(state); + v.hash(state); + } + } +} + +impl ContentHash for std::collections::HashSet +where + K: ContentHash + Ord, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for k in self.iter().sorted() { + k.hash(state); + } + } +} + +impl ContentHash for std::collections::BTreeMap +where + K: ContentHash, + V: ContentHash, +{ + fn hash(&self, state: &mut impl digest::Update) { + state.update(&(self.len() as u64).to_le_bytes()); + for (k, v) in self.iter() { + k.hash(state); + v.hash(state); + } + } +} + +macro_rules! content_hash { + ($(#[$meta:meta])* $vis:vis struct $name:ident { $($(#[$field_meta:meta])* $field_vis:vis $field:ident : $ty:ty),* $(,)? }) => { + $(#[$meta])* + $vis struct $name { + $($(#[$field_meta])* $field_vis $field : $ty),* + } + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + $(<$ty as crate::content_hash::ContentHash>::hash(&self.$field, state);)* + } + } + }; + ($(#[$meta:meta])* $vis:vis enum $name:ident { $($variant:ident { $($field:ident : $ty:ty),* $(,)? }),* $(,)? }) => { + $(#[$meta])* + $vis enum $name { + $($variant { $($field : $ty),* }),* + } + + impl crate::content_hash::ContentHash for $name { + fn hash(&self, state: &mut impl digest::Update) { + let mut counter: u32 = 0; + $( + if let Self::$variant { $(ref $field,)* } = *self { + state.update(&counter.to_le_bytes()); + $(<$ty as crate::content_hash::ContentHash>::hash($field, state);)* + } + counter += 1; + )* + _ = counter; + } + } + }; +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index a705396763..612285aa5e 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -14,6 +14,9 @@ #![deny(unused_must_use)] +#[macro_use] +mod content_hash; + pub mod backend; pub mod commit; pub mod commit_builder; diff --git a/lib/src/op_store.rs b/lib/src/op_store.rs index 707ba487ac..8b7fded6b7 100644 --- a/lib/src/op_store.rs +++ b/lib/src/op_store.rs @@ -18,10 +18,17 @@ use std::fmt::{Debug, Error, Formatter}; use thiserror::Error; use crate::backend::{CommitId, Timestamp}; +use crate::content_hash::ContentHash; #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] pub struct WorkspaceId(String); +impl ContentHash for WorkspaceId { + fn hash(&self, state: &mut impl digest::Update) { + self.0.hash(state); + } +} + impl Debug for WorkspaceId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { f.debug_tuple("WorkspaceId").field(&self.0).finish() @@ -47,6 +54,12 @@ impl WorkspaceId { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] pub struct ViewId(Vec); +impl ContentHash for ViewId { + fn hash(&self, state: &mut impl digest::Update) { + self.0.hash(state); + } +} + impl Debug for ViewId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { f.debug_tuple("ViewId").field(&self.hex()).finish() @@ -78,6 +91,12 @@ impl ViewId { #[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] pub struct OperationId(Vec); +impl ContentHash for OperationId { + fn hash(&self, state: &mut impl digest::Update) { + self.0.hash(state); + } +} + impl Debug for OperationId { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> { f.debug_tuple("OperationId").field(&self.hex()).finish() @@ -106,13 +125,15 @@ impl OperationId { } } -#[derive(PartialEq, Eq, Clone, Debug)] -pub enum RefTarget { - Normal { commit: CommitId }, - Conflict { - removes: Vec, - adds: Vec, - }, +content_hash! { + #[derive(PartialEq, Eq, Clone, Debug)] + pub enum RefTarget { + Normal { commit: CommitId }, + Conflict { + removes: Vec, + adds: Vec, + }, + } } impl RefTarget { @@ -146,67 +167,75 @@ impl RefTarget { } } -#[derive(Default, PartialEq, Eq, Clone, Debug)] -pub struct BranchTarget { - /// The commit the branch points to locally. `None` if the branch has been - /// deleted locally. - pub local_target: Option, - // TODO: Do we need to support tombstones for remote branches? For example, if the branch - // has been deleted locally and you pull from a remote, maybe it should make a difference - // whether the branch is known to have existed on the remote. We may not want to resurrect - // the branch if the branch's state on the remote was just not known. - pub remote_targets: BTreeMap, -} - -/// Represents the way the repo looks at a given time, just like how a Tree -/// object represents how the file system looks at a given time. -#[derive(PartialEq, Eq, Clone, Debug, Default)] -pub struct View { - /// All head commits - pub head_ids: HashSet, - /// Heads of the set of public commits. - pub public_head_ids: HashSet, - pub branches: BTreeMap, - pub tags: BTreeMap, - pub git_refs: BTreeMap, - /// The commit the Git HEAD points to. - // TODO: Support multiple Git worktrees? - // TODO: Do we want to store the current branch name too? - pub git_head: Option, - // The commit that *should be* checked out in the workspace. Note that the working copy - // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be - // precise: the commit to which we most recently completed an update to). - pub wc_commit_ids: HashMap, -} - -/// Represents an operation (transaction) on the repo view, just like how a -/// Commit object represents an operation on the tree. -/// -/// Operations and views are not meant to be exchanged between repos or users; -/// they represent local state and history. -/// -/// The operation history will almost always be linear. It will only have -/// forks when parallel operations occurred. The parent is determined when -/// the transaction starts. When the transaction commits, a lock will be -/// taken and it will be checked that the current head of the operation -/// graph is unchanged. If the current head has changed, there has been -/// concurrent operation. -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct Operation { - pub view_id: ViewId, - pub parents: Vec, - pub metadata: OperationMetadata, -} - -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct OperationMetadata { - pub start_time: Timestamp, - pub end_time: Timestamp, - // Whatever is useful to the user, such as exact command line call - pub description: String, - pub hostname: String, - pub username: String, - pub tags: HashMap, +content_hash! { + #[derive(Default, PartialEq, Eq, Clone, Debug)] + pub struct BranchTarget { + /// The commit the branch points to locally. `None` if the branch has been + /// deleted locally. + pub local_target: Option, + // TODO: Do we need to support tombstones for remote branches? For example, if the branch + // has been deleted locally and you pull from a remote, maybe it should make a difference + // whether the branch is known to have existed on the remote. We may not want to resurrect + // the branch if the branch's state on the remote was just not known. + pub remote_targets: BTreeMap, + } +} + +content_hash! { + /// Represents the way the repo looks at a given time, just like how a Tree + /// object represents how the file system looks at a given time. + #[derive(PartialEq, Eq, Clone, Debug, Default)] + pub struct View { + /// All head commits + pub head_ids: HashSet, + /// Heads of the set of public commits. + pub public_head_ids: HashSet, + pub branches: BTreeMap, + pub tags: BTreeMap, + pub git_refs: BTreeMap, + /// The commit the Git HEAD points to. + // TODO: Support multiple Git worktrees? + // TODO: Do we want to store the current branch name too? + pub git_head: Option, + // The commit that *should be* checked out in the workspace. Note that the working copy + // (.jj/working_copy/) has the source of truth about which commit *is* checked out (to be + // precise: the commit to which we most recently completed an update to). + pub wc_commit_ids: HashMap, + } +} + +content_hash! { + /// Represents an operation (transaction) on the repo view, just like how a + /// Commit object represents an operation on the tree. + /// + /// Operations and views are not meant to be exchanged between repos or users; + /// they represent local state and history. + /// + /// The operation history will almost always be linear. It will only have + /// forks when parallel operations occurred. The parent is determined when + /// the transaction starts. When the transaction commits, a lock will be + /// taken and it will be checked that the current head of the operation + /// graph is unchanged. If the current head has changed, there has been + /// concurrent operation. + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct Operation { + pub view_id: ViewId, + pub parents: Vec, + pub metadata: OperationMetadata, + } +} + +content_hash! { + #[derive(PartialEq, Eq, Clone, Debug)] + pub struct OperationMetadata { + pub start_time: Timestamp, + pub end_time: Timestamp, + // Whatever is useful to the user, such as exact command line call + pub description: String, + pub hostname: String, + pub username: String, + pub tags: HashMap, + } } impl OperationMetadata { diff --git a/lib/src/simple_op_store.rs b/lib/src/simple_op_store.rs index 15a206a3ea..e42bbc8111 100644 --- a/lib/src/simple_op_store.rs +++ b/lib/src/simple_op_store.rs @@ -25,6 +25,7 @@ use protobuf::{Message, MessageField}; use tempfile::{NamedTempFile, PersistError}; use crate::backend::{CommitId, MillisSinceEpoch, Timestamp}; +use crate::content_hash::ContentHash; use crate::file_util::persist_content_addressed_temp_file; use crate::op_store::{ BranchTarget, OpStore, OpStoreError, OpStoreResult, Operation, OperationId, OperationMetadata, @@ -100,7 +101,9 @@ impl OpStore for SimpleOpStore { temp_file.as_file().write_all(&proto_bytes)?; - let id = ViewId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let mut hasher = Blake2b512::default(); + view.hash(&mut hasher); + let id = ViewId::new(hasher.finalize().to_vec()); persist_content_addressed_temp_file(temp_file, self.view_path(&id))?; Ok(id) @@ -123,7 +126,9 @@ impl OpStore for SimpleOpStore { temp_file.as_file().write_all(&proto_bytes)?; - let id = OperationId::new(Blake2b512::digest(&proto_bytes).to_vec()); + let mut hasher = Blake2b512::default(); + operation.hash(&mut hasher); + let id = OperationId::new(hasher.finalize().to_vec()); persist_content_addressed_temp_file(temp_file, self.operation_path(&id))?; Ok(id)