Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Make SnapshotSubset() faster (#9779)" #10143

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/python/pants/backend/python/rules/run_setup_py_test.py
Expand Up @@ -166,7 +166,7 @@ def test_generate_chroot(self) -> None:
"name": "foo",
"version": "1.2.3",
"package_dir": {"": "src"},
"packages": ["foo", "foo.qux", "foo.resources.js"],
"packages": ["foo", "foo.qux"],
"namespace_packages": ["foo"],
"package_data": {"foo": ["resources/js/code.js"]},
"install_requires": ["baz==1.1.1"],
Expand Down Expand Up @@ -293,7 +293,7 @@ def test_get_sources(self) -> None:
"foo/__init__.py",
"foo/resources/js/code.js",
],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux", "foo.resources.js"],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux"],
expected_namespace_packages=["foo.bar"],
expected_package_data={"foo": ("resources/js/code.js",)},
addrs=["src/python/foo/bar/baz:baz1", "src/python/foo/qux", "src/python/foo/resources"],
Expand All @@ -309,7 +309,7 @@ def test_get_sources(self) -> None:
"foo/__init__.py",
"foo/resources/js/code.js",
],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux", "foo.resources.js"],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux"],
expected_namespace_packages=["foo.bar"],
expected_package_data={"foo": ("resources/js/code.js",)},
addrs=[
Expand Down
2 changes: 0 additions & 2 deletions src/rust/engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 4 additions & 24 deletions src/rust/engine/fs/src/glob_matching.rs
Expand Up @@ -4,7 +4,6 @@
use std::collections::HashSet;
use std::ffi::OsStr;
use std::fmt::Display;
use std::iter::Iterator;
use std::path::{Component, Path, PathBuf};
use std::sync::Arc;

Expand All @@ -22,9 +21,9 @@ use crate::{

lazy_static! {
static ref PARENT_DIR: &'static str = "..";
pub static ref SINGLE_STAR_GLOB: Pattern = Pattern::new("*").unwrap();
static ref SINGLE_STAR_GLOB: Pattern = Pattern::new("*").unwrap();
static ref DOUBLE_STAR: &'static str = "**";
pub static ref DOUBLE_STAR_GLOB: Pattern = Pattern::new(*DOUBLE_STAR).unwrap();
static ref DOUBLE_STAR_GLOB: Pattern = Pattern::new(*DOUBLE_STAR).unwrap();
static ref MISSING_GLOB_SOURCE: GlobParsedSource = GlobParsedSource(String::from(""));
static ref PATTERN_MATCH_OPTIONS: MatchOptions = MatchOptions {
require_literal_separator: true,
Expand All @@ -33,7 +32,7 @@ lazy_static! {
}

#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub enum PathGlob {
pub(crate) enum PathGlob {
Wildcard {
canonical_dir: Dir,
symbolic_path: PathBuf,
Expand Down Expand Up @@ -239,19 +238,7 @@ impl PathGlob {
}
}

///
/// This struct extracts out just the include and exclude globs from the `PreparedPathGlobs`
/// struct. It is a temporary measure to try to share some code between the glob matching
/// implementation in this file and in snapshot_ops.rs.
///
/// TODO(#9967): Remove this struct!
///
pub struct ExpandablePathGlobs {
pub include: Vec<PathGlob>,
pub exclude: Arc<GitignoreStyleExcludes>,
}

#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct PreparedPathGlobs {
pub(crate) include: Vec<PathGlobIncludeEntry>,
pub(crate) exclude: Arc<GitignoreStyleExcludes>,
Expand All @@ -261,13 +248,6 @@ pub struct PreparedPathGlobs {
}

impl PreparedPathGlobs {
pub fn as_expandable_globs(&self) -> ExpandablePathGlobs {
ExpandablePathGlobs {
include: Iterator::flatten(self.include.iter().map(|pgie| pgie.globs.clone())).collect(),
exclude: self.exclude.clone(),
}
}

fn parse_patterns_from_include(
include: &[PathGlobIncludeEntry],
) -> Result<Vec<glob::Pattern>, String> {
Expand Down
11 changes: 4 additions & 7 deletions src/rust/engine/fs/src/lib.rs
Expand Up @@ -33,10 +33,7 @@ mod glob_matching_tests;
#[cfg(test)]
mod posixfs_tests;

pub use crate::glob_matching::{
ExpandablePathGlobs, GlobMatching, PathGlob, PreparedPathGlobs, DOUBLE_STAR_GLOB,
SINGLE_STAR_GLOB,
};
pub use crate::glob_matching::{GlobMatching, PreparedPathGlobs};

use std::cmp::min;
use std::io::{self, Read};
Expand Down Expand Up @@ -201,7 +198,7 @@ impl GitignoreStyleExcludes {
self.is_ignored_path(stat.path(), is_dir)
}

pub fn is_ignored_path(&self, path: &Path, is_dir: bool) -> bool {
fn is_ignored_path(&self, path: &Path, is_dir: bool) -> bool {
match self.gitignore.matched(path, is_dir) {
::ignore::Match::None | ::ignore::Match::Whitelist(_) => false,
::ignore::Match::Ignore(_) => true,
Expand All @@ -216,7 +213,7 @@ impl GitignoreStyleExcludes {
}
}

#[derive(Debug, Clone)]
#[derive(Debug)]
pub enum StrictGlobMatching {
// NB: the Error and Warn variants store a description of the origin of the PathGlob
// request so that we can make the error message more helpful to users when globs fail to match.
Expand Down Expand Up @@ -260,7 +257,7 @@ impl StrictGlobMatching {
}
}

#[derive(Debug, Clone)]
#[derive(Debug)]
pub enum GlobExpansionConjunction {
AllMatch,
AnyMatch,
Expand Down
2 changes: 0 additions & 2 deletions src/rust/engine/fs/store/Cargo.toml
Expand Up @@ -5,7 +5,6 @@ authors = ["Daniel Wagner-Hall <dwagnerhall@twitter.com>"]
edition = "2018"

[dependencies]
async-trait = "0.1"
bazel_protos = { path = "../../process_execution/bazel_protos" }
boxfuture = { path = "../../boxfuture" }
bytes = "0.4.5"
Expand All @@ -15,7 +14,6 @@ dirs = "1"
fs = { path = ".." }
futures01 = { package = "futures", version = "0.1" }
futures = { version = "0.3", features = ["compat"] }
glob = "0.2.11"
# TODO: This is 0.5.1 + https://github.com/tikv/grpc-rs/pull/457 + a workaround for https://github.com/rust-lang/cargo/issues/8258
grpcio = { git = "https://github.com/pantsbuild/grpc-rs.git", rev = "ed3afa3c24ddf1fdd86826e836f57c00757dfc00", default_features = false, features = ["protobuf-codec", "secure"] }
hashing = { path = "../../hashing" }
Expand Down
135 changes: 10 additions & 125 deletions src/rust/engine/fs/store/benches/store.rs
Expand Up @@ -29,27 +29,23 @@

use criterion::{criterion_group, criterion_main, Criterion};

use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::os::unix::ffi::OsStrExt;
use std::path::PathBuf;
use std::time::Duration;

use bazel_protos::remote_execution as remexec;
use bytes::Bytes;
use fs::{GlobExpansionConjunction, PreparedPathGlobs, StrictGlobMatching};
use futures::compat::Future01CompatExt;
use futures::future;
use hashing::{Digest, EMPTY_DIGEST};
use protobuf;
use hashing::Digest;
use task_executor::Executor;
use tempfile::TempDir;
use tokio::runtime::Runtime;

use store::{SnapshotOps, Store, SubsetParams};
use store::{Snapshot, Store};

pub fn criterion_benchmark_materialize(c: &mut Criterion) {
pub fn criterion_benchmark(c: &mut Criterion) {
// Create an executor, store containing the stuff to materialize, and a digest for the stuff.
// To avoid benchmarking the deleting of things, we create a parent temporary directory (which
// will be deleted at the end of the benchmark) and then skip deletion of the per-run directories.
Expand All @@ -66,125 +62,16 @@ pub fn criterion_benchmark_materialize(c: &mut Criterion) {
.measurement_time(Duration::from_secs(60))
.bench_function("materialize_directory", |b| {
b.iter(|| {
// NB: We forget this child tempdir to avoid deleting things during the run.
let new_temp = TempDir::new_in(parent_dest_path).unwrap();
let dest = new_temp.path().to_path_buf();
std::mem::forget(new_temp);
// NB: We take ownership of this child tempdir to avoid deleting things during the run.
let dest = TempDir::new_in(parent_dest_path).unwrap().into_path();
let _ = executor
.block_on(store.materialize_directory(dest, digest).compat())
.unwrap();
})
});
}

pub fn criterion_benchmark_subset_wildcard(c: &mut Criterion) {
let rt = Runtime::new().unwrap();
let executor = Executor::new(rt.handle().clone());
// NB: We use a much larger snapshot size compared to the materialize benchmark!
let (store, _tempdir, digest) = large_snapshot(&executor, 1000);

let mut cgroup = c.benchmark_group("snapshot_subset");

cgroup
.sample_size(10)
.measurement_time(Duration::from_secs(80))
.bench_function("wildcard", |b| {
b.iter(|| {
let get_subset = store.subset(
digest,
SubsetParams {
globs: PreparedPathGlobs::create(
vec!["**/*".to_string()],
StrictGlobMatching::Ignore,
GlobExpansionConjunction::AllMatch,
)
.unwrap(),
},
);
let _ = executor.block_on(get_subset).unwrap();
})
});
}

pub fn criterion_benchmark_merge(c: &mut Criterion) {
let rt = Runtime::new().unwrap();
let executor = Executor::new(rt.handle().clone());
let num_files: usize = 4000;
let (store, _tempdir, digest) = large_snapshot(&executor, num_files);

let (directory, _metadata) = executor
.block_on(store.load_directory(digest))
.unwrap()
.unwrap();
// Modify half of the files in the top-level directory by setting them to have the empty
// fingerprint (zero content).
let mut all_file_nodes = directory.get_files().to_vec();
let mut file_nodes_to_modify = all_file_nodes.split_off(all_file_nodes.len() / 2);
for file_node in file_nodes_to_modify.iter_mut() {
let mut empty_bazel_digest = remexec::Digest::new();
empty_bazel_digest.set_hash(EMPTY_DIGEST.0.to_hex());
empty_bazel_digest.set_size_bytes(0);
file_node.set_digest(empty_bazel_digest);
}
let modified_file_names: HashSet<String> = file_nodes_to_modify
.iter()
.map(|file_node| file_node.get_name().to_string())
.collect();

let mut bazel_modified_files_directory = remexec::Directory::new();
bazel_modified_files_directory.set_files(protobuf::RepeatedField::from_vec(
all_file_nodes
.iter()
.cloned()
.chain(file_nodes_to_modify.into_iter())
.collect(),
));
bazel_modified_files_directory.set_directories(directory.directories.clone());

let modified_digest = executor
.block_on(store.record_directory(&bazel_modified_files_directory, true))
.unwrap();

let mut bazel_removed_files_directory = remexec::Directory::new();
bazel_removed_files_directory.set_files(protobuf::RepeatedField::from_vec(
all_file_nodes
.into_iter()
.filter(|file_node| !modified_file_names.contains(file_node.get_name()))
.collect(),
));
bazel_removed_files_directory.set_directories(directory.directories.clone());
let removed_digest = executor
.block_on(store.record_directory(&bazel_removed_files_directory, true))
.unwrap();

let mut cgroup = c.benchmark_group("snapshot_merge");

cgroup
.sample_size(10)
.measurement_time(Duration::from_secs(80))
.bench_function("snapshot_merge", |b| {
b.iter(|| {
// Merge the old and the new snapshot together, allowing any file to be duplicated.
let old_first: Digest = executor
.block_on(store.merge(vec![removed_digest, modified_digest]))
.unwrap();

// Test the performance of either ordering of snapshots.
let new_first: Digest = executor
.block_on(store.merge(vec![modified_digest, removed_digest]))
.unwrap();

assert_eq!(old_first, new_first);
})
});
}

criterion_group!(
benches,
criterion_benchmark_materialize,
criterion_benchmark_subset_wildcard,
criterion_benchmark_merge
);
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

///
Expand Down Expand Up @@ -220,10 +107,8 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,
})
.collect::<String>();

// NB: Split the line by whitespace, then accumulate a PathBuf using each word as a path
// component!
let path_buf = clean_line.split_whitespace().collect::<PathBuf>();
// Drop empty or too-long candidates.
let path_buf = clean_line.split_whitespace().collect::<PathBuf>();
let components_too_long = path_buf.components().any(|c| c.as_os_str().len() > 255);
if components_too_long || path_buf.as_os_str().is_empty() || path_buf.as_os_str().len() > 512
{
Expand All @@ -237,10 +122,9 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,
let storedir = TempDir::new().unwrap();
let store = Store::local_only(executor.clone(), storedir.path()).unwrap();

let store2 = store.clone();
let digests = henries_paths
.map(|mut path| {
let store = store2.clone();
let store = store.clone();
async move {
// We use the path as the content as well: would be interesting to make this tunable.
let content = Bytes::from(path.as_os_str().as_bytes());
Expand All @@ -257,9 +141,10 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,

let digest = executor
.block_on({
let store = store.clone();
async move {
let digests = future::try_join_all(digests).await?;
store2.merge(digests).await
Snapshot::merge_directories(store, digests).await
}
})
.unwrap();
Expand Down