Skip to content

Commit

Permalink
Fix excessive mmap-ing in LMDB (#11513)
Browse files Browse the repository at this point in the history
### Problem

Core dumps collected on macOS are very large currently because they include all mmap'd data. According to `vmmap`, this is because LMDB is `mmap`'ing 1.7TB of data. And... it is! But not intentionally.

### Solution

`ShardedLmdb` was not dividing the `max_size` by the number of shards, and since each shard used their own LMDB `Environment`, we were ending up with `16 * max_size` `mmap`'d. Additionally, we had set some of our databases to sizes that are well beyond what we expect the database to grow to when GC is running consistently.

### Result

It is now possible to gather core dumps on macOS, which should assist in the debugging of #11364.

[ci skip-build-wheels]
  • Loading branch information
stuhood committed Feb 3, 2021
1 parent 81fd031 commit 457e666
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 42 deletions.
9 changes: 7 additions & 2 deletions src/rust/engine/fs/store/src/lib.rs
Expand Up @@ -66,8 +66,13 @@ use remexec::Tree;
const MEGABYTES: usize = 1024 * 1024;
const GIGABYTES: usize = 1024 * MEGABYTES;

// This is the target number of bytes which should be present in all combined LMDB store files
// after garbage collection. We almost certainly want to make this configurable.
///
/// This is the target number of bytes which should be present in all combined LMDB store files
/// after garbage collection. We almost certainly want to make this configurable.
///
/// Because LMDB is sharded (by ShardedLmdb::NUM_SHARDS), this value also bounds the maximum size
/// of individual items in the store: see the relevant calls to `ShardedLmdb::new` for more info.
///
pub const DEFAULT_LOCAL_STORE_GC_TARGET_BYTES: usize = 4 * GIGABYTES;

mod local;
Expand Down
44 changes: 27 additions & 17 deletions src/rust/engine/fs/store/src/local.rs
@@ -1,4 +1,4 @@
use super::{EntryType, ShrinkBehavior, GIGABYTES};
use super::{EntryType, ShrinkBehavior, DEFAULT_LOCAL_STORE_GC_TARGET_BYTES};

use std::collections::BinaryHeap;
use std::path::Path;
Expand Down Expand Up @@ -45,20 +45,32 @@ impl ByteStore {
let directories_root = root.join("directories");
Ok(ByteStore {
inner: Arc::new(InnerStore {
// We want these stores to be allowed to grow very large, in case we are on a system with
// large disks which doesn't want to GC a lot.
// It doesn't reflect space allocated on disk, or RAM allocated (it may be reflected in
// VIRT but not RSS). There is no practical upper bound on this number, so we set them
// ridiculously high.
// However! We set them lower than we'd otherwise choose because sometimes we see tests on
// travis fail because they can't allocate virtual memory, if there are multiple Stores
// in memory at the same time. We don't know why they're not efficiently garbage collected
// by python, but they're not, so...
file_dbs: ShardedLmdb::new(files_root, 100 * GIGABYTES, executor.clone(), lease_time)
.map(Arc::new),
// The size value bounds the total size of all shards, but it also bounds the per item
// size to `max_size / ShardedLmdb::NUM_SHARDS`.
//
// We want these stores to be allowed to grow to a large multiple of the `StoreGCService`
// size target (see DEFAULT_LOCAL_STORE_GC_TARGET_BYTES), in case it is a very long time
// between GC runs. It doesn't reflect space allocated on disk, or RAM allocated (it may
// be reflected in VIRT but not RSS).
//
// However! We set them lower than we'd otherwise choose for a few reasons:
// 1. we see tests on travis fail because they can't allocate virtual memory if there
// are too many open Stores at the same time.
// 2. macOS creates core dumps that apparently include MMAP'd pages, and setting this too
// high will use up an unnecessary amount of disk if core dumps are enabled.
//
file_dbs: ShardedLmdb::new(
files_root,
// We set this larger than we do other stores in order to avoid applying too low a bound
// on the size of stored files.
16 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES,
executor.clone(),
lease_time,
)
.map(Arc::new),
directory_dbs: ShardedLmdb::new(
directories_root,
5 * GIGABYTES,
2 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES,
executor.clone(),
lease_time,
)
Expand Down Expand Up @@ -155,10 +167,8 @@ impl ByteStore {
env
.begin_rw_txn()
.and_then(|mut txn| {
let key = VersionedFingerprint::new(
aged_fingerprint.fingerprint,
ShardedLmdb::schema_version(),
);
let key =
VersionedFingerprint::new(aged_fingerprint.fingerprint, ShardedLmdb::SCHEMA_VERSION);
txn.del(database, &key, None)?;

txn
Expand Down
45 changes: 26 additions & 19 deletions src/rust/engine/sharded_lmdb/src/lib.rs
Expand Up @@ -108,21 +108,22 @@ pub struct ShardedLmdb {
// First Database is content, second is leases.
lmdbs: HashMap<u8, (Arc<Environment>, Database, Database)>,
root_path: PathBuf,
max_size: usize,
max_size_per_shard: usize,
executor: task_executor::Executor,
lease_time: Duration,
}

impl ShardedLmdb {
const NUM_SHARDS: u8 = 0x10;
const NUM_SHARDS_SHIFT: u8 = 4;

// Whenever we change the byte format of data stored in lmdb, we will
// need to increment this schema version. This schema version will
// be appended to the Fingerprint-derived keys to create the key
// we actually store in the database. This way, data stored with a version
// of pants on one schema version will not conflict with data stored
// with a different version of pants on a different schema version.
pub fn schema_version() -> u8 {
2
}
pub const SCHEMA_VERSION: u8 = 2;

// max_size is the maximum size the databases together will be allowed to grow to.
// When calling this function, we will attempt to allocate that much virtual (not resident) memory
Expand All @@ -138,7 +139,9 @@ impl ShardedLmdb {
trace!("Initializing ShardedLmdb at root {:?}", root_path);
let mut lmdbs = HashMap::new();

for (env, dir, fingerprint_prefix) in ShardedLmdb::envs(&root_path, max_size)? {
let max_size_per_shard = max_size / (Self::NUM_SHARDS as usize);

for (env, dir, fingerprint_prefix) in ShardedLmdb::envs(&root_path, max_size_per_shard)? {
let content_database = env
.create_db(Some("content-versioned"), DatabaseFlags::empty())
.map_err(|e| {
Expand Down Expand Up @@ -166,32 +169,36 @@ impl ShardedLmdb {
Ok(ShardedLmdb {
lmdbs,
root_path,
max_size,
max_size_per_shard,
executor,
lease_time,
})
}

fn envs(root_path: &Path, max_size: usize) -> Result<Vec<(Environment, PathBuf, u8)>, String> {
let mut envs = Vec::with_capacity(0x10);
for b in 0x00..0x10 {
let fingerprint_prefix = b << 4;
fn envs(
root_path: &Path,
max_size_per_shard: usize,
) -> Result<Vec<(Environment, PathBuf, u8)>, String> {
let mut envs = Vec::with_capacity(Self::NUM_SHARDS as usize);
for b in 0x00..Self::NUM_SHARDS {
// NB: This shift is NUM_SHARDS dependent.
let fingerprint_prefix = b << Self::NUM_SHARDS_SHIFT;
let mut dirname = String::new();
fmt::Write::write_fmt(&mut dirname, format_args!("{:x}", fingerprint_prefix)).unwrap();
let dirname = dirname[0..1].to_owned();
let dir = root_path.join(dirname);
fs::safe_create_dir_all(&dir)
.map_err(|err| format!("Error making directory for store at {:?}: {:?}", dir, err))?;
envs.push((
ShardedLmdb::make_env(&dir, max_size)?,
ShardedLmdb::make_env(&dir, max_size_per_shard)?,
dir,
fingerprint_prefix,
));
}
Ok(envs)
}

fn make_env(dir: &Path, max_size: usize) -> Result<Environment, String> {
fn make_env(dir: &Path, max_size_per_shard: usize) -> Result<Environment, String> {
Environment::new()
// NO_SYNC
// =======
Expand Down Expand Up @@ -226,7 +233,7 @@ impl ShardedLmdb {
.set_flags(EnvironmentFlags::NO_SYNC | EnvironmentFlags::NO_TLS)
// 2 DBs; one for file contents, one for leases.
.set_max_dbs(2)
.set_map_size(max_size)
.set_map_size(max_size_per_shard)
.open(dir)
.map_err(|e| format!("Error making env for store at {:?}: {}", dir, e))
}
Expand All @@ -245,7 +252,7 @@ impl ShardedLmdb {
self
.executor
.spawn_blocking(move || {
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::schema_version());
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::SCHEMA_VERSION);
let (env, db, _lease_database) = store.get(&fingerprint);
let del_res = env.begin_rw_txn().and_then(|mut txn| {
txn.del(db, &effective_key, None)?;
Expand All @@ -267,7 +274,7 @@ impl ShardedLmdb {

pub async fn exists(&self, fingerprint: Fingerprint) -> Result<bool, String> {
let store = self.clone();
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::schema_version());
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::SCHEMA_VERSION);
self
.executor
.spawn_blocking(move || {
Expand Down Expand Up @@ -298,7 +305,7 @@ impl ShardedLmdb {
self
.executor
.spawn_blocking(move || {
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::schema_version());
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::SCHEMA_VERSION);
let (env, db, lease_database) = store.get(&fingerprint);
let put_res = env.begin_rw_txn().and_then(|mut txn| {
txn.put(db, &effective_key, &bytes, WriteFlags::NO_OVERWRITE)?;
Expand Down Expand Up @@ -336,7 +343,7 @@ impl ShardedLmdb {
env.begin_rw_txn().and_then(|mut txn| {
store.lease_inner(
lease_database,
&VersionedFingerprint::new(fingerprint, ShardedLmdb::schema_version()),
&VersionedFingerprint::new(fingerprint, ShardedLmdb::SCHEMA_VERSION),
until_secs_since_epoch,
&mut txn,
)?;
Expand Down Expand Up @@ -377,7 +384,7 @@ impl ShardedLmdb {
f: F,
) -> Result<Option<T>, String> {
let store = self.clone();
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::schema_version());
let effective_key = VersionedFingerprint::new(fingerprint, ShardedLmdb::SCHEMA_VERSION);
self
.executor
.spawn_blocking(move || {
Expand All @@ -400,7 +407,7 @@ impl ShardedLmdb {

#[allow(clippy::useless_conversion)] // False positive: https://github.com/rust-lang/rust-clippy/issues/3913
pub fn compact(&self) -> Result<(), String> {
for (env, old_dir, _) in ShardedLmdb::envs(&self.root_path, self.max_size)? {
for (env, old_dir, _) in ShardedLmdb::envs(&self.root_path, self.max_size_per_shard)? {
let new_dir = TempDir::new_in(old_dir.parent().unwrap()).expect("TODO");
env
.copy(new_dir.path(), EnvironmentCopyFlags::COMPACT)
Expand Down
6 changes: 2 additions & 4 deletions src/rust/engine/src/context.rs
Expand Up @@ -29,13 +29,11 @@ use rand::seq::SliceRandom;
use regex::Regex;
use rule_graph::RuleGraph;
use sharded_lmdb::{ShardedLmdb, DEFAULT_LEASE_TIME};
use store::Store;
use store::{Store, DEFAULT_LOCAL_STORE_GC_TARGET_BYTES};
use task_executor::Executor;
use uuid::Uuid;
use watch::{Invalidatable, InvalidationWatcher};

const GIGABYTES: usize = 1024 * 1024 * 1024;

// The reqwest crate has no support for ingesting multiple certificates in a single file,
// and requires single PEM blocks. There is a crate (https://crates.io/crates/pem) that can decode
// multiple certificates from a single buffer, but it is not suitable for our use because we don't
Expand Down Expand Up @@ -287,7 +285,7 @@ impl Core {
let maybe_local_cached_command_runner = if exec_strategy_opts.use_local_cache {
let process_execution_store = ShardedLmdb::new(
local_store_dir.join("processes"),
5 * GIGABYTES,
2 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES,
executor.clone(),
DEFAULT_LEASE_TIME,
)
Expand Down

0 comments on commit 457e666

Please sign in to comment.