From 299ccedfeca1fb3497978c288e76008a5c08e899 Mon Sep 17 00:00:00 2001 From: Gabor Cselle Date: Wed, 5 Oct 2011 16:30:28 -0700 Subject: [PATCH] A number of bugfixes: - Added DB::CompactRange() method. Changed manual compaction code so it breaks up compactions of big ranges into smaller compactions. Changed the code that pushes the output of memtable compactions to higher levels to obey the grandparent constraint: i.e., we must never have a single file in level L that overlaps too much data in level L+1 (to avoid very expensive L-1 compactions). Added code to pretty-print internal keys. - Fixed bug where we would not detect overlap with files in level-0 because we were incorrectly using binary search on an array of files with overlapping ranges. Added "leveldb.sstables" property that can be used to dump all of the sstables and ranges that make up the db state. - Removing post_write_snapshot support. Email to leveldb mailing list brought up no users, just confusion from one person about what it meant. - Fixing static_cast char to unsigned on BIG_ENDIAN platforms. Fixes Issue 35 and Issue 36. - Comment clarification to address leveldb Issue 37. - Change license in posix_logger.h to match other files. - A build problem where uint32 was used instead of uint32_t. Sync with upstream @24408625 --- build_detect_platform | 5 - db/corruption_test.cc | 6 +- db/db_bench.cc | 15 +-- db/db_impl.cc | 98 ++++++++++++------ db/db_impl.h | 14 +-- db/db_test.cc | 159 +++++++++++++++++++++++------ db/dbformat.cc | 12 +++ db/dbformat.h | 2 + db/version_edit.cc | 14 ++- db/version_set.cc | 205 +++++++++++++++++++++++++++----------- db/version_set.h | 40 +++++--- db/version_set_test.cc | 57 ++++++++++- doc/index.html | 22 +--- include/leveldb/db.h | 15 ++- include/leveldb/env.h | 8 +- include/leveldb/options.h | 15 +-- util/coding.h | 8 +- util/posix_logger.h | 5 +- 18 files changed, 483 insertions(+), 217 deletions(-) diff --git a/build_detect_platform b/build_detect_platform index 7f0df317a1..d1804e09de 100644 --- a/build_detect_platform +++ b/build_detect_platform @@ -35,11 +35,6 @@ case `uname -s` in echo "PLATFORM_CFLAGS=-D_REENTRANT -DOS_FREEBSD" >> build_config.mk echo "PLATFORM_LDFLAGS=-lpthread" >> build_config.mk ;; - GNU/kFreeBSD) - PLATFORM=OS_FREEBSD - echo "PLATFORM_CFLAGS=-pthread -DOS_FREEBSD" >> build_config.mk - echo "PLATFORM_LDFLAGS=-lpthread -lrt" >> build_config.mk - ;; *) echo "Unknown platform!" exit 1 diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 69fa03a428..1edcd84b7d 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -229,8 +229,8 @@ TEST(CorruptionTest, TableFile) { Build(100); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); - dbi->TEST_CompactRange(1, "", "~"); + dbi->TEST_CompactRange(0, NULL, NULL); + dbi->TEST_CompactRange(1, NULL, NULL); Corrupt(kTableFile, 100, 1); Check(99, 99); @@ -278,7 +278,7 @@ TEST(CorruptionTest, CorruptedDescriptor) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, "", "~"); + dbi->TEST_CompactRange(0, NULL, NULL); Corrupt(kDescriptorFile, 0, 1000); Status s = TryReopen(); diff --git a/db/db_bench.cc b/db/db_bench.cc index bb63e59d14..cf9bb6583a 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -796,20 +796,7 @@ class Benchmark { } void Compact(ThreadState* thread) { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - std::string property; - char name[100]; - snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level); - if (db_->GetProperty(name, &property) && atoi(property.c_str()) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbi->TEST_CompactRange(level, "", "~"); - } + db_->CompactRange(NULL, NULL); } void PrintStats() { diff --git a/db/db_impl.cc b/db/db_impl.cc index 0ca638651c..56182a07dd 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -454,13 +454,8 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, if (s.ok() && meta.file_size > 0) { const Slice min_user_key = meta.smallest.user_key(); const Slice max_user_key = meta.largest.user_key(); - if (base != NULL && !base->OverlapInLevel(0, min_user_key, max_user_key)) { - // Push the new sstable to a higher level if possible to reduce - // expensive manifest file ops. - while (level < config::kMaxMemCompactLevel && - !base->OverlapInLevel(level + 1, min_user_key, max_user_key)) { - level++; - } + if (base != NULL) { + level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } edit->AddFile(level, meta.number, meta.file_size, meta.smallest, meta.largest); @@ -506,25 +501,55 @@ Status DBImpl::CompactMemTable() { return s; } -void DBImpl::TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end) { +void DBImpl::CompactRange(const Slice* begin, const Slice* end) { + int max_level_with_files = 1; + { + MutexLock l(&mutex_); + Version* base = versions_->current(); + for (int level = 1; level < config::kNumLevels; level++) { + if (base->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + } + TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap + for (int level = 0; level < max_level_with_files; level++) { + TEST_CompactRange(level, begin, end); + } +} + +void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { assert(level >= 0); assert(level + 1 < config::kNumLevels); - MutexLock l(&mutex_); - while (manual_compaction_ != NULL) { - bg_cv_.Wait(); - } + InternalKey begin_storage, end_storage; + ManualCompaction manual; manual.level = level; - manual.begin = begin; - manual.end = end; - manual_compaction_ = &manual; - MaybeScheduleCompaction(); - while (manual_compaction_ == &manual) { - bg_cv_.Wait(); + manual.done = false; + if (begin == NULL) { + manual.begin = NULL; + } else { + begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); + manual.begin = &begin_storage; + } + if (end == NULL) { + manual.end = NULL; + } else { + end_storage = InternalKey(*end, 0, static_cast(0)); + manual.end = &end_storage; + } + + MutexLock l(&mutex_); + while (!manual.done) { + while (manual_compaction_ != NULL) { + bg_cv_.Wait(); + } + manual_compaction_ = &manual; + MaybeScheduleCompaction(); + while (manual_compaction_ == &manual) { + bg_cv_.Wait(); + } } } @@ -590,12 +615,20 @@ void DBImpl::BackgroundCompaction() { Compaction* c; bool is_manual = (manual_compaction_ != NULL); + InternalKey manual_end; if (is_manual) { - const ManualCompaction* m = manual_compaction_; - c = versions_->CompactRange( + ManualCompaction* m = manual_compaction_; + c = versions_->CompactRange(m->level, m->begin, m->end); + m->done = (c == NULL); + if (c != NULL) { + manual_end = c->input(0, c->num_input_files(0) - 1)->largest; + } + Log(options_.info_log, + "Manual compaction at level-%d from %s .. %s; will stop at %s\n", m->level, - InternalKey(m->begin, kMaxSequenceNumber, kValueTypeForSeek), - InternalKey(m->end, 0, static_cast(0))); + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + (m->done ? "(end)" : manual_end.DebugString().c_str())); } else { c = versions_->PickCompaction(); } @@ -638,7 +671,13 @@ void DBImpl::BackgroundCompaction() { } if (is_manual) { - // Mark it as done + ManualCompaction* m = manual_compaction_; + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + m->tmp_storage = manual_end; + m->begin = &m->tmp_storage; + } manual_compaction_ = NULL; } } @@ -1109,10 +1148,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) { versions_->SetLastSequence(last_sequence); } - if (options.post_write_snapshot != NULL) { - *options.post_write_snapshot = - status.ok() ? snapshots_.New(last_sequence) : NULL; - } ReleaseLoggingResponsibility(&self); return status; } @@ -1225,6 +1260,9 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } } return true; + } else if (in == "sstables") { + *value = versions_->current()->DebugString(); + return true; } return false; diff --git a/db/db_impl.h b/db/db_impl.h index 5268137603..ab03181108 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -38,14 +38,12 @@ class DBImpl : public DB { virtual void ReleaseSnapshot(const Snapshot* snapshot); virtual bool GetProperty(const Slice& property, std::string* value); virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); + virtual void CompactRange(const Slice* begin, const Slice* end); // Extra methods (for testing) that are not in the public DB interface - // Compact any files in the named level that overlap [begin,end] - void TEST_CompactRange( - int level, - const std::string& begin, - const std::string& end); + // Compact any files in the named level that overlap [*begin,*end] + void TEST_CompactRange(int level, const Slice* begin, const Slice* end); // Force current memtable contents to be compacted. Status TEST_CompactMemTable(); @@ -145,8 +143,10 @@ class DBImpl : public DB { // Information for a manual compaction struct ManualCompaction { int level; - std::string begin; - std::string end; + bool done; + const InternalKey* begin; // NULL means beginning of key range + const InternalKey* end; // NULL means end of key range + InternalKey tmp_storage; // Used to keep track of compaction progress }; ManualCompaction* manual_compaction_; diff --git a/db/db_test.cc b/db/db_test.cc index daa9c03088..ab71c5170d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -195,6 +195,23 @@ class DBTest { return result; } + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < config::kNumLevels; level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + uint64_t Size(const Slice& start, const Slice& limit) { Range r(start, limit); uint64_t size; @@ -203,26 +220,23 @@ class DBTest { } void Compact(const Slice& start, const Slice& limit) { - dbfull()->TEST_CompactMemTable(); - int max_level_with_files = 1; - for (int level = 1; level < config::kNumLevels; level++) { - if (NumTableFilesAtLevel(level) > 0) { - max_level_with_files = level; - } - } - for (int level = 0; level < max_level_with_files; level++) { - dbfull()->TEST_CompactRange(level, "", "~"); + db_->CompactRange(&start, &limit); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large) { + for (int i = 0; i < n; i++) { + Put(small, "begin"); + Put(large, "end"); + dbfull()->TEST_CompactMemTable(); } } // Prevent pushing of new sstables into deeper levels by adding // tables that cover a specified range to all levels. void FillLevels(const std::string& smallest, const std::string& largest) { - for (int level = 0; level < config::kNumLevels; level++) { - Put(smallest, "begin"); - Put(largest, "end"); - dbfull()->TEST_CompactMemTable(); - } + MakeTables(config::kNumLevels, smallest, largest); } void DumpFileCounts(const char* label) { @@ -238,6 +252,12 @@ class DBTest { } } + std::string DumpSSTableList() { + std::string property; + db_->GetProperty("leveldb.sstables", &property); + return property; + } + std::string IterStatus(Iterator* iter) { std::string result; if (iter->Valid()) { @@ -367,7 +387,7 @@ TEST(DBTest, GetEncountersEmptyLevel) { } // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, "a", "z"); + dbfull()->TEST_CompactRange(1, NULL, NULL); ASSERT_EQ(NumTableFilesAtLevel(0), 1); ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 1); @@ -693,7 +713,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { // Reopening moves updates to level-0 Reopen(&options); - dbfull()->TEST_CompactRange(0, "", Key(100000)); + dbfull()->TEST_CompactRange(0, NULL, NULL); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GT(NumTableFilesAtLevel(1), 1); @@ -744,7 +764,7 @@ TEST(DBTest, SparseMerge) { } Put("C", "vc"); dbfull()->TEST_CompactMemTable(); - dbfull()->TEST_CompactRange(0, "A", "Z"); + dbfull()->TEST_CompactRange(0, NULL, NULL); // Make sparse update Put("A", "va2"); @@ -755,9 +775,9 @@ TEST(DBTest, SparseMerge) { // Compactions should not cause us to create a situation where // a file overlaps too much data at the next level. ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, "", "z"); + dbfull()->TEST_CompactRange(0, NULL, NULL); ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, "", "z"); + dbfull()->TEST_CompactRange(1, NULL, NULL); ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); } @@ -808,9 +828,11 @@ TEST(DBTest, ApproximateSizes) { ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000)); ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000)); - dbfull()->TEST_CompactRange(0, - Key(compact_start), - Key(compact_start + 9)); + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend); } ASSERT_EQ(NumTableFilesAtLevel(0), 0); @@ -850,7 +872,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - dbfull()->TEST_CompactRange(0, Key(0), Key(100)); + dbfull()->TEST_CompactRange(0, NULL, NULL); } } @@ -921,11 +943,12 @@ TEST(DBTest, HiddenValuesAreRemoved) { ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - dbfull()->TEST_CompactRange(0, "", "x"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, NULL, &x); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, "", "x"); + dbfull()->TEST_CompactRange(1, NULL, &x); ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); @@ -949,11 +972,12 @@ TEST(DBTest, DeletionMarkers1) { ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, "", "z"); + Slice z("z"); + dbfull()->TEST_CompactRange(last-2, NULL, &z); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last-1, "", "z"); + dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); @@ -976,15 +1000,54 @@ TEST(DBTest, DeletionMarkers2) { ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, "", "z"); + dbfull()->TEST_CompactRange(last-2, NULL, NULL); // DEL kept: "last" file overlaps ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-1, "", "z"); + dbfull()->TEST_CompactRange(last-1, NULL, NULL); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); } +TEST(DBTest, OverlapInLevel0) { + ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; + + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. + ASSERT_OK(Put("100", "v100")); + ASSERT_OK(Put("999", "v999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Delete("100")); + ASSERT_OK(Delete("999")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("0,1,1", FilesPerLevel()); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put("300", "v300")); + ASSERT_OK(Put("500", "v500")); + dbfull()->TEST_CompactMemTable(); + ASSERT_OK(Put("200", "v200")); + ASSERT_OK(Put("600", "v600")); + ASSERT_OK(Put("900", "v900")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("2,1,1", FilesPerLevel()); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, NULL, NULL); + dbfull()->TEST_CompactRange(2, NULL, NULL); + ASSERT_EQ("2", FilesPerLevel()); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete("600")); + dbfull()->TEST_CompactMemTable(); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("NOT_FOUND", Get("600")); +} + TEST(DBTest, ComparatorCheck) { class NewComparator : public Comparator { public: @@ -1008,6 +1071,40 @@ TEST(DBTest, ComparatorCheck) { << s.ToString(); } +TEST(DBTest, ManualCompaction) { + ASSERT_EQ(config::kMaxMemCompactLevel, 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); + + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); + + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(NULL, NULL); + ASSERT_EQ("0,0,1", FilesPerLevel()); +} + TEST(DBTest, DBOpen_Options) { std::string dbname = test::TmpDir() + "/db_options_test"; DestroyDB(dbname, Options()); @@ -1187,7 +1284,6 @@ class ModelDB: public DB { delete reinterpret_cast(snapshot); } virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - assert(options.post_write_snapshot == NULL); // Not supported class Handler : public WriteBatch::Handler { public: KVMap* map_; @@ -1211,6 +1307,9 @@ class ModelDB: public DB { sizes[i] = 0; } } + virtual void CompactRange(const Slice* start, const Slice* end) { + } + private: class ModelIter: public Iterator { public: diff --git a/db/dbformat.cc b/db/dbformat.cc index af2e0776e2..4fb3531ada 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -31,6 +31,18 @@ std::string ParsedInternalKey::DebugString() const { return result; } +std::string InternalKey::DebugString() const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + const char* InternalKeyComparator::Name() const { return "leveldb.InternalKeyComparator"; } diff --git a/db/dbformat.h b/db/dbformat.h index 7344cbfb94..d046990f35 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -149,6 +149,8 @@ class InternalKey { } void Clear() { rep_.clear(); } + + std::string DebugString() const; }; inline int InternalKeyComparator::Compare( diff --git a/db/version_edit.cc b/db/version_edit.cc index f6b9e9c606..9891c32361 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -235,9 +235,8 @@ std::string VersionEdit::DebugString() const { for (size_t i = 0; i < compact_pointers_.size(); i++) { r.append("\n CompactPointer: "); AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" '"); - AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode()); - r.append("'"); + r.append(" "); + r.append(compact_pointers_[i].second.DebugString()); } for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); @@ -255,11 +254,10 @@ std::string VersionEdit::DebugString() const { AppendNumberTo(&r, f.number); r.append(" "); AppendNumberTo(&r, f.file_size); - r.append(" '"); - AppendEscapedStringTo(&r, f.smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, f.largest.Encode()); - r.append("'"); + r.append(" "); + r.append(f.smallest.DebugString()); + r.append(" .. "); + r.append(f.largest.DebugString()); } r.append("\n}\n"); return r; diff --git a/db/version_set.cc b/db/version_set.cc index d75b34771f..8b96af0037 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -41,6 +41,14 @@ static uint64_t MaxFileSizeForLevel(int level) { return kTargetFileSize; // We could vary per level to reduce number of files? } +static int64_t TotalFileSize(const std::vector& files) { + int64_t sum = 0; + for (size_t i = 0; i < files.size(); i++) { + sum += files[i]->file_size; + } + return sum; +} + namespace { std::string IntSetToString(const std::set& s) { std::string result = "{"; @@ -96,17 +104,55 @@ int FindFile(const InternalKeyComparator& icmp, return right; } +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // NULL user_key occurs before all keys and is therefore never after *f + return (user_key != NULL && + ucmp->Compare(*user_key, f->largest.user_key()) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FileMetaData* f) { + // NULL user_key occurs after all keys and is therefore never before *f + return (user_key != NULL && + ucmp->Compare(*user_key, f->smallest.user_key()) < 0); +} + bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + bool disjoint_sorted_files, const std::vector& files, - const Slice& smallest_user_key, - const Slice& largest_user_key) { - // Find the earliest possible internal key for smallest_user_key - InternalKey small(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); - const uint32_t index = FindFile(icmp, files, small.Encode()); - return ((index < files.size()) && - icmp.user_comparator()->Compare( - largest_user_key, files[index]->smallest.user_key()) >= 0); + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (int i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != NULL) { + // Find the earliest possible internal key for smallest_user_key + InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); + index = FindFile(icmp, files, small.Encode()); + } + + if (index >= files.size()) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, files[index]); } // An internal iterator. For a given version/level pair, yields @@ -358,11 +404,64 @@ void Version::Unref() { } bool Version::OverlapInLevel(int level, - const Slice& smallest_user_key, - const Slice& largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, files_[level], - smallest_user_key, - largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], + smallest_user_key, largest_user_key); +} + +int Version::PickLevelForMemTableOutput( + const Slice& smallest_user_key, + const Slice& largest_user_key) { + int level = 0; + if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { + // Push to next level if there is no overlap in next level, + // and the #bytes overlapping in the level after that are limited. + InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey limit(largest_user_key, 0, static_cast(0)); + std::vector overlaps; + while (level < config::kMaxMemCompactLevel) { + if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { + break; + } + GetOverlappingInputs(level + 2, &start, &limit, &overlaps); + const int64_t sum = TotalFileSize(overlaps); + if (sum > kMaxGrandParentOverlapBytes) { + break; + } + level++; + } + } + return level; +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +void Version::GetOverlappingInputs( + int level, + const InternalKey* begin, + const InternalKey* end, + std::vector* inputs) { + inputs->clear(); + Slice user_begin, user_end; + if (begin != NULL) { + user_begin = begin->user_key(); + } + if (end != NULL) { + user_end = end->user_key(); + } + const Comparator* user_cmp = vset_->icmp_.user_comparator(); + for (size_t i = 0; i < files_[level].size(); i++) { + FileMetaData* f = files_[level][i]; + if (begin != NULL && + user_cmp->Compare(f->largest.user_key(), user_begin) < 0) { + // "f" is completely before specified range; skip it + } else if (end != NULL && + user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { + // "f" is completely after specified range; skip it + } else { + inputs->push_back(f); + } + } } std::string Version::DebugString() const { @@ -381,11 +480,11 @@ std::string Version::DebugString() const { AppendNumberTo(&r, files[i]->number); r.push_back(':'); AppendNumberTo(&r, files[i]->file_size); - r.append("['"); - AppendEscapedStringTo(&r, files[i]->smallest.Encode()); - r.append("' .. '"); - AppendEscapedStringTo(&r, files[i]->largest.Encode()); - r.append("']\n"); + r.append("["); + r.append(files[i]->smallest.DebugString()); + r.append(" .. "); + r.append(files[i]->largest.DebugString()); + r.append("]\n"); } } return r; @@ -540,8 +639,8 @@ class VersionSet::Builder { const InternalKey& this_begin = v->files_[level][i]->smallest; if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", - EscapeString(prev_end.Encode()).c_str(), - EscapeString(this_begin.Encode()).c_str()); + prev_end.DebugString().c_str(), + this_begin.DebugString().c_str()); abort(); } } @@ -814,14 +913,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { } } -static int64_t TotalFileSize(const std::vector& files) { - int64_t sum = 0; - for (size_t i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - void VersionSet::Finalize(Version* v) { // Precomputed best level for next compaction int best_level = -1; @@ -967,7 +1058,8 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() { for (int level = 1; level < config::kNumLevels - 1; level++) { for (size_t i = 0; i < current_->files_[level].size(); i++) { const FileMetaData* f = current_->files_[level][i]; - GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps); + current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest, + &overlaps); const int64_t sum = TotalFileSize(overlaps); if (sum > result) { result = sum; @@ -977,27 +1069,6 @@ int64_t VersionSet::MaxNextLevelOverlappingBytes() { return result; } -// Store in "*inputs" all files in "level" that overlap [begin,end] -void VersionSet::GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs) { - inputs->clear(); - Slice user_begin = begin.user_key(); - Slice user_end = end.user_key(); - const Comparator* user_cmp = icmp_.user_comparator(); - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 || - user_cmp->Compare(f->smallest.user_key(), user_end) > 0) { - // Either completely before or after range; skip it - } else { - inputs->push_back(f); - } - } -} - // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. // REQUIRES: inputs is not empty @@ -1113,7 +1184,7 @@ Compaction* VersionSet::PickCompaction() { // Note that the next call will discard the file we placed in // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. - GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]); + current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); assert(!c->inputs_[0].empty()); } @@ -1127,7 +1198,7 @@ void VersionSet::SetupOtherInputs(Compaction* c) { InternalKey smallest, largest; GetRange(c->inputs_[0], &smallest, &largest); - GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]); + current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); // Get entire range covered by compaction InternalKey all_start, all_limit; @@ -1137,12 +1208,13 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // changing the number of "level+1" files we pick up. if (!c->inputs_[1].empty()) { std::vector expanded0; - GetOverlappingInputs(level, all_start, all_limit, &expanded0); + current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); if (expanded0.size() > c->inputs_[0].size()) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; - GetOverlappingInputs(level+1, new_start, new_limit, &expanded1); + current_->GetOverlappingInputs(level+1, &new_start, &new_limit, + &expanded1); if (expanded1.size() == c->inputs_[1].size()) { Log(options_->info_log, "Expanding@%d %d+%d to %d+%d\n", @@ -1163,14 +1235,15 @@ void VersionSet::SetupOtherInputs(Compaction* c) { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) if (level + 2 < config::kNumLevels) { - GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_); + current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); } if (false) { Log(options_->info_log, "Compacting %d '%s' .. '%s'", level, - EscapeString(smallest.Encode()).c_str(), - EscapeString(largest.Encode()).c_str()); + smallest.DebugString().c_str(), + largest.DebugString().c_str()); } // Update the place where we will do the next compaction for this level. @@ -1183,14 +1256,26 @@ void VersionSet::SetupOtherInputs(Compaction* c) { Compaction* VersionSet::CompactRange( int level, - const InternalKey& begin, - const InternalKey& end) { + const InternalKey* begin, + const InternalKey* end) { std::vector inputs; - GetOverlappingInputs(level, begin, end, &inputs); + current_->GetOverlappingInputs(level, begin, end, &inputs); if (inputs.empty()) { return NULL; } + // Avoid compacting too much in one shot in case the range is large. + const uint64_t limit = MaxFileSizeForLevel(level); + uint64_t total = 0; + for (int i = 0; i < inputs.size(); i++) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + inputs.resize(i + 1); + break; + } + } + Compaction* c = new Compaction(level); c->input_version_ = current_; c->input_version_->Ref(); diff --git a/db/version_set.h b/db/version_set.h index 2dbd9480cd..b866b2a1c7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -43,12 +43,17 @@ extern int FindFile(const InternalKeyComparator& icmp, const Slice& key); // Returns true iff some file in "files" overlaps the user key range -// [smallest,largest]. +// [*smallest,*largest]. +// smallest==NULL represents a key smaller than all keys in the DB. +// largest==NULL represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges +// in sorted order. extern bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + bool disjoint_sorted_files, const std::vector& files, - const Slice& smallest_user_key, - const Slice& largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key); class Version { public: @@ -77,11 +82,24 @@ class Version { void Ref(); void Unref(); + void GetOverlappingInputs( + int level, + const InternalKey* begin, // NULL means before all keys + const InternalKey* end, // NULL means after all keys + std::vector* inputs); + // Returns true iff some file in the specified level overlaps - // some part of [smallest_user_key,largest_user_key]. + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. bool OverlapInLevel(int level, - const Slice& smallest_user_key, - const Slice& largest_user_key); + const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Return the level at which we should place a new memtable compaction + // result that covers the range [smallest_user_key,largest_user_key]. + int PickLevelForMemTableOutput(const Slice& smallest_user_key, + const Slice& largest_user_key); int NumFiles(int level) const { return files_[level].size(); } @@ -192,8 +210,8 @@ class VersionSet { // the result. Compaction* CompactRange( int level, - const InternalKey& begin, - const InternalKey& end); + const InternalKey* begin, + const InternalKey* end); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -232,12 +250,6 @@ class VersionSet { void Finalize(Version* v); - void GetOverlappingInputs( - int level, - const InternalKey& begin, - const InternalKey& end, - std::vector* inputs); - void GetRange(const std::vector& inputs, InternalKey* smallest, InternalKey* largest); diff --git a/db/version_set_test.cc b/db/version_set_test.cc index ecfd62bf98..06f8bbd40f 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -12,6 +12,9 @@ namespace leveldb { class FindFileTest { public: std::vector files_; + bool disjoint_sorted_files_; + + FindFileTest() : disjoint_sorted_files_(true) { } ~FindFileTest() { for (int i = 0; i < files_.size(); i++) { @@ -37,13 +40,20 @@ class FindFileTest { bool Overlaps(const char* smallest, const char* largest) { InternalKeyComparator cmp(BytewiseComparator()); - return SomeFileOverlapsRange(cmp, files_, smallest, largest); + Slice s(smallest != NULL ? smallest : ""); + Slice l(largest != NULL ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + (smallest != NULL ? &s : NULL), + (largest != NULL ? &l : NULL)); } }; TEST(FindFileTest, Empty) { ASSERT_EQ(0, Find("foo")); ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(NULL, "z")); + ASSERT_TRUE(! Overlaps("a", NULL)); + ASSERT_TRUE(! Overlaps(NULL, NULL)); } TEST(FindFileTest, Single) { @@ -67,6 +77,13 @@ TEST(FindFileTest, Single) { ASSERT_TRUE(Overlaps("p1", "z")); ASSERT_TRUE(Overlaps("q", "q")); ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(NULL, "j")); + ASSERT_TRUE(! Overlaps("r", NULL)); + ASSERT_TRUE(Overlaps(NULL, "p")); + ASSERT_TRUE(Overlaps(NULL, "p1")); + ASSERT_TRUE(Overlaps("q", NULL)); + ASSERT_TRUE(Overlaps(NULL, NULL)); } @@ -108,6 +125,26 @@ TEST(FindFileTest, Multiple) { ASSERT_TRUE(Overlaps("450", "500")); } +TEST(FindFileTest, MultipleNullBoundaries) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(NULL, "149")); + ASSERT_TRUE(! Overlaps("451", NULL)); + ASSERT_TRUE(Overlaps(NULL, NULL)); + ASSERT_TRUE(Overlaps(NULL, "150")); + ASSERT_TRUE(Overlaps(NULL, "199")); + ASSERT_TRUE(Overlaps(NULL, "200")); + ASSERT_TRUE(Overlaps(NULL, "201")); + ASSERT_TRUE(Overlaps(NULL, "400")); + ASSERT_TRUE(Overlaps(NULL, "800")); + ASSERT_TRUE(Overlaps("100", NULL)); + ASSERT_TRUE(Overlaps("200", NULL)); + ASSERT_TRUE(Overlaps("449", NULL)); + ASSERT_TRUE(Overlaps("450", NULL)); +} + TEST(FindFileTest, OverlapSequenceChecks) { Add("200", "200", 5000, 3000); ASSERT_TRUE(! Overlaps("199", "199")); @@ -117,6 +154,24 @@ TEST(FindFileTest, OverlapSequenceChecks) { ASSERT_TRUE(Overlaps("200", "210")); } +TEST(FindFileTest, OverlappingFiles) { + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + } int main(int argc, char** argv) { diff --git a/doc/index.html b/doc/index.html index 8d03c45d91..472f7cd709 100644 --- a/doc/index.html +++ b/doc/index.html @@ -193,7 +193,7 @@

Snapshots

If ReadOptions::snapshot is NULL, the read will operate on an implicit snapshot of the current state.

-Snapshots typically are created by the DB::GetSnapshot() method: +Snapshots are created by the DB::GetSnapshot() method:

   leveldb::ReadOptions options;
@@ -208,26 +208,6 @@ 

Snapshots

using the DB::ReleaseSnapshot interface. This allows the implementation to get rid of state that was being maintained just to support reading as of that snapshot. -

-A Write operation can also return a snapshot that -represents the state of the database just after applying a particular -set of updates: -

-

-  leveldb::Snapshot* snapshot;
-  leveldb::WriteOptions write_options;
-  write_options.post_write_snapshot = &snapshot;
-  leveldb::Status status = db->Write(write_options, ...);
-  ... perform other mutations to db ...
-
-  leveldb::ReadOptions read_options;
-  read_options.snapshot = snapshot;
-  leveldb::Iterator* iter = db->NewIterator(read_options);
-  ... read as of the state just after the Write call returned ...
-  delete iter;
-
-  db->ReleaseSnapshot(snapshot);
-

Slice

The return value of the it->key() and it->value() calls above diff --git a/include/leveldb/db.h b/include/leveldb/db.h index f945dd71d4..7fb2965f7a 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -112,6 +112,8 @@ class DB { // where is an ASCII representation of a level number (e.g. "0"). // "leveldb.stats" - returns a multi-line string that describes statistics // about the internal operation of the DB. + // "leveldb.sstables" - returns a multi-line string that describes all + // of the sstables that make up the db contents. virtual bool GetProperty(const Slice& property, std::string* value) = 0; // For each i in [0,n-1], store in "sizes[i]", the approximate @@ -125,8 +127,17 @@ class DB { virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) = 0; - // Possible extensions: - // (1) Add a method to compact a range of keys + // Compact the underlying storage for the key range [*begin,*end]. + // In particular, deleted and overwritten versions are discarded, + // and the data is rearranged to reduce the cost of operations + // needed to access the data. This operation should typically only + // be invoked by users who understand the underlying implementation. + // + // begin==NULL is treated as a key before all keys in the database. + // end==NULL is treated as a key after all keys in the database. + // Therefore the following call will compact the entire database: + // db->CompactRange(NULL, NULL); + virtual void CompactRange(const Slice* begin, const Slice* end) = 0; private: // No copying allowed diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 1a8ff6bc32..a39d66f34f 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -160,6 +160,8 @@ class SequentialFile { // Read up to "n" bytes from the file. "scratch[0..n-1]" may be // written by this routine. Sets "*result" to the data that was // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization @@ -184,8 +186,10 @@ class RandomAccessFile { // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" // to the data that was read (including if fewer than "n" bytes were - // successfully read). If an error was encountered, returns a - // non-OK status. + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. // // Safe for concurrent use by multiple threads. virtual Status Read(uint64_t offset, size_t n, Slice* result, diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 381f22891e..84ac7fca22 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -177,21 +177,8 @@ struct WriteOptions { // Default: false bool sync; - // If "post_write_snapshot" is non-NULL, and the write succeeds, - // *post_write_snapshot will be modified to point to a snapshot of - // the DB state immediately after this write. The caller must call - // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the - // snapshot is no longer needed. - // - // If "post_write_snapshot" is non-NULL, and the write fails, - // *post_write_snapshot will be set to NULL. - // - // Default: NULL - const Snapshot** post_write_snapshot; - WriteOptions() - : sync(false), - post_write_snapshot(NULL) { + : sync(false) { } }; diff --git a/util/coding.h b/util/coding.h index 87559684aa..c47b9d8ddf 100644 --- a/util/coding.h +++ b/util/coding.h @@ -62,10 +62,10 @@ inline uint32_t DecodeFixed32(const char* ptr) { memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load return result; } else { - return ((static_cast(ptr[0])) - | (static_cast(ptr[1]) << 8) - | (static_cast(ptr[2]) << 16) - | (static_cast(ptr[3]) << 24)); + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); } } diff --git a/util/posix_logger.h b/util/posix_logger.h index 0dbdeaa3a5..55428e52c5 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -1,5 +1,6 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: sanjay@google.com (Sanjay Ghemawat) +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. // // Logger implementation that can be shared by all environments // where enough posix functionality is available.