Skip to content

Commit

Permalink
[clangd] Introduce PostingList interface
Browse files Browse the repository at this point in the history
This patch abstracts `PostingList` interface and reuses existing
implementation. It will be used later to test different `PostingList`
representations.

No functionality change is introduced, this patch is mostly refactoring
so that the following patches could focus on functionality while not
being too hard to review.

Reviewed By: sammccall, ioeric

Differential Revision: https://reviews.llvm.org/D51982

llvm-svn: 342155
  • Loading branch information
kirillbobyrev committed Sep 13, 2018
1 parent f29b36c commit 249c586
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 161 deletions.
1 change: 1 addition & 0 deletions clang-tools-extra/clangd/CMakeLists.txt
Expand Up @@ -48,6 +48,7 @@ add_clang_library(clangDaemon

index/dex/Dex.cpp
index/dex/Iterator.cpp
index/dex/PostingList.cpp
index/dex/Trigram.cpp

LINK_LIBS
Expand Down
18 changes: 12 additions & 6 deletions clang-tools-extra/clangd/index/dex/Dex.cpp
Expand Up @@ -82,7 +82,7 @@ std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
// FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
PathProximitySignals.SymbolURI = ParentURI;
BoostingIterators.push_back(
createBoost(create(It->second), PathProximitySignals.evaluate()));
createBoost(It->second.iterator(), PathProximitySignals.evaluate()));
}
}
return BoostingIterators;
Expand Down Expand Up @@ -112,13 +112,19 @@ void Dex::buildIndex() {
Symbols[I] = ScoredSymbols[I].second;
}

// Populate TempInvertedIndex with posting lists for index symbols.
// Populate TempInvertedIndex with lists for index symbols.
llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
const auto *Sym = Symbols[SymbolRank];
for (const auto &Token : generateSearchTokens(*Sym))
InvertedIndex[Token].push_back(SymbolRank);
TempInvertedIndex[Token].push_back(SymbolRank);
}

// Convert lists of items to posting lists.
for (const auto &TokenToPostingList : TempInvertedIndex)
InvertedIndex.insert({TokenToPostingList.first,
PostingList(move(TokenToPostingList.second))});

vlog("Built Dex with estimated memory usage {0} bytes.",
estimateMemoryUsage());
}
Expand All @@ -142,7 +148,7 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
for (const auto &Trigram : TrigramTokens) {
const auto It = InvertedIndex.find(Trigram);
if (It != InvertedIndex.end())
TrigramIterators.push_back(create(It->second));
TrigramIterators.push_back(It->second.iterator());
}
if (!TrigramIterators.empty())
TopLevelChildren.push_back(createAnd(move(TrigramIterators)));
Expand All @@ -152,7 +158,7 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
for (const auto &Scope : Req.Scopes) {
const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));
if (It != InvertedIndex.end())
ScopeIterators.push_back(create(It->second));
ScopeIterators.push_back(It->second.iterator());
}
// Add OR iterator for scopes if there are any Scope Iterators.
if (!ScopeIterators.empty())
Expand Down Expand Up @@ -233,7 +239,7 @@ size_t Dex::estimateMemoryUsage() const {
Bytes += LookupTable.getMemorySize();
Bytes += InvertedIndex.getMemorySize();
for (const auto &P : InvertedIndex)
Bytes += P.second.size() * sizeof(DocID);
Bytes += P.second.bytes();
return Bytes + BackingDataSize;
}

Expand Down
1 change: 1 addition & 0 deletions clang-tools-extra/clangd/index/dex/Dex.h
Expand Up @@ -21,6 +21,7 @@
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEX_H

#include "Iterator.h"
#include "PostingList.h"
#include "Token.h"
#include "Trigram.h"
#include "index/Index.h"
Expand Down
67 changes: 0 additions & 67 deletions clang-tools-extra/clangd/index/dex/Iterator.cpp
Expand Up @@ -18,69 +18,6 @@ namespace dex {

namespace {

/// Implements Iterator over a PostingList. DocumentIterator is the most basic
/// iterator: it doesn't have any children (hence it is the leaf of iterator
/// tree) and is simply a wrapper around PostingList::const_iterator.
class DocumentIterator : public Iterator {
public:
explicit DocumentIterator(PostingListRef Documents)
: Documents(Documents), Index(std::begin(Documents)) {}

bool reachedEnd() const override { return Index == std::end(Documents); }

/// Advances cursor to the next item.
void advance() override {
assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end.");
++Index;
}

/// Applies binary search to advance cursor to the next item with DocID equal
/// or higher than the given one.
void advanceTo(DocID ID) override {
assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end.");
// If current ID is beyond requested one, iterator is already in the right
// state.
if (peek() < ID)
Index = std::lower_bound(Index, std::end(Documents), ID);
}

DocID peek() const override {
assert(!reachedEnd() && "DOCUMENT iterator can't peek() at the end.");
return *Index;
}

float consume() override {
assert(!reachedEnd() && "DOCUMENT iterator can't consume() at the end.");
return DEFAULT_BOOST_SCORE;
}

size_t estimateSize() const override { return Documents.size(); }

private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << '[';
auto Separator = "";
for (auto It = std::begin(Documents); It != std::end(Documents); ++It) {
OS << Separator;
if (It == Index)
OS << '{' << *It << '}';
else
OS << *It;
Separator = ", ";
}
OS << Separator;
if (Index == std::end(Documents))
OS << "{END}";
else
OS << "END";
OS << ']';
return OS;
}

PostingListRef Documents;
PostingListRef::const_iterator Index;
};

/// Implements Iterator over the intersection of other iterators.
///
/// AndIterator iterates through common items among all children. It becomes
Expand Down Expand Up @@ -399,10 +336,6 @@ std::vector<std::pair<DocID, float>> consume(Iterator &It) {
return Result;
}

std::unique_ptr<Iterator> create(PostingListRef Documents) {
return llvm::make_unique<DocumentIterator>(Documents);
}

std::unique_ptr<Iterator>
createAnd(std::vector<std::unique_ptr<Iterator>> Children) {
// If there is exactly one child, pull it one level up: AND(Child) -> Child.
Expand Down
19 changes: 0 additions & 19 deletions clang-tools-extra/clangd/index/dex/Iterator.h
Expand Up @@ -44,20 +44,6 @@ namespace dex {
/// Symbol position in the list of all index symbols sorted by a pre-computed
/// symbol quality.
using DocID = uint32_t;
/// Contains sorted sequence of DocIDs all of which belong to symbols matching
/// certain criteria, i.e. containing a Search Token. PostingLists are values
/// for the inverted index.
// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
// likely to be very dense and hence require special attention so that the index
// doesn't use too much memory. Possible solution would be to construct
// compressed posting lists which consist of ranges of DocIDs instead of
// distinct DocIDs. A special case would be the empty query: for that case
// TrueIterator should be implemented - an iterator which doesn't actually store
// any PostingList within itself, but "contains" all DocIDs in range
// [0, IndexSize).
using PostingList = std::vector<DocID>;
/// Immutable reference to PostingList object.
using PostingListRef = llvm::ArrayRef<DocID>;

/// Iterator is the interface for Query Tree node. The simplest type of Iterator
/// is DocumentIterator which is simply a wrapper around PostingList iterator
Expand Down Expand Up @@ -131,11 +117,6 @@ class Iterator {
/// to acquire preliminary scores of requested items.
std::vector<std::pair<DocID, float>> consume(Iterator &It);

/// Returns a document iterator over given PostingList.
///
/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
std::unique_ptr<Iterator> create(PostingListRef Documents);

/// Returns AND Iterator which performs the intersection of the PostingLists of
/// its children.
///
Expand Down
84 changes: 84 additions & 0 deletions clang-tools-extra/clangd/index/dex/PostingList.cpp
@@ -0,0 +1,84 @@
//===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "PostingList.h"
#include "Iterator.h"

namespace clang {
namespace clangd {
namespace dex {

namespace {

/// Implements Iterator over std::vector<DocID>. This is the most basic
/// iterator and is simply a wrapper around
/// std::vector<DocID>::const_iterator.
class PlainIterator : public Iterator {
public:
explicit PlainIterator(llvm::ArrayRef<DocID> Documents)
: Documents(Documents), Index(std::begin(Documents)) {}

bool reachedEnd() const override { return Index == std::end(Documents); }

/// Advances cursor to the next item.
void advance() override {
assert(!reachedEnd() &&
"Posting List iterator can't advance() at the end.");
++Index;
}

/// Applies binary search to advance cursor to the next item with DocID
/// equal or higher than the given one.
void advanceTo(DocID ID) override {
assert(!reachedEnd() &&
"Posting List iterator can't advance() at the end.");
// If current ID is beyond requested one, iterator is already in the right
// state.
if (peek() < ID)
Index = std::lower_bound(Index, std::end(Documents), ID);
}

DocID peek() const override {
assert(!reachedEnd() &&
"Posting List iterator can't peek() at the end.");
return *Index;
}

float consume() override {
assert(!reachedEnd() &&
"Posting List iterator can't consume() at the end.");
return DEFAULT_BOOST_SCORE;
}

size_t estimateSize() const override { return Documents.size(); }

private:
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
OS << '[';
if (Index != std::end(Documents))
OS << *Index;
else
OS << "END";
OS << ']';
return OS;
}

llvm::ArrayRef<DocID> Documents;
llvm::ArrayRef<DocID>::const_iterator Index;
};

} // namespace

std::unique_ptr<Iterator> PostingList::iterator() const {
return llvm::make_unique<PlainIterator>(Documents);
}

} // namespace dex
} // namespace clangd
} // namespace clang
52 changes: 52 additions & 0 deletions clang-tools-extra/clangd/index/dex/PostingList.h
@@ -0,0 +1,52 @@
//===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This defines posting list interface: a storage for identifiers of symbols
// which can be characterized by a specific feature (such as fuzzy-find trigram,
// scope, type or any other Search Token). Posting lists can be traversed in
// order using an iterator and are values for inverted index, which maps search
// tokens to corresponding posting lists.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H

#include "Iterator.h"
#include "llvm/ADT/ArrayRef.h"
#include <cstdint>
#include <vector>

namespace clang {
namespace clangd {
namespace dex {

class Iterator;

/// PostingList is the storage of DocIDs which can be inserted to the Query
/// Tree as a leaf by constructing Iterator over the PostingList object.
// FIXME(kbobyrev): Use VByte algorithm to compress underlying data.
class PostingList {
public:
explicit PostingList(const std::vector<DocID> &&Documents)
: Documents(std::move(Documents)) {}

std::unique_ptr<Iterator> iterator() const;

size_t bytes() const { return Documents.size() * sizeof(DocID); }

private:
const std::vector<DocID> Documents;
};

} // namespace dex
} // namespace clangd
} // namespace clang

#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H

0 comments on commit 249c586

Please sign in to comment.