Skip to content

Commit

Permalink
[clangd] Don't create as much garbage while building Dex index.
Browse files Browse the repository at this point in the history
Summary:
The Token objects are relatively expensive and we were spending a lot of
CPU creating them for each trigram emitted. Instead, use a tiny trigram
structure until we're ready to finalize the index.

This improves the new BuildDex benchmark by 20%. This code is hot and on
the critical path in clangd: it runs after a new preamble is built.

Reviewers: kbobyrev

Subscribers: ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D79918
  • Loading branch information
sam-mccall committed May 14, 2020
1 parent dac6e9c commit 735ab46
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 68 deletions.
6 changes: 6 additions & 0 deletions clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp
Expand Up @@ -83,6 +83,12 @@ static void DexQueries(benchmark::State &State) {
}
BENCHMARK(DexQueries);

static void DexBuild(benchmark::State &State) {
for (auto _ : State)
buildDex();
}
BENCHMARK(DexBuild);

} // namespace
} // namespace clangd
} // namespace clang
Expand Down
93 changes: 59 additions & 34 deletions clang-tools-extra/clangd/index/dex/Dex.cpp
Expand Up @@ -39,28 +39,60 @@ namespace {
const Token RestrictedForCodeCompletion =
Token(Token::Kind::Sentinel, "Restricted For Code Completion");

// Returns the tokens which are given symbol's characteristics. Currently, the
// generated tokens only contain fuzzy matching trigrams and symbol's scope,
// but in the future this will also return path proximity tokens and other
// types of tokens such as symbol type (if applicable).
// Returns the tokens which are given symbols's characteristics. For example,
// trigrams and scopes.
// FIXME(kbobyrev): Support more token types:
// * Namespace proximity
std::vector<Token> generateSearchTokens(const Symbol &Sym) {
std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
Result.emplace_back(Token::Kind::Scope, Sym.Scope);
// Skip token generation for symbols with unknown declaration location.
if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
for (const auto &ProximityURI :
generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
Result.emplace_back(Token::Kind::ProximityURI, ProximityURI);
if (Sym.Flags & Symbol::IndexedForCodeCompletion)
Result.emplace_back(RestrictedForCodeCompletion);
if (!Sym.Type.empty())
Result.emplace_back(Token::Kind::Type, Sym.Type);
return Result;
}
// Helper to efficiently assemble the inverse index (token -> matching docs).
// The output is a nice uniform structure keyed on Token, but constructing
// the Token object every time we want to insert into the map is wasteful.
// Instead we have various maps keyed on things that are cheap to compute,
// and produce the Token keys once at the end.
class IndexBuilder {
llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
std::vector<DocID> RestrictedCCDocs;
llvm::StringMap<std::vector<DocID>> TypeDocs;
llvm::StringMap<std::vector<DocID>> ScopeDocs;
llvm::StringMap<std::vector<DocID>> ProximityDocs;
std::vector<Trigram> TrigramScratch;

public:
// Add the tokens which are given symbol's characteristics.
// This includes fuzzy matching trigrams, symbol's scope, etc.
// FIXME(kbobyrev): Support more token types:
// * Namespace proximity
void add(const Symbol &Sym, DocID D) {
generateIdentifierTrigrams(Sym.Name, TrigramScratch);
for (Trigram T : TrigramScratch)
TrigramDocs[T].push_back(D);
ScopeDocs[Sym.Scope].push_back(D);
if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
for (const auto &ProximityURI :
generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
ProximityDocs[ProximityURI].push_back(D);
if (Sym.Flags & Symbol::IndexedForCodeCompletion)
RestrictedCCDocs.push_back(D);
if (!Sym.Type.empty())
TypeDocs[Sym.Type].push_back(D);
}

// Assemble the final compressed posting lists for the added symbols.
llvm::DenseMap<Token, PostingList> build() {
llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
TrigramDocs.size() +
RestrictedCCDocs.size() +
TypeDocs.size() +
ScopeDocs.size() +
ProximityDocs.size());
for (const auto &E : TrigramDocs)
Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
for (const auto &E : TypeDocs)
Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
for (const auto &E : ScopeDocs)
Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
for (const auto &E : ProximityDocs)
Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
if (!RestrictedCCDocs.empty())
Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
return Result;
}
};

} // namespace

Expand All @@ -86,18 +118,11 @@ void Dex::buildIndex() {
Symbols[I] = ScoredSymbols[I].second;
}

// Populate TempInvertedIndex with lists for index symbols.
llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
const auto *Sym = Symbols[SymbolRank];
for (const auto &Token : generateSearchTokens(*Sym))
TempInvertedIndex[Token].push_back(SymbolRank);
}

// Convert lists of items to posting lists.
for (const auto &TokenToPostingList : TempInvertedIndex)
InvertedIndex.insert(
{TokenToPostingList.first, PostingList(TokenToPostingList.second)});
// Build posting lists for symbols.
IndexBuilder Builder;
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
Builder.add(*Symbols[SymbolRank], SymbolRank);
InvertedIndex = Builder.build();
}

std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
Expand Down
41 changes: 28 additions & 13 deletions clang-tools-extra/clangd/index/dex/Trigram.cpp
Expand Up @@ -11,6 +11,7 @@
#include "Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include <cctype>
#include <queue>
Expand All @@ -20,7 +21,9 @@ namespace clang {
namespace clangd {
namespace dex {

std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// Produce trigrams (including duplicates) and pass them to Out().
template <typename Func>
static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Identifier.size());
calculateRoles(Identifier,
Expand All @@ -46,12 +49,6 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
}
}

llvm::DenseSet<Token> UniqueTrigrams;

auto Add = [&](std::string Chars) {
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
};

// Iterate through valid sequences of three characters Fuzzy Matcher can
// process.
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
Expand All @@ -64,23 +61,41 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
for (const unsigned K : Next[J]) {
if (K == 0)
continue;
Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
LowercaseIdentifier[K]}});
Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
LowercaseIdentifier[K]));
}
}
}
// Emit short-query trigrams: FooBar -> f, fo, fb.
if (!LowercaseIdentifier.empty())
Add({LowercaseIdentifier[0]});
Out(Trigram(LowercaseIdentifier[0]));
if (LowercaseIdentifier.size() >= 2)
Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
if (Roles[I] == Head) {
Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
break;
}
}

return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
void generateIdentifierTrigrams(llvm::StringRef Identifier,
std::vector<Trigram> &Result) {
// Empirically, scanning for duplicates is faster with fewer trigrams, and
// a hashtable is faster with more. This is a hot path, so dispatch based on
// expected number of trigrams. Longer identifiers produce more trigrams.
// The magic number was tuned by running IndexBenchmark.DexBuild.
constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
Result.clear();
if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
identifierTrigrams(Identifier, [&](Trigram T) {
if (!llvm::is_contained(Result, T))
Result.push_back(T);
});
} else {
identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
llvm::sort(Result);
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
}
}

std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
Expand Down
53 changes: 50 additions & 3 deletions clang-tools-extra/clangd/index/dex/Trigram.h
Expand Up @@ -24,14 +24,35 @@
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H

#include "Token.h"
#include "llvm/ADT/bit.h"

#include <string>

namespace clang {
namespace clangd {
namespace dex {

/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
// Compact representation of a trigram (string with up to 3 characters).
// Trigram generation is the hot path of indexing, so Token is too wasteful.
class Trigram {
std::array<char, 4> Data; // Last element is length.
// Steal some invalid bit patterns for DenseMap sentinels.
enum class Sentinel { Tombstone = 4, Empty = 5 };
Trigram(Sentinel S) : Data{0, 0, 0, static_cast<char>(S)} {}
uint32_t id() const { return llvm::bit_cast<uint32_t>(Data); }

public:
Trigram() : Data{0, 0, 0, 0} {}
Trigram(char A) : Data{A, 0, 0, 1} {}
Trigram(char A, char B) : Data{A, B, 0, 2} {}
Trigram(char A, char B, char C) : Data{A, B, C, 3} {}
std::string str() const { return std::string(Data.data(), Data[3]); }
friend struct ::llvm::DenseMapInfo<Trigram>;
friend bool operator==(Trigram L, Trigram R) { return L.id() == R.id(); }
friend bool operator<(Trigram L, Trigram R) { return L.id() < R.id(); }
};

/// Produces list of unique fuzzy-search trigrams from unqualified symbol.
/// The trigrams give the 3-character query substrings this symbol can match.
///
/// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
Expand All @@ -46,8 +67,9 @@ namespace dex {
/// {f, fo, fb, foo, fob, fba, oob, oba, bar}.
///
/// Trigrams are lowercase, as trigram matching is case-insensitive.
/// Trigrams in the returned list are deduplicated.
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
/// Trigrams in the list are deduplicated.
void generateIdentifierTrigrams(llvm::StringRef Identifier,
std::vector<Trigram> &Out);

/// Returns list of unique fuzzy-search trigrams given a query.
///
Expand All @@ -64,4 +86,29 @@ std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
} // namespace clangd
} // namespace clang

namespace llvm {
template <> struct llvm::DenseMapInfo<clang::clangd::dex::Trigram> {
using Trigram = clang::clangd::dex::Trigram;
static inline Trigram getEmptyKey() {
return Trigram(Trigram::Sentinel::Empty);
}
static inline Trigram getTombstoneKey() {
return Trigram(Trigram::Sentinel::Tombstone);
}
static unsigned getHashValue(Trigram V) {
// Finalize step from MurmurHash3.
uint32_t X = V.id();
X ^= X >> 16;
X *= uint32_t{0x85ebca6b};
X ^= X >> 13;
X *= uint32_t{0xc2b2ae35};
X ^= X >> 16;
return X;
}
static bool isEqual(const Trigram &LHS, const Trigram &RHS) {
return LHS == RHS;
}
};
} // namespace llvm

#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
14 changes: 7 additions & 7 deletions clang-tools-extra/clangd/test/Inputs/requests.json
@@ -1,7 +1,7 @@
[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false},
{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false}]
[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false, "PreferredTypes":[]},
{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false, "PreferredTypes":[]}]
30 changes: 19 additions & 11 deletions clang-tools-extra/clangd/unittests/DexTests.cpp
Expand Up @@ -366,38 +366,46 @@ trigramsAre(std::initializer_list<std::string> Trigrams) {
return tokensAre(Trigrams, Token::Kind::Trigram);
}

std::vector<Token> identifierTrigramTokens(llvm::StringRef S) {
std::vector<Trigram> Trigrams;
generateIdentifierTrigrams(S, Trigrams);
std::vector<Token> Tokens;
for (Trigram T : Trigrams)
Tokens.emplace_back(Token::Kind::Trigram, T.str());
return Tokens;
}

TEST(DexTrigrams, IdentifierTrigrams) {
EXPECT_THAT(generateIdentifierTrigrams("X86"),
trigramsAre({"x86", "x", "x8"}));
EXPECT_THAT(identifierTrigramTokens("X86"), trigramsAre({"x86", "x", "x8"}));

EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"}));
EXPECT_THAT(identifierTrigramTokens("nl"), trigramsAre({"nl", "n"}));

EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"}));
EXPECT_THAT(identifierTrigramTokens("n"), trigramsAre({"n"}));

EXPECT_THAT(generateIdentifierTrigrams("clangd"),
EXPECT_THAT(identifierTrigramTokens("clangd"),
trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));

EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
EXPECT_THAT(identifierTrigramTokens("abc_def"),
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
"cde", "def"}));

EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));

EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
"npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
"uep", "ept", "ptr"}));

EXPECT_THAT(
generateIdentifierTrigrams("TUDecl"),
identifierTrigramTokens("TUDecl"),
trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));

EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
EXPECT_THAT(identifierTrigramTokens("IsOK"),
trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));

EXPECT_THAT(
generateIdentifierTrigrams("abc_defGhij__klm"),
identifierTrigramTokens("abc_defGhij__klm"),
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd",
"bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
"efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",
Expand Down

0 comments on commit 735ab46

Please sign in to comment.