Skip to content

Commit

Permalink
Implement a case-folding version of DJB hash
Browse files Browse the repository at this point in the history
Summary:
This patch implements a variant of the DJB hash function which folds the
input according to the algorithm in the Dwarf 5 specification (Section
6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18,
"Case Mappings").

To achieve this, I have added a llvm::sys::unicode::foldCharSimple
function, which performs this mapping. The implementation of this
function was generated from the CaseMatching.txt file from the Unicode
spec using a python script (which is also included in this patch). The
script tries to optimize the function by coalescing adjecant mappings
with the same shift and stride (terms I made up). Theoretically, it
could be made a bit smarter and merge adjecant blocks that were
interrupted by only one or two characters with exceptional mapping, but
this would save only a couple of branches, while it would greatly
complicate the implementation, so I deemed it was not worth it.

Since we assume that the vast majority of the input characters will be
US-ASCII, the folding hash function has a fast-path for handling these,
and only whips out the full decode+fold+encode logic if we encounter a
character outside of this range. It might be possible to implement the
folding directly on utf8 sequences, but this would also bring a lot of
complexity for the few cases where we will actually need to process
non-ascii characters.

Reviewers: JDevlieghere, aprantl, probinson, dblaikie

Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits

Differential Revision: https://reviews.llvm.org/D42740

llvm-svn: 325107
  • Loading branch information
labath committed Feb 14, 2018
1 parent eb18d77 commit f144097
Show file tree
Hide file tree
Showing 8 changed files with 1,055 additions and 1 deletion.
4 changes: 4 additions & 0 deletions llvm/include/llvm/Support/DJB.h
Expand Up @@ -20,6 +20,10 @@ namespace llvm {

/// The Bernstein hash function used by the DWARF accelerator tables.
uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);

/// Computes the Bernstein hash after folding the input according to the Dwarf 5
/// standard case folding rules.
uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
} // namespace llvm

#endif // LLVM_SUPPORT_DJB_H
4 changes: 4 additions & 0 deletions llvm/include/llvm/Support/Unicode.h
Expand Up @@ -60,6 +60,10 @@ bool isPrintable(int UCS);
/// * 1 for each of the remaining characters.
int columnWidthUTF8(StringRef Text);

/// Fold input unicode character according the the Simple unicode case folding
/// rules.
int foldCharSimple(int C);

} // namespace unicode
} // namespace sys
} // namespace llvm
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Support/CMakeLists.txt
Expand Up @@ -113,6 +113,7 @@ add_llvm_library(LLVMSupport
Triple.cpp
Twine.cpp
Unicode.cpp
UnicodeCaseFold.cpp
YAMLParser.cpp
YAMLTraits.cpp
raw_os_ostream.cpp
Expand Down
78 changes: 77 additions & 1 deletion llvm/lib/Support/DJB.cpp
Expand Up @@ -12,9 +12,85 @@
//===----------------------------------------------------------------------===//

#include "llvm/Support/DJB.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Unicode.h"

using namespace llvm;

static inline uint32_t djbHashChar(char C, uint32_t H) {
return (H << 5) + H + C;
}

uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
for (char C : Buffer.bytes())
H = ((H << 5) + H) + C;
H = djbHashChar(C, H);
return H;
}

static UTF32 chopOneUTF32(StringRef &Buffer) {
UTF32 C;
const UTF8 *const Begin8Const =
reinterpret_cast<const UTF8 *>(Buffer.begin());
const UTF8 *Begin8 = Begin8Const;
UTF32 *Begin32 = &C;

// In lenient mode we will always end up with a "reasonable" value in C for
// non-empty input.
assert(!Buffer.empty());
ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
&Begin32, &C + 1, lenientConversion);
Buffer = Buffer.drop_front(Begin8 - Begin8Const);
return C;
}

static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
const UTF32 *Begin32 = &C;
UTF8 *Begin8 = Storage.begin();

// The case-folded output should always be a valid unicode character, so use
// strict mode here.
ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
Storage.end(), strictConversion);
assert(CR == conversionOK && "Case folding produced invalid char?");
(void)CR;
return StringRef(reinterpret_cast<char *>(Storage.begin()),
Begin8 - Storage.begin());
}

static UTF32 foldCharDwarf(UTF32 C) {
// DWARF v5 addition to the unicode folding rules.
// Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
// Above" into "i".
if (C == 0x130 || C == 0x131)
return 'i';
return sys::unicode::foldCharSimple(C);
}

static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
UTF32 C = chopOneUTF32(Buffer);

C = foldCharDwarf(C);

std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
StringRef Folded = toUTF8(C, Storage);
return djbHash(Folded, H);
}

uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
while (!Buffer.empty()) {
unsigned char C = Buffer.front();
if (LLVM_LIKELY(C <= 0x7f)) {
// US-ASCII, encoded as one character in utf-8.
// This is by far the most common case, so handle this specially.
if (C >= 'A' && C <= 'Z')
C = 'a' + (C - 'A'); // fold uppercase into lowercase
H = djbHashChar(C, H);
Buffer = Buffer.drop_front();
continue;
}
H = caseFoldingDjbHashCharSlow(Buffer, H);
}
return H;
}

0 comments on commit f144097

Please sign in to comment.